ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,129 @@
1
+ {
2
+ "description": "ECIP Knowledge Store (M03) — Cache hit rate by type and repo, Redis/pgvector performance",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "Cache Performance",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "Cache Hit Rate",
13
+ "type": "gauge",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": {
18
+ "unit": "percentunit",
19
+ "min": 0,
20
+ "max": 1,
21
+ "thresholds": {
22
+ "mode": "absolute",
23
+ "steps": [
24
+ { "color": "red", "value": null },
25
+ { "color": "yellow", "value": 0.6 },
26
+ { "color": "green", "value": 0.8 }
27
+ ]
28
+ }
29
+ }
30
+ },
31
+ "targets": [
32
+ {
33
+ "expr": "cache_hit_rate{job=\"ecip-knowledge-store\"}",
34
+ "legendFormat": "{{cache_type}} — {{repo}}"
35
+ }
36
+ ]
37
+ },
38
+ {
39
+ "title": "Cache Hit Rate Over Time",
40
+ "type": "timeseries",
41
+ "datasource": "Prometheus",
42
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
43
+ "fieldConfig": {
44
+ "defaults": { "unit": "percentunit" }
45
+ },
46
+ "targets": [
47
+ {
48
+ "expr": "cache_hit_rate{job=~\"ecip-knowledge-store|ecip-query-service\"}",
49
+ "legendFormat": "{{cache_type}} — {{repo}}"
50
+ }
51
+ ]
52
+ },
53
+ {
54
+ "title": "Knowledge Store Write Duration (p95)",
55
+ "type": "timeseries",
56
+ "datasource": "Prometheus",
57
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
58
+ "fieldConfig": {
59
+ "defaults": {
60
+ "unit": "ms",
61
+ "thresholds": {
62
+ "mode": "absolute",
63
+ "steps": [
64
+ { "color": "green", "value": null },
65
+ { "color": "yellow", "value": 150 },
66
+ { "color": "red", "value": 200 }
67
+ ]
68
+ }
69
+ }
70
+ },
71
+ "targets": [
72
+ {
73
+ "expr": "histogram_quantile(0.95, sum(rate(knowledge_store_write_duration_ms_bucket[5m])) by (le, store_type))",
74
+ "legendFormat": "p95 — {{store_type}}"
75
+ }
76
+ ]
77
+ },
78
+ {
79
+ "title": "pgvector Query Duration (p95)",
80
+ "type": "timeseries",
81
+ "datasource": "Prometheus",
82
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
83
+ "fieldConfig": {
84
+ "defaults": { "unit": "ms" }
85
+ },
86
+ "targets": [
87
+ {
88
+ "expr": "histogram_quantile(0.95, sum(rate(knowledge_store_write_duration_ms_bucket{store_type=\"pgvector\"}[5m])) by (le, namespace))",
89
+ "legendFormat": "p95 — {{namespace}}"
90
+ }
91
+ ]
92
+ },
93
+ {
94
+ "title": "HNSW Rebuild Duration",
95
+ "type": "timeseries",
96
+ "datasource": "Prometheus",
97
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 17 },
98
+ "fieldConfig": {
99
+ "defaults": { "unit": "ms" }
100
+ },
101
+ "targets": [
102
+ {
103
+ "expr": "histogram_quantile(0.95, sum(rate(hnsw_rebuild_duration_ms_bucket[5m])) by (le, repo))",
104
+ "legendFormat": "{{repo}}"
105
+ }
106
+ ]
107
+ }
108
+ ],
109
+ "refresh": "30s",
110
+ "schemaVersion": 39,
111
+ "tags": ["ecip", "m03", "knowledge-store", "cache"],
112
+ "templating": {
113
+ "list": [
114
+ {
115
+ "name": "repo",
116
+ "type": "query",
117
+ "datasource": "Prometheus",
118
+ "query": "label_values(cache_hit_rate, repo)",
119
+ "refresh": 2,
120
+ "includeAll": true,
121
+ "multi": true
122
+ }
123
+ ]
124
+ },
125
+ "time": { "from": "now-1h", "to": "now" },
126
+ "title": "ECIP — Cache Performance",
127
+ "uid": "ecip-cache-performance",
128
+ "version": 1
129
+ }
@@ -0,0 +1,93 @@
1
+ {
2
+ "description": "ECIP Query Service (M04) — Cross-repo fan-out depth distribution, cycle warnings",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "Cross-Repo Fan-out",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "Fan-out Depth Distribution",
13
+ "type": "histogram",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": { "unit": "short" }
18
+ },
19
+ "targets": [
20
+ {
21
+ "expr": "sum(rate(cross_repo_fanout_count_bucket[5m])) by (le, depth)",
22
+ "legendFormat": "depth={{depth}}"
23
+ }
24
+ ]
25
+ },
26
+ {
27
+ "title": "Fan-out Depth Over Time",
28
+ "type": "timeseries",
29
+ "datasource": "Prometheus",
30
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
31
+ "fieldConfig": {
32
+ "defaults": { "unit": "short" }
33
+ },
34
+ "targets": [
35
+ {
36
+ "expr": "histogram_quantile(0.95, sum(rate(cross_repo_fanout_count_bucket[5m])) by (le))",
37
+ "legendFormat": "p95 depth"
38
+ },
39
+ {
40
+ "expr": "histogram_quantile(0.50, sum(rate(cross_repo_fanout_count_bucket[5m])) by (le))",
41
+ "legendFormat": "p50 depth"
42
+ }
43
+ ]
44
+ },
45
+ {
46
+ "title": "Fan-out by Repo (top 10)",
47
+ "type": "bargauge",
48
+ "datasource": "Prometheus",
49
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
50
+ "fieldConfig": {
51
+ "defaults": { "unit": "short" }
52
+ },
53
+ "targets": [
54
+ {
55
+ "expr": "topk(10, sum(rate(cross_repo_fanout_count_count[5m])) by (repo))",
56
+ "legendFormat": "{{repo}}",
57
+ "instant": true
58
+ }
59
+ ]
60
+ },
61
+ {
62
+ "title": "Depth > 2 Warnings (Cycle Risk)",
63
+ "type": "timeseries",
64
+ "datasource": "Prometheus",
65
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
66
+ "fieldConfig": {
67
+ "defaults": {
68
+ "unit": "short",
69
+ "thresholds": {
70
+ "mode": "absolute",
71
+ "steps": [
72
+ { "color": "green", "value": null },
73
+ { "color": "red", "value": 1 }
74
+ ]
75
+ }
76
+ }
77
+ },
78
+ "targets": [
79
+ {
80
+ "expr": "sum(rate(cross_repo_fanout_count_bucket{le=\"+Inf\"}[5m])) - sum(rate(cross_repo_fanout_count_bucket{le=\"2\"}[5m]))",
81
+ "legendFormat": "Depth > 2 queries"
82
+ }
83
+ ]
84
+ }
85
+ ],
86
+ "refresh": "30s",
87
+ "schemaVersion": 39,
88
+ "tags": ["ecip", "m04", "cross-repo", "fanout"],
89
+ "time": { "from": "now-6h", "to": "now" },
90
+ "title": "ECIP — Cross-Repo Fan-out",
91
+ "uid": "ecip-cross-repo-fanout",
92
+ "version": 1
93
+ }
@@ -0,0 +1,129 @@
1
+ {
2
+ "description": "ECIP Event Bus (M07) — DLQ depth, DLQ age, retry counts",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "Event Bus DLQ",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "DLQ Depth by Topic",
13
+ "type": "stat",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": {
18
+ "unit": "short",
19
+ "thresholds": {
20
+ "mode": "absolute",
21
+ "steps": [
22
+ { "color": "green", "value": null },
23
+ { "color": "yellow", "value": 50 },
24
+ { "color": "red", "value": 100 }
25
+ ]
26
+ }
27
+ }
28
+ },
29
+ "targets": [
30
+ {
31
+ "expr": "event_bus_dlq_depth",
32
+ "legendFormat": "{{topic}}"
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "title": "DLQ Depth Over Time",
38
+ "type": "timeseries",
39
+ "datasource": "Prometheus",
40
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
41
+ "fieldConfig": {
42
+ "defaults": { "unit": "short" }
43
+ },
44
+ "targets": [
45
+ {
46
+ "expr": "event_bus_dlq_depth",
47
+ "legendFormat": "{{topic}}"
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "title": "Webhook Processing Latency (p95)",
53
+ "type": "timeseries",
54
+ "datasource": "Prometheus",
55
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
56
+ "fieldConfig": {
57
+ "defaults": { "unit": "ms" }
58
+ },
59
+ "targets": [
60
+ {
61
+ "expr": "histogram_quantile(0.95, sum(rate(webhook_processing_duration_ms_bucket[5m])) by (le, topic))",
62
+ "legendFormat": "p95 — {{topic}}"
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "title": "Kafka Consumer Lag",
68
+ "type": "timeseries",
69
+ "datasource": "Prometheus",
70
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
71
+ "fieldConfig": {
72
+ "defaults": { "unit": "short" }
73
+ },
74
+ "targets": [
75
+ {
76
+ "expr": "sum(kafka_consumergroup_lag{group=~\"ecip-event-bus.*\"}) by (topic)",
77
+ "legendFormat": "{{topic}}"
78
+ }
79
+ ]
80
+ },
81
+ {
82
+ "title": "DLQ Retry Counts",
83
+ "type": "timeseries",
84
+ "datasource": "Prometheus",
85
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 },
86
+ "fieldConfig": {
87
+ "defaults": { "unit": "short" }
88
+ },
89
+ "targets": [
90
+ {
91
+ "expr": "sum(rate(event_bus_dlq_retries_total[5m])) by (topic)",
92
+ "legendFormat": "{{topic}}"
93
+ }
94
+ ]
95
+ },
96
+ {
97
+ "title": "DLQ Message Age (oldest)",
98
+ "type": "stat",
99
+ "datasource": "Prometheus",
100
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 },
101
+ "fieldConfig": {
102
+ "defaults": {
103
+ "unit": "s",
104
+ "thresholds": {
105
+ "mode": "absolute",
106
+ "steps": [
107
+ { "color": "green", "value": null },
108
+ { "color": "yellow", "value": 3600 },
109
+ { "color": "red", "value": 86400 }
110
+ ]
111
+ }
112
+ }
113
+ },
114
+ "targets": [
115
+ {
116
+ "expr": "max(event_bus_dlq_oldest_message_age_seconds) by (topic)",
117
+ "legendFormat": "{{topic}}"
118
+ }
119
+ ]
120
+ }
121
+ ],
122
+ "refresh": "30s",
123
+ "schemaVersion": 39,
124
+ "tags": ["ecip", "m07", "event-bus", "dlq"],
125
+ "time": { "from": "now-3h", "to": "now" },
126
+ "title": "ECIP — Event Bus DLQ",
127
+ "uid": "ecip-event-bus-dlq",
128
+ "version": 1
129
+ }
@@ -0,0 +1,104 @@
1
+ {
2
+ "description": "ECIP Analysis Engine (M02) — LSP daemon status, restart rate, OOM events",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "LSP Daemon Health",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "LSP Daemon Restart Rate (/hour)",
13
+ "type": "stat",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": {
18
+ "unit": "short",
19
+ "thresholds": {
20
+ "mode": "absolute",
21
+ "steps": [
22
+ { "color": "green", "value": null },
23
+ { "color": "yellow", "value": 1 },
24
+ { "color": "red", "value": 2 }
25
+ ]
26
+ }
27
+ }
28
+ },
29
+ "targets": [
30
+ {
31
+ "expr": "sum(rate(lsp_daemon_restarts_total[1h])) by (repo, language)",
32
+ "legendFormat": "{{repo}} — {{language}}"
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "title": "LSP Daemon Restarts Over Time",
38
+ "type": "timeseries",
39
+ "datasource": "Prometheus",
40
+ "gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
41
+ "fieldConfig": {
42
+ "defaults": { "unit": "short" }
43
+ },
44
+ "targets": [
45
+ {
46
+ "expr": "sum(increase(lsp_daemon_restarts_total[1h])) by (repo, language)",
47
+ "legendFormat": "{{repo}} — {{language}}"
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "title": "LSP Daemon Memory Usage",
53
+ "type": "timeseries",
54
+ "datasource": "Prometheus",
55
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
56
+ "fieldConfig": {
57
+ "defaults": { "unit": "bytes" }
58
+ },
59
+ "targets": [
60
+ {
61
+ "expr": "container_memory_working_set_bytes{pod=~\"lsp-daemon.*\"}",
62
+ "legendFormat": "{{pod}}"
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "title": "LSP Daemon CPU Usage",
68
+ "type": "timeseries",
69
+ "datasource": "Prometheus",
70
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
71
+ "fieldConfig": {
72
+ "defaults": { "unit": "short" }
73
+ },
74
+ "targets": [
75
+ {
76
+ "expr": "rate(container_cpu_usage_seconds_total{pod=~\"lsp-daemon.*\"}[5m])",
77
+ "legendFormat": "{{pod}}"
78
+ }
79
+ ]
80
+ },
81
+ {
82
+ "title": "OOM Kill Events",
83
+ "type": "timeseries",
84
+ "datasource": "Prometheus",
85
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 17 },
86
+ "fieldConfig": {
87
+ "defaults": { "unit": "short" }
88
+ },
89
+ "targets": [
90
+ {
91
+ "expr": "sum(increase(kube_pod_container_status_last_terminated_reason{container=~\"lsp-daemon.*\", reason=\"OOMKilled\"}[1h])) by (pod)",
92
+ "legendFormat": "OOM — {{pod}}"
93
+ }
94
+ ]
95
+ }
96
+ ],
97
+ "refresh": "30s",
98
+ "schemaVersion": 39,
99
+ "tags": ["ecip", "m02", "lsp-daemon", "health"],
100
+ "time": { "from": "now-3h", "to": "now" },
101
+ "title": "ECIP — LSP Daemon Health",
102
+ "uid": "ecip-lsp-daemon-health",
103
+ "version": 1
104
+ }
@@ -0,0 +1,114 @@
1
+ {
2
+ "description": "ECIP Query Service (M04) + MCP Server (M05) — Fan-out topology, latency per target_repo",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "MCP Call Graph",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "MCP Call Duration p95 by Tool",
13
+ "type": "timeseries",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": {
18
+ "unit": "ms",
19
+ "thresholds": {
20
+ "mode": "absolute",
21
+ "steps": [
22
+ { "color": "green", "value": null },
23
+ { "color": "yellow", "value": 500 },
24
+ { "color": "red", "value": 800 }
25
+ ]
26
+ }
27
+ }
28
+ },
29
+ "targets": [
30
+ {
31
+ "expr": "histogram_quantile(0.95, sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, tool_name))",
32
+ "legendFormat": "{{tool_name}}"
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "title": "MCP Call Duration p95 by Target Repo",
38
+ "type": "timeseries",
39
+ "datasource": "Prometheus",
40
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
41
+ "fieldConfig": {
42
+ "defaults": { "unit": "ms" }
43
+ },
44
+ "targets": [
45
+ {
46
+ "expr": "histogram_quantile(0.95, sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, target_repo))",
47
+ "legendFormat": "{{target_repo}}"
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "title": "MCP Call Rate",
53
+ "type": "timeseries",
54
+ "datasource": "Prometheus",
55
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
56
+ "fieldConfig": {
57
+ "defaults": { "unit": "reqps" }
58
+ },
59
+ "targets": [
60
+ {
61
+ "expr": "sum(rate(mcp_call_duration_ms_count[5m])) by (tool_name)",
62
+ "legendFormat": "{{tool_name}}"
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "title": "MCP Auth Failure Rate",
68
+ "type": "timeseries",
69
+ "datasource": "Prometheus",
70
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
71
+ "fieldConfig": {
72
+ "defaults": { "unit": "short" }
73
+ },
74
+ "targets": [
75
+ {
76
+ "expr": "sum(rate(auth_failure_total{module=\"M05\"}[5m]))",
77
+ "legendFormat": "Auth Failures/s"
78
+ }
79
+ ]
80
+ },
81
+ {
82
+ "title": "MCP Fan-out Trace View",
83
+ "type": "nodeGraph",
84
+ "datasource": "Tempo",
85
+ "gridPos": { "h": 10, "w": 24, "x": 0, "y": 17 },
86
+ "targets": [
87
+ {
88
+ "queryType": "traceql",
89
+ "query": "{ span.ecip.module = \"M05\" }"
90
+ }
91
+ ]
92
+ }
93
+ ],
94
+ "refresh": "30s",
95
+ "schemaVersion": 39,
96
+ "tags": ["ecip", "m04", "m05", "mcp", "call-graph"],
97
+ "templating": {
98
+ "list": [
99
+ {
100
+ "name": "target_repo",
101
+ "type": "query",
102
+ "datasource": "Prometheus",
103
+ "query": "label_values(mcp_call_duration_ms_bucket, target_repo)",
104
+ "refresh": 2,
105
+ "includeAll": true,
106
+ "multi": true
107
+ }
108
+ ]
109
+ },
110
+ "time": { "from": "now-1h", "to": "now" },
111
+ "title": "ECIP — MCP Call Graph",
112
+ "uid": "ecip-mcp-call-graph",
113
+ "version": 1
114
+ }