ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# Dashboard Guide — ECIP Grafana Dashboards
|
|
2
|
+
|
|
3
|
+
> **Audience:** ECIP engineers, SREs, on-call
|
|
4
|
+
> **Location:** Grafana → ECIP folder
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Dashboard Inventory
|
|
9
|
+
|
|
10
|
+
| Dashboard | File | Primary Module | Key SLA |
|
|
11
|
+
|---|---|---|---|
|
|
12
|
+
| Query Latency | `query-latency.json` | M04 | p95 < 2000ms |
|
|
13
|
+
| Analysis Throughput | `analysis-throughput.json` | M02 | — |
|
|
14
|
+
| Cache Performance | `cache-performance.json` | M03 | Hit rate > 85% |
|
|
15
|
+
| LSP Daemon Health | `lsp-daemon-health.json` | M02 | — |
|
|
16
|
+
| MCP Call Graph | `mcp-call-graph.json` | M04/M05 | p95 < 500ms |
|
|
17
|
+
| Event Bus DLQ | `event-bus-dlq.json` | M07 | DLQ depth = 0 |
|
|
18
|
+
| Cross-Repo Fan-out | `cross-repo-fanout.json` | M04 | — |
|
|
19
|
+
| Security Events | `security-events.json` | All | — |
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Delivery Sequence
|
|
24
|
+
|
|
25
|
+
Dashboards are delivered in module-dependency order, not all at once:
|
|
26
|
+
|
|
27
|
+
1. **Week 2** — `lsp-daemon-health.json`, `event-bus-dlq.json` (M02, M07 start early)
|
|
28
|
+
2. **Week 3** — `cache-performance.json`, `analysis-throughput.json` (M03 foundation)
|
|
29
|
+
3. **Week 4** — `security-events.json` (tied to M08-T08)
|
|
30
|
+
4. **Week 11** — `query-latency.json`, `mcp-call-graph.json` (M04 comes online)
|
|
31
|
+
5. **Week 17** — `cross-repo-fanout.json` (M05 comes online)
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## How Dashboards Are Provisioned
|
|
36
|
+
|
|
37
|
+
Dashboards are stored as JSON files in `dashboards/`. The Grafana sidecar automatically discovers ConfigMaps with label `grafana_dashboard: "1"` and loads them.
|
|
38
|
+
|
|
39
|
+
**To add/modify a dashboard:**
|
|
40
|
+
1. Edit the JSON file in `dashboards/`
|
|
41
|
+
2. Commit and push — the Helm chart creates a ConfigMap per dashboard
|
|
42
|
+
3. Grafana sidecar reloads within 60 seconds
|
|
43
|
+
|
|
44
|
+
**Do not edit dashboards in the Grafana UI** — changes will be overwritten on the next Helm deploy.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Dashboard Details
|
|
49
|
+
|
|
50
|
+
### Query Latency (`query-latency.json`)
|
|
51
|
+
|
|
52
|
+
**What it shows:** p50, p95, p99 latency for the M04 Query Service.
|
|
53
|
+
|
|
54
|
+
**Key panels:**
|
|
55
|
+
- Latency histogram with SLA threshold lines at 2000ms
|
|
56
|
+
- Error rate percentage
|
|
57
|
+
- Requests per second
|
|
58
|
+
|
|
59
|
+
**Key PromQL:**
|
|
60
|
+
```promql
|
|
61
|
+
histogram_quantile(0.95, sum(rate(ecip_query_duration_ms_bucket{module="M04"}[5m])) by (le))
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**When to look:** During incidents, SLA breach alerts, capacity planning.
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
### Analysis Throughput (`analysis-throughput.json`)
|
|
69
|
+
|
|
70
|
+
**What it shows:** M02 Analysis Engine processing rate and duration.
|
|
71
|
+
|
|
72
|
+
**Key panels:**
|
|
73
|
+
- Analyses per minute
|
|
74
|
+
- p95 analysis duration
|
|
75
|
+
- Error rate
|
|
76
|
+
- Active analyses gauge
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
### Cache Performance (`cache-performance.json`)
|
|
81
|
+
|
|
82
|
+
**What it shows:** M03 Knowledge Store Redis cache hit rates.
|
|
83
|
+
|
|
84
|
+
**Key panels:**
|
|
85
|
+
- Cache hit rate percentage (target: > 85%)
|
|
86
|
+
- Miss rate with type breakdown
|
|
87
|
+
- Eviction rate
|
|
88
|
+
- Memory usage
|
|
89
|
+
|
|
90
|
+
**When to look:** Cache degradation alerts, latency spikes in M04.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
### LSP Daemon Health (`lsp-daemon-health.json`)
|
|
95
|
+
|
|
96
|
+
**What it shows:** M02 LSP daemon pool status.
|
|
97
|
+
|
|
98
|
+
**Key panels:**
|
|
99
|
+
- Active daemon count
|
|
100
|
+
- Restart rate (alert fires at > 3/5min)
|
|
101
|
+
- OOM kill events
|
|
102
|
+
- Memory per daemon
|
|
103
|
+
|
|
104
|
+
**When to look:** `LSPDaemonRestartRate` or `LSPDaemonOOMKill` alerts.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
### MCP Call Graph (`mcp-call-graph.json`)
|
|
109
|
+
|
|
110
|
+
**What it shows:** M04/M05 MCP tool call performance.
|
|
111
|
+
|
|
112
|
+
**Key panels:**
|
|
113
|
+
- Call duration by tool
|
|
114
|
+
- Fan-out depth per query
|
|
115
|
+
- Error rate by tool
|
|
116
|
+
- Concurrent call count
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
### Event Bus DLQ (`event-bus-dlq.json`)
|
|
121
|
+
|
|
122
|
+
**What it shows:** M07 Kafka dead-letter queue depth and age.
|
|
123
|
+
|
|
124
|
+
**Key panels:**
|
|
125
|
+
- DLQ message depth (target: 0)
|
|
126
|
+
- Oldest message age
|
|
127
|
+
- DLQ ingestion rate
|
|
128
|
+
- Processing lag
|
|
129
|
+
|
|
130
|
+
**When to look:** `DLQDepthExceeded` alerts, event processing failures.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
### Cross-Repo Fan-out (`cross-repo-fanout.json`)
|
|
135
|
+
|
|
136
|
+
**What it shows:** M04 cross-repository dependency resolution depth.
|
|
137
|
+
|
|
138
|
+
**Key panels:**
|
|
139
|
+
- Fan-out depth histogram
|
|
140
|
+
- Repos queried per request
|
|
141
|
+
- Timeout rate
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
### Security Events (`security-events.json`)
|
|
146
|
+
|
|
147
|
+
**What it shows:** Auth failures and RBAC denials from Elasticsearch.
|
|
148
|
+
|
|
149
|
+
**Key panels:**
|
|
150
|
+
- Auth failures over time
|
|
151
|
+
- RBAC denials by resource
|
|
152
|
+
- Top denied users (hashed)
|
|
153
|
+
- Geographic distribution of failures
|
|
154
|
+
|
|
155
|
+
**Data source:** Elasticsearch (`ecip-security-events` index)
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Ownership Model (Post-Week 28)
|
|
160
|
+
|
|
161
|
+
After the build phase:
|
|
162
|
+
- **Module teams** own panels that relate to their metrics
|
|
163
|
+
- **Platform team** owns infrastructure-level panels (Collector health, Prometheus performance, Tempo storage)
|
|
164
|
+
|
|
165
|
+
When adding new metrics to your module, update the relevant dashboard JSON and submit a PR.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
*Last updated: March 2026 · Platform Team*
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* ECIP M08 — Dashboard JSON Linter
|
|
4
|
+
*
|
|
5
|
+
* Validates all Grafana dashboard JSON files in dashboards/.
|
|
6
|
+
*
|
|
7
|
+
* Checks:
|
|
8
|
+
* 1. Valid JSON (parseable without error)
|
|
9
|
+
* 2. Required top-level fields: title, uid, panels
|
|
10
|
+
* 3. schemaVersion is present and ≥ 30
|
|
11
|
+
* 4. uid is non-empty and unique across all dashboards
|
|
12
|
+
* 5. Every panel has: title, type, targets (except row type)
|
|
13
|
+
* 6. No panel has an empty targets array (except row/text)
|
|
14
|
+
* 7. All datasource references use named datasources (not hardcoded URLs)
|
|
15
|
+
* 8. Template variables (if present) have a name and query
|
|
16
|
+
*
|
|
17
|
+
* Exit codes:
|
|
18
|
+
* 0 — all dashboards pass
|
|
19
|
+
* 1 — one or more dashboards have lint errors
|
|
20
|
+
*
|
|
21
|
+
* Usage:
|
|
22
|
+
* node scripts/lint-dashboards.js
|
|
23
|
+
* node scripts/lint-dashboards.js dashboards/query-latency.json
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
'use strict';
|
|
27
|
+
|
|
28
|
+
const fs = require('fs');
|
|
29
|
+
const path = require('path');
|
|
30
|
+
|
|
31
|
+
const DASHBOARDS_DIR = path.resolve(__dirname, '..', 'dashboards');
|
|
32
|
+
const REQUIRED_FIELDS = ['title', 'uid', 'panels'];
|
|
33
|
+
const PANEL_TYPES_NO_TARGETS = new Set(['row', 'text', 'news', 'dashlist']);
|
|
34
|
+
const MIN_SCHEMA_VERSION = 30;
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Lint a single dashboard
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
function lintDashboard(filePath) {
|
|
41
|
+
const errors = [];
|
|
42
|
+
const filename = path.basename(filePath);
|
|
43
|
+
|
|
44
|
+
// 1. Valid JSON
|
|
45
|
+
let dashboard;
|
|
46
|
+
try {
|
|
47
|
+
const raw = fs.readFileSync(filePath, 'utf8');
|
|
48
|
+
dashboard = JSON.parse(raw);
|
|
49
|
+
} catch (e) {
|
|
50
|
+
errors.push(`${filename}: Invalid JSON — ${e.message}`);
|
|
51
|
+
return { filename, errors, uid: null };
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// 2. Required top-level fields
|
|
55
|
+
for (const field of REQUIRED_FIELDS) {
|
|
56
|
+
if (!(field in dashboard)) {
|
|
57
|
+
errors.push(`${filename}: Missing required field '${field}'`);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// 3. schemaVersion
|
|
62
|
+
if (dashboard.schemaVersion == null) {
|
|
63
|
+
errors.push(`${filename}: Missing 'schemaVersion'`);
|
|
64
|
+
} else if (dashboard.schemaVersion < MIN_SCHEMA_VERSION) {
|
|
65
|
+
errors.push(`${filename}: schemaVersion ${dashboard.schemaVersion} is below minimum (${MIN_SCHEMA_VERSION})`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 4. uid
|
|
69
|
+
const uid = dashboard.uid;
|
|
70
|
+
if (!uid || typeof uid !== 'string' || uid.trim() === '') {
|
|
71
|
+
errors.push(`${filename}: 'uid' is empty or missing`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// 5. Panel validation
|
|
75
|
+
const panels = dashboard.panels || [];
|
|
76
|
+
if (!Array.isArray(panels)) {
|
|
77
|
+
errors.push(`${filename}: 'panels' is not an array`);
|
|
78
|
+
} else {
|
|
79
|
+
panels.forEach((panel, i) => {
|
|
80
|
+
const label = `${filename} → panel[${i}]`;
|
|
81
|
+
|
|
82
|
+
if (!panel.title) {
|
|
83
|
+
errors.push(`${label}: Missing 'title'`);
|
|
84
|
+
}
|
|
85
|
+
if (!panel.type) {
|
|
86
|
+
errors.push(`${label}: Missing 'type'`);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// 6. Targets check (skip row/text panels)
|
|
90
|
+
if (panel.type && !PANEL_TYPES_NO_TARGETS.has(panel.type)) {
|
|
91
|
+
if (!panel.targets || !Array.isArray(panel.targets)) {
|
|
92
|
+
errors.push(`${label} (${panel.title || 'untitled'}): Missing 'targets' array`);
|
|
93
|
+
} else if (panel.targets.length === 0) {
|
|
94
|
+
errors.push(`${label} (${panel.title || 'untitled'}): 'targets' array is empty`);
|
|
95
|
+
} else {
|
|
96
|
+
// 7. Datasource check on targets
|
|
97
|
+
panel.targets.forEach((target, ti) => {
|
|
98
|
+
if (target.expr === undefined && target.query === undefined && target.rawSql === undefined) {
|
|
99
|
+
errors.push(`${label} → target[${ti}]: No 'expr', 'query', or 'rawSql' field`);
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// 8. Template variables
|
|
108
|
+
if (dashboard.templating && dashboard.templating.list) {
|
|
109
|
+
dashboard.templating.list.forEach((tVar, i) => {
|
|
110
|
+
if (!tVar.name) {
|
|
111
|
+
errors.push(`${filename} → templating[${i}]: Missing variable 'name'`);
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return { filename, errors, uid };
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// ---------------------------------------------------------------------------
|
|
120
|
+
// Main
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
function main() {
|
|
124
|
+
// Determine files to lint
|
|
125
|
+
let files;
|
|
126
|
+
if (process.argv.length > 2) {
|
|
127
|
+
files = process.argv.slice(2).map((f) => path.resolve(f));
|
|
128
|
+
} else {
|
|
129
|
+
if (!fs.existsSync(DASHBOARDS_DIR)) {
|
|
130
|
+
console.error(`Dashboard directory not found: ${DASHBOARDS_DIR}`);
|
|
131
|
+
process.exit(1);
|
|
132
|
+
}
|
|
133
|
+
files = fs.readdirSync(DASHBOARDS_DIR)
|
|
134
|
+
.filter((f) => f.endsWith('.json'))
|
|
135
|
+
.map((f) => path.join(DASHBOARDS_DIR, f));
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (files.length === 0) {
|
|
139
|
+
console.log('No dashboard JSON files found.');
|
|
140
|
+
process.exit(0);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
console.log(`\n📊 ECIP M08 — Dashboard Lint`);
|
|
144
|
+
console.log(` Checking ${files.length} dashboard(s)...\n`);
|
|
145
|
+
|
|
146
|
+
const allResults = files.map((f) => lintDashboard(f));
|
|
147
|
+
|
|
148
|
+
// UID uniqueness check
|
|
149
|
+
const uidMap = new Map();
|
|
150
|
+
for (const result of allResults) {
|
|
151
|
+
if (result.uid) {
|
|
152
|
+
if (uidMap.has(result.uid)) {
|
|
153
|
+
result.errors.push(
|
|
154
|
+
`${result.filename}: Duplicate UID '${result.uid}' — also used by ${uidMap.get(result.uid)}`,
|
|
155
|
+
);
|
|
156
|
+
} else {
|
|
157
|
+
uidMap.set(result.uid, result.filename);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Report
|
|
163
|
+
let totalErrors = 0;
|
|
164
|
+
for (const result of allResults) {
|
|
165
|
+
if (result.errors.length === 0) {
|
|
166
|
+
console.log(` ✅ ${result.filename}`);
|
|
167
|
+
} else {
|
|
168
|
+
console.log(` ❌ ${result.filename} — ${result.errors.length} error(s)`);
|
|
169
|
+
result.errors.forEach((e) => console.log(` ${e}`));
|
|
170
|
+
totalErrors += result.errors.length;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
console.log('');
|
|
175
|
+
if (totalErrors > 0) {
|
|
176
|
+
console.log(`❌ ${totalErrors} lint error(s) across ${allResults.filter((r) => r.errors.length > 0).length} file(s).\n`);
|
|
177
|
+
process.exit(1);
|
|
178
|
+
} else {
|
|
179
|
+
console.log(`✅ All ${files.length} dashboard(s) pass lint checks.\n`);
|
|
180
|
+
process.exit(0);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
main();
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — Grafana Tempo Datasource Provisioning
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Auto-provisions Tempo as a Grafana datasource via sidecar.
|
|
5
|
+
# =============================================================================
|
|
6
|
+
apiVersion: 1
|
|
7
|
+
|
|
8
|
+
datasources:
|
|
9
|
+
- name: Tempo
|
|
10
|
+
type: tempo
|
|
11
|
+
access: proxy
|
|
12
|
+
url: http://tempo.monitoring:3200
|
|
13
|
+
uid: tempo
|
|
14
|
+
isDefault: false
|
|
15
|
+
editable: true
|
|
16
|
+
jsonData:
|
|
17
|
+
httpMethod: GET
|
|
18
|
+
tracesToMetrics:
|
|
19
|
+
datasourceUid: prometheus
|
|
20
|
+
tags:
|
|
21
|
+
- key: service.name
|
|
22
|
+
value: service
|
|
23
|
+
- key: ecip.module
|
|
24
|
+
value: module
|
|
25
|
+
tracesToLogs:
|
|
26
|
+
datasourceUid: loki
|
|
27
|
+
tags:
|
|
28
|
+
- key: trace_id
|
|
29
|
+
mappedTags:
|
|
30
|
+
- key: service.name
|
|
31
|
+
value: service_name
|
|
32
|
+
- key: ecip.module
|
|
33
|
+
value: module
|
|
34
|
+
mapTagNamesEnabled: true
|
|
35
|
+
spanStartTimeShift: "-1h"
|
|
36
|
+
spanEndTimeShift: "1h"
|
|
37
|
+
filterByTraceID: true
|
|
38
|
+
filterBySpanID: false
|
|
39
|
+
nodeGraph:
|
|
40
|
+
enabled: true
|
|
41
|
+
serviceMap:
|
|
42
|
+
datasourceUid: prometheus
|
|
43
|
+
search:
|
|
44
|
+
hide: false
|
|
45
|
+
lokiSearch:
|
|
46
|
+
datasourceUid: loki
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — Grafana Tempo Helm Values
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# S3-backed trace storage with 14-day retention.
|
|
5
|
+
# Microservices mode for production scalability.
|
|
6
|
+
# =============================================================================
|
|
7
|
+
|
|
8
|
+
tempo:
|
|
9
|
+
# Microservices mode — separate read/write/compactor for scale
|
|
10
|
+
multitenancyEnabled: false
|
|
11
|
+
|
|
12
|
+
storage:
|
|
13
|
+
trace:
|
|
14
|
+
backend: s3
|
|
15
|
+
s3:
|
|
16
|
+
bucket: ecip-tempo-traces
|
|
17
|
+
endpoint: s3.amazonaws.com
|
|
18
|
+
region: us-east-1
|
|
19
|
+
# Credentials from K8s secret (injected via Helm)
|
|
20
|
+
access_key: ${TEMPO_S3_ACCESS_KEY}
|
|
21
|
+
secret_key: ${TEMPO_S3_SECRET_KEY}
|
|
22
|
+
|
|
23
|
+
retention:
|
|
24
|
+
# 14-day trace retention — per design doc
|
|
25
|
+
max_block_duration: 1h
|
|
26
|
+
max_compaction_objects: 6000000
|
|
27
|
+
compaction:
|
|
28
|
+
compacted_block_retention: 336h # 14 days
|
|
29
|
+
|
|
30
|
+
server:
|
|
31
|
+
http_listen_port: 3200
|
|
32
|
+
grpc_listen_port: 9095
|
|
33
|
+
|
|
34
|
+
# OTLP receiver for traces from OTel Collector
|
|
35
|
+
receivers:
|
|
36
|
+
otlp:
|
|
37
|
+
protocols:
|
|
38
|
+
grpc:
|
|
39
|
+
endpoint: 0.0.0.0:4317
|
|
40
|
+
http:
|
|
41
|
+
endpoint: 0.0.0.0:4318
|
|
42
|
+
|
|
43
|
+
# Query frontend for Grafana
|
|
44
|
+
query_frontend:
|
|
45
|
+
search:
|
|
46
|
+
max_duration: 168h # Allow searching up to 7 days back
|
|
47
|
+
default_result_limit: 20
|
|
48
|
+
|
|
49
|
+
# Resource limits
|
|
50
|
+
resources:
|
|
51
|
+
requests:
|
|
52
|
+
cpu: 500m
|
|
53
|
+
memory: 1Gi
|
|
54
|
+
limits:
|
|
55
|
+
cpu: 2000m
|
|
56
|
+
memory: 4Gi
|
|
57
|
+
|
|
58
|
+
persistence:
|
|
59
|
+
enabled: true
|
|
60
|
+
size: 50Gi
|
|
61
|
+
storageClassName: standard
|
|
62
|
+
|
|
63
|
+
# Ingester configuration
|
|
64
|
+
ingester:
|
|
65
|
+
replicas: 2
|
|
66
|
+
resources:
|
|
67
|
+
requests:
|
|
68
|
+
cpu: 500m
|
|
69
|
+
memory: 1Gi
|
|
70
|
+
limits:
|
|
71
|
+
cpu: 1000m
|
|
72
|
+
memory: 2Gi
|
|
73
|
+
|
|
74
|
+
# Compactor — merges and deduplicates blocks
|
|
75
|
+
compactor:
|
|
76
|
+
replicas: 1
|
|
77
|
+
resources:
|
|
78
|
+
requests:
|
|
79
|
+
cpu: 250m
|
|
80
|
+
memory: 512Mi
|
|
81
|
+
limits:
|
|
82
|
+
cpu: 500m
|
|
83
|
+
memory: 1Gi
|
|
84
|
+
|
|
85
|
+
# Distributor — receives spans and distributes to ingesters
|
|
86
|
+
distributor:
|
|
87
|
+
replicas: 2
|
|
88
|
+
resources:
|
|
89
|
+
requests:
|
|
90
|
+
cpu: 250m
|
|
91
|
+
memory: 512Mi
|
|
92
|
+
limits:
|
|
93
|
+
cpu: 1000m
|
|
94
|
+
memory: 1Gi
|