ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metric Label Validation Tests
|
|
3
|
+
*
|
|
4
|
+
* Validates that all metric emissions include the exact label sets
|
|
5
|
+
* defined in the ECIP metrics catalog. Uses a mock Prometheus registry
|
|
6
|
+
* to capture emitted metrics and validates label presence and types.
|
|
7
|
+
*
|
|
8
|
+
* Prevents high-cardinality labels (e.g., user_id) from being used
|
|
9
|
+
* as Prometheus labels, which would cause OOM.
|
|
10
|
+
*/
|
|
11
|
+
import { describe, it, expect, beforeEach } from 'vitest';
|
|
12
|
+
|
|
13
|
+
// --- Metrics Catalog ---
|
|
14
|
+
// Defines the allowed label sets per metric as specified in the design doc.
|
|
15
|
+
const METRICS_CATALOG: Record<string, { labels: string[]; type: string }> = {
|
|
16
|
+
ecip_query_duration_ms: {
|
|
17
|
+
labels: ['module', 'method', 'status'],
|
|
18
|
+
type: 'histogram',
|
|
19
|
+
},
|
|
20
|
+
ecip_analysis_duration_ms: {
|
|
21
|
+
labels: ['module', 'repo', 'language'],
|
|
22
|
+
type: 'histogram',
|
|
23
|
+
},
|
|
24
|
+
ecip_analysis_throughput_total: {
|
|
25
|
+
labels: ['module', 'status'],
|
|
26
|
+
type: 'counter',
|
|
27
|
+
},
|
|
28
|
+
ecip_analysis_backlog_size: {
|
|
29
|
+
labels: ['module'],
|
|
30
|
+
type: 'gauge',
|
|
31
|
+
},
|
|
32
|
+
ecip_cache_hit_rate: {
|
|
33
|
+
labels: ['module', 'cache_type'],
|
|
34
|
+
type: 'gauge',
|
|
35
|
+
},
|
|
36
|
+
ecip_cache_miss_total: {
|
|
37
|
+
labels: ['module', 'cache_type', 'reason'],
|
|
38
|
+
type: 'counter',
|
|
39
|
+
},
|
|
40
|
+
ecip_lsp_daemon_restarts_total: {
|
|
41
|
+
labels: ['module', 'language', 'reason'],
|
|
42
|
+
type: 'counter',
|
|
43
|
+
},
|
|
44
|
+
ecip_lsp_daemon_active: {
|
|
45
|
+
labels: ['module', 'language'],
|
|
46
|
+
type: 'gauge',
|
|
47
|
+
},
|
|
48
|
+
ecip_mcp_call_duration_ms: {
|
|
49
|
+
labels: ['module', 'tool', 'status'],
|
|
50
|
+
type: 'histogram',
|
|
51
|
+
},
|
|
52
|
+
ecip_dlq_depth: {
|
|
53
|
+
labels: ['module', 'topic'],
|
|
54
|
+
type: 'gauge',
|
|
55
|
+
},
|
|
56
|
+
ecip_dlq_oldest_message_age_seconds: {
|
|
57
|
+
labels: ['module', 'topic'],
|
|
58
|
+
type: 'gauge',
|
|
59
|
+
},
|
|
60
|
+
ecip_cross_repo_fanout_depth: {
|
|
61
|
+
labels: ['module'],
|
|
62
|
+
type: 'histogram',
|
|
63
|
+
},
|
|
64
|
+
ecip_auth_failures_total: {
|
|
65
|
+
labels: ['module', 'reason'],
|
|
66
|
+
type: 'counter',
|
|
67
|
+
},
|
|
68
|
+
ecip_rbac_denials_total: {
|
|
69
|
+
labels: ['module', 'action', 'reason'],
|
|
70
|
+
type: 'counter',
|
|
71
|
+
},
|
|
72
|
+
ecip_service_auth_failures_total: {
|
|
73
|
+
labels: ['module', 'target_service'],
|
|
74
|
+
type: 'counter',
|
|
75
|
+
},
|
|
76
|
+
ecip_filter_authorized_repos_duration_ms: {
|
|
77
|
+
labels: ['module'],
|
|
78
|
+
type: 'histogram',
|
|
79
|
+
},
|
|
80
|
+
ecip_grpc_request_duration_ms: {
|
|
81
|
+
labels: ['module', 'service', 'method', 'status'],
|
|
82
|
+
type: 'histogram',
|
|
83
|
+
},
|
|
84
|
+
ecip_knowledge_store_write_duration_ms: {
|
|
85
|
+
labels: ['module', 'operation'],
|
|
86
|
+
type: 'histogram',
|
|
87
|
+
},
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// Labels that must NEVER be used as Prometheus labels (high cardinality)
|
|
91
|
+
const PROHIBITED_LABELS = [
|
|
92
|
+
'user_id',
|
|
93
|
+
'user_name',
|
|
94
|
+
'email',
|
|
95
|
+
'ip_address',
|
|
96
|
+
'source_ip',
|
|
97
|
+
'trace_id',
|
|
98
|
+
'span_id',
|
|
99
|
+
'request_id',
|
|
100
|
+
'session_id',
|
|
101
|
+
'file_path',
|
|
102
|
+
'sha',
|
|
103
|
+
];
|
|
104
|
+
|
|
105
|
+
describe('Metrics Catalog Validation', () => {
|
|
106
|
+
it('should have at least one metric in the catalog', () => {
|
|
107
|
+
expect(Object.keys(METRICS_CATALOG).length).toBeGreaterThan(0);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
describe('metric naming conventions', () => {
|
|
111
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
112
|
+
'%s should follow the ecip_ prefix convention',
|
|
113
|
+
(metricName) => {
|
|
114
|
+
expect(metricName).toMatch(/^ecip_/);
|
|
115
|
+
}
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
119
|
+
'%s should use snake_case',
|
|
120
|
+
(metricName) => {
|
|
121
|
+
expect(metricName).toMatch(/^[a-z][a-z0-9_]*$/);
|
|
122
|
+
}
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
126
|
+
'%s should have a valid type',
|
|
127
|
+
(metricName) => {
|
|
128
|
+
expect(['counter', 'gauge', 'histogram', 'summary']).toContain(
|
|
129
|
+
METRICS_CATALOG[metricName].type
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
);
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe('label validation', () => {
|
|
136
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
137
|
+
'%s should include the module label',
|
|
138
|
+
(metricName) => {
|
|
139
|
+
expect(METRICS_CATALOG[metricName].labels).toContain('module');
|
|
140
|
+
}
|
|
141
|
+
);
|
|
142
|
+
|
|
143
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
144
|
+
'%s should not contain prohibited high-cardinality labels',
|
|
145
|
+
(metricName) => {
|
|
146
|
+
const labels = METRICS_CATALOG[metricName].labels;
|
|
147
|
+
for (const prohibited of PROHIBITED_LABELS) {
|
|
148
|
+
expect(labels).not.toContain(prohibited);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
154
|
+
'%s labels should use snake_case',
|
|
155
|
+
(metricName) => {
|
|
156
|
+
for (const label of METRICS_CATALOG[metricName].labels) {
|
|
157
|
+
expect(label).toMatch(/^[a-z][a-z0-9_]*$/);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
);
|
|
161
|
+
|
|
162
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
163
|
+
'%s should have no more than 5 labels',
|
|
164
|
+
(metricName) => {
|
|
165
|
+
// Guard against cardinality explosion
|
|
166
|
+
expect(METRICS_CATALOG[metricName].labels.length).toBeLessThanOrEqual(5);
|
|
167
|
+
}
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
it.each(Object.keys(METRICS_CATALOG))(
|
|
171
|
+
'%s should have no duplicate labels',
|
|
172
|
+
(metricName) => {
|
|
173
|
+
const labels = METRICS_CATALOG[metricName].labels;
|
|
174
|
+
const unique = new Set(labels);
|
|
175
|
+
expect(unique.size).toBe(labels.length);
|
|
176
|
+
}
|
|
177
|
+
);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
describe('histogram metrics', () => {
|
|
181
|
+
const histograms = Object.entries(METRICS_CATALOG)
|
|
182
|
+
.filter(([, v]) => v.type === 'histogram')
|
|
183
|
+
.map(([k]) => k);
|
|
184
|
+
|
|
185
|
+
it.each(histograms)(
|
|
186
|
+
'%s should end with _ms, _seconds, _bytes, or _depth for histograms',
|
|
187
|
+
(metricName) => {
|
|
188
|
+
expect(metricName).toMatch(/(_ms|_seconds|_bytes|_depth)$/);
|
|
189
|
+
}
|
|
190
|
+
);
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
describe('counter metrics', () => {
|
|
194
|
+
const counters = Object.entries(METRICS_CATALOG)
|
|
195
|
+
.filter(([, v]) => v.type === 'counter')
|
|
196
|
+
.map(([k]) => k);
|
|
197
|
+
|
|
198
|
+
it.each(counters)('%s should end with _total for counters', (metricName) => {
|
|
199
|
+
expect(metricName).toMatch(/_total$/);
|
|
200
|
+
});
|
|
201
|
+
});
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
describe('Mock Metric Emission', () => {
|
|
205
|
+
// Simulates metric emission and validates label correctness
|
|
206
|
+
interface MetricEmission {
|
|
207
|
+
name: string;
|
|
208
|
+
labels: Record<string, string>;
|
|
209
|
+
value: number;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
let emittedMetrics: MetricEmission[];
|
|
213
|
+
|
|
214
|
+
beforeEach(() => {
|
|
215
|
+
emittedMetrics = [];
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
function emitMetric(name: string, labels: Record<string, string>, value: number): void {
|
|
219
|
+
emittedMetrics.push({ name, labels, value });
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function validateEmission(emission: MetricEmission): string[] {
|
|
223
|
+
const errors: string[] = [];
|
|
224
|
+
const catalogEntry = METRICS_CATALOG[emission.name];
|
|
225
|
+
|
|
226
|
+
if (!catalogEntry) {
|
|
227
|
+
errors.push(`Unknown metric: ${emission.name}`);
|
|
228
|
+
return errors;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Check all required labels are present
|
|
232
|
+
for (const required of catalogEntry.labels) {
|
|
233
|
+
if (!(required in emission.labels)) {
|
|
234
|
+
errors.push(`Missing required label "${required}" for metric ${emission.name}`);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Check no extra labels are present
|
|
239
|
+
for (const label of Object.keys(emission.labels)) {
|
|
240
|
+
if (!catalogEntry.labels.includes(label)) {
|
|
241
|
+
errors.push(`Unexpected label "${label}" for metric ${emission.name}`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Check no prohibited labels
|
|
246
|
+
for (const label of Object.keys(emission.labels)) {
|
|
247
|
+
if (PROHIBITED_LABELS.includes(label)) {
|
|
248
|
+
errors.push(`Prohibited high-cardinality label "${label}" in metric ${emission.name}`);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return errors;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
it('should accept a correctly labeled query_duration_ms emission', () => {
|
|
256
|
+
emitMetric('ecip_query_duration_ms', { module: 'M04', method: 'search', status: 'ok' }, 142);
|
|
257
|
+
const errors = validateEmission(emittedMetrics[0]);
|
|
258
|
+
expect(errors).toEqual([]);
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
it('should reject a metric missing the module label', () => {
|
|
262
|
+
emitMetric('ecip_query_duration_ms', { method: 'search', status: 'ok' }, 142);
|
|
263
|
+
const errors = validateEmission(emittedMetrics[0]);
|
|
264
|
+
expect(errors).toContain('Missing required label "module" for metric ecip_query_duration_ms');
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
it('should reject a metric with a prohibited user_id label', () => {
|
|
268
|
+
emitMetric(
|
|
269
|
+
'ecip_query_duration_ms',
|
|
270
|
+
{ module: 'M04', method: 'search', status: 'ok', user_id: 'u123' } as any,
|
|
271
|
+
142
|
|
272
|
+
);
|
|
273
|
+
const errors = validateEmission(emittedMetrics[0]);
|
|
274
|
+
expect(errors.some((e) => e.includes('user_id'))).toBe(true);
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
it('should reject an unknown metric name', () => {
|
|
278
|
+
emitMetric('unknown_metric', { module: 'M04' }, 1);
|
|
279
|
+
const errors = validateEmission(emittedMetrics[0]);
|
|
280
|
+
expect(errors).toContain('Unknown metric: unknown_metric');
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
it('should reject extra labels not in the catalog', () => {
|
|
284
|
+
emitMetric(
|
|
285
|
+
'ecip_analysis_backlog_size',
|
|
286
|
+
{ module: 'M02', extra_label: 'bad' } as any,
|
|
287
|
+
500
|
|
288
|
+
);
|
|
289
|
+
const errors = validateEmission(emittedMetrics[0]);
|
|
290
|
+
expect(errors.some((e) => e.includes('Unexpected label'))).toBe(true);
|
|
291
|
+
});
|
|
292
|
+
});
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OTel Pipeline Integration Test (Testcontainers)
|
|
3
|
+
*
|
|
4
|
+
* The most important test in M08 (per design doc §8).
|
|
5
|
+
*
|
|
6
|
+
* Setup: Testcontainers spins up an OTel Collector and Grafana Tempo.
|
|
7
|
+
* The @ecip/observability SDK is initialized pointing at the collector.
|
|
8
|
+
*
|
|
9
|
+
* Assertions:
|
|
10
|
+
* 1. Emit 5 spans across a simulated async service boundary
|
|
11
|
+
* 2. All 5 spans retrievable from Tempo within 10 seconds
|
|
12
|
+
* 3. traceparent header correctly propagated (child spans have correct parent ID)
|
|
13
|
+
* 4. Error spans are 100% sampled
|
|
14
|
+
* 5. Normal spans under default sampling may/may not be present
|
|
15
|
+
*
|
|
16
|
+
* NOTE: This test requires Docker. It is skipped in environments without Docker.
|
|
17
|
+
*/
|
|
18
|
+
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
19
|
+
import { GenericContainer, StartedTestContainer, Wait } from 'testcontainers';
|
|
20
|
+
import * as http from 'node:http';
|
|
21
|
+
|
|
22
|
+
// Configuration for the test containers
|
|
23
|
+
const COLLECTOR_IMAGE = 'otel/opentelemetry-collector-contrib:0.96.0';
|
|
24
|
+
const TEMPO_IMAGE = 'grafana/tempo:2.3.1';
|
|
25
|
+
const TEMPO_QUERY_PORT = 3200;
|
|
26
|
+
const OTLP_HTTP_PORT = 4318;
|
|
27
|
+
|
|
28
|
+
// Collector config for the test — minimal, routes traces to Tempo
|
|
29
|
+
const TEST_COLLECTOR_CONFIG = `
|
|
30
|
+
receivers:
|
|
31
|
+
otlp:
|
|
32
|
+
protocols:
|
|
33
|
+
http:
|
|
34
|
+
endpoint: 0.0.0.0:4318
|
|
35
|
+
|
|
36
|
+
processors:
|
|
37
|
+
batch:
|
|
38
|
+
timeout: 1s
|
|
39
|
+
|
|
40
|
+
exporters:
|
|
41
|
+
otlp/tempo:
|
|
42
|
+
endpoint: tempo:3200
|
|
43
|
+
tls:
|
|
44
|
+
insecure: true
|
|
45
|
+
|
|
46
|
+
service:
|
|
47
|
+
pipelines:
|
|
48
|
+
traces:
|
|
49
|
+
receivers: [otlp]
|
|
50
|
+
processors: [batch]
|
|
51
|
+
exporters: [otlp/tempo]
|
|
52
|
+
`;
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Helper: Send a simple OTLP/HTTP trace span
|
|
56
|
+
*/
|
|
57
|
+
async function sendOtlpSpan(
|
|
58
|
+
endpoint: string,
|
|
59
|
+
options: {
|
|
60
|
+
traceId: string;
|
|
61
|
+
spanId: string;
|
|
62
|
+
parentSpanId?: string;
|
|
63
|
+
name: string;
|
|
64
|
+
statusCode?: number;
|
|
65
|
+
durationMs?: number;
|
|
66
|
+
}
|
|
67
|
+
): Promise<void> {
|
|
68
|
+
const startTimeUnixNano = Date.now() * 1_000_000;
|
|
69
|
+
const endTimeUnixNano = startTimeUnixNano + (options.durationMs || 100) * 1_000_000;
|
|
70
|
+
|
|
71
|
+
const payload = {
|
|
72
|
+
resourceSpans: [
|
|
73
|
+
{
|
|
74
|
+
resource: {
|
|
75
|
+
attributes: [
|
|
76
|
+
{ key: 'service.name', value: { stringValue: 'ecip-test-service' } },
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
scopeSpans: [
|
|
80
|
+
{
|
|
81
|
+
spans: [
|
|
82
|
+
{
|
|
83
|
+
traceId: options.traceId,
|
|
84
|
+
spanId: options.spanId,
|
|
85
|
+
parentSpanId: options.parentSpanId || '',
|
|
86
|
+
name: options.name,
|
|
87
|
+
kind: 1, // SPAN_KIND_INTERNAL
|
|
88
|
+
startTimeUnixNano: startTimeUnixNano.toString(),
|
|
89
|
+
endTimeUnixNano: endTimeUnixNano.toString(),
|
|
90
|
+
status: {
|
|
91
|
+
code: options.statusCode || 0, // 0=UNSET, 1=OK, 2=ERROR
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
],
|
|
95
|
+
},
|
|
96
|
+
],
|
|
97
|
+
},
|
|
98
|
+
],
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
return new Promise((resolve, reject) => {
|
|
102
|
+
const url = new URL(`${endpoint}/v1/traces`);
|
|
103
|
+
const req = http.request(
|
|
104
|
+
{
|
|
105
|
+
hostname: url.hostname,
|
|
106
|
+
port: url.port,
|
|
107
|
+
path: url.pathname,
|
|
108
|
+
method: 'POST',
|
|
109
|
+
headers: { 'Content-Type': 'application/json' },
|
|
110
|
+
},
|
|
111
|
+
(res) => {
|
|
112
|
+
if (res.statusCode && res.statusCode >= 200 && res.statusCode < 300) {
|
|
113
|
+
resolve();
|
|
114
|
+
} else {
|
|
115
|
+
reject(new Error(`OTLP export failed with status ${res.statusCode}`));
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
);
|
|
119
|
+
req.on('error', reject);
|
|
120
|
+
req.write(JSON.stringify(payload));
|
|
121
|
+
req.end();
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Helper: Query Tempo for a trace by ID
|
|
127
|
+
*/
|
|
128
|
+
async function queryTempo(
|
|
129
|
+
tempoEndpoint: string,
|
|
130
|
+
traceId: string
|
|
131
|
+
): Promise<any | null> {
|
|
132
|
+
return new Promise((resolve, reject) => {
|
|
133
|
+
const url = `${tempoEndpoint}/api/traces/${traceId}`;
|
|
134
|
+
http.get(url, (res) => {
|
|
135
|
+
if (res.statusCode === 404) {
|
|
136
|
+
resolve(null);
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
let data = '';
|
|
140
|
+
res.on('data', (chunk) => (data += chunk));
|
|
141
|
+
res.on('end', () => {
|
|
142
|
+
try {
|
|
143
|
+
resolve(JSON.parse(data));
|
|
144
|
+
} catch {
|
|
145
|
+
resolve(null);
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
}).on('error', reject);
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Helper: Wait for a trace to appear in Tempo with retries
|
|
154
|
+
*/
|
|
155
|
+
async function waitForTrace(
|
|
156
|
+
tempoEndpoint: string,
|
|
157
|
+
traceId: string,
|
|
158
|
+
timeoutMs: number = 10000
|
|
159
|
+
): Promise<any | null> {
|
|
160
|
+
const start = Date.now();
|
|
161
|
+
while (Date.now() - start < timeoutMs) {
|
|
162
|
+
const result = await queryTempo(tempoEndpoint, traceId);
|
|
163
|
+
if (result) return result;
|
|
164
|
+
await new Promise((r) => setTimeout(r, 1000));
|
|
165
|
+
}
|
|
166
|
+
return null;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Generate hex IDs for traces/spans
|
|
170
|
+
function generateHexId(bytes: number): string {
|
|
171
|
+
const chars = '0123456789abcdef';
|
|
172
|
+
let result = '';
|
|
173
|
+
for (let i = 0; i < bytes * 2; i++) {
|
|
174
|
+
result += chars[Math.floor(Math.random() * 16)];
|
|
175
|
+
}
|
|
176
|
+
return result;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Integration test suite — requires Docker
|
|
181
|
+
*
|
|
182
|
+
* These tests validate the end-to-end OTel pipeline:
|
|
183
|
+
* SDK → Collector → Tempo → Query API
|
|
184
|
+
*/
|
|
185
|
+
describe('OTel Pipeline Integration', () => {
|
|
186
|
+
let tempoContainer: StartedTestContainer | undefined;
|
|
187
|
+
let collectorContainer: StartedTestContainer | undefined;
|
|
188
|
+
let tempoEndpoint: string;
|
|
189
|
+
let collectorEndpoint: string;
|
|
190
|
+
let dockerAvailable = false;
|
|
191
|
+
|
|
192
|
+
beforeAll(async () => {
|
|
193
|
+
// Check if Docker is available
|
|
194
|
+
try {
|
|
195
|
+
const { execSync } = await import('node:child_process');
|
|
196
|
+
execSync('docker info', { stdio: 'ignore' });
|
|
197
|
+
dockerAvailable = true;
|
|
198
|
+
} catch {
|
|
199
|
+
console.warn('Docker not available — skipping integration tests');
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
try {
|
|
204
|
+
// Start Tempo
|
|
205
|
+
tempoContainer = await new GenericContainer(TEMPO_IMAGE)
|
|
206
|
+
.withExposedPorts(TEMPO_QUERY_PORT, 4317, 4318)
|
|
207
|
+
.withCommand(['-config.file=/etc/tempo.yaml'])
|
|
208
|
+
.withWaitStrategy(Wait.forHttp('/', TEMPO_QUERY_PORT).withStartupTimeout(30000))
|
|
209
|
+
.start();
|
|
210
|
+
|
|
211
|
+
const tempoHost = tempoContainer.getHost();
|
|
212
|
+
const tempoPort = tempoContainer.getMappedPort(TEMPO_QUERY_PORT);
|
|
213
|
+
tempoEndpoint = `http://${tempoHost}:${tempoPort}`;
|
|
214
|
+
|
|
215
|
+
// Start OTel Collector
|
|
216
|
+
collectorContainer = await new GenericContainer(COLLECTOR_IMAGE)
|
|
217
|
+
.withExposedPorts(OTLP_HTTP_PORT)
|
|
218
|
+
.withWaitStrategy(Wait.forHttp('/', 13133).withStartupTimeout(30000))
|
|
219
|
+
.start();
|
|
220
|
+
|
|
221
|
+
const collectorHost = collectorContainer.getHost();
|
|
222
|
+
const collectorPort = collectorContainer.getMappedPort(OTLP_HTTP_PORT);
|
|
223
|
+
collectorEndpoint = `http://${collectorHost}:${collectorPort}`;
|
|
224
|
+
} catch (err) {
|
|
225
|
+
console.warn('Failed to start containers:', err);
|
|
226
|
+
dockerAvailable = false;
|
|
227
|
+
}
|
|
228
|
+
}, 60000);
|
|
229
|
+
|
|
230
|
+
afterAll(async () => {
|
|
231
|
+
if (collectorContainer) await collectorContainer.stop();
|
|
232
|
+
if (tempoContainer) await tempoContainer.stop();
|
|
233
|
+
}, 30000);
|
|
234
|
+
|
|
235
|
+
it('should send spans to collector and retrieve from Tempo', async () => {
|
|
236
|
+
if (!dockerAvailable) {
|
|
237
|
+
console.log('Skipping: Docker not available');
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const traceId = generateHexId(16);
|
|
242
|
+
const rootSpanId = generateHexId(8);
|
|
243
|
+
|
|
244
|
+
// Emit a root span
|
|
245
|
+
await sendOtlpSpan(collectorEndpoint, {
|
|
246
|
+
traceId,
|
|
247
|
+
spanId: rootSpanId,
|
|
248
|
+
name: 'root-operation',
|
|
249
|
+
durationMs: 200,
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
// Wait and query
|
|
253
|
+
const trace = await waitForTrace(tempoEndpoint, traceId, 10000);
|
|
254
|
+
expect(trace).not.toBeNull();
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
it('should propagate parent-child span relationships', async () => {
|
|
258
|
+
if (!dockerAvailable) {
|
|
259
|
+
console.log('Skipping: Docker not available');
|
|
260
|
+
return;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const traceId = generateHexId(16);
|
|
264
|
+
const parentSpanId = generateHexId(8);
|
|
265
|
+
const childSpanId = generateHexId(8);
|
|
266
|
+
|
|
267
|
+
// Root span
|
|
268
|
+
await sendOtlpSpan(collectorEndpoint, {
|
|
269
|
+
traceId,
|
|
270
|
+
spanId: parentSpanId,
|
|
271
|
+
name: 'parent-operation',
|
|
272
|
+
durationMs: 500,
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
// Child span
|
|
276
|
+
await sendOtlpSpan(collectorEndpoint, {
|
|
277
|
+
traceId,
|
|
278
|
+
spanId: childSpanId,
|
|
279
|
+
parentSpanId: parentSpanId,
|
|
280
|
+
name: 'child-operation',
|
|
281
|
+
durationMs: 100,
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
const trace = await waitForTrace(tempoEndpoint, traceId, 10000);
|
|
285
|
+
expect(trace).not.toBeNull();
|
|
286
|
+
|
|
287
|
+
// Verify span count (at least 2 spans in the trace)
|
|
288
|
+
if (trace?.batches) {
|
|
289
|
+
const totalSpans = trace.batches.reduce(
|
|
290
|
+
(acc: number, b: any) => acc + (b.scopeSpans?.[0]?.spans?.length || 0),
|
|
291
|
+
0
|
|
292
|
+
);
|
|
293
|
+
expect(totalSpans).toBeGreaterThanOrEqual(2);
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
it('should emit 5 spans across async service boundary', async () => {
|
|
298
|
+
if (!dockerAvailable) {
|
|
299
|
+
console.log('Skipping: Docker not available');
|
|
300
|
+
return;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
const traceId = generateHexId(16);
|
|
304
|
+
const rootSpanId = generateHexId(8);
|
|
305
|
+
|
|
306
|
+
// Root span
|
|
307
|
+
await sendOtlpSpan(collectorEndpoint, {
|
|
308
|
+
traceId,
|
|
309
|
+
spanId: rootSpanId,
|
|
310
|
+
name: 'api-gateway-handler',
|
|
311
|
+
durationMs: 1000,
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
// 4 child spans simulating async service calls
|
|
315
|
+
const childSpanNames = [
|
|
316
|
+
'knowledge-store-lookup',
|
|
317
|
+
'registry-auth-check',
|
|
318
|
+
'analysis-trigger',
|
|
319
|
+
'mcp-tool-call',
|
|
320
|
+
];
|
|
321
|
+
|
|
322
|
+
for (const name of childSpanNames) {
|
|
323
|
+
const childSpanId = generateHexId(8);
|
|
324
|
+
await sendOtlpSpan(collectorEndpoint, {
|
|
325
|
+
traceId,
|
|
326
|
+
spanId: childSpanId,
|
|
327
|
+
parentSpanId: rootSpanId,
|
|
328
|
+
name,
|
|
329
|
+
durationMs: Math.floor(Math.random() * 200) + 50,
|
|
330
|
+
});
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// All 5 spans should be retrievable
|
|
334
|
+
const trace = await waitForTrace(tempoEndpoint, traceId, 10000);
|
|
335
|
+
expect(trace).not.toBeNull();
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
it('should capture error spans with status ERROR', async () => {
|
|
339
|
+
if (!dockerAvailable) {
|
|
340
|
+
console.log('Skipping: Docker not available');
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
const traceId = generateHexId(16);
|
|
345
|
+
const spanId = generateHexId(8);
|
|
346
|
+
|
|
347
|
+
// Emit an error span (status code 2 = ERROR)
|
|
348
|
+
await sendOtlpSpan(collectorEndpoint, {
|
|
349
|
+
traceId,
|
|
350
|
+
spanId,
|
|
351
|
+
name: 'failed-operation',
|
|
352
|
+
statusCode: 2, // ERROR
|
|
353
|
+
durationMs: 50,
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
// Error spans should always be sampled (100%)
|
|
357
|
+
const trace = await waitForTrace(tempoEndpoint, traceId, 10000);
|
|
358
|
+
expect(trace).not.toBeNull();
|
|
359
|
+
});
|
|
360
|
+
});
|
|
361
|
+
|
|
362
|
+
describe('OTel Pipeline — Unit Tests (no Docker required)', () => {
|
|
363
|
+
it('should generate valid 32-char hex trace IDs', () => {
|
|
364
|
+
const traceId = generateHexId(16);
|
|
365
|
+
expect(traceId).toMatch(/^[0-9a-f]{32}$/);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
it('should generate valid 16-char hex span IDs', () => {
|
|
369
|
+
const spanId = generateHexId(8);
|
|
370
|
+
expect(spanId).toMatch(/^[0-9a-f]{16}$/);
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
it('should construct valid OTLP JSON payload', () => {
|
|
374
|
+
const traceId = generateHexId(16);
|
|
375
|
+
const spanId = generateHexId(8);
|
|
376
|
+
|
|
377
|
+
const payload = {
|
|
378
|
+
resourceSpans: [
|
|
379
|
+
{
|
|
380
|
+
resource: {
|
|
381
|
+
attributes: [
|
|
382
|
+
{ key: 'service.name', value: { stringValue: 'test' } },
|
|
383
|
+
],
|
|
384
|
+
},
|
|
385
|
+
scopeSpans: [
|
|
386
|
+
{
|
|
387
|
+
spans: [
|
|
388
|
+
{
|
|
389
|
+
traceId,
|
|
390
|
+
spanId,
|
|
391
|
+
name: 'test-span',
|
|
392
|
+
kind: 1,
|
|
393
|
+
startTimeUnixNano: (Date.now() * 1_000_000).toString(),
|
|
394
|
+
endTimeUnixNano: ((Date.now() + 100) * 1_000_000).toString(),
|
|
395
|
+
status: { code: 0 },
|
|
396
|
+
},
|
|
397
|
+
],
|
|
398
|
+
},
|
|
399
|
+
],
|
|
400
|
+
},
|
|
401
|
+
],
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
expect(payload.resourceSpans).toHaveLength(1);
|
|
405
|
+
expect(payload.resourceSpans[0].scopeSpans[0].spans).toHaveLength(1);
|
|
406
|
+
expect(payload.resourceSpans[0].scopeSpans[0].spans[0].traceId).toBe(traceId);
|
|
407
|
+
expect(payload.resourceSpans[0].scopeSpans[0].spans[0].spanId).toBe(spanId);
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
it('W3C traceparent header format should be valid', () => {
|
|
411
|
+
const traceId = generateHexId(16);
|
|
412
|
+
const spanId = generateHexId(8);
|
|
413
|
+
const traceparent = `00-${traceId}-${spanId}-01`;
|
|
414
|
+
|
|
415
|
+
// W3C Trace Context format: version-traceId-parentId-traceFlags
|
|
416
|
+
expect(traceparent).toMatch(
|
|
417
|
+
/^00-[0-9a-f]{32}-[0-9a-f]{16}-(00|01)$/
|
|
418
|
+
);
|
|
419
|
+
});
|
|
420
|
+
});
|