ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,417 @@
1
+ /**
2
+ * ECIP M08 — Security Events Unit Tests
3
+ *
4
+ * Validates the security event emission contract (NFR-SEC-002, NFR-SEC-007):
5
+ * 1. emitAuthFailure() produces correct ECS-formatted events
6
+ * 2. emitRbacDenial() produces correct ECS-formatted events
7
+ * 3. No raw user_id (PII) appears in output — only hashed u_ prefix
8
+ * 4. Events route to stderr (dedicated OTel logger provider path)
9
+ * 5. Events include trace context for correlation
10
+ * 6. Security events NEVER pass through the general logger
11
+ */
12
+
13
+ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Mock OTel API to provide deterministic trace context
17
+ // ---------------------------------------------------------------------------
18
+
19
+ const MOCK_TRACE_ID = 'abc123def456abc123def456abc123de';
20
+ const MOCK_SPAN_ID = '1234567890abcdef';
21
+
22
+ vi.mock('@opentelemetry/api', () => ({
23
+ context: {
24
+ active: vi.fn(() => ({})),
25
+ },
26
+ trace: {
27
+ getSpan: vi.fn(() => ({
28
+ spanContext: () => ({
29
+ traceId: 'abc123def456abc123def456abc123de',
30
+ spanId: '1234567890abcdef',
31
+ }),
32
+ })),
33
+ },
34
+ }));
35
+
36
+ // Import AFTER mocking
37
+ import {
38
+ emitAuthFailure,
39
+ emitRbacDenial,
40
+ flushSecurityEvents,
41
+ type AuthFailureEvent,
42
+ type RbacDenialEvent,
43
+ } from '../logging-lib/nodejs/src/security-events';
44
+
45
+ // ---------------------------------------------------------------------------
46
+ // Test utilities
47
+ // ---------------------------------------------------------------------------
48
+
49
+ function captureStderr(): { output: string[]; restore: () => void } {
50
+ const output: string[] = [];
51
+ const originalWrite = process.stderr.write;
52
+ process.stderr.write = ((chunk: string | Uint8Array) => {
53
+ output.push(typeof chunk === 'string' ? chunk : chunk.toString());
54
+ return true;
55
+ }) as typeof process.stderr.write;
56
+
57
+ return {
58
+ output,
59
+ restore: () => {
60
+ process.stderr.write = originalWrite;
61
+ },
62
+ };
63
+ }
64
+
65
+ function parseSecurityEvent(raw: string): Record<string, unknown> {
66
+ // Each event is a single JSON line on stderr
67
+ return JSON.parse(raw.trim());
68
+ }
69
+
70
+ // ---------------------------------------------------------------------------
71
+ // Tests
72
+ // ---------------------------------------------------------------------------
73
+
74
+ describe('Security Events — emitAuthFailure()', () => {
75
+ let capture: ReturnType<typeof captureStderr>;
76
+
77
+ beforeEach(() => {
78
+ capture = captureStderr();
79
+ });
80
+
81
+ afterEach(() => {
82
+ capture.restore();
83
+ });
84
+
85
+ it('should emit ECS-formatted auth failure event', () => {
86
+ emitAuthFailure({
87
+ userId: 'user@example.com',
88
+ reason: 'jwt_expired',
89
+ sourceIp: '10.0.14.22',
90
+ module: 'M01',
91
+ });
92
+ flushSecurityEvents();
93
+
94
+ expect(capture.output.length).toBe(1);
95
+ const event = parseSecurityEvent(capture.output[0]);
96
+
97
+ expect(event['event.kind']).toBe('event');
98
+ expect(event['event.category']).toBe('authentication');
99
+ expect(event['event.type']).toBe('denied');
100
+ expect(event['event.outcome']).toBe('failure');
101
+ expect(event['reason']).toBe('jwt_expired');
102
+ expect(event['source.ip']).toBe('10.0.14.22');
103
+ expect(event['module']).toBe('M01');
104
+ });
105
+
106
+ it('should include @timestamp in ISO-8601 format', () => {
107
+ emitAuthFailure({
108
+ userId: 'u_abc',
109
+ reason: 'jwt_invalid',
110
+ sourceIp: '192.168.1.1',
111
+ module: 'M01',
112
+ });
113
+ flushSecurityEvents();
114
+
115
+ const event = parseSecurityEvent(capture.output[0]);
116
+ const timestamp = event['@timestamp'] as string;
117
+
118
+ expect(timestamp).toBeDefined();
119
+ // Validate ISO-8601 format
120
+ expect(new Date(timestamp).toISOString()).toBe(timestamp);
121
+ });
122
+
123
+ it('should include trace.id for distributed trace correlation', () => {
124
+ emitAuthFailure({
125
+ userId: 'u_abc',
126
+ reason: 'jwt_missing',
127
+ sourceIp: '10.0.0.1',
128
+ module: 'M01',
129
+ });
130
+ flushSecurityEvents();
131
+
132
+ const event = parseSecurityEvent(capture.output[0]);
133
+ // trace.id is always present (either active span or 'no-active-trace')
134
+ // Exact mock-based assertions are in logging-lib/nodejs/tests/
135
+ expect(event['trace.id']).toBeDefined();
136
+ expect(typeof event['trace.id']).toBe('string');
137
+ });
138
+
139
+ it('should accept all valid failure reasons', () => {
140
+ const reasons: AuthFailureEvent['reason'][] = [
141
+ 'jwt_expired',
142
+ 'jwt_invalid',
143
+ 'jwt_missing',
144
+ 'mtls_rejected',
145
+ ];
146
+
147
+ for (const reason of reasons) {
148
+ emitAuthFailure({
149
+ userId: 'u_test',
150
+ reason,
151
+ sourceIp: '10.0.0.1',
152
+ module: 'M01',
153
+ });
154
+ }
155
+ flushSecurityEvents();
156
+
157
+ expect(capture.output.length).toBe(reasons.length);
158
+ reasons.forEach((reason, i) => {
159
+ const event = parseSecurityEvent(capture.output[i]);
160
+ expect(event['reason']).toBe(reason);
161
+ });
162
+ });
163
+
164
+ it('should include metadata when provided', () => {
165
+ emitAuthFailure({
166
+ userId: 'u_abc',
167
+ reason: 'jwt_expired',
168
+ sourceIp: '10.0.14.22',
169
+ module: 'M01',
170
+ metadata: { requestPath: '/api/v1/query', userAgent: 'curl/8.5.0' },
171
+ });
172
+ flushSecurityEvents();
173
+
174
+ const event = parseSecurityEvent(capture.output[0]);
175
+ const metadata = event['metadata'] as Record<string, unknown>;
176
+ expect(metadata).toBeDefined();
177
+ expect(metadata['requestPath']).toBe('/api/v1/query');
178
+ });
179
+ });
180
+
181
+ describe('Security Events — emitRbacDenial()', () => {
182
+ let capture: ReturnType<typeof captureStderr>;
183
+
184
+ beforeEach(() => {
185
+ capture = captureStderr();
186
+ });
187
+
188
+ afterEach(() => {
189
+ capture.restore();
190
+ });
191
+
192
+ it('should emit ECS-formatted RBAC denial event', () => {
193
+ emitRbacDenial({
194
+ userId: 'user@example.com',
195
+ resource: 'repo:acme/auth-service',
196
+ action: 'read',
197
+ reason: 'rbac_insufficient_role',
198
+ module: 'M06',
199
+ });
200
+ flushSecurityEvents();
201
+
202
+ expect(capture.output.length).toBe(1);
203
+ const event = parseSecurityEvent(capture.output[0]);
204
+
205
+ expect(event['event.kind']).toBe('event');
206
+ expect(event['event.category']).toBe('authorization');
207
+ expect(event['event.type']).toBe('denied');
208
+ expect(event['event.outcome']).toBe('failure');
209
+ expect(event['resource']).toBe('repo:acme/auth-service');
210
+ expect(event['action']).toBe('read');
211
+ expect(event['reason']).toBe('rbac_insufficient_role');
212
+ expect(event['module']).toBe('M06');
213
+ });
214
+
215
+ it('should accept all valid RBAC actions', () => {
216
+ const actions: RbacDenialEvent['action'][] = ['read', 'write', 'admin'];
217
+
218
+ for (const action of actions) {
219
+ emitRbacDenial({
220
+ userId: 'u_test',
221
+ resource: 'repo:test/repo',
222
+ action,
223
+ reason: 'denied',
224
+ module: 'M06',
225
+ });
226
+ }
227
+ flushSecurityEvents();
228
+
229
+ expect(capture.output.length).toBe(actions.length);
230
+ actions.forEach((action, i) => {
231
+ const event = parseSecurityEvent(capture.output[i]);
232
+ expect(event['action']).toBe(action);
233
+ });
234
+ });
235
+
236
+ it('should include trace.id for correlation', () => {
237
+ emitRbacDenial({
238
+ userId: 'u_abc',
239
+ resource: 'repo:acme/api',
240
+ action: 'write',
241
+ reason: 'no_write_permission',
242
+ module: 'M06',
243
+ });
244
+ flushSecurityEvents();
245
+
246
+ const event = parseSecurityEvent(capture.output[0]);
247
+ // trace.id is always present (either active span or 'no-active-trace')
248
+ // Exact mock-based assertions are in logging-lib/nodejs/tests/
249
+ expect(event['trace.id']).toBeDefined();
250
+ expect(typeof event['trace.id']).toBe('string');
251
+ });
252
+ });
253
+
254
+ describe('Security Events — PII Scrubbing (NFR-SEC-002)', () => {
255
+ let capture: ReturnType<typeof captureStderr>;
256
+
257
+ beforeEach(() => {
258
+ capture = captureStderr();
259
+ });
260
+
261
+ afterEach(() => {
262
+ capture.restore();
263
+ });
264
+
265
+ it('should hash raw email addresses — never emit raw PII', () => {
266
+ const rawEmail = 'alice@acme-corp.com';
267
+
268
+ emitAuthFailure({
269
+ userId: rawEmail,
270
+ reason: 'jwt_expired',
271
+ sourceIp: '10.0.14.22',
272
+ module: 'M01',
273
+ });
274
+ flushSecurityEvents();
275
+
276
+ const event = parseSecurityEvent(capture.output[0]);
277
+ const userId = event['user.id'] as string;
278
+
279
+ // Raw email must NOT appear anywhere in the event
280
+ expect(capture.output[0]).not.toContain(rawEmail);
281
+
282
+ // user.id must be hashed with u_ prefix
283
+ expect(userId).toMatch(/^u_[a-f0-9]{12}$/);
284
+ expect(userId).not.toBe(rawEmail);
285
+ });
286
+
287
+ it('should pass through already-hashed user IDs (u_ prefix)', () => {
288
+ const hashedId = 'u_abc123def456';
289
+
290
+ emitAuthFailure({
291
+ userId: hashedId,
292
+ reason: 'jwt_invalid',
293
+ sourceIp: '10.0.0.1',
294
+ module: 'M01',
295
+ });
296
+ flushSecurityEvents();
297
+
298
+ const event = parseSecurityEvent(capture.output[0]);
299
+ expect(event['user.id']).toBe(hashedId);
300
+ });
301
+
302
+ it('should produce consistent hashes for the same input', () => {
303
+ emitAuthFailure({
304
+ userId: 'bob@example.com',
305
+ reason: 'jwt_expired',
306
+ sourceIp: '10.0.0.1',
307
+ module: 'M01',
308
+ });
309
+ emitAuthFailure({
310
+ userId: 'bob@example.com',
311
+ reason: 'jwt_missing',
312
+ sourceIp: '10.0.0.2',
313
+ module: 'M04',
314
+ });
315
+ flushSecurityEvents();
316
+
317
+ const event1 = parseSecurityEvent(capture.output[0]);
318
+ const event2 = parseSecurityEvent(capture.output[1]);
319
+ expect(event1['user.id']).toBe(event2['user.id']);
320
+ });
321
+
322
+ it('should hash RBAC denial user IDs the same way', () => {
323
+ const rawEmail = 'charlie@acme-corp.com';
324
+
325
+ emitRbacDenial({
326
+ userId: rawEmail,
327
+ resource: 'repo:acme/secret-repo',
328
+ action: 'admin',
329
+ reason: 'no_admin_role',
330
+ module: 'M06',
331
+ });
332
+ flushSecurityEvents();
333
+
334
+ const event = parseSecurityEvent(capture.output[0]);
335
+ const userId = event['user.id'] as string;
336
+
337
+ expect(capture.output[0]).not.toContain(rawEmail);
338
+ expect(userId).toMatch(/^u_[a-f0-9]{12}$/);
339
+ });
340
+ });
341
+
342
+ describe('Security Events — Emission Path Isolation', () => {
343
+ let capture: ReturnType<typeof captureStderr>;
344
+
345
+ beforeEach(() => {
346
+ capture = captureStderr();
347
+ });
348
+
349
+ afterEach(() => {
350
+ capture.restore();
351
+ });
352
+
353
+ it('should emit to stderr (not stdout) — separate from general logger', () => {
354
+ const stdoutSpy = vi.spyOn(process.stdout, 'write');
355
+
356
+ emitAuthFailure({
357
+ userId: 'u_test',
358
+ reason: 'jwt_expired',
359
+ sourceIp: '10.0.0.1',
360
+ module: 'M01',
361
+ });
362
+ flushSecurityEvents();
363
+
364
+ // Security events go to stderr (picked up by OTel Collector security pipeline)
365
+ expect(capture.output.length).toBeGreaterThan(0);
366
+
367
+ // General logger writes to stdout — security events must NOT appear there
368
+ const stdoutCalls = stdoutSpy.mock.calls;
369
+ for (const call of stdoutCalls) {
370
+ const content = typeof call[0] === 'string' ? call[0] : call[0]?.toString() || '';
371
+ expect(content).not.toContain('event.category');
372
+ expect(content).not.toContain('authentication');
373
+ }
374
+
375
+ stdoutSpy.mockRestore();
376
+ });
377
+
378
+ it('should emit each event as a single JSON line (for OTel filelog receiver)', () => {
379
+ emitAuthFailure({
380
+ userId: 'u_test',
381
+ reason: 'jwt_expired',
382
+ sourceIp: '10.0.0.1',
383
+ module: 'M01',
384
+ });
385
+ emitRbacDenial({
386
+ userId: 'u_test',
387
+ resource: 'repo:test/repo',
388
+ action: 'read',
389
+ reason: 'denied',
390
+ module: 'M06',
391
+ });
392
+ flushSecurityEvents();
393
+
394
+ // Each event is a separate line
395
+ for (const line of capture.output) {
396
+ const trimmed = line.trim();
397
+ // Should be valid JSON
398
+ expect(() => JSON.parse(trimmed)).not.toThrow();
399
+ // Should be a single line (no embedded newlines)
400
+ expect(trimmed).not.toContain('\n');
401
+ }
402
+ });
403
+
404
+ it('should buffer events and flush together', () => {
405
+ // Emit 3 events without flushing
406
+ emitAuthFailure({ userId: 'u_1', reason: 'jwt_expired', sourceIp: '1.1.1.1', module: 'M01' });
407
+ emitAuthFailure({ userId: 'u_2', reason: 'jwt_invalid', sourceIp: '2.2.2.2', module: 'M01' });
408
+ emitAuthFailure({ userId: 'u_3', reason: 'jwt_missing', sourceIp: '3.3.3.3', module: 'M01' });
409
+
410
+ // No output before flush
411
+ expect(capture.output.length).toBe(0);
412
+
413
+ // Flush emits all buffered events
414
+ flushSecurityEvents();
415
+ expect(capture.output.length).toBe(3);
416
+ });
417
+ });
package/tsconfig.json ADDED
@@ -0,0 +1,17 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "ESNext",
5
+ "moduleResolution": "bundler",
6
+ "esModuleInterop": true,
7
+ "strict": true,
8
+ "skipLibCheck": true,
9
+ "outDir": "dist",
10
+ "rootDir": ".",
11
+ "declaration": true,
12
+ "resolveJsonModule": true,
13
+ "forceConsistentCasingInFileNames": true
14
+ },
15
+ "include": ["tests/**/*.ts", "logging-lib/nodejs/src/**/*.ts"],
16
+ "exclude": ["node_modules", "dist"]
17
+ }
@@ -0,0 +1,21 @@
1
+ import { defineConfig } from 'vitest/config';
2
+
3
+ export default defineConfig({
4
+ test: {
5
+ globals: true,
6
+ include: [
7
+ 'tests/**/*.test.ts',
8
+ 'logging-lib/nodejs/tests/**/*.test.ts',
9
+ ],
10
+ exclude: [
11
+ 'node_modules',
12
+ '**/otel-pipeline-integration.test.ts', // Requires Docker — run separately
13
+ ],
14
+ coverage: {
15
+ provider: 'v8',
16
+ reporter: ['text', 'lcov', 'html'],
17
+ reportsDirectory: 'coverage',
18
+ },
19
+ testTimeout: 30000,
20
+ },
21
+ });
@@ -0,0 +1,9 @@
1
+ import { defineConfig } from 'vitest/config';
2
+
3
+ export default defineConfig({
4
+ test: {
5
+ globals: true,
6
+ include: ['tests/otel-pipeline-integration.test.ts'],
7
+ testTimeout: 120000, // 2 minutes — containers need time to start
8
+ },
9
+ });