ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,170 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Prometheus Helm Values (kube-prometheus-stack)
3
+ # =============================================================================
4
+ # Base values for dev/staging. Production overrides in values.prod.yaml.
5
+ # =============================================================================
6
+
7
+ prometheus:
8
+ prometheusSpec:
9
+ retention: 30d
10
+ retentionSize: 50GB
11
+
12
+ resources:
13
+ requests:
14
+ cpu: 500m
15
+ memory: 2Gi
16
+ limits:
17
+ cpu: 2000m
18
+ memory: 8Gi
19
+
20
+ storageSpec:
21
+ volumeClaimTemplate:
22
+ spec:
23
+ storageClassName: standard
24
+ accessModes: ["ReadWriteOnce"]
25
+ resources:
26
+ requests:
27
+ storage: 100Gi
28
+
29
+ # Scrape interval: 15s for application metrics, 30s for infrastructure
30
+ scrapeInterval: 15s
31
+ evaluationInterval: 15s
32
+
33
+ # Rule files from ConfigMap
34
+ ruleSelector:
35
+ matchLabels:
36
+ role: alert-rules
37
+ app: ecip
38
+
39
+ # Additional scrape configs loaded from ConfigMap
40
+ additionalScrapeConfigsSecret:
41
+ enabled: true
42
+ name: ecip-scrape-configs
43
+ key: scrape-configs.yaml
44
+
45
+ # Series limit alert — prevent high-cardinality OOM
46
+ additionalAlertManagerConfigs: []
47
+
48
+ # Enable admin API for snapshot/compaction
49
+ enableAdminAPI: true
50
+
51
+ alertmanager:
52
+ enabled: true
53
+ alertmanagerSpec:
54
+ resources:
55
+ requests:
56
+ cpu: 100m
57
+ memory: 128Mi
58
+ limits:
59
+ cpu: 200m
60
+ memory: 256Mi
61
+
62
+ config:
63
+ global:
64
+ resolve_timeout: 5m
65
+
66
+ route:
67
+ group_by: ['alertname', 'module', 'severity']
68
+ group_wait: 30s
69
+ group_interval: 5m
70
+ repeat_interval: 4h
71
+ receiver: 'slack-warnings'
72
+
73
+ routes:
74
+ # Critical alerts → PagerDuty + Slack
75
+ - match:
76
+ severity: critical
77
+ receiver: 'pagerduty-critical'
78
+ continue: true
79
+
80
+ - match:
81
+ severity: critical
82
+ receiver: 'slack-critical'
83
+
84
+ # Security alerts → dedicated Slack channel
85
+ - match:
86
+ team: security
87
+ receiver: 'slack-security'
88
+
89
+ # Warning alerts → Slack only
90
+ - match:
91
+ severity: warning
92
+ receiver: 'slack-warnings'
93
+
94
+ receivers:
95
+ - name: 'pagerduty-critical'
96
+ pagerduty_configs:
97
+ - routing_key: '<PAGERDUTY_ROUTING_KEY>'
98
+ severity: critical
99
+ description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'
100
+
101
+ - name: 'slack-critical'
102
+ slack_configs:
103
+ - api_url: '<SLACK_WEBHOOK_ALERTS>'
104
+ channel: '#ecip-alerts'
105
+ title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
106
+ text: '{{ .CommonAnnotations.description }}'
107
+ send_resolved: true
108
+
109
+ - name: 'slack-warnings'
110
+ slack_configs:
111
+ - api_url: '<SLACK_WEBHOOK_ALERTS_WARN>'
112
+ channel: '#ecip-alerts-warn'
113
+ title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
114
+ text: '{{ .CommonAnnotations.description }}'
115
+ send_resolved: true
116
+
117
+ - name: 'slack-security'
118
+ slack_configs:
119
+ - api_url: '<SLACK_WEBHOOK_SECURITY>'
120
+ channel: '#ecip-security'
121
+ title: '🔒 SECURITY: {{ .GroupLabels.alertname }}'
122
+ text: '{{ .CommonAnnotations.description }}'
123
+ send_resolved: true
124
+
125
+ grafana:
126
+ enabled: true
127
+ adminPassword: admin
128
+ persistence:
129
+ enabled: true
130
+ size: 10Gi
131
+
132
+ sidecar:
133
+ dashboards:
134
+ enabled: true
135
+ label: grafana_dashboard
136
+ folder: /var/lib/grafana/dashboards/ecip
137
+ provider:
138
+ foldersFromFilesStructure: false
139
+ folder: ECIP
140
+
141
+ datasources:
142
+ datasources.yaml:
143
+ apiVersion: 1
144
+ datasources:
145
+ - name: Prometheus
146
+ type: prometheus
147
+ url: http://prometheus-operated:9090
148
+ access: proxy
149
+ isDefault: true
150
+
151
+ - name: Tempo
152
+ type: tempo
153
+ url: http://tempo:3200
154
+ access: proxy
155
+ jsonData:
156
+ tracesToMetrics:
157
+ datasourceUid: prometheus
158
+ nodeGraph:
159
+ enabled: true
160
+ serviceMap:
161
+ datasourceUid: prometheus
162
+
163
+ - name: Elasticsearch
164
+ type: elasticsearch
165
+ url: http://elasticsearch:9200
166
+ access: proxy
167
+ jsonData:
168
+ index: ecip-security-events-*
169
+ timeField: "@timestamp"
170
+ esVersion: "8.0.0"
@@ -0,0 +1,97 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Prometheus Recording Rules
3
+ # =============================================================================
4
+ # Pre-computed rate/ratio metrics for dashboard performance.
5
+ # Without these, every Grafana query recalculates from raw histograms.
6
+ # =============================================================================
7
+ groups:
8
+ - name: ecip.recording.latency
9
+ interval: 30s
10
+ rules:
11
+ # Query latency quantiles (pre-computed)
12
+ - record: ecip:query_duration_ms:p50
13
+ expr: >
14
+ histogram_quantile(0.50,
15
+ sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
16
+ )
17
+
18
+ - record: ecip:query_duration_ms:p95
19
+ expr: >
20
+ histogram_quantile(0.95,
21
+ sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
22
+ )
23
+
24
+ - record: ecip:query_duration_ms:p99
25
+ expr: >
26
+ histogram_quantile(0.99,
27
+ sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
28
+ )
29
+
30
+ # Analysis duration quantiles
31
+ - record: ecip:analysis_duration_ms:p50
32
+ expr: >
33
+ histogram_quantile(0.50,
34
+ sum(rate(analysis_duration_ms_bucket{job="ecip-analysis-engine"}[5m])) by (le)
35
+ )
36
+
37
+ - record: ecip:analysis_duration_ms:p95
38
+ expr: >
39
+ histogram_quantile(0.95,
40
+ sum(rate(analysis_duration_ms_bucket{job="ecip-analysis-engine"}[5m])) by (le)
41
+ )
42
+
43
+ # MCP call latency
44
+ - record: ecip:mcp_call_duration_ms:p95
45
+ expr: >
46
+ histogram_quantile(0.95,
47
+ sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, tool_name)
48
+ )
49
+
50
+ # gRPC request latency
51
+ - record: ecip:grpc_request_duration_ms:p95
52
+ expr: >
53
+ histogram_quantile(0.95,
54
+ sum(rate(grpc_request_duration_ms_bucket[5m])) by (le, service, method)
55
+ )
56
+
57
+ # Knowledge store write latency
58
+ - record: ecip:knowledge_store_write_duration_ms:p95
59
+ expr: >
60
+ histogram_quantile(0.95,
61
+ sum(rate(knowledge_store_write_duration_ms_bucket[5m])) by (le, store_type)
62
+ )
63
+
64
+ - name: ecip.recording.rates
65
+ interval: 30s
66
+ rules:
67
+ # Query throughput (requests/sec)
68
+ - record: ecip:query_rate:rps
69
+ expr: >
70
+ sum(rate(query_duration_ms_count{job="ecip-query-service"}[5m])) by (mode)
71
+
72
+ # Analysis throughput (events/sec)
73
+ - record: ecip:analysis_rate:rps
74
+ expr: >
75
+ sum(rate(analysis_duration_ms_count{job="ecip-analysis-engine"}[5m]))
76
+
77
+ # Error rates
78
+ - record: ecip:query_error_rate:ratio
79
+ expr: >
80
+ sum(rate(query_duration_ms_count{job="ecip-query-service",status_code=~"5.."}[5m]))
81
+ /
82
+ sum(rate(query_duration_ms_count{job="ecip-query-service"}[5m]))
83
+
84
+ # LSP daemon restart rate (per hour)
85
+ - record: ecip:lsp_daemon_restart_rate:per_hour
86
+ expr: >
87
+ sum(rate(lsp_daemon_restarts_total[1h])) by (repo, language)
88
+
89
+ # Auth failure rate
90
+ - record: ecip:auth_failure_rate:5m
91
+ expr: >
92
+ sum(increase(auth_failure_total[5m])) by (reason)
93
+
94
+ # RBAC denial rate
95
+ - record: ecip:rbac_denial_rate:5m
96
+ expr: >
97
+ sum(increase(rbac_denial_total[5m])) by (resource, action)
@@ -0,0 +1,122 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Prometheus Scrape Configs
3
+ # =============================================================================
4
+ # Per-service scrape targets. Updated as modules come online.
5
+ # Loaded into Prometheus via Secret/ConfigMap reference.
6
+ # =============================================================================
7
+
8
+ # --- M01: API Gateway ---
9
+ - job_name: ecip-api-gateway
10
+ scrape_interval: 15s
11
+ kubernetes_sd_configs:
12
+ - role: pod
13
+ namespaces:
14
+ names: [ecip]
15
+ relabel_configs:
16
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
17
+ regex: ecip-api-gateway
18
+ action: keep
19
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
20
+ target_label: __address__
21
+ regex: (.+)
22
+ replacement: ${1}:${2}
23
+ - target_label: module
24
+ replacement: M01
25
+
26
+ # --- M02: Analysis Engine ---
27
+ - job_name: ecip-analysis-engine
28
+ scrape_interval: 15s
29
+ kubernetes_sd_configs:
30
+ - role: pod
31
+ namespaces:
32
+ names: [ecip]
33
+ relabel_configs:
34
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
35
+ regex: ecip-analysis-engine
36
+ action: keep
37
+ - target_label: module
38
+ replacement: M02
39
+
40
+ # --- M03: Knowledge Store ---
41
+ - job_name: ecip-knowledge-store
42
+ scrape_interval: 15s
43
+ kubernetes_sd_configs:
44
+ - role: pod
45
+ namespaces:
46
+ names: [ecip]
47
+ relabel_configs:
48
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
49
+ regex: ecip-knowledge-store
50
+ action: keep
51
+ - target_label: module
52
+ replacement: M03
53
+
54
+ # --- M04: Query Service ---
55
+ - job_name: ecip-query-service
56
+ scrape_interval: 15s
57
+ kubernetes_sd_configs:
58
+ - role: pod
59
+ namespaces:
60
+ names: [ecip]
61
+ relabel_configs:
62
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
63
+ regex: ecip-query-service
64
+ action: keep
65
+ - target_label: module
66
+ replacement: M04
67
+
68
+ # --- M05: MCP Server ---
69
+ - job_name: ecip-mcp-server
70
+ scrape_interval: 15s
71
+ kubernetes_sd_configs:
72
+ - role: pod
73
+ namespaces:
74
+ names: [ecip]
75
+ relabel_configs:
76
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
77
+ regex: ecip-mcp-server
78
+ action: keep
79
+ - target_label: module
80
+ replacement: M05
81
+
82
+ # --- M06: Registry Service ---
83
+ - job_name: ecip-registry-service
84
+ scrape_interval: 15s
85
+ kubernetes_sd_configs:
86
+ - role: pod
87
+ namespaces:
88
+ names: [ecip]
89
+ relabel_configs:
90
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
91
+ regex: ecip-registry-service
92
+ action: keep
93
+ - target_label: module
94
+ replacement: M06
95
+
96
+ # --- M07: Event Bus ---
97
+ - job_name: ecip-event-bus
98
+ scrape_interval: 15s
99
+ kubernetes_sd_configs:
100
+ - role: pod
101
+ namespaces:
102
+ names: [ecip]
103
+ relabel_configs:
104
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
105
+ regex: ecip-event-bus
106
+ action: keep
107
+ - target_label: module
108
+ replacement: M07
109
+
110
+ # --- OTel Collector self-metrics ---
111
+ - job_name: otel-collector
112
+ scrape_interval: 30s
113
+ kubernetes_sd_configs:
114
+ - role: pod
115
+ namespaces:
116
+ names: [monitoring]
117
+ relabel_configs:
118
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
119
+ regex: otel-collector
120
+ action: keep
121
+ - target_label: module
122
+ replacement: M08
@@ -0,0 +1,239 @@
1
+ # SDK Integration Guide — `@ecip/observability`
2
+
3
+ > **Priority:** P0 — Must be completed in Week 1
4
+ > **Audience:** All ECIP module teams
5
+ > **Time to integrate:** ≤ 30 minutes per service
6
+
7
+ ---
8
+
9
+ ## Overview
10
+
11
+ Every ECIP service **must** emit structured logs, traces, and metrics through the `@ecip/observability` SDK (Node.js/TypeScript) or the `ecip-observability` package (Python). Direct use of `console.log`, `print()`, raw Pino, or raw OpenTelemetry SDK is prohibited in production code paths.
12
+
13
+ The CI gate (M08-T10) blocks PRs that do not satisfy integration requirements.
14
+
15
+ ---
16
+
17
+ ## Node.js / TypeScript (M01, M03, M04, M05, M06, M07)
18
+
19
+ ### Step 1 — Install
20
+
21
+ ```bash
22
+ npm install @ecip/observability @opentelemetry/sdk-node
23
+ ```
24
+
25
+ ### Step 2 — Initialize Tracer (BEFORE all other imports)
26
+
27
+ Create `src/instrument.ts` as the **very first imported file** in your entry point:
28
+
29
+ ```typescript
30
+ // src/instrument.ts
31
+ import { initTracer } from '@ecip/observability';
32
+
33
+ initTracer({
34
+ serviceName: 'ecip-api-gateway', // use your module name
35
+ otlpEndpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
36
+ });
37
+ ```
38
+
39
+ In your entry `src/server.ts` or `src/index.ts`:
40
+
41
+ ```typescript
42
+ import './instrument'; // MUST be the first import
43
+ import express from 'express';
44
+ // ... rest of your imports
45
+ ```
46
+
47
+ ### Step 3 — Create Logger with Mandatory Fields
48
+
49
+ ```typescript
50
+ import { createLogger } from '@ecip/observability';
51
+
52
+ // All four fields are REQUIRED at compile time
53
+ const log = createLogger({
54
+ repo: 'acme-corp/auth-service',
55
+ branch: 'main',
56
+ user_id: ctx.userId,
57
+ module: 'M01',
58
+ });
59
+
60
+ log.info({ duration_ms: 43, cached: false }, 'Request handled');
61
+ log.warn({ queue_depth: 150 }, 'Backlog growing');
62
+ log.error({ err }, 'Unhandled exception');
63
+ ```
64
+
65
+ > **TypeScript will fail compilation** if any of `repo`, `branch`, `user_id`, or `module` is missing.
66
+
67
+ ### Step 4 — Add Trace Middleware
68
+
69
+ ```typescript
70
+ import { traceMiddleware } from '@ecip/observability';
71
+
72
+ const app = express();
73
+ app.use(traceMiddleware());
74
+ ```
75
+
76
+ This automatically:
77
+ - Creates a span for every incoming request
78
+ - Propagates W3C `traceparent` headers
79
+ - Injects `trace_id` and `span_id` into all log lines
80
+
81
+ ### Step 5 — Emit Security Events
82
+
83
+ ```typescript
84
+ import { emitAuthFailure, emitRbacDenial } from '@ecip/observability';
85
+
86
+ // Authentication failure (e.g., invalid JWT)
87
+ emitAuthFailure({
88
+ userId: ctx.userId,
89
+ sourceIp: req.ip,
90
+ method: req.method,
91
+ path: req.path,
92
+ reason: 'jwt_expired',
93
+ });
94
+
95
+ // RBAC denial (valid auth, insufficient permissions)
96
+ emitRbacDenial({
97
+ userId: ctx.userId,
98
+ resource: repoId,
99
+ action: 'write',
100
+ reason: 'rbac_insufficient_role',
101
+ });
102
+ ```
103
+
104
+ > Security events route to a **dedicated** Elasticsearch index via a separate OTel pipeline. Never log security events through the general logger.
105
+
106
+ ### Step 6 — Custom Spans (Optional)
107
+
108
+ ```typescript
109
+ import { withSpan } from '@ecip/observability';
110
+
111
+ const result = await withSpan('knowledge-store.lookup', async (span) => {
112
+ span.setAttribute('repo', repoId);
113
+ const data = await ksClient.lookup(repoId, query);
114
+ span.setAttribute('result_count', data.length);
115
+ return data;
116
+ });
117
+ ```
118
+
119
+ ---
120
+
121
+ ## Python (M02 — Analysis Engine)
122
+
123
+ ### Step 1 — Install
124
+
125
+ ```bash
126
+ pip install ecip-observability opentelemetry-sdk
127
+ ```
128
+
129
+ ### Step 2 — Initialize Tracer
130
+
131
+ ```python
132
+ # At the very top of your __main__ or entry point
133
+ from ecip_observability import init_tracer, get_logger
134
+
135
+ init_tracer(service_name="ecip-analysis-engine")
136
+ ```
137
+
138
+ ### Step 3 — Create Logger
139
+
140
+ ```python
141
+ log = get_logger(
142
+ repo="acme-corp/auth-service",
143
+ branch="main",
144
+ user_id=user_id,
145
+ module="M02",
146
+ )
147
+
148
+ log.info("Analysis complete", duration_ms=14200, files_indexed=47)
149
+ ```
150
+
151
+ > `get_logger()` raises `MissingObservabilityContext` if any required field is missing.
152
+
153
+ ### Step 4 — Auto-Span Decorator
154
+
155
+ ```python
156
+ from ecip_observability import traced
157
+
158
+ @traced(name="lsp.symbol_extraction")
159
+ def extract_symbols(file_path: str) -> list:
160
+ # Span automatically started/ended
161
+ # Exceptions auto-captured as span errors
162
+ ...
163
+
164
+ # Also works with async functions
165
+ @traced(name="lsp.hover_info")
166
+ async def get_hover_info(file_path: str, line: int) -> dict:
167
+ ...
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Helm Value Injection
173
+
174
+ All OTel configuration is injected by the Helm chart. **Module teams do not hardcode endpoints.**
175
+
176
+ Add this to your module's Helm `values.yaml`:
177
+
178
+ ```yaml
179
+ env:
180
+ - name: OTEL_EXPORTER_OTLP_ENDPOINT
181
+ value: "http://otel-collector.monitoring:4318"
182
+ - name: OTEL_SERVICE_NAME
183
+ valueFrom:
184
+ fieldRef:
185
+ fieldPath: metadata.labels['app.kubernetes.io/name']
186
+ - name: OTEL_RESOURCE_ATTRIBUTES
187
+ value: "ecip.module=M04,deployment.environment=production"
188
+ ```
189
+
190
+ ---
191
+
192
+ ## CI Gate Requirements
193
+
194
+ A module PR is **blocked from merging** unless all of the following are true:
195
+
196
+ | # | Requirement | Enforcement |
197
+ |---|---|---|
198
+ | 1 | `@ecip/observability` or `ecip-observability` in dependency list | Package manifest check |
199
+ | 2 | `initTracer()` called before the service starts accepting requests | Startup integration test |
200
+ | 3 | At least one log line per request handler uses `createLogger()` with all mandatory fields | ESLint rule (Node.js), Ruff rule (Python) |
201
+ | 4 | No `console.log` or `print()` in production code paths | ESLint `no-console` / Ruff `T201` |
202
+
203
+ ---
204
+
205
+ ## Troubleshooting
206
+
207
+ ### "Traces not showing up in Tempo"
208
+
209
+ 1. Check collector health: `curl http://otel-collector.monitoring:13133/healthz`
210
+ 2. Check zpages: `http://otel-collector.monitoring:55679/debug/tracez`
211
+ 3. Verify `OTEL_EXPORTER_OTLP_ENDPOINT` is set in your pod
212
+ 4. Remember: only 5% of healthy traces are sampled. Error traces are always sampled.
213
+
214
+ ### "Logger TypeScript compilation error"
215
+
216
+ You're missing a mandatory field. All four fields are required:
217
+ ```typescript
218
+ createLogger({ repo, branch, user_id, module })
219
+ ```
220
+
221
+ ### "Security events not in Elasticsearch"
222
+
223
+ Security events use a separate OTel log provider. Ensure you're using `emitAuthFailure()` / `emitRbacDenial()`, not the general logger.
224
+
225
+ ---
226
+
227
+ ## Collector Endpoints
228
+
229
+ | Protocol | Endpoint | Use Case |
230
+ |---|---|---|
231
+ | gRPC | `otel-collector.monitoring:4317` | Default for SDK auto-instrumentation |
232
+ | HTTP | `otel-collector.monitoring:4318` | SDKs that don't support gRPC |
233
+ | Health | `otel-collector.monitoring:13133` | Liveness/readiness probes |
234
+ | Metrics | `otel-collector.monitoring:8888` | Collector self-monitoring |
235
+ | zPages | `otel-collector.monitoring:55679` | Debug trace/span pipelines |
236
+
237
+ ---
238
+
239
+ *Last updated: March 2026 · Platform Team*