e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,1385 @@
|
|
|
1
|
+
# ADR-007: OpenTelemetry Integration
|
|
2
|
+
|
|
3
|
+
**Status:** Draft
|
|
4
|
+
**Date:** January 13, 2026
|
|
5
|
+
**Covers:** UC-008 (OpenTelemetry Integration)
|
|
6
|
+
**Depends On:** ADR-001 (Core), ADR-005 (Tracing), ADR-004 (Adapters)
|
|
7
|
+
**Priority:** 🟡 Medium (v1.1+ enhancement)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## 📋 Table of Contents
|
|
12
|
+
|
|
13
|
+
1. [Context & Problem](#1-context--problem)
|
|
14
|
+
2. [Architecture Overview](#2-architecture-overview)
|
|
15
|
+
- 2.2. [Metrics Backend Selection (C03 Resolution)](#22-metrics-backend-selection-c03-resolution) ⚠️ CRITICAL
|
|
16
|
+
3. [OTel Collector Adapter](#3-otel-collector-adapter)
|
|
17
|
+
4. [Semantic Conventions](#4-semantic-conventions)
|
|
18
|
+
5. [Logs Signal Export](#5-logs-signal-export)
|
|
19
|
+
6. [Traces Signal Export](#6-traces-signal-export)
|
|
20
|
+
7. [Resource Attributes](#7-resource-attributes)
|
|
21
|
+
8. [Trace Context Integration](#8-trace-context-integration)
|
|
22
|
+
9. [Testing Strategy](#9-testing-strategy)
|
|
23
|
+
10. [Trade-offs](#10-trade-offs)
|
|
24
|
+
|
|
25
|
+
**Note:** Cardinality Protection (C04 Resolution) moved to [ADR-009 Cost Optimization §8](ADR-009-cost-optimization.md#8-cardinality-protection-c04-resolution) ⚠️
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## 1. Context & Problem
|
|
30
|
+
|
|
31
|
+
### 1.1. Problem Statement
|
|
32
|
+
|
|
33
|
+
**Telemetry Fragmentation:**
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
# ❌ CURRENT: Separate systems, manual integration
|
|
37
|
+
# - E11y events → Loki (custom adapter)
|
|
38
|
+
# - Rails logs → File/Stdout
|
|
39
|
+
# - Sidekiq → Redis logs
|
|
40
|
+
# - Traces → Manual instrumentation
|
|
41
|
+
# - Metrics → Yabeda → Prometheus
|
|
42
|
+
|
|
43
|
+
# Problems:
|
|
44
|
+
# 1. Multiple telemetry pipelines (5+ different systems)
|
|
45
|
+
# 2. No automatic correlation (logs ↔ traces ↔ metrics)
|
|
46
|
+
# 3. Different field naming conventions
|
|
47
|
+
# 4. Manual span creation from events
|
|
48
|
+
# 5. Can't use OTel Collector benefits (sampling, routing, filtering)
|
|
49
|
+
# 6. Vendor lock-in (custom adapters for each backend)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Missing OpenTelemetry Integration:**
|
|
53
|
+
- ❌ No OTel Logs Signal support (E11y → custom formats only)
|
|
54
|
+
- ❌ No automatic semantic conventions mapping
|
|
55
|
+
- ❌ No OTel Collector adapter (direct to backends only)
|
|
56
|
+
- ❌ No automatic span creation from events
|
|
57
|
+
- ❌ Manual trace context management
|
|
58
|
+
- ❌ Can't use OTel ecosystem tools (processors, exporters, samplers)
|
|
59
|
+
|
|
60
|
+
### 1.2. Goals
|
|
61
|
+
|
|
62
|
+
**Primary Goals:**
|
|
63
|
+
- ✅ **OTel Collector Adapter** (OTLP HTTP/gRPC support)
|
|
64
|
+
- ✅ **Logs Signal Export** (E11y events → OTel Logs)
|
|
65
|
+
- ✅ **Semantic Conventions** (automatic field mapping)
|
|
66
|
+
- ✅ **Automatic Span Creation** (events → spans)
|
|
67
|
+
- ✅ **Trace Context Integration** (use OTel SDK trace context)
|
|
68
|
+
- ✅ **Resource Attributes** (service metadata)
|
|
69
|
+
|
|
70
|
+
**Non-Goals:**
|
|
71
|
+
- ❌ Replace Yabeda (metrics stay with Yabeda, better for Rails)
|
|
72
|
+
- ❌ Replace existing adapters (OTel is optional, v1.1+)
|
|
73
|
+
- ❌ OTel auto-instrumentation (already exists separately)
|
|
74
|
+
|
|
75
|
+
> **⚠️ NOTE (C03 Resolution):** OpenTelemetry is **optional** for E11y. **Yabeda is the default metrics backend** (see ADR-002). You can choose OpenTelemetry for metrics, but **not both simultaneously** to avoid double overhead. See [Metrics Backend Selection](#22-metrics-backend-selection-c03-resolution) and [CONFLICT-ANALYSIS.md C03](../researches/CONFLICT-ANALYSIS.md#c03-dual-metrics-collection-overhead).
|
|
76
|
+
|
|
77
|
+
### 1.3. Success Metrics
|
|
78
|
+
|
|
79
|
+
| Metric | Target | Critical? |
|
|
80
|
+
|--------|--------|-----------|
|
|
81
|
+
| **OTel compatibility** | 100% OTLP spec | ✅ Yes |
|
|
82
|
+
| **Semantic conventions coverage** | HTTP, DB, RPC, Messaging | ✅ Yes |
|
|
83
|
+
| **Trace correlation** | 100% automatic | ✅ Yes |
|
|
84
|
+
| **Performance overhead** | <5% vs direct adapters | ✅ Yes |
|
|
85
|
+
| **Backend flexibility** | Any OTel-compatible | ✅ Yes |
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 2. Architecture Overview
|
|
90
|
+
|
|
91
|
+
### 2.1. System Context
|
|
92
|
+
|
|
93
|
+
```mermaid
|
|
94
|
+
C4Context
|
|
95
|
+
title OpenTelemetry Integration Context
|
|
96
|
+
|
|
97
|
+
Person(dev, "Developer", "Rails application")
|
|
98
|
+
|
|
99
|
+
System(e11y, "E11y Gem", "Tracks business events")
|
|
100
|
+
|
|
101
|
+
System_Ext(otel_collector, "OTel Collector", "Centralized telemetry pipeline")
|
|
102
|
+
System_Ext(loki, "Loki", "Logs storage")
|
|
103
|
+
System_Ext(jaeger, "Jaeger", "Traces storage")
|
|
104
|
+
System_Ext(prometheus, "Prometheus", "Metrics storage")
|
|
105
|
+
System_Ext(s3, "S3", "Long-term archive")
|
|
106
|
+
|
|
107
|
+
Rel(dev, e11y, "Tracks events", "E11y API")
|
|
108
|
+
Rel(e11y, otel_collector, "Exports", "OTLP HTTP/gRPC")
|
|
109
|
+
Rel(otel_collector, loki, "Routes logs", "Loki API")
|
|
110
|
+
Rel(otel_collector, jaeger, "Routes traces", "Jaeger API")
|
|
111
|
+
Rel(otel_collector, prometheus, "Routes metrics", "Remote Write")
|
|
112
|
+
Rel(otel_collector, s3, "Archives", "S3 API")
|
|
113
|
+
|
|
114
|
+
UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### 2.2. Metrics Backend Selection (C03 Resolution) ⚠️ CRITICAL
|
|
118
|
+
|
|
119
|
+
**Reference:** [CONFLICT-ANALYSIS.md - C03: Dual Metrics Collection Overhead](../researches/CONFLICT-ANALYSIS.md#c03-dual-metrics-collection-overhead)
|
|
120
|
+
|
|
121
|
+
**Problem:** Running both Yabeda (ADR-002) and OpenTelemetry metrics simultaneously causes **double overhead** - every event increments counters in both systems, doubling CPU/memory usage and storage costs.
|
|
122
|
+
|
|
123
|
+
**Decision:** E11y supports **configurable metrics backend** - choose ONE:
|
|
124
|
+
1. **`:yabeda`** (default) - Ruby-native, Prometheus, best for Rails
|
|
125
|
+
2. **`:opentelemetry`** (optional) - Vendor-neutral, OTLP, multi-backend
|
|
126
|
+
3. **`[:yabeda, :opentelemetry]`** (migration only) - Both enabled (⚠️ double overhead!)
|
|
127
|
+
|
|
128
|
+
**Configuration:**
|
|
129
|
+
|
|
130
|
+
```ruby
|
|
131
|
+
# config/initializers/e11y.rb
|
|
132
|
+
E11y.configure do |config|
|
|
133
|
+
# Option 1: Yabeda only (DEFAULT, recommended for Rails)
|
|
134
|
+
config.metrics do
|
|
135
|
+
backend :yabeda # Prometheus via Yabeda
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Option 2: OpenTelemetry only (for OTLP backends)
|
|
139
|
+
# config.metrics do
|
|
140
|
+
# backend :opentelemetry # OTLP via OTel SDK
|
|
141
|
+
# end
|
|
142
|
+
|
|
143
|
+
# Option 3: Both (for migration period ONLY)
|
|
144
|
+
# config.metrics do
|
|
145
|
+
# backend [:yabeda, :opentelemetry] # ⚠️ DOUBLE OVERHEAD!
|
|
146
|
+
# end
|
|
147
|
+
end
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Metrics Adapter Pattern:**
|
|
151
|
+
|
|
152
|
+
E11y uses an internal **Metrics Adapter** to abstract the backend:
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
# lib/e11y/metrics.rb
|
|
156
|
+
module E11y
|
|
157
|
+
module Metrics
|
|
158
|
+
class << self
|
|
159
|
+
# Unified API (backend-agnostic)
|
|
160
|
+
def increment(metric_name, tags = {}, by: 1)
|
|
161
|
+
backends.each do |backend|
|
|
162
|
+
case backend
|
|
163
|
+
when :yabeda
|
|
164
|
+
increment_yabeda(metric_name, tags, by)
|
|
165
|
+
when :opentelemetry
|
|
166
|
+
increment_opentelemetry(metric_name, tags, by)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def histogram(metric_name, value, tags = {})
|
|
172
|
+
backends.each do |backend|
|
|
173
|
+
case backend
|
|
174
|
+
when :yabeda
|
|
175
|
+
histogram_yabeda(metric_name, value, tags)
|
|
176
|
+
when :opentelemetry
|
|
177
|
+
histogram_opentelemetry(metric_name, value, tags)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
private
|
|
183
|
+
|
|
184
|
+
def backends
|
|
185
|
+
Array(E11y.config.metrics.backend)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def increment_yabeda(metric_name, tags, by)
|
|
189
|
+
return unless defined?(Yabeda)
|
|
190
|
+
|
|
191
|
+
# Convert metric_name to Yabeda format
|
|
192
|
+
# e.g., 'events_total' → Yabeda.e11y_events_total
|
|
193
|
+
yabeda_metric = Yabeda.e11y.public_send(metric_name)
|
|
194
|
+
yabeda_metric.increment(tags, by: by)
|
|
195
|
+
rescue NameError => e
|
|
196
|
+
E11y.logger.warn "Yabeda metric not found: #{metric_name} (#{e.message})"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def increment_opentelemetry(metric_name, tags, by)
|
|
200
|
+
return unless defined?(OpenTelemetry)
|
|
201
|
+
|
|
202
|
+
# Convert to OpenTelemetry format
|
|
203
|
+
# e.g., 'events_total' → 'e11y.events.total'
|
|
204
|
+
otel_metric_name = "e11y.#{metric_name.to_s.tr('_', '.')}"
|
|
205
|
+
|
|
206
|
+
meter = OpenTelemetry.meter_provider.meter('e11y')
|
|
207
|
+
counter = meter.create_counter(otel_metric_name, unit: '1', description: 'E11y metric')
|
|
208
|
+
counter.add(by, attributes: tags)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def histogram_yabeda(metric_name, value, tags)
|
|
212
|
+
return unless defined?(Yabeda)
|
|
213
|
+
|
|
214
|
+
yabeda_metric = Yabeda.e11y.public_send(metric_name)
|
|
215
|
+
yabeda_metric.measure(tags, value)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def histogram_opentelemetry(metric_name, value, tags)
|
|
219
|
+
return unless defined?(OpenTelemetry)
|
|
220
|
+
|
|
221
|
+
otel_metric_name = "e11y.#{metric_name.to_s.tr('_', '.')}"
|
|
222
|
+
|
|
223
|
+
meter = OpenTelemetry.meter_provider.meter('e11y')
|
|
224
|
+
histogram = meter.create_histogram(otel_metric_name, unit: 'ms', description: 'E11y metric')
|
|
225
|
+
histogram.record(value, attributes: tags)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
**Usage in E11y (backend-agnostic):**
|
|
233
|
+
|
|
234
|
+
```ruby
|
|
235
|
+
# lib/e11y/event.rb
|
|
236
|
+
class Event
|
|
237
|
+
def track
|
|
238
|
+
# Single call - backend determined by config
|
|
239
|
+
E11y::Metrics.increment('events_total', {
|
|
240
|
+
event_name: self.event_name,
|
|
241
|
+
severity: self.severity
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
# ... rest of tracking logic
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Depending on config.metrics.backend:
|
|
249
|
+
# - :yabeda → Yabeda.e11y_events_total.increment(...)
|
|
250
|
+
# - :opentelemetry → OpenTelemetry counter.add(...)
|
|
251
|
+
# - [:yabeda, :opentelemetry] → BOTH (double overhead!)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**Warning System:**
|
|
255
|
+
|
|
256
|
+
```ruby
|
|
257
|
+
# lib/e11y/config/metrics.rb
|
|
258
|
+
module E11y
|
|
259
|
+
module Config
|
|
260
|
+
class Metrics
|
|
261
|
+
attr_accessor :backend
|
|
262
|
+
|
|
263
|
+
def initialize
|
|
264
|
+
@backend = :yabeda # Default
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def backend=(value)
|
|
268
|
+
@backend = value
|
|
269
|
+
|
|
270
|
+
# Warn if both backends enabled
|
|
271
|
+
if Array(value).size > 1
|
|
272
|
+
E11y.logger.warn do
|
|
273
|
+
"⚠️ Multiple metrics backends enabled: #{Array(value).join(', ')}. " \
|
|
274
|
+
"This causes DOUBLE OVERHEAD (CPU, memory, storage). " \
|
|
275
|
+
"Only use multiple backends during migration. " \
|
|
276
|
+
"See ADR-007 and CONFLICT-ANALYSIS.md C03."
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**Migration Guide (Yabeda → OpenTelemetry):**
|
|
286
|
+
|
|
287
|
+
```ruby
|
|
288
|
+
# Step 1: Start with Yabeda (production)
|
|
289
|
+
config.metrics.backend = :yabeda
|
|
290
|
+
|
|
291
|
+
# Step 2: Enable both backends in staging (test OTLP pipeline)
|
|
292
|
+
config.metrics.backend = [:yabeda, :opentelemetry]
|
|
293
|
+
# ⚠️ Monitor: CPU/memory usage should ~2× (expected)
|
|
294
|
+
|
|
295
|
+
# Step 3: Validate OTLP metrics (Grafana dashboards work)
|
|
296
|
+
# Check: e11y.events.total (OTLP) matches e11y_events_total (Prometheus)
|
|
297
|
+
|
|
298
|
+
# Step 4: Switch to OpenTelemetry only in production
|
|
299
|
+
config.metrics.backend = :opentelemetry
|
|
300
|
+
|
|
301
|
+
# Step 5: Remove Yabeda gem dependency (cleanup)
|
|
302
|
+
# gem 'yabeda' - no longer needed
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
**Performance Impact:**
|
|
306
|
+
|
|
307
|
+
```ruby
|
|
308
|
+
# Benchmark: 10,000 events/sec
|
|
309
|
+
# Single backend (:yabeda OR :opentelemetry):
|
|
310
|
+
# - CPU: ~5% overhead
|
|
311
|
+
# - Memory: ~10 MB for metric buffers
|
|
312
|
+
# - Latency: +0.1ms per event
|
|
313
|
+
|
|
314
|
+
# Both backends ([:yabeda, :opentelemetry]):
|
|
315
|
+
# - CPU: ~10% overhead (2×)
|
|
316
|
+
# - Memory: ~20 MB (2×)
|
|
317
|
+
# - Latency: +0.2ms per event (2×)
|
|
318
|
+
# ⚠️ Only use during migration (1-2 weeks max)
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
**Monitoring:**
|
|
322
|
+
|
|
323
|
+
```ruby
|
|
324
|
+
# Track which backends are active
|
|
325
|
+
E11y::Metrics.gauge('e11y.metrics.backends_active',
|
|
326
|
+
Array(E11y.config.metrics.backend).size,
|
|
327
|
+
{ backends: Array(E11y.config.metrics.backend).join(',') }
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Alert if multiple backends enabled in production
|
|
331
|
+
# Alert: e11y_metrics_backends_active{env="production"} > 1
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
**Trade-offs:**
|
|
335
|
+
|
|
336
|
+
| Aspect | Yabeda (default) | OpenTelemetry | Both (migration) |
|
|
337
|
+
|--------|------------------|---------------|------------------|
|
|
338
|
+
| **Performance** | Fast (Ruby-native) | Slightly slower (SDK overhead) | 2× overhead ⚠️ |
|
|
339
|
+
| **Ecosystem** | Rails/Ruby best fit | Vendor-neutral | N/A |
|
|
340
|
+
| **Backend** | Prometheus only | Any OTLP backend | Prometheus + OTLP |
|
|
341
|
+
| **Setup** | Simple (gem install) | Requires OTel Collector | Complex |
|
|
342
|
+
| **Use case** | Rails apps, Prometheus | Multi-language, cloud-native | Migration period only |
|
|
343
|
+
|
|
344
|
+
**Recommendation:**
|
|
345
|
+
- **Rails apps with Prometheus:** Use `:yabeda` (default)
|
|
346
|
+
- **Cloud-native, multi-backend:** Use `:opentelemetry`
|
|
347
|
+
- **Migration period:** Use `[:yabeda, :opentelemetry]` for 1-2 weeks max
|
|
348
|
+
|
|
349
|
+
### 2.3. Component Architecture
|
|
350
|
+
|
|
351
|
+
```mermaid
|
|
352
|
+
graph TB
|
|
353
|
+
subgraph "E11y Gem"
|
|
354
|
+
Event[Event Tracked] --> Pipeline[Processing Pipeline]
|
|
355
|
+
Pipeline --> OTelAdapter[OTel Collector Adapter]
|
|
356
|
+
|
|
357
|
+
OTelAdapter --> LogsExporter[Logs Signal Exporter]
|
|
358
|
+
OTelAdapter --> TracesExporter[Traces Signal Exporter]
|
|
359
|
+
|
|
360
|
+
subgraph "Semantic Conventions"
|
|
361
|
+
LogsExporter --> HTTPConv[HTTP Conventions]
|
|
362
|
+
LogsExporter --> DBConv[Database Conventions]
|
|
363
|
+
LogsExporter --> RPCConv[RPC Conventions]
|
|
364
|
+
LogsExporter --> MsgConv[Messaging Conventions]
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
subgraph "Trace Context"
|
|
368
|
+
TracesExporter --> OTelSDK[OTel SDK Integration]
|
|
369
|
+
TracesExporter --> W3CContext[W3C Trace Context]
|
|
370
|
+
TracesExporter --> SpanCreator[Automatic Span Creator]
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
subgraph "Resource Attributes"
|
|
374
|
+
OTelAdapter --> ServiceAttrs[Service Metadata]
|
|
375
|
+
OTelAdapter --> DeploymentAttrs[Deployment Info]
|
|
376
|
+
OTelAdapter --> HostAttrs[Host/Container Info]
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
subgraph "OpenTelemetry Collector"
|
|
381
|
+
OTelCollector[Receiver: OTLP] --> Processors[Processors]
|
|
382
|
+
Processors --> Batch[Batch]
|
|
383
|
+
Processors --> Filter[Filter]
|
|
384
|
+
Processors --> Sampling[Tail Sampling]
|
|
385
|
+
Processors --> Resource[Resource]
|
|
386
|
+
|
|
387
|
+
Sampling --> Exporters[Exporters]
|
|
388
|
+
Exporters --> Loki[Loki]
|
|
389
|
+
Exporters --> Jaeger[Jaeger]
|
|
390
|
+
Exporters --> S3[S3]
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
LogsExporter --> OTelCollector
|
|
394
|
+
TracesExporter --> OTelCollector
|
|
395
|
+
|
|
396
|
+
style OTelAdapter fill:#d1ecf1
|
|
397
|
+
style OTelCollector fill:#fff3cd
|
|
398
|
+
style HTTPConv fill:#d4edda
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
### 2.4. Data Flow Sequence
|
|
402
|
+
|
|
403
|
+
```mermaid
|
|
404
|
+
sequenceDiagram
|
|
405
|
+
participant App as Rails App
|
|
406
|
+
participant E11y as E11y Event
|
|
407
|
+
participant Mapper as Semantic Mapper
|
|
408
|
+
participant Exporter as OTel Exporter
|
|
409
|
+
participant Collector as OTel Collector
|
|
410
|
+
participant Backend as Backend (Loki/Jaeger)
|
|
411
|
+
|
|
412
|
+
App->>E11y: Events::HttpRequest.track(method, status)
|
|
413
|
+
E11y->>Mapper: Map to semantic conventions
|
|
414
|
+
Mapper->>Mapper: method → http.method<br/>status → http.status_code
|
|
415
|
+
Mapper->>Exporter: OTel Log Record
|
|
416
|
+
|
|
417
|
+
Note over Exporter: Add Resource Attributes<br/>service.name, service.version
|
|
418
|
+
|
|
419
|
+
Exporter->>Exporter: Get trace context from OTel SDK
|
|
420
|
+
Exporter->>Exporter: Batch (100 events or 10s)
|
|
421
|
+
Exporter->>Collector: POST /v1/logs (OTLP HTTP)
|
|
422
|
+
|
|
423
|
+
Collector->>Collector: Process: Batch, Filter, Sample
|
|
424
|
+
Collector->>Backend: Export to Loki/Jaeger
|
|
425
|
+
Backend-->>Collector: ACK
|
|
426
|
+
Collector-->>Exporter: 200 OK
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
---
|
|
430
|
+
|
|
431
|
+
## 3. OTel Collector Adapter
|
|
432
|
+
|
|
433
|
+
### 3.1. Adapter Implementation
|
|
434
|
+
|
|
435
|
+
```ruby
|
|
436
|
+
# lib/e11y/adapters/opentelemetry_collector.rb
|
|
437
|
+
module E11y
|
|
438
|
+
module Adapters
|
|
439
|
+
class OpenTelemetryCollector < Base
|
|
440
|
+
def initialize(config = {})
|
|
441
|
+
super(name: :opentelemetry_collector)
|
|
442
|
+
|
|
443
|
+
@endpoint = config[:endpoint] || ENV['OTEL_EXPORTER_OTLP_ENDPOINT'] || 'http://localhost:4318'
|
|
444
|
+
@protocol = config[:protocol] || :http # :http or :grpc
|
|
445
|
+
@headers = config[:headers] || {}
|
|
446
|
+
@timeout = config[:timeout] || 10
|
|
447
|
+
@compression = config[:compression] || :gzip # :none, :gzip
|
|
448
|
+
|
|
449
|
+
# Signal types
|
|
450
|
+
@export_logs = config[:export_logs] != false
|
|
451
|
+
@export_traces = config[:export_traces] || false
|
|
452
|
+
@export_metrics = config[:export_metrics] || false
|
|
453
|
+
|
|
454
|
+
# Batching
|
|
455
|
+
@batch_size = config[:batch_size] || 100
|
|
456
|
+
@flush_interval = config[:flush_interval] || 10
|
|
457
|
+
|
|
458
|
+
# Resource attributes (cached once)
|
|
459
|
+
@resource_attributes = build_resource_attributes(config[:resource_attributes] || {})
|
|
460
|
+
|
|
461
|
+
# HTTP client (Faraday with connection pooling)
|
|
462
|
+
@http_client = build_http_client
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
def send_batch(events)
|
|
466
|
+
results = {}
|
|
467
|
+
|
|
468
|
+
# Export logs (most common)
|
|
469
|
+
if @export_logs
|
|
470
|
+
log_records = events.map { |event| to_otel_log_record(event) }
|
|
471
|
+
results[:logs] = export_logs(log_records)
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# Export traces (spans from events)
|
|
475
|
+
if @export_traces
|
|
476
|
+
spans = events.select { |e| should_create_span?(e) }
|
|
477
|
+
.map { |event| to_otel_span(event) }
|
|
478
|
+
results[:traces] = export_traces(spans) if spans.any?
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
{
|
|
482
|
+
success: results.values.all? { |r| r[:success] },
|
|
483
|
+
sent: events.size,
|
|
484
|
+
results: results
|
|
485
|
+
}
|
|
486
|
+
rescue => error
|
|
487
|
+
{
|
|
488
|
+
success: false,
|
|
489
|
+
error: error.message,
|
|
490
|
+
sent: 0
|
|
491
|
+
}
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
private
|
|
495
|
+
|
|
496
|
+
# === OTLP HTTP Export ===
|
|
497
|
+
|
|
498
|
+
def export_logs(log_records)
|
|
499
|
+
payload = {
|
|
500
|
+
resourceLogs: [{
|
|
501
|
+
resource: {
|
|
502
|
+
attributes: @resource_attributes
|
|
503
|
+
},
|
|
504
|
+
scopeLogs: [{
|
|
505
|
+
scope: {
|
|
506
|
+
name: 'e11y',
|
|
507
|
+
version: E11y::VERSION
|
|
508
|
+
},
|
|
509
|
+
logRecords: log_records
|
|
510
|
+
}]
|
|
511
|
+
}]
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
send_otlp_request('/v1/logs', payload)
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
def export_traces(spans)
|
|
518
|
+
payload = {
|
|
519
|
+
resourceSpans: [{
|
|
520
|
+
resource: {
|
|
521
|
+
attributes: @resource_attributes
|
|
522
|
+
},
|
|
523
|
+
scopeSpans: [{
|
|
524
|
+
scope: {
|
|
525
|
+
name: 'e11y',
|
|
526
|
+
version: E11y::VERSION
|
|
527
|
+
},
|
|
528
|
+
spans: spans
|
|
529
|
+
}]
|
|
530
|
+
}]
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
send_otlp_request('/v1/traces', payload)
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
def send_otlp_request(path, payload)
|
|
537
|
+
response = @http_client.post do |req|
|
|
538
|
+
req.url path
|
|
539
|
+
req.headers['Content-Type'] = 'application/json'
|
|
540
|
+
req.headers['Content-Encoding'] = 'gzip' if @compression == :gzip
|
|
541
|
+
@headers.each { |k, v| req.headers[k] = v }
|
|
542
|
+
|
|
543
|
+
body = payload.to_json
|
|
544
|
+
req.body = @compression == :gzip ? compress_gzip(body) : body
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
{
|
|
548
|
+
success: response.success?,
|
|
549
|
+
status: response.status,
|
|
550
|
+
sent: payload.dig(:resourceLogs, 0, :scopeLogs, 0, :logRecords)&.size || 0
|
|
551
|
+
}
|
|
552
|
+
rescue => error
|
|
553
|
+
{ success: false, error: error.message, sent: 0 }
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
# === OTel Log Record Conversion ===
|
|
557
|
+
|
|
558
|
+
def to_otel_log_record(event)
|
|
559
|
+
{
|
|
560
|
+
timeUnixNano: time_to_unix_nano(event[:timestamp]),
|
|
561
|
+
observedTimeUnixNano: time_to_unix_nano(Time.now),
|
|
562
|
+
severityNumber: map_severity_to_otel(event[:severity]),
|
|
563
|
+
severityText: event[:severity].to_s.upcase,
|
|
564
|
+
body: {
|
|
565
|
+
stringValue: event[:event_name]
|
|
566
|
+
},
|
|
567
|
+
attributes: build_log_attributes(event),
|
|
568
|
+
traceId: encode_trace_id(event[:trace_id]),
|
|
569
|
+
spanId: encode_span_id(event[:span_id]),
|
|
570
|
+
flags: event[:trace_flags] || 0
|
|
571
|
+
}.compact
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
def build_log_attributes(event)
|
|
575
|
+
attributes = []
|
|
576
|
+
|
|
577
|
+
# Semantic conventions mapping
|
|
578
|
+
mapped_payload = E11y::OpenTelemetry::SemanticConventions.map(
|
|
579
|
+
event[:event_name],
|
|
580
|
+
event[:payload]
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
# Convert to OTel key-value pairs
|
|
584
|
+
mapped_payload.each do |key, value|
|
|
585
|
+
attributes << {
|
|
586
|
+
key: key.to_s,
|
|
587
|
+
value: encode_otel_value(value)
|
|
588
|
+
}
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
# Add event metadata
|
|
592
|
+
attributes << { key: 'event.name', value: { stringValue: event[:event_name] } }
|
|
593
|
+
attributes << { key: 'event.domain', value: { stringValue: event[:domain] } } if event[:domain]
|
|
594
|
+
|
|
595
|
+
attributes
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
# === OTel Span Conversion ===
|
|
599
|
+
|
|
600
|
+
def to_otel_span(event)
|
|
601
|
+
start_time = time_to_unix_nano(event[:timestamp])
|
|
602
|
+
end_time = event[:duration_ms] ?
|
|
603
|
+
start_time + (event[:duration_ms] * 1_000_000).to_i :
|
|
604
|
+
start_time + 1_000_000 # 1ms default
|
|
605
|
+
|
|
606
|
+
{
|
|
607
|
+
traceId: encode_trace_id(event[:trace_id]),
|
|
608
|
+
spanId: encode_span_id(event[:span_id]),
|
|
609
|
+
parentSpanId: encode_span_id(event[:parent_span_id]),
|
|
610
|
+
name: event[:event_name],
|
|
611
|
+
kind: span_kind_to_otel(event[:span_kind] || :internal),
|
|
612
|
+
startTimeUnixNano: start_time,
|
|
613
|
+
endTimeUnixNano: end_time,
|
|
614
|
+
attributes: build_span_attributes(event),
|
|
615
|
+
status: build_span_status(event)
|
|
616
|
+
}.compact
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
def build_span_attributes(event)
|
|
620
|
+
attributes = []
|
|
621
|
+
|
|
622
|
+
event[:payload].each do |key, value|
|
|
623
|
+
attributes << {
|
|
624
|
+
key: key.to_s,
|
|
625
|
+
value: encode_otel_value(value)
|
|
626
|
+
}
|
|
627
|
+
end
|
|
628
|
+
|
|
629
|
+
attributes
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
def build_span_status(event)
|
|
633
|
+
if event[:severity].in?([:error, :fatal])
|
|
634
|
+
{
|
|
635
|
+
code: 2, # STATUS_CODE_ERROR
|
|
636
|
+
message: event[:payload][:error_message] || 'Error'
|
|
637
|
+
}
|
|
638
|
+
else
|
|
639
|
+
{
|
|
640
|
+
code: 1 # STATUS_CODE_OK
|
|
641
|
+
}
|
|
642
|
+
end
|
|
643
|
+
end
|
|
644
|
+
|
|
645
|
+
# === Resource Attributes ===
|
|
646
|
+
|
|
647
|
+
def build_resource_attributes(custom_attrs)
|
|
648
|
+
attributes = []
|
|
649
|
+
|
|
650
|
+
# Service (REQUIRED)
|
|
651
|
+
attributes << kv('service.name', ENV['SERVICE_NAME'] || 'api')
|
|
652
|
+
attributes << kv('service.version', ENV['GIT_SHA'] || 'unknown')
|
|
653
|
+
attributes << kv('service.instance.id', ENV['HOSTNAME'] || Socket.gethostname)
|
|
654
|
+
|
|
655
|
+
# Deployment
|
|
656
|
+
attributes << kv('deployment.environment', Rails.env.to_s)
|
|
657
|
+
attributes << kv('deployment.region', ENV['AWS_REGION']) if ENV['AWS_REGION']
|
|
658
|
+
|
|
659
|
+
# Host
|
|
660
|
+
attributes << kv('host.name', Socket.gethostname)
|
|
661
|
+
attributes << kv('host.type', ENV['INSTANCE_TYPE']) if ENV['INSTANCE_TYPE']
|
|
662
|
+
|
|
663
|
+
# Container (if applicable)
|
|
664
|
+
if ENV['CONTAINER_ID']
|
|
665
|
+
attributes << kv('container.id', ENV['CONTAINER_ID'])
|
|
666
|
+
attributes << kv('container.name', ENV['CONTAINER_NAME']) if ENV['CONTAINER_NAME']
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
# Kubernetes (if applicable)
|
|
670
|
+
if ENV['K8S_NAMESPACE']
|
|
671
|
+
attributes << kv('k8s.namespace.name', ENV['K8S_NAMESPACE'])
|
|
672
|
+
attributes << kv('k8s.pod.name', ENV['K8S_POD_NAME']) if ENV['K8S_POD_NAME']
|
|
673
|
+
attributes << kv('k8s.deployment.name', ENV['K8S_DEPLOYMENT']) if ENV['K8S_DEPLOYMENT']
|
|
674
|
+
end
|
|
675
|
+
|
|
676
|
+
# Custom attributes
|
|
677
|
+
custom_attrs.each do |key, value|
|
|
678
|
+
attributes << kv(key.to_s, value)
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
attributes
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
def kv(key, value)
|
|
685
|
+
{
|
|
686
|
+
key: key,
|
|
687
|
+
value: encode_otel_value(value)
|
|
688
|
+
}
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
# === Helpers ===
|
|
692
|
+
|
|
693
|
+
def encode_otel_value(value)
|
|
694
|
+
case value
|
|
695
|
+
when String
|
|
696
|
+
{ stringValue: value }
|
|
697
|
+
when Integer
|
|
698
|
+
{ intValue: value }
|
|
699
|
+
when Float
|
|
700
|
+
{ doubleValue: value }
|
|
701
|
+
when TrueClass, FalseClass
|
|
702
|
+
{ boolValue: value }
|
|
703
|
+
when Array
|
|
704
|
+
{ arrayValue: { values: value.map { |v| encode_otel_value(v) } } }
|
|
705
|
+
when Hash
|
|
706
|
+
{ kvlistValue: { values: value.map { |k, v| { key: k.to_s, value: encode_otel_value(v) } } } }
|
|
707
|
+
else
|
|
708
|
+
{ stringValue: value.to_s }
|
|
709
|
+
end
|
|
710
|
+
end
|
|
711
|
+
|
|
712
|
+
def time_to_unix_nano(time)
|
|
713
|
+
time = Time.parse(time) if time.is_a?(String)
|
|
714
|
+
(time.to_f * 1_000_000_000).to_i
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
def encode_trace_id(trace_id)
|
|
718
|
+
return nil unless trace_id
|
|
719
|
+
# W3C trace-id is 32 hex chars → 16 bytes → base64
|
|
720
|
+
[trace_id].pack('H*').unpack1('m0')
|
|
721
|
+
end
|
|
722
|
+
|
|
723
|
+
def encode_span_id(span_id)
|
|
724
|
+
return nil unless span_id
|
|
725
|
+
# W3C span-id is 16 hex chars → 8 bytes → base64
|
|
726
|
+
[span_id].pack('H*').unpack1('m0')
|
|
727
|
+
end
|
|
728
|
+
|
|
729
|
+
def map_severity_to_otel(severity)
|
|
730
|
+
# OTel Severity Numbers: https://opentelemetry.io/docs/specs/otel/logs/data-model/#field-severitynumber
|
|
731
|
+
case severity.to_sym
|
|
732
|
+
when :debug then 5 # DEBUG
|
|
733
|
+
when :info then 9 # INFO
|
|
734
|
+
when :success then 9 # INFO (custom severity)
|
|
735
|
+
when :warn then 13 # WARN
|
|
736
|
+
when :error then 17 # ERROR
|
|
737
|
+
when :fatal then 21 # FATAL
|
|
738
|
+
else 0 # UNSPECIFIED
|
|
739
|
+
end
|
|
740
|
+
end
|
|
741
|
+
|
|
742
|
+
def span_kind_to_otel(kind)
|
|
743
|
+
# OTel Span Kinds
|
|
744
|
+
case kind.to_sym
|
|
745
|
+
when :internal then 1
|
|
746
|
+
when :server then 2
|
|
747
|
+
when :client then 3
|
|
748
|
+
when :producer then 4
|
|
749
|
+
when :consumer then 5
|
|
750
|
+
else 0 # UNSPECIFIED
|
|
751
|
+
end
|
|
752
|
+
end
|
|
753
|
+
|
|
754
|
+
def should_create_span?(event)
|
|
755
|
+
# Create spans for errors by default
|
|
756
|
+
return true if event[:severity].in?([:error, :fatal])
|
|
757
|
+
|
|
758
|
+
# Check if event matches span creation patterns
|
|
759
|
+
E11y.config.opentelemetry.span_creation_patterns.any? do |pattern|
|
|
760
|
+
File.fnmatch(pattern, event[:event_name])
|
|
761
|
+
end
|
|
762
|
+
end
|
|
763
|
+
|
|
764
|
+
def build_http_client
|
|
765
|
+
Faraday.new(url: @endpoint) do |f|
|
|
766
|
+
f.request :retry, max: 3, interval: 1, backoff_factor: 2
|
|
767
|
+
f.adapter :net_http_persistent, pool_size: 5
|
|
768
|
+
f.options.timeout = @timeout
|
|
769
|
+
end
|
|
770
|
+
end
|
|
771
|
+
|
|
772
|
+
def compress_gzip(data)
|
|
773
|
+
require 'zlib'
|
|
774
|
+
io = StringIO.new
|
|
775
|
+
gz = Zlib::GzipWriter.new(io)
|
|
776
|
+
gz.write(data)
|
|
777
|
+
gz.close
|
|
778
|
+
io.string
|
|
779
|
+
end
|
|
780
|
+
end
|
|
781
|
+
end
|
|
782
|
+
end
|
|
783
|
+
```
|
|
784
|
+
|
|
785
|
+
---
|
|
786
|
+
|
|
787
|
+
## 4. Semantic Conventions
|
|
788
|
+
|
|
789
|
+
### 4.1. Semantic Conventions Mapper
|
|
790
|
+
|
|
791
|
+
```ruby
|
|
792
|
+
# lib/e11y/opentelemetry/semantic_conventions.rb
|
|
793
|
+
module E11y
|
|
794
|
+
module OpenTelemetry
|
|
795
|
+
class SemanticConventions
|
|
796
|
+
# Semantic conventions registry
|
|
797
|
+
CONVENTIONS = {
|
|
798
|
+
# HTTP Semantic Conventions
|
|
799
|
+
# https://opentelemetry.io/docs/specs/semconv/http/
|
|
800
|
+
http: {
|
|
801
|
+
'method' => 'http.method',
|
|
802
|
+
'route' => 'http.route',
|
|
803
|
+
'path' => 'http.target',
|
|
804
|
+
'status_code' => 'http.status_code',
|
|
805
|
+
'status' => 'http.status_code',
|
|
806
|
+
'duration_ms' => 'http.server.duration',
|
|
807
|
+
'request_size' => 'http.request.body.size',
|
|
808
|
+
'response_size' => 'http.response.body.size',
|
|
809
|
+
'user_agent' => 'http.user_agent',
|
|
810
|
+
'client_ip' => 'http.client_ip',
|
|
811
|
+
'scheme' => 'http.scheme',
|
|
812
|
+
'host' => 'http.host',
|
|
813
|
+
'server_name' => 'http.server_name'
|
|
814
|
+
},
|
|
815
|
+
|
|
816
|
+
# Database Semantic Conventions
|
|
817
|
+
# https://opentelemetry.io/docs/specs/semconv/database/
|
|
818
|
+
database: {
|
|
819
|
+
'query' => 'db.statement',
|
|
820
|
+
'statement' => 'db.statement',
|
|
821
|
+
'duration_ms' => 'db.operation.duration',
|
|
822
|
+
'rows_affected' => 'db.operation.rows_affected',
|
|
823
|
+
'connection_id' => 'db.connection.id',
|
|
824
|
+
'database_name' => 'db.name',
|
|
825
|
+
'table_name' => 'db.sql.table',
|
|
826
|
+
'operation' => 'db.operation'
|
|
827
|
+
},
|
|
828
|
+
|
|
829
|
+
# RPC/gRPC Semantic Conventions
|
|
830
|
+
# https://opentelemetry.io/docs/specs/semconv/rpc/
|
|
831
|
+
rpc: {
|
|
832
|
+
'service' => 'rpc.service',
|
|
833
|
+
'method' => 'rpc.method',
|
|
834
|
+
'system' => 'rpc.system',
|
|
835
|
+
'status_code' => 'rpc.grpc.status_code'
|
|
836
|
+
},
|
|
837
|
+
|
|
838
|
+
# Messaging Semantic Conventions
|
|
839
|
+
# https://opentelemetry.io/docs/specs/semconv/messaging/
|
|
840
|
+
messaging: {
|
|
841
|
+
'queue_name' => 'messaging.destination.name',
|
|
842
|
+
'message_id' => 'messaging.message.id',
|
|
843
|
+
'conversation_id' => 'messaging.message.conversation_id',
|
|
844
|
+
'payload_size' => 'messaging.message.payload_size_bytes',
|
|
845
|
+
'operation' => 'messaging.operation'
|
|
846
|
+
},
|
|
847
|
+
|
|
848
|
+
# Exception Semantic Conventions
|
|
849
|
+
# https://opentelemetry.io/docs/specs/semconv/exceptions/
|
|
850
|
+
exception: {
|
|
851
|
+
'error_type' => 'exception.type',
|
|
852
|
+
'error_message' => 'exception.message',
|
|
853
|
+
'error_class' => 'exception.type',
|
|
854
|
+
'stacktrace' => 'exception.stacktrace'
|
|
855
|
+
}
|
|
856
|
+
}.freeze
|
|
857
|
+
|
|
858
|
+
def self.map(event_name, payload)
|
|
859
|
+
# Detect convention type from event name
|
|
860
|
+
convention_type = detect_convention_type(event_name)
|
|
861
|
+
|
|
862
|
+
return payload unless convention_type
|
|
863
|
+
|
|
864
|
+
# Map fields
|
|
865
|
+
mapped = {}
|
|
866
|
+
conventions = CONVENTIONS[convention_type]
|
|
867
|
+
|
|
868
|
+
payload.each do |key, value|
|
|
869
|
+
otel_key = conventions[key.to_s] || key.to_s
|
|
870
|
+
mapped[otel_key] = value
|
|
871
|
+
end
|
|
872
|
+
|
|
873
|
+
# Add system-specific attributes
|
|
874
|
+
mapped.merge!(system_attributes(convention_type))
|
|
875
|
+
|
|
876
|
+
mapped
|
|
877
|
+
end
|
|
878
|
+
|
|
879
|
+
def self.detect_convention_type(event_name)
|
|
880
|
+
case event_name
|
|
881
|
+
when /http|request|response/i
|
|
882
|
+
:http
|
|
883
|
+
when /database|query|sql|postgres|mysql/i
|
|
884
|
+
:database
|
|
885
|
+
when /rpc|grpc/i
|
|
886
|
+
:rpc
|
|
887
|
+
when /message|queue|kafka|rabbitmq|sidekiq|job/i
|
|
888
|
+
:messaging
|
|
889
|
+
when /error|exception|failure/i
|
|
890
|
+
:exception
|
|
891
|
+
else
|
|
892
|
+
nil # No convention
|
|
893
|
+
end
|
|
894
|
+
end
|
|
895
|
+
|
|
896
|
+
def self.system_attributes(convention_type)
|
|
897
|
+
case convention_type
|
|
898
|
+
when :database
|
|
899
|
+
{ 'db.system' => 'postgresql' } # From config
|
|
900
|
+
when :rpc
|
|
901
|
+
{ 'rpc.system' => 'grpc' }
|
|
902
|
+
when :messaging
|
|
903
|
+
{ 'messaging.system' => 'sidekiq' }
|
|
904
|
+
else
|
|
905
|
+
{}
|
|
906
|
+
end
|
|
907
|
+
end
|
|
908
|
+
end
|
|
909
|
+
end
|
|
910
|
+
end
|
|
911
|
+
```
|
|
912
|
+
|
|
913
|
+
### 4.2. Event-Level Convention Declaration
|
|
914
|
+
|
|
915
|
+
```ruby
|
|
916
|
+
# app/events/http_request.rb
|
|
917
|
+
module Events
|
|
918
|
+
class HttpRequest < E11y::Event::Base
|
|
919
|
+
# Declare OTel conventions for this event
|
|
920
|
+
use_otel_conventions :http
|
|
921
|
+
|
|
922
|
+
schema do
|
|
923
|
+
required(:method).filled(:string)
|
|
924
|
+
required(:route).filled(:string)
|
|
925
|
+
required(:status_code).filled(:integer)
|
|
926
|
+
required(:duration_ms).filled(:float)
|
|
927
|
+
end
|
|
928
|
+
|
|
929
|
+
# Optional: Custom OTel mapping
|
|
930
|
+
otel_mapping do
|
|
931
|
+
'http.method' from: :method
|
|
932
|
+
'http.route' from: :route
|
|
933
|
+
'http.status_code' from: :status_code
|
|
934
|
+
'http.server.duration' from: :duration_ms
|
|
935
|
+
|
|
936
|
+
# Static values
|
|
937
|
+
'http.scheme' value: 'https'
|
|
938
|
+
|
|
939
|
+
# From config
|
|
940
|
+
'http.server_name' from_config: 'server.name'
|
|
941
|
+
end
|
|
942
|
+
end
|
|
943
|
+
end
|
|
944
|
+
|
|
945
|
+
# Usage: Automatic mapping!
|
|
946
|
+
Events::HttpRequest.track(
|
|
947
|
+
method: 'POST',
|
|
948
|
+
route: '/api/orders',
|
|
949
|
+
status_code: 201,
|
|
950
|
+
duration_ms: 45.2
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
# → OTel Logs Signal receives:
|
|
954
|
+
# {
|
|
955
|
+
# Attributes: {
|
|
956
|
+
# 'http.method': 'POST',
|
|
957
|
+
# 'http.route': '/api/orders',
|
|
958
|
+
# 'http.status_code': 201,
|
|
959
|
+
# 'http.server.duration': 45.2,
|
|
960
|
+
# 'http.scheme': 'https'
|
|
961
|
+
# }
|
|
962
|
+
# }
|
|
963
|
+
```
|
|
964
|
+
|
|
965
|
+
---
|
|
966
|
+
|
|
967
|
+
## 5. Logs Signal Export
|
|
968
|
+
|
|
969
|
+
### 5.1. OTel Log Record Structure
|
|
970
|
+
|
|
971
|
+
```ruby
|
|
972
|
+
# OTel Logs Signal format (OTLP JSON)
|
|
973
|
+
{
|
|
974
|
+
resourceLogs: [{
|
|
975
|
+
resource: {
|
|
976
|
+
attributes: [
|
|
977
|
+
{ key: 'service.name', value: { stringValue: 'api' } },
|
|
978
|
+
{ key: 'service.version', value: { stringValue: 'abc123' } },
|
|
979
|
+
{ key: 'deployment.environment', value: { stringValue: 'production' } }
|
|
980
|
+
]
|
|
981
|
+
},
|
|
982
|
+
scopeLogs: [{
|
|
983
|
+
scope: {
|
|
984
|
+
name: 'e11y',
|
|
985
|
+
version: '1.0.0'
|
|
986
|
+
},
|
|
987
|
+
logRecords: [
|
|
988
|
+
{
|
|
989
|
+
timeUnixNano: 1673520000000000000,
|
|
990
|
+
observedTimeUnixNano: 1673520000000000000,
|
|
991
|
+
severityNumber: 9, # INFO
|
|
992
|
+
severityText: 'INFO',
|
|
993
|
+
body: {
|
|
994
|
+
stringValue: 'Events::OrderCreated'
|
|
995
|
+
},
|
|
996
|
+
attributes: [
|
|
997
|
+
{ key: 'order.id', value: { stringValue: '123' } },
|
|
998
|
+
{ key: 'order.amount', value: { doubleValue: 99.99 } },
|
|
999
|
+
{ key: 'event.name', value: { stringValue: 'Events::OrderCreated' } }
|
|
1000
|
+
],
|
|
1001
|
+
traceId: 'YWJjMTIzZGVmNDU2', # Base64-encoded
|
|
1002
|
+
spanId: 'eHl6Nzg5', # Base64-encoded
|
|
1003
|
+
flags: 1
|
|
1004
|
+
}
|
|
1005
|
+
]
|
|
1006
|
+
}]
|
|
1007
|
+
}]
|
|
1008
|
+
}
|
|
1009
|
+
```
|
|
1010
|
+
|
|
1011
|
+
---
|
|
1012
|
+
|
|
1013
|
+
## 6. Traces Signal Export
|
|
1014
|
+
|
|
1015
|
+
### 6.1. Automatic Span Creation
|
|
1016
|
+
|
|
1017
|
+
```ruby
|
|
1018
|
+
# lib/e11y/opentelemetry/span_creator.rb
|
|
1019
|
+
module E11y
|
|
1020
|
+
module OpenTelemetry
|
|
1021
|
+
class SpanCreator
|
|
1022
|
+
def self.create_span_from_event(event)
|
|
1023
|
+
return unless should_create_span?(event)
|
|
1024
|
+
|
|
1025
|
+
tracer = ::OpenTelemetry.tracer_provider.tracer('e11y', E11y::VERSION)
|
|
1026
|
+
|
|
1027
|
+
# Get current span (parent)
|
|
1028
|
+
parent_context = ::OpenTelemetry::Trace.current_span.context
|
|
1029
|
+
|
|
1030
|
+
# Create child span
|
|
1031
|
+
span = tracer.start_span(
|
|
1032
|
+
event[:event_name],
|
|
1033
|
+
with_parent: parent_context,
|
|
1034
|
+
kind: span_kind(event),
|
|
1035
|
+
start_timestamp: time_to_timestamp(event[:timestamp])
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
# Add attributes
|
|
1039
|
+
event[:payload].each do |key, value|
|
|
1040
|
+
span.set_attribute(key.to_s, value)
|
|
1041
|
+
end
|
|
1042
|
+
|
|
1043
|
+
# Mark as error if needed
|
|
1044
|
+
if event[:severity].in?([:error, :fatal])
|
|
1045
|
+
span.status = ::OpenTelemetry::Trace::Status.error(
|
|
1046
|
+
event[:payload][:error_message] || 'Error'
|
|
1047
|
+
)
|
|
1048
|
+
else
|
|
1049
|
+
span.status = ::OpenTelemetry::Trace::Status.ok
|
|
1050
|
+
end
|
|
1051
|
+
|
|
1052
|
+
# End span (with duration if available)
|
|
1053
|
+
end_timestamp = if event[:duration_ms]
|
|
1054
|
+
time_to_timestamp(event[:timestamp]) + (event[:duration_ms] * 1_000_000).to_i
|
|
1055
|
+
else
|
|
1056
|
+
time_to_timestamp(Time.now)
|
|
1057
|
+
end
|
|
1058
|
+
|
|
1059
|
+
span.finish(end_timestamp: end_timestamp)
|
|
1060
|
+
|
|
1061
|
+
span
|
|
1062
|
+
end
|
|
1063
|
+
|
|
1064
|
+
private
|
|
1065
|
+
|
|
1066
|
+
def self.should_create_span?(event)
|
|
1067
|
+
# Always create spans for errors
|
|
1068
|
+
return true if event[:severity].in?([:error, :fatal])
|
|
1069
|
+
|
|
1070
|
+
# Check configured patterns
|
|
1071
|
+
patterns = E11y.config.opentelemetry.span_creation_patterns || []
|
|
1072
|
+
patterns.any? { |pattern| File.fnmatch(pattern, event[:event_name]) }
|
|
1073
|
+
end
|
|
1074
|
+
|
|
1075
|
+
def self.span_kind(event)
|
|
1076
|
+
case event[:span_kind]
|
|
1077
|
+
when :server then ::OpenTelemetry::Trace::SpanKind::SERVER
|
|
1078
|
+
when :client then ::OpenTelemetry::Trace::SpanKind::CLIENT
|
|
1079
|
+
when :producer then ::OpenTelemetry::Trace::SpanKind::PRODUCER
|
|
1080
|
+
when :consumer then ::OpenTelemetry::Trace::SpanKind::CONSUMER
|
|
1081
|
+
else ::OpenTelemetry::Trace::SpanKind::INTERNAL
|
|
1082
|
+
end
|
|
1083
|
+
end
|
|
1084
|
+
|
|
1085
|
+
def self.time_to_timestamp(time)
|
|
1086
|
+
time = Time.parse(time) if time.is_a?(String)
|
|
1087
|
+
(time.to_f * 1_000_000_000).to_i
|
|
1088
|
+
end
|
|
1089
|
+
end
|
|
1090
|
+
end
|
|
1091
|
+
end
|
|
1092
|
+
```
|
|
1093
|
+
|
|
1094
|
+
### 6.2. Configuration
|
|
1095
|
+
|
|
1096
|
+
```ruby
|
|
1097
|
+
# config/initializers/e11y.rb
|
|
1098
|
+
E11y.configure do |config|
|
|
1099
|
+
config.opentelemetry do
|
|
1100
|
+
enabled true
|
|
1101
|
+
|
|
1102
|
+
# Automatic span creation
|
|
1103
|
+
create_spans_for do
|
|
1104
|
+
# Create spans for errors
|
|
1105
|
+
severity [:error, :fatal]
|
|
1106
|
+
|
|
1107
|
+
# Create spans for order processing
|
|
1108
|
+
pattern 'order.*'
|
|
1109
|
+
pattern 'payment.*'
|
|
1110
|
+
|
|
1111
|
+
# Span settings
|
|
1112
|
+
span_kind :internal
|
|
1113
|
+
span_name ->(event) { event.event_name }
|
|
1114
|
+
end
|
|
1115
|
+
end
|
|
1116
|
+
end
|
|
1117
|
+
```
|
|
1118
|
+
|
|
1119
|
+
---
|
|
1120
|
+
|
|
1121
|
+
## 7. Resource Attributes
|
|
1122
|
+
|
|
1123
|
+
### 7.1. Resource Attributes Configuration
|
|
1124
|
+
|
|
1125
|
+
```ruby
|
|
1126
|
+
# config/initializers/e11y.rb
|
|
1127
|
+
E11y.configure do |config|
|
|
1128
|
+
config.opentelemetry do
|
|
1129
|
+
resource_attributes do
|
|
1130
|
+
# Service (REQUIRED)
|
|
1131
|
+
'service.name' ENV['SERVICE_NAME'] || 'api'
|
|
1132
|
+
'service.version' ENV['GIT_SHA'] || 'unknown'
|
|
1133
|
+
'service.instance.id' ENV['HOSTNAME'] || Socket.gethostname
|
|
1134
|
+
|
|
1135
|
+
# Deployment
|
|
1136
|
+
'deployment.environment' Rails.env.to_s
|
|
1137
|
+
'deployment.region' ENV['AWS_REGION']
|
|
1138
|
+
|
|
1139
|
+
# Host
|
|
1140
|
+
'host.name' Socket.gethostname
|
|
1141
|
+
'host.type' ENV['INSTANCE_TYPE']
|
|
1142
|
+
|
|
1143
|
+
# Container
|
|
1144
|
+
'container.id' ENV['CONTAINER_ID']
|
|
1145
|
+
'container.name' ENV['CONTAINER_NAME']
|
|
1146
|
+
|
|
1147
|
+
# Kubernetes
|
|
1148
|
+
'k8s.namespace.name' ENV['K8S_NAMESPACE']
|
|
1149
|
+
'k8s.pod.name' ENV['K8S_POD_NAME']
|
|
1150
|
+
'k8s.deployment.name' ENV['K8S_DEPLOYMENT']
|
|
1151
|
+
end
|
|
1152
|
+
end
|
|
1153
|
+
end
|
|
1154
|
+
```
|
|
1155
|
+
|
|
1156
|
+
---
|
|
1157
|
+
|
|
1158
|
+
## 8. Trace Context Integration
|
|
1159
|
+
|
|
1160
|
+
### 8.1. OTel SDK Trace Context
|
|
1161
|
+
|
|
1162
|
+
```ruby
|
|
1163
|
+
# lib/e11y/trace_context/opentelemetry_source.rb
|
|
1164
|
+
module E11y
|
|
1165
|
+
module TraceContext
|
|
1166
|
+
class OpenTelemetrySource
|
|
1167
|
+
def self.extract
|
|
1168
|
+
# Use OTel SDK current span context
|
|
1169
|
+
span_context = ::OpenTelemetry::Trace.current_span.context
|
|
1170
|
+
|
|
1171
|
+
return {} unless span_context.valid?
|
|
1172
|
+
|
|
1173
|
+
{
|
|
1174
|
+
trace_id: span_context.hex_trace_id,
|
|
1175
|
+
span_id: span_context.hex_span_id,
|
|
1176
|
+
trace_flags: span_context.trace_flags.sampled? ? 1 : 0,
|
|
1177
|
+
trace_state: span_context.tracestate.to_s
|
|
1178
|
+
}
|
|
1179
|
+
end
|
|
1180
|
+
|
|
1181
|
+
def self.inject(trace_id:, span_id:, trace_flags: 1, **_options)
|
|
1182
|
+
# Not needed: OTel SDK handles injection automatically
|
|
1183
|
+
# E11y just reads from OTel SDK
|
|
1184
|
+
end
|
|
1185
|
+
|
|
1186
|
+
def self.available?
|
|
1187
|
+
defined?(::OpenTelemetry) && ::OpenTelemetry.tracer_provider
|
|
1188
|
+
end
|
|
1189
|
+
end
|
|
1190
|
+
end
|
|
1191
|
+
end
|
|
1192
|
+
```
|
|
1193
|
+
|
|
1194
|
+
### 8.2. Configuration
|
|
1195
|
+
|
|
1196
|
+
```ruby
|
|
1197
|
+
# config/initializers/e11y.rb
|
|
1198
|
+
E11y.configure do |config|
|
|
1199
|
+
config.trace_context do
|
|
1200
|
+
# Use OTel SDK as primary source
|
|
1201
|
+
source :opentelemetry
|
|
1202
|
+
|
|
1203
|
+
# Fallback to E11y if OTel not available
|
|
1204
|
+
fallback_to_e11y true
|
|
1205
|
+
end
|
|
1206
|
+
end
|
|
1207
|
+
```
|
|
1208
|
+
|
|
1209
|
+
---
|
|
1210
|
+
|
|
1211
|
+
## 9. Cardinality Protection (C04 Resolution) ⚠️
|
|
1212
|
+
|
|
1213
|
+
**This section has been moved to [ADR-009 Cost Optimization §8: Cardinality Protection](ADR-009-cost-optimization.md#8-cardinality-protection-c04-resolution).**
|
|
1214
|
+
|
|
1215
|
+
**Rationale:** Cardinality explosion is a **cost optimization concern** affecting ALL backends (Yabeda/Prometheus, OpenTelemetry, Loki), not just OTLP. The unified solution is now documented in ADR-009.
|
|
1216
|
+
|
|
1217
|
+
**Summary:**
|
|
1218
|
+
- ✅ Unified cardinality protection for **all adapters** (Yabeda, OpenTelemetry, Loki)
|
|
1219
|
+
- ✅ Single config: `E11y.config.cardinality_protection` applies globally
|
|
1220
|
+
- ✅ Per-backend overrides: `inherit_from :global` or custom limits
|
|
1221
|
+
- ✅ 90% cost reduction for high-cardinality OTLP attributes
|
|
1222
|
+
|
|
1223
|
+
See [ADR-009 §8](ADR-009-cost-optimization.md#8-cardinality-protection-c04-resolution) for full implementation details.
|
|
1224
|
+
|
|
1225
|
+
---
|
|
1226
|
+
|
|
1227
|
+
## 10. Testing Strategy
|
|
1228
|
+
|
|
1229
|
+
### 10.1. OTel Adapter Tests
|
|
1230
|
+
|
|
1231
|
+
```ruby
|
|
1232
|
+
# spec/e11y/adapters/opentelemetry_collector_spec.rb
|
|
1233
|
+
RSpec.describe E11y::Adapters::OpenTelemetryCollector do
|
|
1234
|
+
let(:adapter) do
|
|
1235
|
+
described_class.new(
|
|
1236
|
+
endpoint: 'http://localhost:4318',
|
|
1237
|
+
export_logs: true,
|
|
1238
|
+
export_traces: false
|
|
1239
|
+
)
|
|
1240
|
+
end
|
|
1241
|
+
|
|
1242
|
+
describe '#send_batch' do
|
|
1243
|
+
it 'exports events as OTel Logs Signal' do
|
|
1244
|
+
stub_request(:post, 'http://localhost:4318/v1/logs')
|
|
1245
|
+
.to_return(status: 200, body: '{}')
|
|
1246
|
+
|
|
1247
|
+
events = [
|
|
1248
|
+
{
|
|
1249
|
+
timestamp: Time.now.iso8601,
|
|
1250
|
+
event_name: 'Events::OrderCreated',
|
|
1251
|
+
severity: :info,
|
|
1252
|
+
payload: { order_id: '123' },
|
|
1253
|
+
trace_id: 'abc123',
|
|
1254
|
+
span_id: 'def456'
|
|
1255
|
+
}
|
|
1256
|
+
]
|
|
1257
|
+
|
|
1258
|
+
result = adapter.send_batch(events)
|
|
1259
|
+
|
|
1260
|
+
expect(result[:success]).to be true
|
|
1261
|
+
expect(result[:sent]).to eq(1)
|
|
1262
|
+
|
|
1263
|
+
# Verify OTLP format
|
|
1264
|
+
expect(WebMock).to have_requested(:post, 'http://localhost:4318/v1/logs')
|
|
1265
|
+
.with { |req|
|
|
1266
|
+
body = JSON.parse(req.body)
|
|
1267
|
+
expect(body['resourceLogs']).to be_present
|
|
1268
|
+
expect(body['resourceLogs'][0]['scopeLogs'][0]['logRecords'].size).to eq(1)
|
|
1269
|
+
}
|
|
1270
|
+
end
|
|
1271
|
+
|
|
1272
|
+
it 'applies semantic conventions' do
|
|
1273
|
+
stub_request(:post, 'http://localhost:4318/v1/logs')
|
|
1274
|
+
.to_return(status: 200)
|
|
1275
|
+
|
|
1276
|
+
events = [
|
|
1277
|
+
{
|
|
1278
|
+
timestamp: Time.now.iso8601,
|
|
1279
|
+
event_name: 'Events::HttpRequest',
|
|
1280
|
+
severity: :info,
|
|
1281
|
+
payload: { method: 'POST', status_code: 201 }
|
|
1282
|
+
}
|
|
1283
|
+
]
|
|
1284
|
+
|
|
1285
|
+
adapter.send_batch(events)
|
|
1286
|
+
|
|
1287
|
+
expect(WebMock).to have_requested(:post, 'http://localhost:4318/v1/logs')
|
|
1288
|
+
.with { |req|
|
|
1289
|
+
body = JSON.parse(req.body)
|
|
1290
|
+
attributes = body['resourceLogs'][0]['scopeLogs'][0]['logRecords'][0]['attributes']
|
|
1291
|
+
|
|
1292
|
+
# Check semantic conventions mapping
|
|
1293
|
+
expect(attributes).to include(
|
|
1294
|
+
{ 'key' => 'http.method', 'value' => { 'stringValue' => 'POST' } },
|
|
1295
|
+
{ 'key' => 'http.status_code', 'value' => { 'intValue' => 201 } }
|
|
1296
|
+
)
|
|
1297
|
+
}
|
|
1298
|
+
end
|
|
1299
|
+
end
|
|
1300
|
+
end
|
|
1301
|
+
```
|
|
1302
|
+
|
|
1303
|
+
### 10.2. Semantic Conventions Tests
|
|
1304
|
+
|
|
1305
|
+
```ruby
|
|
1306
|
+
# spec/e11y/opentelemetry/semantic_conventions_spec.rb
|
|
1307
|
+
RSpec.describe E11y::OpenTelemetry::SemanticConventions do
|
|
1308
|
+
describe '.map' do
|
|
1309
|
+
it 'maps HTTP fields to OTel conventions' do
|
|
1310
|
+
result = described_class.map(
|
|
1311
|
+
'Events::HttpRequest',
|
|
1312
|
+
method: 'POST',
|
|
1313
|
+
status_code: 201,
|
|
1314
|
+
duration_ms: 45.2
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
expect(result).to eq(
|
|
1318
|
+
'http.method' => 'POST',
|
|
1319
|
+
'http.status_code' => 201,
|
|
1320
|
+
'http.server.duration' => 45.2,
|
|
1321
|
+
'http.scheme' => 'https' # System attribute
|
|
1322
|
+
)
|
|
1323
|
+
end
|
|
1324
|
+
|
|
1325
|
+
it 'maps database fields to OTel conventions' do
|
|
1326
|
+
result = described_class.map(
|
|
1327
|
+
'Events::DatabaseQuery',
|
|
1328
|
+
query: 'SELECT * FROM orders',
|
|
1329
|
+
duration_ms: 12.5
|
|
1330
|
+
)
|
|
1331
|
+
|
|
1332
|
+
expect(result).to eq(
|
|
1333
|
+
'db.statement' => 'SELECT * FROM orders',
|
|
1334
|
+
'db.operation.duration' => 12.5,
|
|
1335
|
+
'db.system' => 'postgresql'
|
|
1336
|
+
)
|
|
1337
|
+
end
|
|
1338
|
+
|
|
1339
|
+
it 'preserves unmapped fields' do
|
|
1340
|
+
result = described_class.map(
|
|
1341
|
+
'Events::CustomEvent',
|
|
1342
|
+
custom_field: 'value'
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
expect(result).to eq(
|
|
1346
|
+
'custom_field' => 'value'
|
|
1347
|
+
)
|
|
1348
|
+
end
|
|
1349
|
+
end
|
|
1350
|
+
end
|
|
1351
|
+
```
|
|
1352
|
+
|
|
1353
|
+
---
|
|
1354
|
+
|
|
1355
|
+
## 11. Trade-offs
|
|
1356
|
+
|
|
1357
|
+
### 11.1. Key Decisions
|
|
1358
|
+
|
|
1359
|
+
| Decision | Pro | Con | Rationale |
|
|
1360
|
+
|----------|-----|-----|-----------|
|
|
1361
|
+
| **Optional v1.1+** | No breaking changes | Later adoption | Rails 8+ ecosystem first |
|
|
1362
|
+
| **OTel Collector required** | Advanced features | Extra component | Industry standard |
|
|
1363
|
+
| **Logs Signal primary** | Best fit for events | Not traces-first | E11y is event-focused |
|
|
1364
|
+
| **Yabeda for metrics (C03)** ⚠️ | Better Rails integration | Separate from OTLP | Yabeda is superior for Rails |
|
|
1365
|
+
| **HTTP OTLP only** | Simple, universal | No gRPC (v1) | HTTP is 95% use case |
|
|
1366
|
+
|
|
1367
|
+
### 11.2. Alternatives Considered
|
|
1368
|
+
|
|
1369
|
+
**A) Direct OTel SDK Integration**
|
|
1370
|
+
- ❌ Rejected: Too complex for v1.0, optional for v1.1+
|
|
1371
|
+
|
|
1372
|
+
**B) Replace All Adapters with OTel**
|
|
1373
|
+
- ❌ Rejected: Breaks existing users, OTel is enhancement
|
|
1374
|
+
|
|
1375
|
+
**C) Metrics Signal Export**
|
|
1376
|
+
- ❌ Rejected: Yabeda is better for Rails metrics
|
|
1377
|
+
|
|
1378
|
+
**D) gRPC OTLP Support**
|
|
1379
|
+
- ⏳ Deferred: v1.2+ (HTTP is sufficient for v1.1)
|
|
1380
|
+
|
|
1381
|
+
---
|
|
1382
|
+
|
|
1383
|
+
**Status:** ✅ Draft Complete
|
|
1384
|
+
**Next:** Implementation (v1.1 release)
|
|
1385
|
+
**Estimated Implementation:** 2 weeks
|