e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,1372 @@
|
|
|
1
|
+
# ADR-005: Tracing & Context Management
|
|
2
|
+
|
|
3
|
+
**Status:** Draft
|
|
4
|
+
**Date:** January 12, 2026
|
|
5
|
+
**Covers:** UC-006 (Trace Context Management), UC-009 (Multi-Service Tracing)
|
|
6
|
+
**Depends On:** ADR-001 (Core), ADR-008 (Rails Integration)
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## 📋 Table of Contents
|
|
11
|
+
|
|
12
|
+
1. [Context & Problem](#1-context--problem)
|
|
13
|
+
2. [Architecture Overview](#2-architecture-overview)
|
|
14
|
+
3. [Current (Thread-Local Storage)](#3-current-thread-local-storage)
|
|
15
|
+
4. [Trace ID Generation](#4-trace-id-generation)
|
|
16
|
+
5. [W3C Trace Context](#5-w3c-trace-context)
|
|
17
|
+
6. [Context Propagation](#6-context-propagation)
|
|
18
|
+
7. [Sampling Decisions](#7-sampling-decisions)
|
|
19
|
+
8. [Context Inheritance](#8-context-inheritance)
|
|
20
|
+
- 8.3. [Background Job Tracing Strategy (C17 Resolution)](#83-background-job-tracing-strategy-c17-resolution) ⚠️ CRITICAL
|
|
21
|
+
- 8.3.1. The Problem: Unbounded Traces
|
|
22
|
+
- 8.3.2. Decision: Hybrid Model (New Trace + Parent Link)
|
|
23
|
+
- 8.3.3. SidekiqTraceMiddleware Implementation
|
|
24
|
+
- 8.3.4. Configuration
|
|
25
|
+
- 8.3.5. Querying Full Flow (Request → Job)
|
|
26
|
+
- 8.3.6. Schema Changes
|
|
27
|
+
- 8.3.7. Trade-offs (C17 Resolution)
|
|
28
|
+
9. [Trade-offs](#9-trade-offs)
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 1. Context & Problem
|
|
33
|
+
|
|
34
|
+
### 1.1. Problem Statement
|
|
35
|
+
|
|
36
|
+
**Current Pain Points:**
|
|
37
|
+
|
|
38
|
+
1. **No Trace Correlation:**
|
|
39
|
+
```ruby
|
|
40
|
+
# ❌ Can't correlate events across requests
|
|
41
|
+
Events::OrderCreated.track(order_id: 123)
|
|
42
|
+
# Later in Sidekiq:
|
|
43
|
+
Events::EmailSent.track(order_id: 123)
|
|
44
|
+
# → No way to link these two events
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
2. **Lost Context in Background Jobs:**
|
|
48
|
+
```ruby
|
|
49
|
+
# ❌ User context lost when job executes
|
|
50
|
+
def create
|
|
51
|
+
@current_user = User.find(params[:user_id])
|
|
52
|
+
SendEmailJob.perform_later(order_id: 123)
|
|
53
|
+
# Job has no idea about @current_user
|
|
54
|
+
end
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
3. **No Cross-Service Tracing:**
|
|
58
|
+
```ruby
|
|
59
|
+
# ❌ Can't trace requests across microservices
|
|
60
|
+
response = HTTP.get("https://payment-service/charge")
|
|
61
|
+
# Payment service has no idea this is part of the same trace
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
4. **Manual Context Passing:**
|
|
65
|
+
```ruby
|
|
66
|
+
# ❌ Must manually pass context everywhere
|
|
67
|
+
def process_order(order_id, trace_id:, user_id:, request_id:)
|
|
68
|
+
Events::OrderProcessing.track(
|
|
69
|
+
order_id: order_id,
|
|
70
|
+
trace_id: trace_id,
|
|
71
|
+
user_id: user_id,
|
|
72
|
+
request_id: request_id
|
|
73
|
+
)
|
|
74
|
+
end
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 1.2. Goals
|
|
78
|
+
|
|
79
|
+
**Primary Goals:**
|
|
80
|
+
- ✅ **Automatic trace ID generation** for every request/job
|
|
81
|
+
- ✅ **W3C Trace Context** standard support
|
|
82
|
+
- ✅ **Thread-safe context storage** (Rails.current pattern)
|
|
83
|
+
- ✅ **Context propagation** to background jobs
|
|
84
|
+
- ✅ **Cross-service tracing** via HTTP headers
|
|
85
|
+
- ✅ **Sampling consistency** across distributed traces
|
|
86
|
+
|
|
87
|
+
**Non-Goals:**
|
|
88
|
+
- ❌ Full OpenTelemetry SDK (see ADR-007)
|
|
89
|
+
- ❌ Automatic span creation (manual spans only)
|
|
90
|
+
- ❌ Distributed transactions
|
|
91
|
+
|
|
92
|
+
### 1.3. Success Metrics
|
|
93
|
+
|
|
94
|
+
| Metric | Target | Critical? |
|
|
95
|
+
|--------|--------|-----------|
|
|
96
|
+
| **Context lookup overhead** | <100ns p99 | ✅ Yes |
|
|
97
|
+
| **Trace ID collision rate** | <1 in 10^15 | ✅ Yes |
|
|
98
|
+
| **Context propagation rate** | >99.9% | ✅ Yes |
|
|
99
|
+
| **Cross-service trace coverage** | >95% | ✅ Yes |
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## 2. Architecture Overview
|
|
104
|
+
|
|
105
|
+
### 2.1. System Context
|
|
106
|
+
|
|
107
|
+
```mermaid
|
|
108
|
+
C4Context
|
|
109
|
+
title Tracing & Context Management Context
|
|
110
|
+
|
|
111
|
+
Person(user, "User", "Makes HTTP request")
|
|
112
|
+
|
|
113
|
+
System(rails_app, "Rails App", "Web application")
|
|
114
|
+
|
|
115
|
+
System(e11y, "E11y Gem", "Context management")
|
|
116
|
+
|
|
117
|
+
System_Ext(sidekiq, "Sidekiq", "Background jobs")
|
|
118
|
+
System_Ext(payment_service, "Payment Service", "Microservice")
|
|
119
|
+
System_Ext(frontend, "Frontend", "SPA/Mobile")
|
|
120
|
+
|
|
121
|
+
Rel(user, frontend, "Uses")
|
|
122
|
+
Rel(frontend, rails_app, "HTTP request", "traceparent header")
|
|
123
|
+
Rel(rails_app, e11y, "Auto trace context", "E11y::Current")
|
|
124
|
+
Rel(e11y, sidekiq, "Propagate context", "Job metadata")
|
|
125
|
+
Rel(rails_app, payment_service, "HTTP request", "traceparent header")
|
|
126
|
+
Rel(payment_service, rails_app, "Uses E11y", "Same trace_id")
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 2.2. Component Architecture
|
|
130
|
+
|
|
131
|
+
```mermaid
|
|
132
|
+
graph TB
|
|
133
|
+
subgraph "Request Entry Points"
|
|
134
|
+
HTTPRequest[HTTP Request] --> Middleware[Rack Middleware]
|
|
135
|
+
JobExecution[Job Execution] --> JobMiddleware[Sidekiq/ActiveJob MW]
|
|
136
|
+
ManualAPI[Manual API Call] --> CurrentAPI[E11y::Current.set]
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
subgraph "Context Storage (Thread-Local)"
|
|
140
|
+
Middleware --> Current[E11y::Current]
|
|
141
|
+
JobMiddleware --> Current
|
|
142
|
+
CurrentAPI --> Current
|
|
143
|
+
|
|
144
|
+
Current --> TraceID[trace_id]
|
|
145
|
+
Current --> SpanID[span_id]
|
|
146
|
+
Current --> ParentSpanID[parent_span_id]
|
|
147
|
+
Current --> UserID[user_id]
|
|
148
|
+
Current --> RequestID[request_id]
|
|
149
|
+
Current --> Sampled[sampled]
|
|
150
|
+
Current --> Baggage[baggage]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
subgraph "Trace ID Management"
|
|
154
|
+
Current --> TraceGenerator[Trace ID Generator]
|
|
155
|
+
TraceGenerator --> W3CParser[W3C Trace Context Parser]
|
|
156
|
+
TraceGenerator --> UUIDGen[UUID Generator]
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
subgraph "Context Propagation"
|
|
160
|
+
Current --> HTTPPropagator[HTTP Header Propagator]
|
|
161
|
+
Current --> JobPropagator[Job Metadata Propagator]
|
|
162
|
+
Current --> LogPropagator[Structured Log Propagator]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
subgraph "Event Pipeline"
|
|
166
|
+
Current --> EventTracking[Event Tracking]
|
|
167
|
+
EventTracking --> EnrichWithContext[Auto-enrich with Context]
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
style Current fill:#d1ecf1
|
|
171
|
+
style TraceGenerator fill:#fff3cd
|
|
172
|
+
style HTTPPropagator fill:#d4edda
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### 2.3. Context Lifecycle
|
|
176
|
+
|
|
177
|
+
```mermaid
|
|
178
|
+
sequenceDiagram
|
|
179
|
+
participant Browser as Browser
|
|
180
|
+
participant Rack as Rack Middleware
|
|
181
|
+
participant Current as E11y::Current
|
|
182
|
+
participant Controller as Rails Controller
|
|
183
|
+
participant Sidekiq as Sidekiq Job
|
|
184
|
+
participant Event as Event Tracking
|
|
185
|
+
|
|
186
|
+
Note over Browser: User makes request
|
|
187
|
+
Browser->>Rack: GET /orders<br/>traceparent: 00-abc123...
|
|
188
|
+
|
|
189
|
+
Rack->>Current: Extract or generate trace_id
|
|
190
|
+
Current->>Current: Store in thread-local
|
|
191
|
+
|
|
192
|
+
Note over Current: trace_id: abc123<br/>span_id: def456<br/>user_id: 42
|
|
193
|
+
|
|
194
|
+
Rack->>Controller: Process request
|
|
195
|
+
|
|
196
|
+
Controller->>Event: Track event
|
|
197
|
+
Event->>Current: Read context
|
|
198
|
+
Current-->>Event: Return trace_id, user_id, etc
|
|
199
|
+
Event->>Event: Enrich event with context
|
|
200
|
+
|
|
201
|
+
Controller->>Sidekiq: Enqueue job
|
|
202
|
+
Sidekiq->>Current: Read trace_id
|
|
203
|
+
Current-->>Sidekiq: abc123
|
|
204
|
+
Sidekiq->>Sidekiq: Store in job metadata
|
|
205
|
+
|
|
206
|
+
Note over Sidekiq: Later... job executes
|
|
207
|
+
|
|
208
|
+
Sidekiq->>Current: Restore trace_id from metadata
|
|
209
|
+
Current->>Current: Store in NEW thread-local
|
|
210
|
+
|
|
211
|
+
Note over Current: Same trace_id: abc123<br/>New span_id: ghi789<br/>parent_span_id: def456
|
|
212
|
+
|
|
213
|
+
Sidekiq->>Event: Track job event
|
|
214
|
+
Event->>Current: Read context
|
|
215
|
+
Current-->>Event: Same trace_id!
|
|
216
|
+
|
|
217
|
+
Rack->>Current: Reset context
|
|
218
|
+
Current->>Current: Clear thread-local
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## 3. Current (Thread-Local Storage)
|
|
224
|
+
|
|
225
|
+
### 3.1. E11y::Current Implementation
|
|
226
|
+
|
|
227
|
+
**Design Decision:** Use `ActiveSupport::CurrentAttributes` for thread-safe storage.
|
|
228
|
+
|
|
229
|
+
```ruby
|
|
230
|
+
# lib/e11y/current.rb
|
|
231
|
+
module E11y
|
|
232
|
+
class Current < ActiveSupport::CurrentAttributes
|
|
233
|
+
# Core trace attributes
|
|
234
|
+
attribute :trace_id
|
|
235
|
+
attribute :span_id
|
|
236
|
+
attribute :parent_span_id
|
|
237
|
+
|
|
238
|
+
# Request/job attributes
|
|
239
|
+
attribute :request_id
|
|
240
|
+
attribute :job_id
|
|
241
|
+
attribute :job_class
|
|
242
|
+
|
|
243
|
+
# User/tenant attributes
|
|
244
|
+
attribute :user_id
|
|
245
|
+
attribute :tenant_id
|
|
246
|
+
attribute :organization_id
|
|
247
|
+
|
|
248
|
+
# Sampling decision
|
|
249
|
+
attribute :sampled
|
|
250
|
+
|
|
251
|
+
# Custom baggage (key-value pairs)
|
|
252
|
+
attribute :baggage
|
|
253
|
+
|
|
254
|
+
# IP and user agent (for security/audit)
|
|
255
|
+
attribute :ip_address
|
|
256
|
+
attribute :user_agent
|
|
257
|
+
|
|
258
|
+
# Delegation methods for convenience
|
|
259
|
+
class << self
|
|
260
|
+
# Set multiple attributes at once
|
|
261
|
+
def set(attributes = {})
|
|
262
|
+
attributes.each do |key, value|
|
|
263
|
+
public_send(:"#{key}=", value)
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Get all current attributes as hash
|
|
268
|
+
def to_h
|
|
269
|
+
{
|
|
270
|
+
trace_id: trace_id,
|
|
271
|
+
span_id: span_id,
|
|
272
|
+
parent_span_id: parent_span_id,
|
|
273
|
+
request_id: request_id,
|
|
274
|
+
job_id: job_id,
|
|
275
|
+
job_class: job_class,
|
|
276
|
+
user_id: user_id,
|
|
277
|
+
tenant_id: tenant_id,
|
|
278
|
+
organization_id: organization_id,
|
|
279
|
+
sampled: sampled,
|
|
280
|
+
baggage: baggage,
|
|
281
|
+
ip_address: ip_address,
|
|
282
|
+
user_agent: user_agent
|
|
283
|
+
}.compact
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Check if we're in a traced context
|
|
287
|
+
def traced?
|
|
288
|
+
trace_id.present?
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Create a new span (child of current span)
|
|
292
|
+
def create_span(name)
|
|
293
|
+
new_span_id = E11y::TraceContext.generate_span_id
|
|
294
|
+
|
|
295
|
+
span = Span.new(
|
|
296
|
+
trace_id: trace_id || E11y::TraceContext.generate_id,
|
|
297
|
+
span_id: new_span_id,
|
|
298
|
+
parent_span_id: span_id,
|
|
299
|
+
name: name,
|
|
300
|
+
started_at: Time.now
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Update current span_id
|
|
304
|
+
self.span_id = new_span_id
|
|
305
|
+
|
|
306
|
+
span
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Add baggage (key-value metadata)
|
|
310
|
+
def add_baggage(key, value)
|
|
311
|
+
self.baggage ||= {}
|
|
312
|
+
self.baggage[key.to_s] = value.to_s
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
# Get baggage value
|
|
316
|
+
def get_baggage(key)
|
|
317
|
+
baggage&.[](key.to_s)
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# Span helper class
|
|
322
|
+
class Span
|
|
323
|
+
attr_reader :trace_id, :span_id, :parent_span_id, :name, :started_at
|
|
324
|
+
attr_accessor :finished_at, :status, :attributes
|
|
325
|
+
|
|
326
|
+
def initialize(trace_id:, span_id:, parent_span_id:, name:, started_at:)
|
|
327
|
+
@trace_id = trace_id
|
|
328
|
+
@span_id = span_id
|
|
329
|
+
@parent_span_id = parent_span_id
|
|
330
|
+
@name = name
|
|
331
|
+
@started_at = started_at
|
|
332
|
+
@attributes = {}
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def finish(status: :ok)
|
|
336
|
+
@finished_at = Time.now
|
|
337
|
+
@status = status
|
|
338
|
+
|
|
339
|
+
duration = (@finished_at - @started_at) * 1000 # ms
|
|
340
|
+
|
|
341
|
+
# Track span as event
|
|
342
|
+
Events::Span.track(
|
|
343
|
+
span_name: @name,
|
|
344
|
+
trace_id: @trace_id,
|
|
345
|
+
span_id: @span_id,
|
|
346
|
+
parent_span_id: @parent_span_id,
|
|
347
|
+
duration: duration,
|
|
348
|
+
status: status,
|
|
349
|
+
attributes: @attributes
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
self
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def add_attribute(key, value)
|
|
356
|
+
@attributes[key] = value
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
### 3.2. Usage Examples
|
|
364
|
+
|
|
365
|
+
```ruby
|
|
366
|
+
# Read current context
|
|
367
|
+
E11y::Current.trace_id # => "abc123..."
|
|
368
|
+
E11y::Current.user_id # => 42
|
|
369
|
+
|
|
370
|
+
# Set context manually
|
|
371
|
+
E11y::Current.set(
|
|
372
|
+
trace_id: 'custom-trace-123',
|
|
373
|
+
user_id: 99,
|
|
374
|
+
tenant_id: 'acme-corp'
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Check if traced
|
|
378
|
+
if E11y::Current.traced?
|
|
379
|
+
puts "We're in a trace!"
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
# Get all context
|
|
383
|
+
E11y::Current.to_h
|
|
384
|
+
# => { trace_id: "abc...", user_id: 42, ... }
|
|
385
|
+
|
|
386
|
+
# Create a manual span
|
|
387
|
+
span = E11y::Current.create_span('database_query')
|
|
388
|
+
span.add_attribute('table', 'orders')
|
|
389
|
+
# ... do work ...
|
|
390
|
+
span.finish(status: :ok)
|
|
391
|
+
|
|
392
|
+
# Baggage (propagated metadata)
|
|
393
|
+
E11y::Current.add_baggage('experiment_id', 'exp-42')
|
|
394
|
+
E11y::Current.get_baggage('experiment_id') # => "exp-42"
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
---
|
|
398
|
+
|
|
399
|
+
## 4. Trace ID Generation
|
|
400
|
+
|
|
401
|
+
### 4.1. Trace ID Generator
|
|
402
|
+
|
|
403
|
+
```ruby
|
|
404
|
+
# lib/e11y/trace_context/id_generator.rb
|
|
405
|
+
module E11y
|
|
406
|
+
module TraceContext
|
|
407
|
+
module IDGenerator
|
|
408
|
+
# W3C Trace Context format:
|
|
409
|
+
# trace-id: 32 hex chars (128 bits)
|
|
410
|
+
# span-id: 16 hex chars (64 bits)
|
|
411
|
+
|
|
412
|
+
# Generate trace ID (128 bits = 16 bytes)
|
|
413
|
+
def self.generate_trace_id
|
|
414
|
+
SecureRandom.hex(16) # 32 hex chars
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# Generate span ID (64 bits = 8 bytes)
|
|
418
|
+
def self.generate_span_id
|
|
419
|
+
SecureRandom.hex(8) # 16 hex chars
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# Validate trace ID format
|
|
423
|
+
def self.valid_trace_id?(trace_id)
|
|
424
|
+
trace_id.is_a?(String) &&
|
|
425
|
+
trace_id.match?(/\A[0-9a-f]{32}\z/) &&
|
|
426
|
+
trace_id != '00000000000000000000000000000000' # Not all zeros
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Validate span ID format
|
|
430
|
+
def self.valid_span_id?(span_id)
|
|
431
|
+
span_id.is_a?(String) &&
|
|
432
|
+
span_id.match?(/\A[0-9a-f]{16}\z/) &&
|
|
433
|
+
span_id != '0000000000000000' # Not all zeros
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
# Convert UUID to trace ID (for compatibility)
|
|
437
|
+
def self.uuid_to_trace_id(uuid)
|
|
438
|
+
uuid.delete('-').downcase
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Convenience methods
|
|
443
|
+
def self.generate_id
|
|
444
|
+
IDGenerator.generate_trace_id
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
def self.generate_span_id
|
|
448
|
+
IDGenerator.generate_span_id
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
end
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
---
|
|
455
|
+
|
|
456
|
+
## 5. W3C Trace Context
|
|
457
|
+
|
|
458
|
+
### 5.1. W3C Trace Context Standard
|
|
459
|
+
|
|
460
|
+
**Format:** `traceparent: 00-{trace-id}-{parent-id}-{trace-flags}`
|
|
461
|
+
|
|
462
|
+
**Example:** `traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01`
|
|
463
|
+
|
|
464
|
+
- `00` = version
|
|
465
|
+
- `0af7651916cd43dd8448eb211c80319c` = trace-id (32 hex chars)
|
|
466
|
+
- `b7ad6b7169203331` = parent-id (16 hex chars)
|
|
467
|
+
- `01` = trace-flags (01 = sampled)
|
|
468
|
+
|
|
469
|
+
### 5.2. W3C Parser & Generator
|
|
470
|
+
|
|
471
|
+
```ruby
|
|
472
|
+
# lib/e11y/trace_context/w3c.rb
|
|
473
|
+
module E11y
|
|
474
|
+
module TraceContext
|
|
475
|
+
module W3C
|
|
476
|
+
VERSION = '00'
|
|
477
|
+
|
|
478
|
+
# Parse W3C traceparent header
|
|
479
|
+
def self.parse_traceparent(header)
|
|
480
|
+
return nil unless header.is_a?(String)
|
|
481
|
+
|
|
482
|
+
parts = header.split('-')
|
|
483
|
+
return nil unless parts.size == 4
|
|
484
|
+
|
|
485
|
+
version, trace_id, parent_id, flags = parts
|
|
486
|
+
|
|
487
|
+
# Validate format
|
|
488
|
+
return nil unless version == VERSION
|
|
489
|
+
return nil unless IDGenerator.valid_trace_id?(trace_id)
|
|
490
|
+
return nil unless IDGenerator.valid_span_id?(parent_id)
|
|
491
|
+
|
|
492
|
+
sampled = (flags.to_i(16) & 0x01) == 1
|
|
493
|
+
|
|
494
|
+
{
|
|
495
|
+
version: version,
|
|
496
|
+
trace_id: trace_id,
|
|
497
|
+
parent_span_id: parent_id,
|
|
498
|
+
sampled: sampled,
|
|
499
|
+
flags: flags
|
|
500
|
+
}
|
|
501
|
+
end
|
|
502
|
+
|
|
503
|
+
# Generate W3C traceparent header
|
|
504
|
+
def self.generate_traceparent(trace_id:, span_id:, sampled: true)
|
|
505
|
+
flags = sampled ? '01' : '00'
|
|
506
|
+
|
|
507
|
+
"#{VERSION}-#{trace_id}-#{span_id}-#{flags}"
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
# Parse W3C tracestate header (optional)
|
|
511
|
+
def self.parse_tracestate(header)
|
|
512
|
+
return {} unless header.is_a?(String)
|
|
513
|
+
|
|
514
|
+
header.split(',').each_with_object({}) do |entry, hash|
|
|
515
|
+
key, value = entry.split('=', 2)
|
|
516
|
+
hash[key.strip] = value.strip if key && value
|
|
517
|
+
end
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
# Generate W3C tracestate header
|
|
521
|
+
def self.generate_tracestate(state_hash)
|
|
522
|
+
state_hash.map { |k, v| "#{k}=#{v}" }.join(',')
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
end
|
|
526
|
+
end
|
|
527
|
+
```
|
|
528
|
+
|
|
529
|
+
### 5.3. HTTP Header Extraction
|
|
530
|
+
|
|
531
|
+
```ruby
|
|
532
|
+
# lib/e11y/trace_context/http_extractor.rb
|
|
533
|
+
module E11y
|
|
534
|
+
module TraceContext
|
|
535
|
+
class HTTPExtractor
|
|
536
|
+
TRACEPARENT_HEADER = 'HTTP_TRACEPARENT'
|
|
537
|
+
TRACESTATE_HEADER = 'HTTP_TRACESTATE'
|
|
538
|
+
|
|
539
|
+
# Legacy headers (fallback)
|
|
540
|
+
X_REQUEST_ID = 'HTTP_X_REQUEST_ID'
|
|
541
|
+
X_TRACE_ID = 'HTTP_X_TRACE_ID'
|
|
542
|
+
X_CORRELATION_ID = 'HTTP_X_CORRELATION_ID'
|
|
543
|
+
|
|
544
|
+
def self.extract(env)
|
|
545
|
+
# Try W3C Trace Context first
|
|
546
|
+
if env[TRACEPARENT_HEADER]
|
|
547
|
+
extract_w3c(env)
|
|
548
|
+
# Fallback to legacy headers
|
|
549
|
+
elsif env[X_TRACE_ID]
|
|
550
|
+
extract_legacy(env)
|
|
551
|
+
else
|
|
552
|
+
# No trace context → generate new
|
|
553
|
+
generate_new
|
|
554
|
+
end
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
private
|
|
558
|
+
|
|
559
|
+
def self.extract_w3c(env)
|
|
560
|
+
context = W3C.parse_traceparent(env[TRACEPARENT_HEADER])
|
|
561
|
+
return generate_new unless context
|
|
562
|
+
|
|
563
|
+
tracestate = W3C.parse_tracestate(env[TRACESTATE_HEADER])
|
|
564
|
+
|
|
565
|
+
{
|
|
566
|
+
trace_id: context[:trace_id],
|
|
567
|
+
parent_span_id: context[:parent_span_id],
|
|
568
|
+
span_id: IDGenerator.generate_span_id, # New span for this service
|
|
569
|
+
sampled: context[:sampled],
|
|
570
|
+
tracestate: tracestate,
|
|
571
|
+
format: :w3c
|
|
572
|
+
}
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
def self.extract_legacy(env)
|
|
576
|
+
trace_id = env[X_TRACE_ID] ||
|
|
577
|
+
env[X_REQUEST_ID] ||
|
|
578
|
+
env[X_CORRELATION_ID]
|
|
579
|
+
|
|
580
|
+
# Convert to W3C format if needed
|
|
581
|
+
trace_id = normalize_trace_id(trace_id)
|
|
582
|
+
|
|
583
|
+
{
|
|
584
|
+
trace_id: trace_id,
|
|
585
|
+
span_id: IDGenerator.generate_span_id,
|
|
586
|
+
sampled: true, # Assume sampled for legacy
|
|
587
|
+
format: :legacy
|
|
588
|
+
}
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
def self.generate_new
|
|
592
|
+
{
|
|
593
|
+
trace_id: IDGenerator.generate_trace_id,
|
|
594
|
+
span_id: IDGenerator.generate_span_id,
|
|
595
|
+
sampled: true,
|
|
596
|
+
format: :new
|
|
597
|
+
}
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
def self.normalize_trace_id(trace_id)
|
|
601
|
+
# If UUID format, convert to W3C
|
|
602
|
+
if trace_id.include?('-') && trace_id.length == 36
|
|
603
|
+
IDGenerator.uuid_to_trace_id(trace_id)
|
|
604
|
+
# If already 32 hex chars, use as-is
|
|
605
|
+
elsif trace_id.match?(/\A[0-9a-f]{32}\z/i)
|
|
606
|
+
trace_id.downcase
|
|
607
|
+
# Otherwise, hash it to 32 hex chars
|
|
608
|
+
else
|
|
609
|
+
Digest::SHA256.hexdigest(trace_id)[0...32]
|
|
610
|
+
end
|
|
611
|
+
end
|
|
612
|
+
end
|
|
613
|
+
end
|
|
614
|
+
end
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
---
|
|
618
|
+
|
|
619
|
+
## 6. Context Propagation
|
|
620
|
+
|
|
621
|
+
### 6.1. HTTP Propagator (Outgoing Requests)
|
|
622
|
+
|
|
623
|
+
```ruby
|
|
624
|
+
# lib/e11y/trace_context/http_propagator.rb
|
|
625
|
+
module E11y
|
|
626
|
+
module TraceContext
|
|
627
|
+
class HTTPPropagator
|
|
628
|
+
# Inject trace context into HTTP headers
|
|
629
|
+
def self.inject(headers = {})
|
|
630
|
+
return headers unless E11y::Current.traced?
|
|
631
|
+
|
|
632
|
+
trace_id = E11y::Current.trace_id
|
|
633
|
+
span_id = E11y::Current.span_id
|
|
634
|
+
sampled = E11y::Current.sampled
|
|
635
|
+
|
|
636
|
+
# W3C Trace Context
|
|
637
|
+
headers['traceparent'] = W3C.generate_traceparent(
|
|
638
|
+
trace_id: trace_id,
|
|
639
|
+
span_id: span_id,
|
|
640
|
+
sampled: sampled
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
# Add tracestate if baggage present
|
|
644
|
+
if E11y::Current.baggage&.any?
|
|
645
|
+
headers['tracestate'] = W3C.generate_tracestate(
|
|
646
|
+
E11y::Current.baggage
|
|
647
|
+
)
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Legacy headers (for backwards compatibility)
|
|
651
|
+
headers['X-Request-ID'] = E11y::Current.request_id if E11y::Current.request_id
|
|
652
|
+
headers['X-Trace-ID'] = trace_id
|
|
653
|
+
|
|
654
|
+
headers
|
|
655
|
+
end
|
|
656
|
+
|
|
657
|
+
# Helper for common HTTP clients
|
|
658
|
+
def self.wrap_faraday(conn)
|
|
659
|
+
conn.use :instrumentation do |faraday|
|
|
660
|
+
faraday.request :headers do |req|
|
|
661
|
+
inject(req.headers)
|
|
662
|
+
end
|
|
663
|
+
end
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
def self.wrap_http_rb(http)
|
|
667
|
+
headers = inject
|
|
668
|
+
headers.each { |k, v| http = http.headers(k => v) }
|
|
669
|
+
http
|
|
670
|
+
end
|
|
671
|
+
end
|
|
672
|
+
end
|
|
673
|
+
end
|
|
674
|
+
```
|
|
675
|
+
|
|
676
|
+
### 6.2. Job Propagator (Sidekiq/ActiveJob)
|
|
677
|
+
|
|
678
|
+
Already implemented in ADR-008, but here's the core logic:
|
|
679
|
+
|
|
680
|
+
```ruby
|
|
681
|
+
# lib/e11y/trace_context/job_propagator.rb
|
|
682
|
+
module E11y
|
|
683
|
+
module TraceContext
|
|
684
|
+
class JobPropagator
|
|
685
|
+
# Inject trace context into job metadata
|
|
686
|
+
def self.inject(job_metadata = {})
|
|
687
|
+
return job_metadata unless E11y::Current.traced?
|
|
688
|
+
|
|
689
|
+
job_metadata['e11y_trace_id'] = E11y::Current.trace_id
|
|
690
|
+
job_metadata['e11y_span_id'] = E11y::Current.span_id
|
|
691
|
+
job_metadata['e11y_sampled'] = E11y::Current.sampled
|
|
692
|
+
|
|
693
|
+
# Propagate baggage
|
|
694
|
+
if E11y::Current.baggage&.any?
|
|
695
|
+
job_metadata['e11y_baggage'] = E11y::Current.baggage
|
|
696
|
+
end
|
|
697
|
+
|
|
698
|
+
# Propagate user/tenant context
|
|
699
|
+
job_metadata['e11y_user_id'] = E11y::Current.user_id if E11y::Current.user_id
|
|
700
|
+
job_metadata['e11y_tenant_id'] = E11y::Current.tenant_id if E11y::Current.tenant_id
|
|
701
|
+
|
|
702
|
+
job_metadata
|
|
703
|
+
end
|
|
704
|
+
|
|
705
|
+
# Extract trace context from job metadata
|
|
706
|
+
def self.extract(job_metadata)
|
|
707
|
+
return {} unless job_metadata['e11y_trace_id']
|
|
708
|
+
|
|
709
|
+
{
|
|
710
|
+
trace_id: job_metadata['e11y_trace_id'],
|
|
711
|
+
parent_span_id: job_metadata['e11y_span_id'],
|
|
712
|
+
span_id: IDGenerator.generate_span_id, # New span for job
|
|
713
|
+
sampled: job_metadata['e11y_sampled'],
|
|
714
|
+
baggage: job_metadata['e11y_baggage'],
|
|
715
|
+
user_id: job_metadata['e11y_user_id'],
|
|
716
|
+
tenant_id: job_metadata['e11y_tenant_id']
|
|
717
|
+
}
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
end
|
|
721
|
+
end
|
|
722
|
+
```
|
|
723
|
+
|
|
724
|
+
### 6.3. Structured Log Propagator
|
|
725
|
+
|
|
726
|
+
```ruby
|
|
727
|
+
# lib/e11y/trace_context/log_propagator.rb
|
|
728
|
+
module E11y
|
|
729
|
+
module TraceContext
|
|
730
|
+
class LogPropagator
|
|
731
|
+
# Add trace context to structured log entry
|
|
732
|
+
def self.inject(log_entry = {})
|
|
733
|
+
return log_entry unless E11y::Current.traced?
|
|
734
|
+
|
|
735
|
+
log_entry.merge(
|
|
736
|
+
trace_id: E11y::Current.trace_id,
|
|
737
|
+
span_id: E11y::Current.span_id,
|
|
738
|
+
parent_span_id: E11y::Current.parent_span_id,
|
|
739
|
+
user_id: E11y::Current.user_id,
|
|
740
|
+
tenant_id: E11y::Current.tenant_id
|
|
741
|
+
).compact
|
|
742
|
+
end
|
|
743
|
+
end
|
|
744
|
+
end
|
|
745
|
+
end
|
|
746
|
+
```
|
|
747
|
+
|
|
748
|
+
---
|
|
749
|
+
|
|
750
|
+
## 7. Sampling Decisions
|
|
751
|
+
|
|
752
|
+
### 7.1. Trace-Consistent Sampling
|
|
753
|
+
|
|
754
|
+
**Design Decision:** Sampling decision is made at trace entry point and propagated.
|
|
755
|
+
|
|
756
|
+
```ruby
|
|
757
|
+
# lib/e11y/trace_context/sampler.rb
|
|
758
|
+
module E11y
|
|
759
|
+
module TraceContext
|
|
760
|
+
class Sampler
|
|
761
|
+
def initialize(config)
|
|
762
|
+
@default_rate = config.default_sample_rate
|
|
763
|
+
@per_event_rates = config.per_event_sample_rates
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
# Decide if this trace should be sampled
|
|
767
|
+
def should_sample?(context = {})
|
|
768
|
+
# If sampling decision already made (from parent), respect it
|
|
769
|
+
return context[:sampled] if context.key?(:sampled)
|
|
770
|
+
|
|
771
|
+
# Apply sampling rules
|
|
772
|
+
sample_rate = determine_sample_rate(context)
|
|
773
|
+
|
|
774
|
+
# Random sampling
|
|
775
|
+
rand < sample_rate
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
private
|
|
779
|
+
|
|
780
|
+
def determine_sample_rate(context)
|
|
781
|
+
# Priority 1: Always sample errors
|
|
782
|
+
return 1.0 if context[:error]
|
|
783
|
+
|
|
784
|
+
# Priority 2: Per-event sampling
|
|
785
|
+
if context[:event_name]
|
|
786
|
+
rate = @per_event_rates[context[:event_name]]
|
|
787
|
+
return rate if rate
|
|
788
|
+
end
|
|
789
|
+
|
|
790
|
+
# Priority 3: Per-user sampling (for debugging)
|
|
791
|
+
if context[:user_id] && debug_user?(context[:user_id])
|
|
792
|
+
return 1.0
|
|
793
|
+
end
|
|
794
|
+
|
|
795
|
+
# Default sampling rate
|
|
796
|
+
@default_rate
|
|
797
|
+
end
|
|
798
|
+
|
|
799
|
+
def debug_user?(user_id)
|
|
800
|
+
# Check if user is in debug mode (e.g., via feature flag)
|
|
801
|
+
E11y.config.debug_users.include?(user_id)
|
|
802
|
+
end
|
|
803
|
+
end
|
|
804
|
+
end
|
|
805
|
+
end
|
|
806
|
+
```
|
|
807
|
+
|
|
808
|
+
### 7.2. Sampling Configuration
|
|
809
|
+
|
|
810
|
+
```ruby
|
|
811
|
+
# config/initializers/e11y.rb
|
|
812
|
+
E11y.configure do |config|
|
|
813
|
+
config.tracing do
|
|
814
|
+
# Default sample rate (10% of traces)
|
|
815
|
+
default_sample_rate 0.1
|
|
816
|
+
|
|
817
|
+
# Per-event sampling
|
|
818
|
+
per_event_sample_rates do
|
|
819
|
+
event 'payment.processed', sample_rate: 1.0 # Always sample
|
|
820
|
+
event 'order.created', sample_rate: 0.5 # 50%
|
|
821
|
+
event 'health_check', sample_rate: 0.01 # 1%
|
|
822
|
+
end
|
|
823
|
+
|
|
824
|
+
# Always sample for debug users
|
|
825
|
+
debug_users [123, 456] # User IDs
|
|
826
|
+
|
|
827
|
+
# Respect parent sampling decision
|
|
828
|
+
respect_parent_sampling true # Default: true
|
|
829
|
+
end
|
|
830
|
+
end
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
---
|
|
834
|
+
|
|
835
|
+
## 8. Context Inheritance
|
|
836
|
+
|
|
837
|
+
### 8.1. Context Inheritance Patterns
|
|
838
|
+
|
|
839
|
+
```ruby
|
|
840
|
+
# lib/e11y/trace_context/inheritance.rb
|
|
841
|
+
module E11y
|
|
842
|
+
module TraceContext
|
|
843
|
+
module Inheritance
|
|
844
|
+
# Execute block with inherited context
|
|
845
|
+
def self.with_inherited_context(parent_context, &block)
|
|
846
|
+
previous_context = E11y::Current.attributes
|
|
847
|
+
|
|
848
|
+
begin
|
|
849
|
+
# Inherit from parent, but create new span
|
|
850
|
+
E11y::Current.set(
|
|
851
|
+
trace_id: parent_context[:trace_id],
|
|
852
|
+
parent_span_id: parent_context[:span_id],
|
|
853
|
+
span_id: IDGenerator.generate_span_id,
|
|
854
|
+
sampled: parent_context[:sampled],
|
|
855
|
+
baggage: parent_context[:baggage]&.dup,
|
|
856
|
+
user_id: parent_context[:user_id],
|
|
857
|
+
tenant_id: parent_context[:tenant_id]
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
yield
|
|
861
|
+
ensure
|
|
862
|
+
E11y::Current.set(previous_context)
|
|
863
|
+
end
|
|
864
|
+
end
|
|
865
|
+
|
|
866
|
+
# Fork context for parallel execution (e.g., Thread, Fiber)
|
|
867
|
+
def self.fork_context(&block)
|
|
868
|
+
parent_context = E11y::Current.to_h
|
|
869
|
+
|
|
870
|
+
Thread.new do
|
|
871
|
+
with_inherited_context(parent_context, &block)
|
|
872
|
+
end
|
|
873
|
+
end
|
|
874
|
+
end
|
|
875
|
+
end
|
|
876
|
+
end
|
|
877
|
+
```
|
|
878
|
+
|
|
879
|
+
### 8.2. Usage Examples
|
|
880
|
+
|
|
881
|
+
```ruby
|
|
882
|
+
# Execute with inherited context
|
|
883
|
+
parent_context = E11y::Current.to_h
|
|
884
|
+
|
|
885
|
+
E11y::TraceContext::Inheritance.with_inherited_context(parent_context) do
|
|
886
|
+
# This block runs with parent's trace_id but new span_id
|
|
887
|
+
Events::ChildTask.track(task_id: 42)
|
|
888
|
+
end
|
|
889
|
+
|
|
890
|
+
# Fork context for parallel execution
|
|
891
|
+
threads = 5.times.map do |i|
|
|
892
|
+
E11y::TraceContext::Inheritance.fork_context do
|
|
893
|
+
# Each thread gets its own span but shares trace_id
|
|
894
|
+
Events::ParallelTask.track(index: i)
|
|
895
|
+
end
|
|
896
|
+
end
|
|
897
|
+
|
|
898
|
+
threads.each(&:join)
|
|
899
|
+
```
|
|
900
|
+
|
|
901
|
+
### 8.3. Background Job Tracing Strategy (C17 Resolution)
|
|
902
|
+
|
|
903
|
+
> **⚠️ CRITICAL: C17 Conflict Resolution - Background Job Tracing Strategy**
|
|
904
|
+
> **See:** [CONFLICT-ANALYSIS.md C17](researches/CONFLICT-ANALYSIS.md#c17-sidekiq-job-trace-context--parent-request-trace-uc-010--uc-009) for detailed analysis
|
|
905
|
+
> **Problem:** Should Sidekiq jobs inherit parent trace_id or start new trace?
|
|
906
|
+
> **Solution:** Hybrid model - jobs start NEW trace but LINK to parent
|
|
907
|
+
|
|
908
|
+
#### 8.3.1. The Problem: Unbounded Traces
|
|
909
|
+
|
|
910
|
+
**When a web request enqueues a background job, two competing models exist:**
|
|
911
|
+
|
|
912
|
+
```ruby
|
|
913
|
+
# Scenario:
|
|
914
|
+
# Web request (trace_id: abc-123) enqueues Sidekiq job
|
|
915
|
+
|
|
916
|
+
# Model A: Job INHERITS parent trace_id (same trace_id)
|
|
917
|
+
# Result: ONE continuous trace (request → job)
|
|
918
|
+
# Problem: Trace duration UNBOUNDED (job may run hours later!)
|
|
919
|
+
# Problem: SLO metrics SKEWED (trace includes async work)
|
|
920
|
+
|
|
921
|
+
# Model B: Job STARTS new trace_id (new trace)
|
|
922
|
+
# Result: TWO separate traces (request trace + job trace)
|
|
923
|
+
# Problem: Can't see full end-to-end flow in single trace
|
|
924
|
+
# Problem: Lost context (job doesn't know parent)
|
|
925
|
+
```
|
|
926
|
+
|
|
927
|
+
**Architectural Trade-off:**
|
|
928
|
+
- ✅ **Model A (inherit):** Complete trace, easy debugging
|
|
929
|
+
- ❌ **Model A (inherit):** Unbounded duration, skewed SLOs
|
|
930
|
+
- ✅ **Model B (new trace):** Bounded traces, accurate SLOs
|
|
931
|
+
- ❌ **Model B (new trace):** Lost parent context, complex querying
|
|
932
|
+
|
|
933
|
+
#### 8.3.2. Decision: Hybrid Model (New Trace + Parent Link)
|
|
934
|
+
|
|
935
|
+
**Approved Solution:**
|
|
936
|
+
Jobs start **NEW trace** (`trace_id`) but **LINK to parent** (`parent_trace_id` field).
|
|
937
|
+
|
|
938
|
+
```ruby
|
|
939
|
+
# lib/e11y/trace_context/job_strategy.rb
|
|
940
|
+
module E11y
|
|
941
|
+
module TraceContext
|
|
942
|
+
class JobStrategy
|
|
943
|
+
# Trace strategies for background jobs
|
|
944
|
+
STRATEGIES = {
|
|
945
|
+
# Job starts NEW trace, stores link to parent (RECOMMENDED)
|
|
946
|
+
start_new_with_link: -> (parent_context) {
|
|
947
|
+
{
|
|
948
|
+
trace_id: IDGenerator.generate_trace_id, # ← NEW trace!
|
|
949
|
+
span_id: IDGenerator.generate_span_id,
|
|
950
|
+
parent_trace_id: parent_context[:trace_id], # ← Link to parent
|
|
951
|
+
parent_span_id: parent_context[:span_id],
|
|
952
|
+
sampled: parent_context[:sampled], # Inherit sampling
|
|
953
|
+
baggage: parent_context[:baggage],
|
|
954
|
+
user_id: parent_context[:user_id],
|
|
955
|
+
tenant_id: parent_context[:tenant_id]
|
|
956
|
+
}
|
|
957
|
+
},
|
|
958
|
+
|
|
959
|
+
# Job INHERITS parent trace_id (same trace)
|
|
960
|
+
inherit_parent: -> (parent_context) {
|
|
961
|
+
{
|
|
962
|
+
trace_id: parent_context[:trace_id], # ← SAME trace
|
|
963
|
+
parent_span_id: parent_context[:span_id],
|
|
964
|
+
span_id: IDGenerator.generate_span_id, # New span
|
|
965
|
+
sampled: parent_context[:sampled],
|
|
966
|
+
baggage: parent_context[:baggage],
|
|
967
|
+
user_id: parent_context[:user_id],
|
|
968
|
+
tenant_id: parent_context[:tenant_id]
|
|
969
|
+
}
|
|
970
|
+
},
|
|
971
|
+
|
|
972
|
+
# Job starts NEW trace, NO link (isolated)
|
|
973
|
+
start_new_isolated: -> (parent_context) {
|
|
974
|
+
{
|
|
975
|
+
trace_id: IDGenerator.generate_trace_id, # ← NEW trace
|
|
976
|
+
span_id: IDGenerator.generate_span_id,
|
|
977
|
+
parent_trace_id: nil, # ← NO link
|
|
978
|
+
sampled: parent_context[:sampled], # Still inherit sampling
|
|
979
|
+
baggage: parent_context[:baggage],
|
|
980
|
+
user_id: parent_context[:user_id],
|
|
981
|
+
tenant_id: parent_context[:tenant_id]
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
}.freeze
|
|
985
|
+
|
|
986
|
+
# Apply strategy to create job trace context
|
|
987
|
+
def self.apply(strategy, parent_context)
|
|
988
|
+
strategy_fn = STRATEGIES.fetch(strategy) do
|
|
989
|
+
raise ArgumentError, "Unknown strategy: #{strategy}"
|
|
990
|
+
end
|
|
991
|
+
|
|
992
|
+
strategy_fn.call(parent_context)
|
|
993
|
+
end
|
|
994
|
+
end
|
|
995
|
+
end
|
|
996
|
+
end
|
|
997
|
+
```
|
|
998
|
+
|
|
999
|
+
#### 8.3.3. SidekiqTraceMiddleware Implementation
|
|
1000
|
+
|
|
1001
|
+
**Sidekiq server middleware (job execution):**
|
|
1002
|
+
|
|
1003
|
+
```ruby
|
|
1004
|
+
# lib/e11y/middleware/sidekiq_trace_middleware.rb
|
|
1005
|
+
module E11y
|
|
1006
|
+
module Middleware
|
|
1007
|
+
class SidekiqTraceMiddleware
|
|
1008
|
+
def call(worker, job, queue)
|
|
1009
|
+
# Extract parent context from job metadata
|
|
1010
|
+
parent_context = extract_parent_context(job)
|
|
1011
|
+
|
|
1012
|
+
# Determine trace strategy (default: start_new_with_link)
|
|
1013
|
+
strategy = worker.class.e11y_trace_strategy || :start_new_with_link
|
|
1014
|
+
|
|
1015
|
+
# Apply strategy to create job trace context
|
|
1016
|
+
job_context = E11y::TraceContext::JobStrategy.apply(
|
|
1017
|
+
strategy,
|
|
1018
|
+
parent_context
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
# Set trace context for job execution
|
|
1022
|
+
E11y::Current.set(job_context)
|
|
1023
|
+
|
|
1024
|
+
# Track job execution start
|
|
1025
|
+
Events::JobStarted.track(
|
|
1026
|
+
job_class: worker.class.name,
|
|
1027
|
+
job_id: job['jid'],
|
|
1028
|
+
queue: queue,
|
|
1029
|
+
parent_trace_id: job_context[:parent_trace_id] # ← Link!
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
yield
|
|
1033
|
+
|
|
1034
|
+
# Track job success
|
|
1035
|
+
Events::JobCompleted.track(
|
|
1036
|
+
job_class: worker.class.name,
|
|
1037
|
+
job_id: job['jid'],
|
|
1038
|
+
queue: queue
|
|
1039
|
+
)
|
|
1040
|
+
rescue => e
|
|
1041
|
+
# Track job failure
|
|
1042
|
+
Events::JobFailed.track(
|
|
1043
|
+
job_class: worker.class.name,
|
|
1044
|
+
job_id: job['jid'],
|
|
1045
|
+
queue: queue,
|
|
1046
|
+
error_class: e.class.name,
|
|
1047
|
+
error_message: e.message
|
|
1048
|
+
)
|
|
1049
|
+
raise
|
|
1050
|
+
ensure
|
|
1051
|
+
E11y::Current.reset
|
|
1052
|
+
end
|
|
1053
|
+
|
|
1054
|
+
private
|
|
1055
|
+
|
|
1056
|
+
def extract_parent_context(job)
|
|
1057
|
+
{
|
|
1058
|
+
trace_id: job['e11y_trace_id'],
|
|
1059
|
+
span_id: job['e11y_span_id'],
|
|
1060
|
+
sampled: job['e11y_sampled'],
|
|
1061
|
+
baggage: job['e11y_baggage'],
|
|
1062
|
+
user_id: job['e11y_user_id'],
|
|
1063
|
+
tenant_id: job['e11y_tenant_id']
|
|
1064
|
+
}.compact
|
|
1065
|
+
end
|
|
1066
|
+
end
|
|
1067
|
+
end
|
|
1068
|
+
end
|
|
1069
|
+
|
|
1070
|
+
# Configure Sidekiq server
|
|
1071
|
+
Sidekiq.configure_server do |config|
|
|
1072
|
+
config.server_middleware do |chain|
|
|
1073
|
+
chain.add E11y::Middleware::SidekiqTraceMiddleware
|
|
1074
|
+
end
|
|
1075
|
+
end
|
|
1076
|
+
```
|
|
1077
|
+
|
|
1078
|
+
**Sidekiq client middleware (job enqueue):**
|
|
1079
|
+
|
|
1080
|
+
```ruby
|
|
1081
|
+
# lib/e11y/middleware/sidekiq_client_middleware.rb
|
|
1082
|
+
module E11y
|
|
1083
|
+
module Middleware
|
|
1084
|
+
class SidekiqClientMiddleware
|
|
1085
|
+
def call(worker_class, job, queue, redis_pool)
|
|
1086
|
+
# Inject current trace context into job metadata
|
|
1087
|
+
if E11y::Current.traced?
|
|
1088
|
+
job['e11y_trace_id'] = E11y::Current.trace_id
|
|
1089
|
+
job['e11y_span_id'] = E11y::Current.span_id
|
|
1090
|
+
job['e11y_sampled'] = E11y::Current.sampled
|
|
1091
|
+
job['e11y_baggage'] = E11y::Current.baggage if E11y::Current.baggage&.any?
|
|
1092
|
+
job['e11y_user_id'] = E11y::Current.user_id if E11y::Current.user_id
|
|
1093
|
+
job['e11y_tenant_id'] = E11y::Current.tenant_id if E11y::Current.tenant_id
|
|
1094
|
+
end
|
|
1095
|
+
|
|
1096
|
+
yield
|
|
1097
|
+
end
|
|
1098
|
+
end
|
|
1099
|
+
end
|
|
1100
|
+
end
|
|
1101
|
+
|
|
1102
|
+
# Configure Sidekiq client
|
|
1103
|
+
Sidekiq.configure_client do |config|
|
|
1104
|
+
config.client_middleware do |chain|
|
|
1105
|
+
chain.add E11y::Middleware::SidekiqClientMiddleware
|
|
1106
|
+
end
|
|
1107
|
+
end
|
|
1108
|
+
```
|
|
1109
|
+
|
|
1110
|
+
#### 8.3.4. Configuration
|
|
1111
|
+
|
|
1112
|
+
**Global default strategy:**
|
|
1113
|
+
|
|
1114
|
+
```ruby
|
|
1115
|
+
# config/initializers/e11y.rb
|
|
1116
|
+
E11y.configure do |config|
|
|
1117
|
+
config.tracing do |tracing|
|
|
1118
|
+
# Default strategy for ALL jobs
|
|
1119
|
+
tracing.background_jobs.default_strategy = :start_new_with_link
|
|
1120
|
+
|
|
1121
|
+
# Alternative strategies:
|
|
1122
|
+
# - :inherit_parent (job uses same trace_id as parent)
|
|
1123
|
+
# - :start_new_isolated (job gets new trace, no link)
|
|
1124
|
+
end
|
|
1125
|
+
end
|
|
1126
|
+
```
|
|
1127
|
+
|
|
1128
|
+
**Per-job strategy override:**
|
|
1129
|
+
|
|
1130
|
+
```ruby
|
|
1131
|
+
# app/jobs/urgent_email_job.rb
|
|
1132
|
+
class UrgentEmailJob < ApplicationJob
|
|
1133
|
+
include Sidekiq::Job
|
|
1134
|
+
|
|
1135
|
+
# Override: Fast jobs (< 1 sec) can inherit parent trace
|
|
1136
|
+
e11y_trace_strategy :inherit_parent
|
|
1137
|
+
|
|
1138
|
+
def perform(order_id)
|
|
1139
|
+
# This job runs in SAME trace as parent request
|
|
1140
|
+
Events::EmailSent.track(order_id: order_id)
|
|
1141
|
+
end
|
|
1142
|
+
end
|
|
1143
|
+
|
|
1144
|
+
# app/jobs/batch_report_job.rb
|
|
1145
|
+
class BatchReportJob < ApplicationJob
|
|
1146
|
+
include Sidekiq::Job
|
|
1147
|
+
|
|
1148
|
+
# Override: Slow jobs (hours later) should start new trace
|
|
1149
|
+
e11y_trace_strategy :start_new_with_link # (default)
|
|
1150
|
+
|
|
1151
|
+
def perform(report_id)
|
|
1152
|
+
# This job runs in NEW trace, linked to parent
|
|
1153
|
+
Events::ReportGenerated.track(report_id: report_id)
|
|
1154
|
+
end
|
|
1155
|
+
end
|
|
1156
|
+
```
|
|
1157
|
+
|
|
1158
|
+
#### 8.3.5. Querying Full Flow (Request → Job)
|
|
1159
|
+
|
|
1160
|
+
**How to reconstruct full end-to-end flow:**
|
|
1161
|
+
|
|
1162
|
+
```ruby
|
|
1163
|
+
# Find parent request trace
|
|
1164
|
+
parent_trace = Trace.find_by(trace_id: 'abc-123')
|
|
1165
|
+
|
|
1166
|
+
# Find all child job traces (via parent_trace_id link)
|
|
1167
|
+
child_traces = Trace.where(parent_trace_id: 'abc-123')
|
|
1168
|
+
|
|
1169
|
+
# Result:
|
|
1170
|
+
# Parent trace: abc-123 (request)
|
|
1171
|
+
# → Child trace: xyz-789 (SendOrderEmailJob)
|
|
1172
|
+
# → Child trace: def-456 (ProcessPaymentJob)
|
|
1173
|
+
|
|
1174
|
+
# Query for full flow:
|
|
1175
|
+
SELECT * FROM events
|
|
1176
|
+
WHERE trace_id = 'abc-123' -- Parent request events
|
|
1177
|
+
OR parent_trace_id = 'abc-123' -- Child job events
|
|
1178
|
+
ORDER BY created_at;
|
|
1179
|
+
```
|
|
1180
|
+
|
|
1181
|
+
**Example flow with hybrid model:**
|
|
1182
|
+
|
|
1183
|
+
```ruby
|
|
1184
|
+
# 1. Web request (trace_id: abc-123)
|
|
1185
|
+
POST /orders
|
|
1186
|
+
→ Events::OrderCreated (trace_id: abc-123, span_id: span-001)
|
|
1187
|
+
→ Enqueue SendOrderEmailJob (metadata: {e11y_trace_id: 'abc-123'})
|
|
1188
|
+
|
|
1189
|
+
# 2. Sidekiq job execution (NEW trace_id: xyz-789)
|
|
1190
|
+
SendOrderEmailJob#perform
|
|
1191
|
+
→ SidekiqTraceMiddleware applies :start_new_with_link strategy
|
|
1192
|
+
→ NEW trace_id: xyz-789, parent_trace_id: abc-123
|
|
1193
|
+
→ Events::JobStarted (trace_id: xyz-789, parent_trace_id: abc-123)
|
|
1194
|
+
→ Events::EmailSent (trace_id: xyz-789, span_id: span-001)
|
|
1195
|
+
→ Events::JobCompleted (trace_id: xyz-789)
|
|
1196
|
+
|
|
1197
|
+
# Result: TWO traces with LINK
|
|
1198
|
+
# Trace abc-123: OrderCreated (request)
|
|
1199
|
+
# Trace xyz-789: JobStarted, EmailSent, JobCompleted (linked via parent_trace_id)
|
|
1200
|
+
```
|
|
1201
|
+
|
|
1202
|
+
#### 8.3.6. Schema Changes
|
|
1203
|
+
|
|
1204
|
+
**Add `parent_trace_id` field to events table:**
|
|
1205
|
+
|
|
1206
|
+
```ruby
|
|
1207
|
+
# db/migrate/XXXXXX_add_parent_trace_id_to_events.rb
|
|
1208
|
+
class AddParentTraceIdToEvents < ActiveRecord::Migration[8.0]
|
|
1209
|
+
def change
|
|
1210
|
+
add_column :events, :parent_trace_id, :string, limit: 32, null: true
|
|
1211
|
+
add_index :events, :parent_trace_id
|
|
1212
|
+
|
|
1213
|
+
# For querying full flow: WHERE trace_id = X OR parent_trace_id = X
|
|
1214
|
+
add_index :events, [:trace_id, :parent_trace_id]
|
|
1215
|
+
end
|
|
1216
|
+
end
|
|
1217
|
+
```
|
|
1218
|
+
|
|
1219
|
+
**Update Event base class:**
|
|
1220
|
+
|
|
1221
|
+
```ruby
|
|
1222
|
+
# lib/e11y/event.rb
|
|
1223
|
+
module E11y
|
|
1224
|
+
class Event
|
|
1225
|
+
attribute :parent_trace_id, :string # ← NEW field
|
|
1226
|
+
|
|
1227
|
+
# Auto-populate from E11y::Current
|
|
1228
|
+
def initialize(attributes = {})
|
|
1229
|
+
super
|
|
1230
|
+
|
|
1231
|
+
self.trace_id ||= E11y::Current.trace_id
|
|
1232
|
+
self.span_id ||= E11y::Current.span_id
|
|
1233
|
+
self.parent_trace_id ||= E11y::Current.parent_trace_id # ← NEW!
|
|
1234
|
+
self.user_id ||= E11y::Current.user_id
|
|
1235
|
+
self.tenant_id ||= E11y::Current.tenant_id
|
|
1236
|
+
end
|
|
1237
|
+
end
|
|
1238
|
+
end
|
|
1239
|
+
```
|
|
1240
|
+
|
|
1241
|
+
#### 8.3.7. Trade-offs (C17 Resolution)
|
|
1242
|
+
|
|
1243
|
+
| Aspect | Hybrid Model (start_new_with_link) | Inherit Parent | Start New Isolated |
|
|
1244
|
+
|--------|-------------------------------------|----------------|--------------------|
|
|
1245
|
+
| **Trace Boundaries** | ✅ Clear (request vs job) | ❌ Unbounded (spans hours) | ✅ Clear (no link) |
|
|
1246
|
+
| **SLO Accuracy** | ✅ Accurate (separate latencies) | ❌ Skewed (includes job time) | ✅ Accurate |
|
|
1247
|
+
| **End-to-End Visibility** | ✅ Can reconstruct (via link) | ✅ Single trace view | ❌ Lost (no link) |
|
|
1248
|
+
| **Querying Complexity** | ⚠️ Must follow links (JOIN) | ✅ Simple (single trace_id) | ✅ Simple (isolated) |
|
|
1249
|
+
| **Storage Cost** | ⚠️ Two trace IDs to store | ✅ Single trace_id | ✅ Single trace_id |
|
|
1250
|
+
| **Use Case** | ✅ **RECOMMENDED (default)** | ⚠️ Fast jobs only (< 1s) | ⚠️ Isolated jobs only |
|
|
1251
|
+
|
|
1252
|
+
**Why Hybrid Model is Default:**
|
|
1253
|
+
1. ✅ **Clear trace boundaries** - Request SLO ≠ Job SLO
|
|
1254
|
+
2. ✅ **Accurate metrics** - Can measure request latency separately from job latency
|
|
1255
|
+
3. ✅ **Bounded traces** - Traces have clear start/end (not hours long)
|
|
1256
|
+
4. ✅ **Still linked** - Can reconstruct full flow via `parent_trace_id`
|
|
1257
|
+
5. ✅ **Flexible** - Can override per-job if needed
|
|
1258
|
+
|
|
1259
|
+
**Related Conflicts:**
|
|
1260
|
+
- **C05:** Trace-aware sampling (see ADR-009 §3.6)
|
|
1261
|
+
- **C11:** Stratified sampling (see ADR-009 §3.7)
|
|
1262
|
+
- **UC-010:** Background Job Tracking
|
|
1263
|
+
- **UC-009:** Multi-Service Tracing
|
|
1264
|
+
|
|
1265
|
+
---
|
|
1266
|
+
|
|
1267
|
+
## 9. Trade-offs
|
|
1268
|
+
|
|
1269
|
+
### 9.1. Key Decisions
|
|
1270
|
+
|
|
1271
|
+
| Decision | Pro | Con | Rationale |
|
|
1272
|
+
|----------|-----|-----|-----------|
|
|
1273
|
+
| **ActiveSupport::CurrentAttributes** | Rails-native, thread-safe | Rails dependency | Perfect fit for Rails 8+ |
|
|
1274
|
+
| **W3C Trace Context** | Industry standard | More complex than UUID | Future-proof, interop |
|
|
1275
|
+
| **128-bit trace ID** | No collisions | Longer strings | W3C requirement |
|
|
1276
|
+
| **Trace-consistent sampling** | Distributed traces work | Complex propagation | Critical for multi-service |
|
|
1277
|
+
| **Auto-enrich events** | Zero boilerplate | Implicit behavior | DX > explicit |
|
|
1278
|
+
| **Baggage propagation** | Flexible metadata | Size overhead | Limited use, opt-in |
|
|
1279
|
+
| **Manual spans** | Simple | Less automation | v1.0 scope |
|
|
1280
|
+
| **Hybrid job tracing (C17)** ⚠️ | Clear boundaries, accurate SLOs | More complex queries | Prevents unbounded traces |
|
|
1281
|
+
|
|
1282
|
+
### 9.2. Alternatives Considered
|
|
1283
|
+
|
|
1284
|
+
**A) Global variables for context**
|
|
1285
|
+
- ❌ Rejected: Not thread-safe
|
|
1286
|
+
|
|
1287
|
+
**B) OpenTelemetry SDK**
|
|
1288
|
+
- ❌ Rejected for v1.0: Too heavy, see ADR-007
|
|
1289
|
+
|
|
1290
|
+
**C) UUID v4 for trace ID**
|
|
1291
|
+
- ❌ Rejected: Not W3C compliant
|
|
1292
|
+
|
|
1293
|
+
**D) Automatic span creation**
|
|
1294
|
+
- ❌ Rejected for v1.0: Complexity, performance
|
|
1295
|
+
|
|
1296
|
+
**E) Context stored in Database**
|
|
1297
|
+
- ❌ Rejected: Too slow, high overhead
|
|
1298
|
+
|
|
1299
|
+
---
|
|
1300
|
+
|
|
1301
|
+
## 10. Configuration Reference
|
|
1302
|
+
|
|
1303
|
+
```ruby
|
|
1304
|
+
# config/initializers/e11y.rb
|
|
1305
|
+
E11y.configure do |config|
|
|
1306
|
+
config.tracing do
|
|
1307
|
+
# Enable tracing
|
|
1308
|
+
enabled true
|
|
1309
|
+
|
|
1310
|
+
# Trace ID generation
|
|
1311
|
+
id_generator :secure_random # or :uuid, :custom
|
|
1312
|
+
|
|
1313
|
+
# W3C Trace Context
|
|
1314
|
+
w3c_trace_context do
|
|
1315
|
+
enabled true
|
|
1316
|
+
version '00'
|
|
1317
|
+
|
|
1318
|
+
# Legacy header support (fallback)
|
|
1319
|
+
support_legacy_headers true
|
|
1320
|
+
legacy_headers ['X-Request-ID', 'X-Trace-ID', 'X-Correlation-ID']
|
|
1321
|
+
end
|
|
1322
|
+
|
|
1323
|
+
# Sampling
|
|
1324
|
+
sampling do
|
|
1325
|
+
default_sample_rate 0.1
|
|
1326
|
+
|
|
1327
|
+
per_event_sample_rates do
|
|
1328
|
+
event 'payment.*', sample_rate: 1.0
|
|
1329
|
+
event 'health_check', sample_rate: 0.01
|
|
1330
|
+
end
|
|
1331
|
+
|
|
1332
|
+
debug_users [123, 456]
|
|
1333
|
+
respect_parent_sampling true
|
|
1334
|
+
end
|
|
1335
|
+
|
|
1336
|
+
# Context propagation
|
|
1337
|
+
propagation do
|
|
1338
|
+
# HTTP outgoing requests
|
|
1339
|
+
http do
|
|
1340
|
+
enabled true
|
|
1341
|
+
inject_headers ['traceparent', 'tracestate', 'X-Request-ID']
|
|
1342
|
+
end
|
|
1343
|
+
|
|
1344
|
+
# Background jobs
|
|
1345
|
+
jobs do
|
|
1346
|
+
enabled true
|
|
1347
|
+
propagate_baggage true
|
|
1348
|
+
propagate_user_context true
|
|
1349
|
+
end
|
|
1350
|
+
|
|
1351
|
+
# Structured logs
|
|
1352
|
+
logs do
|
|
1353
|
+
enabled true
|
|
1354
|
+
fields [:trace_id, :span_id, :user_id]
|
|
1355
|
+
end
|
|
1356
|
+
end
|
|
1357
|
+
|
|
1358
|
+
# Baggage (optional metadata)
|
|
1359
|
+
baggage do
|
|
1360
|
+
enabled true
|
|
1361
|
+
max_size 1024 # bytes
|
|
1362
|
+
max_entries 10
|
|
1363
|
+
end
|
|
1364
|
+
end
|
|
1365
|
+
end
|
|
1366
|
+
```
|
|
1367
|
+
|
|
1368
|
+
---
|
|
1369
|
+
|
|
1370
|
+
**Status:** ✅ Draft Complete
|
|
1371
|
+
**Next:** ADR-011 (Testing Strategy) or ADR-012 (Event Evolution)
|
|
1372
|
+
**Estimated Implementation:** 2 weeks
|