e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,1043 @@
1
+ # UC-009: Multi-Service Tracing
2
+
3
+ **Status:** v1.1+ Enhancement
4
+ **Complexity:** Advanced
5
+ **Setup Time:** 30-40 minutes
6
+ **Target Users:** Platform Engineers, SRE, Microservices Teams
7
+
8
+ ---
9
+
10
+ ## 📋 Overview
11
+
12
+ ### Problem Statement
13
+
14
+ **The microservices debugging nightmare:**
15
+ ```ruby
16
+ # ❌ BEFORE: Lost context across services
17
+ # Service A (API):
18
+ Rails.logger.info "Order created: #{order_id}"
19
+
20
+ # Service B (Payment):
21
+ Rails.logger.info "Processing payment" # ← Which order?!
22
+
23
+ # Service C (Fulfillment):
24
+ Rails.logger.info "Shipping order" # ← Which payment?!
25
+
26
+ # Problems:
27
+ # 1. No correlation between services
28
+ # 2. Can't see complete request flow
29
+ # 3. Manual trace_id passing (error-prone)
30
+ # 4. Different logging formats per service
31
+ # 5. Can't measure cross-service latency
32
+ # 6. Debugging takes hours (grep through multiple logs)
33
+ ```
34
+
35
+ ### E11y Solution
36
+
37
+ **Automatic distributed tracing:**
38
+ ```ruby
39
+ # ✅ AFTER: Automatic trace propagation across services
40
+
41
+ # Service A (API) - trace_id: abc-123
42
+ Events::OrderCreated.track(order_id: '789')
43
+ # → HTTP call to Service B with W3C Trace Context header
44
+
45
+ # Service B (Payment) - trace_id: abc-123 (preserved!)
46
+ Events::PaymentProcessing.track(order_id: '789', amount: 99.99)
47
+ # → HTTP call to Service C with W3C Trace Context header
48
+
49
+ # Service C (Fulfillment) - trace_id: abc-123 (preserved!)
50
+ Events::OrderShipping.track(order_id: '789', tracking: 'TRACK123')
51
+
52
+ # Grafana query: {trace_id="abc-123"}
53
+ # 10:00:00.000 [service-a] order.created
54
+ # 10:00:00.050 [service-b] payment.processing
55
+ # 10:00:02.120 [service-c] order.shipping
56
+ # → Complete distributed trace!
57
+ ```
58
+
59
+ ---
60
+
61
+ ## 🎯 Features
62
+
63
+ > **Implementation:** See [ADR-005: Tracing Context](../ADR-005-tracing-context.md) for complete architecture, including [Section 5: W3C Trace Context](../ADR-005-tracing-context.md#5-w3c-trace-context), [Section 6.1: HTTP Propagator](../ADR-005-tracing-context.md#61-http-propagator-outgoing-requests), and [Section 8: Context Inheritance](../ADR-005-tracing-context.md#8-context-inheritance-thread-fiber-support).
64
+
65
+ ### 1. Automatic W3C Trace Context Propagation
66
+
67
+ **Zero-config HTTP header propagation:**
68
+ ```ruby
69
+ # Service A: API Gateway
70
+ class OrdersController < ApplicationController
71
+ def create
72
+ order = Order.create!(order_params)
73
+
74
+ # Track event (trace_id automatically set from request)
75
+ Events::OrderCreated.track(
76
+ order_id: order.id,
77
+ user_id: current_user.id
78
+ )
79
+
80
+ # Call Payment Service (trace_id automatically propagated!)
81
+ response = PaymentServiceClient.charge(
82
+ order_id: order.id,
83
+ amount: order.total
84
+ )
85
+
86
+ render json: order
87
+ end
88
+ end
89
+
90
+ # HTTP Request to Payment Service includes:
91
+ # traceparent: 00-abc123...-def456...-01
92
+ # ^ ^ ^ ^
93
+ # | | | +-- flags (sampled)
94
+ # | | +------------ span_id
95
+ # | +---------------------- trace_id
96
+ # +-------------------------- version
97
+ ```
98
+
99
+ **Faraday middleware (automatic!):**
100
+ ```ruby
101
+ # config/initializers/e11y.rb
102
+ E11y.configure do |config|
103
+ config.trace_propagation do
104
+ # Faraday middleware (auto-inject trace headers)
105
+ faraday enabled: true
106
+
107
+ # Net::HTTP middleware
108
+ net_http enabled: true
109
+
110
+ # HTTParty
111
+ httparty enabled: true
112
+ end
113
+ end
114
+
115
+ # Now ALL HTTP clients automatically propagate trace context!
116
+ conn = Faraday.new(url: 'http://payment-service')
117
+ conn.post('/charges', { amount: 99.99 })
118
+ # → Automatically includes traceparent header ✨
119
+ ```
120
+
121
+ ---
122
+
123
+ ### 2. Service-to-Service Event Correlation
124
+
125
+ **Explicit service boundaries:**
126
+ ```ruby
127
+ # Service A: API Gateway
128
+ module Events
129
+ class OrderCreated < E11y::Event::Base
130
+ service_boundary :outgoing # This service initiates call
131
+
132
+ schema do
133
+ required(:order_id).filled(:string)
134
+ required(:user_id).filled(:string)
135
+ required(:amount).filled(:decimal)
136
+ end
137
+ end
138
+ end
139
+
140
+ # Track + call downstream service
141
+ Events::OrderCreated.track(order_id: '789', user_id: '123', amount: 99.99)
142
+
143
+ # HTTP call to Service B
144
+ PaymentServiceClient.charge(order_id: '789', amount: 99.99)
145
+
146
+ # ---
147
+
148
+ # Service B: Payment Service
149
+ module Events
150
+ class PaymentReceived < E11y::Event::Base
151
+ service_boundary :incoming # This service receives call
152
+
153
+ schema do
154
+ required(:order_id).filled(:string)
155
+ required(:amount).filled(:decimal)
156
+ end
157
+ end
158
+
159
+ class PaymentProcessed < E11y::Event::Base
160
+ service_boundary :outgoing # This service initiates next call
161
+
162
+ schema do
163
+ required(:order_id).filled(:string)
164
+ required(:transaction_id).filled(:string)
165
+ end
166
+ end
167
+ end
168
+
169
+ class ChargesController < ApplicationController
170
+ def create
171
+ # Track incoming request (trace_id from header!)
172
+ Events::PaymentReceived.track(
173
+ order_id: params[:order_id],
174
+ amount: params[:amount]
175
+ )
176
+
177
+ # Process payment
178
+ transaction = charge_card(params[:amount])
179
+
180
+ # Track outgoing event
181
+ Events::PaymentProcessed.track(
182
+ order_id: params[:order_id],
183
+ transaction_id: transaction.id
184
+ )
185
+
186
+ # Call Fulfillment Service
187
+ FulfillmentServiceClient.ship_order(
188
+ order_id: params[:order_id],
189
+ transaction_id: transaction.id
190
+ )
191
+
192
+ render json: { transaction_id: transaction.id }
193
+ end
194
+ end
195
+
196
+ # Timeline in Grafana: {trace_id="abc-123"}
197
+ # 10:00:00.000 [api-gateway] order.created
198
+ # 10:00:00.050 [payment] payment.received
199
+ # 10:00:02.000 [payment] payment.processed
200
+ # 10:00:02.050 [fulfillment] order.received
201
+ # 10:00:05.000 [fulfillment] order.shipped
202
+ # → Complete multi-service trace!
203
+ ```
204
+
205
+ ---
206
+
207
+ ### 3. Background Job Trace Propagation (C17 Resolution) ⚠️
208
+
209
+ > **Implementation:** See [ADR-005 Section 8.3: Background Job Tracing Strategy](../ADR-005-tracing-context.md#83-background-job-tracing-strategy-c17-resolution) for the hybrid tracing model.
210
+
211
+ **Hybrid Tracing Model (New Trace + Parent Link):**
212
+ ```ruby
213
+ # Service A: API Gateway
214
+ class OrdersController < ApplicationController
215
+ def create
216
+ order = Order.create!(order_params)
217
+
218
+ # Track event (trace_id: abc-123)
219
+ Events::OrderCreated.track(order_id: order.id)
220
+ # Current trace: abc-123
221
+
222
+ # Enqueue job (parent trace context automatically passed!)
223
+ ProcessOrderJob.perform_later(order.id)
224
+ # → Sidekiq metadata: { parent_trace_id: 'abc-123' }
225
+
226
+ render json: order
227
+ end
228
+ end
229
+
230
+ # Service B: Worker Service (Sidekiq)
231
+ class ProcessOrderJob < ApplicationJob
232
+ def perform(order_id)
233
+ # ✅ NEW TRACE STARTED: trace_id: xyz-789 (fresh trace for job!)
234
+ # ✅ PARENT LINKED: parent_trace_id: abc-123 (link to request trace)
235
+
236
+ Events::OrderProcessingStarted.track(
237
+ order_id: order_id
238
+ # Metadata auto-added:
239
+ # - trace_id: xyz-789 (job's own trace)
240
+ # - parent_trace_id: abc-123 (link to parent request)
241
+ )
242
+
243
+ # Call Payment Service (NEW trace_id propagated!)
244
+ PaymentServiceClient.charge(order_id: order_id)
245
+ # → HTTP header: traceparent: xyz-789 (job's trace!)
246
+
247
+ Events::OrderProcessingCompleted.track(order_id: order_id)
248
+ end
249
+ end
250
+
251
+ # Service C: Payment Service
252
+ class ChargesController < ApplicationController
253
+ def create
254
+ # trace_id: xyz-789 (from job's HTTP header)
255
+ # parent_trace_id: abc-123 (preserved from job)
256
+ Events::PaymentProcessed.track(...)
257
+ end
258
+ end
259
+
260
+ # Timeline 1: Request Trace {trace_id="abc-123"}
261
+ # 10:00:00.000 [api-gateway] order.created (trace_id=abc-123)
262
+ # 10:00:00.010 [api-gateway] job.enqueued (trace_id=abc-123)
263
+
264
+ # Timeline 2: Job Trace {trace_id="xyz-789", parent_trace_id="abc-123"}
265
+ # 10:00:05.000 [worker] job.started (trace_id=xyz-789, parent=abc-123)
266
+ # 10:00:05.010 [worker] order.processing.started (trace_id=xyz-789)
267
+ # 10:00:05.050 [payment] payment.processed (trace_id=xyz-789)
268
+ # 10:00:07.000 [worker] order.processing.completed (trace_id=xyz-789)
269
+
270
+ # Query to see full flow (request + job):
271
+ # Loki: {trace_id="abc-123"} OR {parent_trace_id="abc-123"}
272
+ # → Shows BOTH request trace AND linked job trace!
273
+ ```
274
+
275
+ **Why Hybrid Model (not same trace_id)?**
276
+
277
+ 1. **Bounded trace duration:** Job may run for hours/days (not same as 100ms request)
278
+ 2. **SLO accuracy:** Request SLO (P99 200ms) ≠ Job SLO (P99 5 minutes)
279
+ 3. **Trace clarity:** Separate timelines for sync (request) vs async (job) operations
280
+ 4. **Link preserved:** `parent_trace_id` allows reconstructing full flow
281
+
282
+ See [ADR-005 §8.3](../ADR-005-tracing-context.md#83-background-job-tracing-strategy-c17-resolution) for detailed rationale.
283
+
284
+ **Visual Diagram: Request Trace → Job Trace (with parent link)**
285
+
286
+ ```mermaid
287
+ graph LR
288
+ subgraph "Request Trace (trace_id=abc-123)"
289
+ A[HTTP Request] --> B[order.created]
290
+ B --> C[job.enqueued]
291
+ end
292
+
293
+ subgraph "Job Trace (trace_id=xyz-789, parent=abc-123)"
294
+ D[job.started] --> E[order.processing.started]
295
+ E --> F[payment.processed]
296
+ F --> G[order.processing.completed]
297
+ end
298
+
299
+ C -.parent_trace_id: abc-123.-> D
300
+
301
+ style C fill:#f9f,stroke:#333,stroke-width:2px
302
+ style D fill:#9ff,stroke:#333,stroke-width:2px
303
+ ```
304
+
305
+ **Query Examples:**
306
+
307
+ ```ruby
308
+ # 1. Find all events in request trace
309
+ Loki query: {trace_id="abc-123"}
310
+ # Result:
311
+ # - order.created
312
+ # - job.enqueued
313
+
314
+ # 2. Find all events in job trace
315
+ Loki query: {trace_id="xyz-789"}
316
+ # Result:
317
+ # - job.started
318
+ # - order.processing.started
319
+ # - payment.processed
320
+ # - order.processing.completed
321
+
322
+ # 3. Find FULL FLOW (request + linked jobs)
323
+ Loki query: {trace_id="abc-123"} OR {parent_trace_id="abc-123"}
324
+ # Result: ALL events from request AND its child jobs!
325
+ # - order.created (trace=abc-123)
326
+ # - job.enqueued (trace=abc-123)
327
+ # - job.started (trace=xyz-789, parent=abc-123)
328
+ # - order.processing.started (trace=xyz-789, parent=abc-123)
329
+ # - payment.processed (trace=xyz-789, parent=abc-123)
330
+ # - order.processing.completed (trace=xyz-789, parent=abc-123)
331
+ ```
332
+
333
+ **Database Schema (parent_trace_id field):**
334
+
335
+ ```sql
336
+ -- events table
337
+ CREATE TABLE events (
338
+ id BIGSERIAL PRIMARY KEY,
339
+ event_name VARCHAR(255),
340
+ trace_id VARCHAR(36),
341
+ parent_trace_id VARCHAR(36), -- ✅ NEW: Link to parent trace!
342
+ -- ... other fields
343
+ );
344
+
345
+ -- Index for efficient queries
346
+ CREATE INDEX idx_events_parent_trace_id ON events(parent_trace_id);
347
+
348
+ -- Query to reconstruct full flow:
349
+ SELECT * FROM events
350
+ WHERE trace_id = 'abc-123'
351
+ OR parent_trace_id = 'abc-123'
352
+ ORDER BY timestamp ASC;
353
+ ```
354
+
355
+ ---
356
+
357
+ ### 4. Cross-Service Latency Measurement
358
+
359
+ **Automatic service-to-service timing:**
360
+ ```ruby
361
+ # config/initializers/e11y.rb
362
+ E11y.configure do |config|
363
+ config.distributed_tracing do
364
+ # Measure cross-service latency
365
+ measure_service_latency true
366
+
367
+ # Track service hops
368
+ track_service_hops true
369
+ end
370
+ end
371
+
372
+ # Service A: API Gateway
373
+ Events::OrderCreated.track(
374
+ order_id: '789',
375
+ timestamp_sent: Time.current # Auto-added!
376
+ )
377
+
378
+ # Service B: Payment Service
379
+ Events::PaymentReceived.track(
380
+ order_id: '789',
381
+ timestamp_received: Time.current # Auto-added!
382
+ )
383
+
384
+ # E11y automatically calculates:
385
+ # - Network latency: timestamp_received - timestamp_sent
386
+ # - Service hop count: 1 (API → Payment)
387
+ # - Total trace duration: last event - first event
388
+
389
+ # Metrics (automatic!):
390
+ # e11y_service_to_service_latency_ms{from="api",to="payment"} = 50
391
+ # e11y_service_hops{trace_id="abc-123"} = 3
392
+ # e11y_trace_duration_ms{trace_id="abc-123"} = 5020
393
+ ```
394
+
395
+ ---
396
+
397
+ ### 5. Service Mesh Integration
398
+
399
+ > **Implementation:** See [ADR-005 Section 5.3: HTTP Header Extraction](../ADR-005-tracing-context.md#53-http-header-extraction-w3c-legacy-headers) for W3C standard header support enabling service mesh compatibility.
400
+
401
+ **Automatic integration with Istio/Linkerd:**
402
+ ```ruby
403
+ # config/initializers/e11y.rb
404
+ E11y.configure do |config|
405
+ config.service_mesh do
406
+ # Detect service mesh
407
+ auto_detect true # Detects Istio/Linkerd
408
+
409
+ # Use mesh headers
410
+ use_mesh_headers true
411
+
412
+ # Mesh-specific headers
413
+ istio do
414
+ use_headers [
415
+ 'x-request-id',
416
+ 'x-b3-traceid',
417
+ 'x-b3-spanid',
418
+ 'x-b3-parentspanid',
419
+ 'x-b3-sampled',
420
+ 'x-b3-flags'
421
+ ]
422
+ end
423
+
424
+ linkerd do
425
+ use_headers [
426
+ 'l5d-ctx-trace',
427
+ 'l5d-ctx-deadline'
428
+ ]
429
+ end
430
+ end
431
+ end
432
+
433
+ # E11y automatically:
434
+ # 1. Reads trace context from mesh headers
435
+ # 2. Propagates context to downstream services
436
+ # 3. Respects mesh sampling decisions
437
+ # 4. Correlates E11y events with mesh spans
438
+ ```
439
+
440
+ ---
441
+
442
+ ## 💻 Implementation Examples
443
+
444
+ ### Example 1: E-commerce Order Flow (3 Services)
445
+
446
+ ```ruby
447
+ # === SERVICE A: API GATEWAY ===
448
+ # app/controllers/orders_controller.rb
449
+ class OrdersController < ApplicationController
450
+ def create
451
+ # Start distributed trace
452
+ Events::OrderCreationStarted.track(
453
+ user_id: current_user.id,
454
+ cart_total: cart.total
455
+ )
456
+
457
+ # Create order
458
+ order = Order.create!(
459
+ user_id: current_user.id,
460
+ items: cart.items,
461
+ total: cart.total
462
+ )
463
+
464
+ # Track order created
465
+ Events::OrderCreated.track(
466
+ order_id: order.id,
467
+ user_id: current_user.id,
468
+ amount: order.total,
469
+ items_count: order.items.count
470
+ )
471
+
472
+ # Call Payment Service (trace_id auto-propagated!)
473
+ begin
474
+ payment = PaymentServiceClient.charge(
475
+ order_id: order.id,
476
+ amount: order.total,
477
+ payment_method: params[:payment_method]
478
+ )
479
+
480
+ Events::PaymentInitiated.track(
481
+ order_id: order.id,
482
+ payment_id: payment.id,
483
+ amount: order.total
484
+ )
485
+
486
+ render json: { order: order, payment: payment }
487
+ rescue PaymentServiceClient::Error => e
488
+ Events::PaymentFailed.track(
489
+ order_id: order.id,
490
+ error: e.message,
491
+ severity: :error
492
+ )
493
+
494
+ render json: { error: e.message }, status: :unprocessable_entity
495
+ end
496
+ end
497
+ end
498
+
499
+ # === SERVICE B: PAYMENT SERVICE ===
500
+ # app/controllers/charges_controller.rb
501
+ class ChargesController < ApplicationController
502
+ def create
503
+ # Trace context automatically extracted from headers!
504
+ Events::PaymentRequestReceived.track(
505
+ order_id: params[:order_id],
506
+ amount: params[:amount],
507
+ payment_method: params[:payment_method]
508
+ )
509
+
510
+ # Process payment
511
+ transaction = StripeGateway.charge(
512
+ amount: params[:amount],
513
+ source: params[:payment_method]
514
+ )
515
+
516
+ Events::PaymentSucceeded.track(
517
+ order_id: params[:order_id],
518
+ transaction_id: transaction.id,
519
+ amount: transaction.amount,
520
+ severity: :success
521
+ )
522
+
523
+ # Call Fulfillment Service (trace_id auto-propagated!)
524
+ FulfillmentServiceClient.create_shipment(
525
+ order_id: params[:order_id],
526
+ transaction_id: transaction.id
527
+ )
528
+
529
+ render json: { transaction: transaction }
530
+ rescue StripeGateway::Error => e
531
+ Events::PaymentFailed.track(
532
+ order_id: params[:order_id],
533
+ error: e.message,
534
+ error_code: e.code,
535
+ severity: :error
536
+ )
537
+
538
+ render json: { error: e.message }, status: :unprocessable_entity
539
+ end
540
+ end
541
+
542
+ # === SERVICE C: FULFILLMENT SERVICE ===
543
+ # app/controllers/shipments_controller.rb
544
+ class ShipmentsController < ApplicationController
545
+ def create
546
+ Events::ShipmentRequestReceived.track(
547
+ order_id: params[:order_id],
548
+ transaction_id: params[:transaction_id]
549
+ )
550
+
551
+ # Create shipment
552
+ shipment = Shipment.create!(
553
+ order_id: params[:order_id],
554
+ carrier: 'USPS',
555
+ tracking_number: generate_tracking_number
556
+ )
557
+
558
+ Events::ShipmentCreated.track(
559
+ order_id: params[:order_id],
560
+ shipment_id: shipment.id,
561
+ tracking_number: shipment.tracking_number,
562
+ estimated_delivery: shipment.estimated_delivery,
563
+ severity: :success
564
+ )
565
+
566
+ render json: { shipment: shipment }
567
+ end
568
+ end
569
+
570
+ # === GRAFANA QUERY ===
571
+ # {trace_id="abc-123"} | json | line_format "{{.timestamp}} [{{.service}}] {{.event_name}}"
572
+ #
573
+ # Result:
574
+ # 10:00:00.000 [api-gateway] order.creation.started
575
+ # 10:00:00.050 [api-gateway] order.created
576
+ # 10:00:00.060 [api-gateway] payment.initiated
577
+ # 10:00:00.100 [payment] payment.request.received
578
+ # 10:00:02.150 [payment] payment.succeeded
579
+ # 10:00:02.200 [fulfillment] shipment.request.received
580
+ # 10:00:03.500 [fulfillment] shipment.created
581
+ # → Complete 7-step distributed trace!
582
+ ```
583
+
584
+ ---
585
+
586
+ ### Example 2: GraphQL Federation (4 Services)
587
+
588
+ ```ruby
589
+ # === SERVICE A: GATEWAY (GraphQL) ===
590
+ class Mutations::CreateOrder < Mutations::BaseMutation
591
+ def resolve(input:)
592
+ Events::GraphqlMutationStarted.track(
593
+ mutation: 'createOrder',
594
+ user_id: context[:current_user].id
595
+ )
596
+
597
+ # Query User Service (trace_id propagated)
598
+ user = UserServiceClient.get_user(context[:current_user].id)
599
+
600
+ # Query Product Service (trace_id propagated)
601
+ products = ProductServiceClient.get_products(input[:product_ids])
602
+
603
+ # Query Order Service (trace_id propagated)
604
+ order = OrderServiceClient.create_order(
605
+ user_id: user.id,
606
+ products: products
607
+ )
608
+
609
+ Events::GraphqlMutationCompleted.track(
610
+ mutation: 'createOrder',
611
+ order_id: order.id,
612
+ duration_ms: duration
613
+ )
614
+
615
+ { order: order }
616
+ end
617
+ end
618
+
619
+ # === SERVICE B: USER SERVICE ===
620
+ class UsersController < ApplicationController
621
+ def show
622
+ Events::UserFetchRequested.track(user_id: params[:id])
623
+
624
+ user = User.find(params[:id])
625
+
626
+ Events::UserFetched.track(
627
+ user_id: user.id,
628
+ user_segment: user.segment
629
+ )
630
+
631
+ render json: user
632
+ end
633
+ end
634
+
635
+ # === SERVICE C: PRODUCT SERVICE ===
636
+ class ProductsController < ApplicationController
637
+ def index
638
+ Events::ProductsFetchRequested.track(
639
+ product_ids: params[:ids]
640
+ )
641
+
642
+ products = Product.where(id: params[:ids])
643
+
644
+ Events::ProductsFetched.track(
645
+ product_ids: products.map(&:id),
646
+ count: products.count
647
+ )
648
+
649
+ render json: products
650
+ end
651
+ end
652
+
653
+ # === SERVICE D: ORDER SERVICE ===
654
+ class OrdersController < ApplicationController
655
+ def create
656
+ Events::OrderCreationRequested.track(
657
+ user_id: params[:user_id],
658
+ product_ids: params[:product_ids]
659
+ )
660
+
661
+ order = Order.create!(
662
+ user_id: params[:user_id],
663
+ order_items: params[:products].map { |p|
664
+ OrderItem.new(product_id: p[:id], quantity: p[:quantity])
665
+ }
666
+ )
667
+
668
+ Events::OrderCreated.track(
669
+ order_id: order.id,
670
+ user_id: order.user_id,
671
+ total: order.total
672
+ )
673
+
674
+ render json: order
675
+ end
676
+ end
677
+
678
+ # === JAEGER TRACE VIEW ===
679
+ # Span: graphql.mutation.createOrder (2.5s)
680
+ # ├─ Span: user.fetch (50ms) [user-service]
681
+ # ├─ Span: products.fetch (120ms) [product-service]
682
+ # └─ Span: order.create (2.3s) [order-service]
683
+ # └─ Span: db.insert (200ms)
684
+ # → Complete GraphQL federation trace!
685
+ ```
686
+
687
+ ---
688
+
689
+ ### Example 3: Event-Driven Architecture (Kafka)
690
+
691
+ ```ruby
692
+ # === SERVICE A: ORDER SERVICE ===
693
+ class OrdersController < ApplicationController
694
+ def create
695
+ order = Order.create!(order_params)
696
+
697
+ # Track event
698
+ Events::OrderCreated.track(
699
+ order_id: order.id,
700
+ user_id: current_user.id,
701
+ amount: order.total
702
+ )
703
+
704
+ # Publish to Kafka (trace_id in message metadata!)
705
+ KafkaProducer.publish(
706
+ topic: 'orders.created',
707
+ key: order.id,
708
+ value: order.to_json,
709
+ headers: {
710
+ 'traceparent' => E11y::TraceContext.current.to_traceparent
711
+ # ↑ W3C Trace Context header in Kafka message!
712
+ }
713
+ )
714
+
715
+ render json: order
716
+ end
717
+ end
718
+
719
+ # === SERVICE B: NOTIFICATION SERVICE (Kafka Consumer) ===
720
+ class OrderCreatedConsumer
721
+ def consume(message)
722
+ # Extract trace context from Kafka headers
723
+ trace_context = E11y::TraceContext.from_traceparent(
724
+ message.headers['traceparent']
725
+ )
726
+
727
+ # Set trace context (restores trace_id!)
728
+ E11y::TraceContext.with_context(trace_context) do
729
+ order_data = JSON.parse(message.value)
730
+
731
+ Events::NotificationRequested.track(
732
+ order_id: order_data['id'],
733
+ notification_type: 'order_confirmation'
734
+ )
735
+
736
+ # Send email
737
+ OrderMailer.confirmation(order_data['id']).deliver_now
738
+
739
+ Events::NotificationSent.track(
740
+ order_id: order_data['id'],
741
+ notification_type: 'email',
742
+ severity: :success
743
+ )
744
+ end
745
+ end
746
+ end
747
+
748
+ # === SERVICE C: ANALYTICS SERVICE (Kafka Consumer) ===
749
+ class OrderCreatedConsumer
750
+ def consume(message)
751
+ trace_context = E11y::TraceContext.from_traceparent(
752
+ message.headers['traceparent']
753
+ )
754
+
755
+ E11y::TraceContext.with_context(trace_context) do
756
+ order_data = JSON.parse(message.value)
757
+
758
+ Events::AnalyticsEventProcessed.track(
759
+ order_id: order_data['id'],
760
+ event_type: 'order_created',
761
+ user_segment: calculate_segment(order_data['user_id'])
762
+ )
763
+ end
764
+ end
765
+ end
766
+
767
+ # === TIMELINE: {trace_id="abc-123"} ===
768
+ # 10:00:00.000 [order-service] order.created
769
+ # 10:00:00.010 [kafka] message.published (topic: orders.created)
770
+ # 10:00:00.500 [notification-service] notification.requested
771
+ # 10:00:01.200 [notification-service] notification.sent
772
+ # 10:00:00.550 [analytics-service] analytics.event.processed
773
+ # → Complete event-driven trace!
774
+ ```
775
+
776
+ ---
777
+
778
+ ## 🔧 Configuration
779
+
780
+ ### Full Configuration
781
+
782
+ ```ruby
783
+ # config/initializers/e11y.rb
784
+ E11y.configure do |config|
785
+ config.distributed_tracing do
786
+ # === TRACE PROPAGATION ===
787
+ propagation do
788
+ # W3C Trace Context (standard)
789
+ w3c_trace_context enabled: true
790
+
791
+ # B3 (Zipkin)
792
+ b3 enabled: true, single_header: true
793
+
794
+ # Jaeger
795
+ jaeger enabled: false
796
+
797
+ # Datadog
798
+ datadog enabled: false
799
+ end
800
+
801
+ # === HTTP CLIENTS ===
802
+ http_clients do
803
+ # Faraday
804
+ faraday do
805
+ enabled true
806
+ inject_headers true
807
+ extract_headers true
808
+ end
809
+
810
+ # Net::HTTP
811
+ net_http do
812
+ enabled true
813
+ inject_headers true
814
+ end
815
+
816
+ # HTTParty
817
+ httparty do
818
+ enabled true
819
+ inject_headers true
820
+ end
821
+
822
+ # RestClient
823
+ rest_client do
824
+ enabled true
825
+ inject_headers true
826
+ end
827
+ end
828
+
829
+ # === BACKGROUND JOBS ===
830
+ background_jobs do
831
+ # Sidekiq
832
+ sidekiq do
833
+ enabled true
834
+ propagate_trace_context true
835
+ store_in_job_metadata true
836
+ end
837
+
838
+ # ActiveJob
839
+ active_job do
840
+ enabled true
841
+ propagate_trace_context true
842
+ end
843
+ end
844
+
845
+ # === MESSAGE QUEUES ===
846
+ message_queues do
847
+ # Kafka
848
+ kafka do
849
+ enabled true
850
+ inject_headers true
851
+ extract_headers true
852
+ header_name 'traceparent'
853
+ end
854
+
855
+ # RabbitMQ
856
+ rabbitmq do
857
+ enabled true
858
+ inject_headers true
859
+ extract_headers true
860
+ end
861
+
862
+ # AWS SQS
863
+ sqs do
864
+ enabled true
865
+ use_message_attributes true
866
+ end
867
+ end
868
+
869
+ # === SERVICE MESH ===
870
+ service_mesh do
871
+ auto_detect true
872
+
873
+ # Istio
874
+ istio do
875
+ enabled true
876
+ use_headers true
877
+ end
878
+
879
+ # Linkerd
880
+ linkerd do
881
+ enabled true
882
+ use_headers true
883
+ end
884
+ end
885
+
886
+ # === METRICS ===
887
+ metrics do
888
+ # Service-to-service latency
889
+ measure_service_latency true
890
+
891
+ # Service hops
892
+ track_service_hops true
893
+
894
+ # Trace duration
895
+ measure_trace_duration true
896
+ end
897
+ end
898
+ end
899
+ ```
900
+
901
+ ---
902
+
903
+ ## 📊 Monitoring Distributed Traces
904
+
905
+ ### Grafana Queries
906
+
907
+ ```ruby
908
+ # 1. Find all events for a trace
909
+ {trace_id="abc-123"} | json
910
+
911
+ # 2. Service-to-service flow
912
+ {trace_id="abc-123"}
913
+ | json
914
+ | line_format "{{.timestamp}} [{{.service}}] {{.event_name}}"
915
+
916
+ # 3. Cross-service latency
917
+ sum by(from_service, to_service) (
918
+ e11y_service_to_service_latency_ms
919
+ )
920
+
921
+ # 4. Service hop distribution
922
+ histogram_quantile(0.95,
923
+ sum(rate(e11y_service_hops_bucket[5m])) by (le)
924
+ )
925
+
926
+ # 5. Failed distributed traces
927
+ {trace_id=~".+"}
928
+ | json
929
+ | severity="error"
930
+ | line_format "{{.trace_id}} {{.service}} {{.event_name}}"
931
+ ```
932
+
933
+ ---
934
+
935
+ ## 💡 Best Practices
936
+
937
+ ### ✅ DO
938
+
939
+ **1. Use W3C Trace Context (standard)**
940
+ ```ruby
941
+ # ✅ GOOD: Industry standard
942
+ config.distributed_tracing do
943
+ propagation do
944
+ w3c_trace_context enabled: true
945
+ end
946
+ end
947
+ ```
948
+
949
+ **2. Mark service boundaries**
950
+ ```ruby
951
+ # ✅ GOOD: Clear boundaries
952
+ module Events
953
+ class OrderCreated < E11y::Event::Base
954
+ service_boundary :outgoing # This service → next service
955
+ end
956
+
957
+ class PaymentReceived < E11y::Event::Base
958
+ service_boundary :incoming # From previous service
959
+ end
960
+ end
961
+ ```
962
+
963
+ **3. Propagate context in async jobs**
964
+ ```ruby
965
+ # ✅ GOOD: Preserve trace_id in jobs
966
+ config.background_jobs do
967
+ sidekiq do
968
+ propagate_trace_context true
969
+ end
970
+ end
971
+ ```
972
+
973
+ ---
974
+
975
+ ### ❌ DON'T
976
+
977
+ **1. Don't use different context formats**
978
+ ```ruby
979
+ # ❌ BAD: Mixed formats (incompatible!)
980
+ # Service A: W3C Trace Context
981
+ # Service B: B3 format
982
+ # Service C: Custom format
983
+ # → Can't correlate!
984
+
985
+ # ✅ GOOD: Single format everywhere
986
+ config.distributed_tracing do
987
+ propagation do
988
+ w3c_trace_context enabled: true # All services!
989
+ end
990
+ end
991
+ ```
992
+
993
+ **2. Don't forget message queue propagation**
994
+ ```ruby
995
+ # ❌ BAD: Kafka without trace headers
996
+ KafkaProducer.publish(topic: 'orders', value: order.to_json)
997
+ # → Trace context lost!
998
+
999
+ # ✅ GOOD: Include trace headers
1000
+ KafkaProducer.publish(
1001
+ topic: 'orders',
1002
+ value: order.to_json,
1003
+ headers: {
1004
+ 'traceparent' => E11y::TraceContext.current.to_traceparent
1005
+ }
1006
+ )
1007
+ ```
1008
+
1009
+ ---
1010
+
1011
+ ## 📚 Related Use Cases
1012
+
1013
+ - **[UC-006: Trace Context Management](./UC-006-trace-context-management.md)** - W3C Trace Context basics
1014
+ - **[UC-008: OpenTelemetry Integration](./UC-008-opentelemetry-integration.md)** - OTel setup
1015
+ - **[UC-010: Background Job Tracking](./UC-010-background-job-tracking.md)** - Async tracing
1016
+
1017
+ ---
1018
+
1019
+ ## 🎯 Summary
1020
+
1021
+ ### Distributed Tracing Benefits
1022
+
1023
+ | Without E11y | With E11y |
1024
+ |--------------|-----------|
1025
+ | Manual trace_id passing | Automatic propagation |
1026
+ | Different formats per service | W3C Trace Context (standard) |
1027
+ | Lost context in async jobs | Preserved automatically |
1028
+ | No cross-service visibility | Complete trace timeline |
1029
+ | Hours to debug issues | Minutes to find root cause |
1030
+
1031
+ **Setup Time:**
1032
+ - Per service: 10-15 min (config only!)
1033
+ - First trace: Immediate (automatic!)
1034
+
1035
+ **Debugging Time:**
1036
+ - Before: 2-4 hours (grep through logs)
1037
+ - After: 5-10 minutes (Grafana query)
1038
+
1039
+ ---
1040
+
1041
+ **Document Version:** 1.0
1042
+ **Last Updated:** January 12, 2026
1043
+ **Status:** ✅ Complete