e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,2750 @@
1
+ # ADR-013: Reliability & Error Handling
2
+
3
+ **Status:** Draft
4
+ **Date:** January 12, 2026
5
+ **Covers:** UC-021 (Error Handling, Retry Policy, DLQ)
6
+ **Depends On:** ADR-001 (Core), ADR-004 (Adapters), ADR-006 (Security)
7
+
8
+ ---
9
+
10
+ ## 📋 Table of Contents
11
+
12
+ 1. [Context & Problem](#1-context--problem)
13
+ 2. [Architecture Overview](#2-architecture-overview)
14
+ 3. [Retry Policy](#3-retry-policy)
15
+ - 3.5. [Retry Rate Limiting (C06 Resolution)](#35-retry-rate-limiting-c06-resolution) ⚠️ CRITICAL
16
+ - 3.5.1. The Problem: Thundering Herd on Adapter Recovery
17
+ - 3.5.2. Decision: Separate Retry Rate Limiter with Staged Batching
18
+ - 3.5.3. RetryHandler with Rate Limiting
19
+ - 3.5.4. Configuration
20
+ - 3.5.5. Staged Retry Batching
21
+ - 3.5.6. Retry Storm Scenario (Without Rate Limiting)
22
+ - 3.5.7. Monitoring Metrics
23
+ - 3.5.8. Trade-offs (C06 Resolution)
24
+ - 3.6. [Event Tracking in Background Jobs (C18 Resolution)](#36-event-tracking-in-background-jobs-c18-resolution) ⚠️ CRITICAL
25
+ - 3.6.1. The Problem: Observability Blocking Business Logic
26
+ - 3.6.2. Decision: Non-Failing Event Tracking in Jobs
27
+ - 3.6.3. SidekiqErrorHandlingMiddleware
28
+ - 3.6.4. Event Tracking with Error Handling
29
+ - 3.6.5. Configuration
30
+ - 3.6.6. Job Success Despite Event Tracking Failure
31
+ - 3.6.7. Alternative Approach: Separate Event Tracking Job
32
+ - 3.6.8. Monitoring & Alerting
33
+ - 3.6.9. Trade-offs (C18 Resolution)
34
+ 4. [Dead Letter Queue (DLQ)](#4-dead-letter-queue-dlq)
35
+ - 4.6. [Rate Limiting × DLQ Filter Interaction (C02 Resolution)](#46-rate-limiting--dlq-filter-interaction-c02-resolution) ⚠️ CRITICAL
36
+ - 4.6.1. The Problem: Critical Events Silently Dropped
37
+ - 4.6.2. Decision: Rate Limiter Respects DLQ Filter
38
+ - 4.6.3. Configuration: Bypass Rate Limiting for Critical Events
39
+ - 4.6.4. Rate-Limited Payment Event Example
40
+ - 4.6.5. Trade-offs: Bypass vs DLQ
41
+ - 4.6.6. Monitoring Metrics
42
+ - 4.6.7. Trade-offs (C02 Resolution)
43
+ 5. [Circuit Breaker](#5-circuit-breaker)
44
+ 6. [Graceful Degradation](#6-graceful-degradation)
45
+ 7. [Self-Healing](#7-self-healing)
46
+ 8. [Monitoring & Alerting](#8-monitoring--alerting)
47
+ 9. [Trade-offs](#9-trade-offs)
48
+
49
+ ---
50
+
51
+ ## 1. Context & Problem
52
+
53
+ ### 1.1. Problem Statement
54
+
55
+ **Failure Scenarios:**
56
+
57
+ 1. **Adapter Failures:**
58
+ ```ruby
59
+ # ❌ Loki is down → events are lost
60
+ Events::OrderPaid.track(order_id: 123)
61
+ # Loki: Connection refused
62
+ # → Event disappears forever
63
+ ```
64
+
65
+ 2. **Transient Errors:**
66
+ ```ruby
67
+ # ❌ Network timeout → no retry
68
+ Adapters::Loki.send(events)
69
+ # → 1 timeout = event lost
70
+ ```
71
+
72
+ 3. **Cascading Failures:**
73
+ ```ruby
74
+ # ❌ One adapter failure blocks others
75
+ adapters = [loki, sentry, elasticsearch]
76
+ adapters.each { |a| a.send(event) } # Loki hangs → Sentry never called
77
+ ```
78
+
79
+ 4. **No Persistent Storage:**
80
+ ```ruby
81
+ # ❌ Critical events lost on failure
82
+ Events::PaymentProcessed.track(amount: 100_000)
83
+ # → If all adapters fail, event is gone
84
+ ```
85
+
86
+ ### 1.2. Goals
87
+
88
+ **Primary Goals:**
89
+ - ✅ **Zero event loss** for critical events
90
+ - ✅ **Automatic retry** with exponential backoff
91
+ - ✅ **Circuit breaker** to prevent cascading failures
92
+ - ✅ **Dead Letter Queue** for persistent storage
93
+ - ✅ **Graceful degradation** when adapters fail
94
+ - ✅ **Self-healing** when adapters recover
95
+
96
+ **Non-Goals:**
97
+ - ❌ Guaranteed ordering (at-least-once, not exactly-once)
98
+ - ❌ Distributed transactions across adapters
99
+ - ❌ Real-time replay from DLQ (manual/scheduled only)
100
+
101
+ ### 1.3. Success Metrics
102
+
103
+ | Metric | Target | Critical? |
104
+ |--------|--------|-----------|
105
+ | **Event loss rate** | <0.01% | ✅ Yes |
106
+ | **Recovery time** | <60s (circuit breaker) | ✅ Yes |
107
+ | **Retry overhead** | <10ms p99 | ✅ Yes |
108
+ | **DLQ write latency** | <5ms p99 | ✅ Yes |
109
+
110
+ ---
111
+
112
+ ## 2. Architecture Overview
113
+
114
+ ### 2.1. System Context
115
+
116
+ ```mermaid
117
+ C4Context
118
+ title Reliability & Error Handling Context
119
+
120
+ Person(app, "Rails App", "Tracks events")
121
+
122
+ System(e11y, "E11y Gem", "Reliable event delivery")
123
+
124
+ System_Ext(adapters, "Adapters", "Loki, Sentry, etc")
125
+ System_Ext(dlq_storage, "DLQ Storage", "File or Redis")
126
+ System_Ext(monitoring, "Monitoring", "Self-monitoring metrics")
127
+
128
+ Rel(app, e11y, "Tracks events", "E11y API")
129
+ Rel(e11y, adapters, "Sends events", "HTTP/gRPC")
130
+ Rel(e11y, dlq_storage, "Saves failed events", "File I/O or Redis")
131
+ Rel(e11y, monitoring, "Reports failures", "Yabeda metrics")
132
+
133
+ Rel(adapters, e11y, "Returns errors", "Exceptions")
134
+ ```
135
+
136
+ ### 2.2. Component Architecture
137
+
138
+ ```mermaid
139
+ graph TB
140
+ subgraph "Event Pipeline"
141
+ Event[Event Tracked] --> Buffer[Main Buffer]
142
+ Buffer --> Dispatcher[Event Dispatcher]
143
+ end
144
+
145
+ subgraph "Reliability Layer"
146
+ Dispatcher --> RetryHandler[Retry Handler]
147
+ RetryHandler --> CircuitBreaker[Circuit Breaker]
148
+ CircuitBreaker --> AdapterPool[Adapter Pool]
149
+ end
150
+
151
+ subgraph "Failure Handling"
152
+ AdapterPool -->|Success| Success[Success]
153
+ AdapterPool -->|Transient Error| RetryQueue[Retry Queue]
154
+ AdapterPool -->|Permanent Failure| DLQFilter[DLQ Filter]
155
+
156
+ RetryQueue -->|Max Retries| DLQFilter
157
+ DLQFilter -->|Save| DLQ[Dead Letter Queue]
158
+ DLQFilter -->|Discard| Discard[Discard]
159
+ end
160
+
161
+ subgraph "Recovery"
162
+ DLQ --> Replay[Manual/Scheduled Replay]
163
+ Replay --> RetryHandler
164
+
165
+ CircuitBreaker -->|Health Check| SelfHealing[Self-Healing]
166
+ SelfHealing -->|Recovered| CircuitBreaker
167
+ end
168
+
169
+ style RetryHandler fill:#fff3cd
170
+ style CircuitBreaker fill:#f8d7da
171
+ style DLQ fill:#d4edda
172
+ style SelfHealing fill:#d1ecf1
173
+ ```
174
+
175
+ ### 2.3. Error Flow Sequence
176
+
177
+ ```mermaid
178
+ sequenceDiagram
179
+ participant App as Rails App
180
+ participant Pipeline as E11y Pipeline
181
+ participant Retry as Retry Handler
182
+ participant CB as Circuit Breaker
183
+ participant Adapter as Adapter
184
+ participant DLQ as Dead Letter Queue
185
+
186
+ App->>Pipeline: Track event
187
+ Pipeline->>Retry: Send to adapters
188
+
189
+ Retry->>CB: Check circuit state
190
+
191
+ alt Circuit CLOSED (healthy)
192
+ CB->>Adapter: Send event
193
+
194
+ alt Transient error (timeout, 5xx)
195
+ Adapter-->>CB: Error
196
+ CB-->>Retry: Retry
197
+
198
+ loop Max 3 retries
199
+ Retry->>CB: Retry with backoff
200
+ CB->>Adapter: Send event
201
+
202
+ alt Success
203
+ Adapter-->>CB: OK
204
+ CB-->>Retry: Success
205
+ Retry-->>Pipeline: Done
206
+ else Still failing
207
+ Adapter-->>CB: Error
208
+ end
209
+ end
210
+
211
+ Note over Retry: Max retries exceeded
212
+ Retry->>DLQ: Save to DLQ
213
+ else Permanent error (4xx, invalid)
214
+ Adapter-->>CB: Error
215
+ CB-->>Retry: Permanent failure
216
+ Retry->>DLQ: Save to DLQ (if filter allows)
217
+ else Success
218
+ Adapter-->>CB: OK
219
+ CB-->>Retry: Success
220
+ Retry-->>Pipeline: Done
221
+ end
222
+ else Circuit OPEN (unhealthy)
223
+ CB-->>Retry: Fast fail
224
+ Retry->>DLQ: Save to DLQ
225
+
226
+ Note over CB: After 60s timeout
227
+ CB->>CB: Transition to HALF_OPEN
228
+ CB->>Adapter: Health check
229
+
230
+ alt Health check OK
231
+ CB->>CB: Transition to CLOSED
232
+ else Health check failed
233
+ CB->>CB: Back to OPEN
234
+ end
235
+ end
236
+ ```
237
+
238
+ ---
239
+
240
+ ## 3. Retry Policy
241
+
242
+ ### 3.1. Exponential Backoff with Jitter
243
+
244
+ ```ruby
245
+ # lib/e11y/reliability/retry_handler.rb
246
+ module E11y
247
+ module Reliability
248
+ class RetryHandler
249
+ def initialize(config)
250
+ @max_retries = config.max_retries
251
+ @base_delay = config.base_delay_ms
252
+ @max_delay = config.max_delay_ms
253
+ @jitter = config.jitter
254
+ @retry_on = config.retry_on_errors
255
+ end
256
+
257
+ def with_retry(adapter, event, &block)
258
+ attempt = 0
259
+ last_error = nil
260
+
261
+ loop do
262
+ begin
263
+ result = yield
264
+
265
+ # Track success metric
266
+ E11y::Metrics.increment('e11y.retry.success', {
267
+ adapter: adapter.name,
268
+ attempt: attempt
269
+ })
270
+
271
+ return result
272
+
273
+ rescue => error
274
+ attempt += 1
275
+ last_error = error
276
+
277
+ # Check if error is retryable
278
+ unless retryable_error?(error)
279
+ E11y::Metrics.increment('e11y.retry.permanent_failure', {
280
+ adapter: adapter.name,
281
+ error_class: error.class.name
282
+ })
283
+
284
+ raise RetryExhausted.new(error, permanent: true)
285
+ end
286
+
287
+ # Check max retries
288
+ if attempt > @max_retries
289
+ E11y::Metrics.increment('e11y.retry.exhausted', {
290
+ adapter: adapter.name,
291
+ attempts: attempt
292
+ })
293
+
294
+ raise RetryExhausted.new(error, attempts: attempt)
295
+ end
296
+
297
+ # Calculate backoff delay
298
+ delay = calculate_delay(attempt)
299
+
300
+ E11y::Metrics.increment('e11y.retry.attempt', {
301
+ adapter: adapter.name,
302
+ attempt: attempt,
303
+ delay_ms: delay
304
+ })
305
+
306
+ # Sleep with backoff
307
+ sleep(delay / 1000.0)
308
+ end
309
+ end
310
+ end
311
+
312
+ private
313
+
314
+ def retryable_error?(error)
315
+ case error
316
+ when *@retry_on
317
+ true
318
+ when Net::HTTPRetriableError, Net::OpenTimeout, Net::ReadTimeout
319
+ true
320
+ when Faraday::TimeoutError, Faraday::ConnectionFailed
321
+ true
322
+ when HTTP::TimeoutError, HTTP::ConnectionError
323
+ true
324
+ else
325
+ # Check HTTP status code if available
326
+ if error.respond_to?(:response)
327
+ status = error.response[:status] rescue nil
328
+ return status && (status >= 500 || status == 429)
329
+ end
330
+
331
+ false
332
+ end
333
+ end
334
+
335
+ def calculate_delay(attempt)
336
+ # Exponential backoff: base * 2^(attempt-1)
337
+ delay = @base_delay * (2 ** (attempt - 1))
338
+
339
+ # Cap at max_delay
340
+ delay = [@max_delay, delay].min
341
+
342
+ # Add jitter (± jitter%)
343
+ if @jitter > 0
344
+ jitter_amount = delay * @jitter
345
+ delay = delay + rand(-jitter_amount..jitter_amount)
346
+ end
347
+
348
+ delay.to_i
349
+ end
350
+ end
351
+
352
+ class RetryExhausted < StandardError
353
+ attr_reader :original_error, :attempts, :permanent
354
+
355
+ def initialize(original_error, attempts: nil, permanent: false)
356
+ @original_error = original_error
357
+ @attempts = attempts
358
+ @permanent = permanent
359
+
360
+ message = if permanent
361
+ "Permanent failure: #{original_error.message}"
362
+ else
363
+ "Retry exhausted after #{attempts} attempts: #{original_error.message}"
364
+ end
365
+
366
+ super(message)
367
+ end
368
+ end
369
+ end
370
+ end
371
+ ```
372
+
373
+ ### 3.2. Configuration
374
+
375
+ ```ruby
376
+ # config/initializers/e11y.rb
377
+ E11y.configure do |config|
378
+ config.error_handling.retry_policy do
379
+ # Max retry attempts (default: 3)
380
+ max_retries 3
381
+
382
+ # Base delay in milliseconds (default: 100ms)
383
+ base_delay_ms 100
384
+
385
+ # Max delay cap (default: 5000ms = 5s)
386
+ max_delay_ms 5000
387
+
388
+ # Jitter percentage (default: 0.1 = ±10%)
389
+ jitter 0.1
390
+
391
+ # Custom retryable errors
392
+ retry_on [
393
+ Net::HTTPRetriableError,
394
+ Faraday::TimeoutError,
395
+ Faraday::ConnectionFailed,
396
+ YourCustomError
397
+ ]
398
+
399
+ # Per-adapter retry config
400
+ adapter_overrides do
401
+ adapter :loki do
402
+ max_retries 5 # More retries for critical adapter
403
+ base_delay_ms 200
404
+ end
405
+
406
+ adapter :sentry do
407
+ max_retries 2 # Fewer retries for non-critical
408
+ base_delay_ms 50
409
+ end
410
+ end
411
+ end
412
+ end
413
+ ```
414
+
415
+ ### 3.3. Retry Timeline Example
416
+
417
+ ```
418
+ Attempt 1: Immediate (0ms)
419
+ └─ Error (timeout)
420
+
421
+ Attempt 2: 100ms backoff (± 10ms jitter) = ~95-105ms
422
+ └─ Error (timeout)
423
+
424
+ Attempt 3: 200ms backoff (± 20ms jitter) = ~180-220ms
425
+ └─ Error (timeout)
426
+
427
+ Attempt 4: 400ms backoff (± 40ms jitter) = ~360-440ms
428
+ └─ Success! Total time: ~800ms
429
+ ```
430
+
431
+ ### 3.4. Integration with Rate Limiting
432
+
433
+ **Critical:** Retries count toward rate limits (see Conflict #14 in CONFLICT-ANALYSIS.md).
434
+
435
+ ```ruby
436
+ # lib/e11y/reliability/retry_handler.rb (extended)
437
+ def with_retry(adapter, event, &block)
438
+ attempt = 0
439
+
440
+ loop do
441
+ # Check rate limit BEFORE each attempt (including retries)
442
+ unless E11y::RateLimiter.allow?(event, adapter)
443
+ E11y::Metrics.increment('e11y.retry.rate_limited', {
444
+ adapter: adapter.name,
445
+ attempt: attempt
446
+ })
447
+
448
+ # Rate limited → save to DLQ (bypass retries)
449
+ raise RateLimitExceeded.new("Rate limit exceeded for #{adapter.name}")
450
+ end
451
+
452
+ begin
453
+ return yield
454
+ rescue => error
455
+ attempt += 1
456
+ # ... retry logic ...
457
+ end
458
+ end
459
+ rescue RateLimitExceeded => e
460
+ # DLQ filter will decide if this should be saved
461
+ raise RetryExhausted.new(e, permanent: true)
462
+ end
463
+ ```
464
+
465
+ ### 3.5. Retry Rate Limiting (C06 Resolution)
466
+
467
+ > **⚠️ CRITICAL: C06 Conflict Resolution - Retry Storm Prevention**
468
+ > **See:** [CONFLICT-ANALYSIS.md C06](researches/CONFLICT-ANALYSIS.md#c06-rate-limiting--retry-storm-thundering-herd) for detailed analysis
469
+ > **Problem:** Retry storms overwhelm adapters after recovery (thundering herd)
470
+ > **Solution:** Separate rate limiter for retries with staged batching + jitter
471
+
472
+ #### 3.5.1. The Problem: Thundering Herd on Adapter Recovery
473
+
474
+ **When an adapter (Loki/Sentry) goes down and comes back up:**
475
+
476
+ ```ruby
477
+ # Scenario: Loki down for 10 seconds
478
+ # - 1000 events buffered during outage
479
+ # - Each event configured for 3 retries
480
+ # - Loki recovers at t=10s
481
+
482
+ # WITHOUT retry rate limiting:
483
+ # t=10s: Loki comes back online
484
+ # t=10.1s: Retry wave 1 → 1000 events × 3 retries = 3000 requests
485
+ # t=10.2s: Retry wave 2 (backoff) → another 3000 requests
486
+ # Result: 6000 requests in 1 second → Loki CRASHES AGAIN!
487
+
488
+ # Problem: Main rate limiter (UC-011) applies to NEW events only
489
+ # Retries bypass rate limiter (happen after adapter write failure)
490
+ # → Adapter overload, cascade failure, longer downtime
491
+ ```
492
+
493
+ **Architectural Trade-off:**
494
+ - ✅ **Retries needed** for reliability (zero event loss)
495
+ - ❌ **Retry storm** overwhelms recovering adapter
496
+ - ⚠️ **Main rate limiter ineffective** (retries bypass pipeline step 3)
497
+
498
+ #### 3.5.2. Decision: Separate Retry Rate Limiter with Staged Batching
499
+
500
+ **Approved Solution:**
501
+ Implement **separate rate limiter** for retries with **staged batching** and **jitter**.
502
+
503
+ ```ruby
504
+ # lib/e11y/reliability/retry_rate_limiter.rb
505
+ module E11y
506
+ module Reliability
507
+ class RetryRateLimiter
508
+ # Token bucket algorithm for retry rate limiting
509
+ def initialize(limit:, on_limit_exceeded:, jitter_range: 1.0)
510
+ @limit = limit # Max retries/sec (e.g., 100)
511
+ @on_limit_exceeded = on_limit_exceeded # :delay, :drop, :dlq
512
+ @jitter_range = jitter_range # ±100% jitter (spread timing)
513
+
514
+ @bucket = TokenBucket.new(
515
+ capacity: limit,
516
+ refill_rate: limit, # Refill per second
517
+ refill_interval: 1.0 # 1 second
518
+ )
519
+ end
520
+
521
+ # Check if retry is allowed
522
+ def allow?(event, adapter)
523
+ if @bucket.take(1)
524
+ # Retry allowed
525
+ E11y::Metrics.increment('e11y.retry.rate_limiter.allowed', {
526
+ adapter: adapter.name
527
+ })
528
+ true
529
+ else
530
+ # Rate limit exceeded
531
+ E11y::Metrics.increment('e11y.retry.rate_limiter.limited', {
532
+ adapter: adapter.name,
533
+ action: @on_limit_exceeded
534
+ })
535
+ false
536
+ end
537
+ end
538
+
539
+ # Calculate retry delay with exponential backoff + jitter
540
+ def calculate_delay(attempt)
541
+ # Base delay: 2^attempt seconds
542
+ base_delay = 2 ** attempt
543
+
544
+ # Add jitter: ±jitter_range%
545
+ jitter = rand(-@jitter_range..@jitter_range)
546
+ final_delay = base_delay * (1 + jitter)
547
+
548
+ # Cap at max delay (e.g., 60 seconds)
549
+ [final_delay, 60.0].min
550
+ end
551
+ end
552
+ end
553
+ end
554
+ ```
555
+
556
+ #### 3.5.3. RetryHandler with Rate Limiting
557
+
558
+ **Updated RetryHandler implementation:**
559
+
560
+ ```ruby
561
+ # lib/e11y/reliability/retry_handler.rb
562
+ module E11y
563
+ module Reliability
564
+ class RetryHandler
565
+ def initialize(config)
566
+ @config = config
567
+
568
+ # NEW: Separate retry rate limiter
569
+ @retry_rate_limiter = RetryRateLimiter.new(
570
+ limit: config.retry_rate_limit.limit,
571
+ on_limit_exceeded: config.retry_rate_limit.on_limit_exceeded,
572
+ jitter_range: config.retry_rate_limit.jitter_range
573
+ )
574
+ end
575
+
576
+ def with_retry(adapter, event, &block)
577
+ attempt = 0
578
+
579
+ loop do
580
+ # NEW: Check retry rate limit BEFORE attempting
581
+ unless @retry_rate_limiter.allow?(event, adapter)
582
+ # Rate limit exceeded → handle according to strategy
583
+ handle_rate_limit_exceeded(event, adapter, attempt)
584
+ return # Exit retry loop
585
+ end
586
+
587
+ begin
588
+ # Attempt delivery
589
+ result = yield
590
+
591
+ # Success → track metrics
592
+ E11y::Metrics.increment('e11y.retry.success', {
593
+ adapter: adapter.name,
594
+ attempt: attempt
595
+ })
596
+
597
+ return result
598
+ rescue => error
599
+ attempt += 1
600
+
601
+ # Max retries exceeded?
602
+ if attempt >= @config.max_retries
603
+ handle_retry_exhausted(event, adapter, error, attempt)
604
+ return
605
+ end
606
+
607
+ # Calculate backoff delay with jitter
608
+ delay = @retry_rate_limiter.calculate_delay(attempt)
609
+
610
+ # Log retry
611
+ E11y.logger.warn(
612
+ "Retry attempt #{attempt}/#{@config.max_retries} " \
613
+ "for adapter #{adapter.name} after #{delay}s",
614
+ error: error.class.name,
615
+ event_type: event.type
616
+ )
617
+
618
+ # Wait before retry (with jitter)
619
+ sleep(delay)
620
+ end
621
+ end
622
+ end
623
+
624
+ private
625
+
626
+ def handle_rate_limit_exceeded(event, adapter, attempt)
627
+ case @config.retry_rate_limit.on_limit_exceeded
628
+ when :delay
629
+ # Add to delayed retry queue (process later)
630
+ delay = @retry_rate_limiter.calculate_delay(attempt)
631
+
632
+ E11y::Reliability::DelayedRetryQueue.add(
633
+ event: event,
634
+ adapter: adapter,
635
+ delay: delay,
636
+ attempt: attempt
637
+ )
638
+
639
+ E11y.logger.warn(
640
+ "Retry rate limited, delaying retry by #{delay}s",
641
+ adapter: adapter.name,
642
+ event_type: event.type
643
+ )
644
+
645
+ when :drop
646
+ # Drop event (don't retry, don't save to DLQ)
647
+ E11y.logger.error(
648
+ "Retry rate limited, dropping event",
649
+ adapter: adapter.name,
650
+ event_type: event.type
651
+ )
652
+
653
+ E11y::Metrics.increment('e11y.retry.dropped', {
654
+ adapter: adapter.name,
655
+ reason: 'retry_rate_limited'
656
+ })
657
+
658
+ when :dlq
659
+ # Send to DLQ (manual replay later)
660
+ E11y::Reliability::DLQ.save(
661
+ event,
662
+ reason: 'retry_rate_limited',
663
+ adapter: adapter.name,
664
+ attempt: attempt
665
+ )
666
+
667
+ E11y.logger.warn(
668
+ "Retry rate limited, saving to DLQ",
669
+ adapter: adapter.name,
670
+ event_type: event.type
671
+ )
672
+ end
673
+ end
674
+
675
+ def handle_retry_exhausted(event, adapter, error, attempt)
676
+ # All retries exhausted → send to DLQ
677
+ E11y::Reliability::DLQ.save(
678
+ event,
679
+ reason: 'retry_exhausted',
680
+ adapter: adapter.name,
681
+ attempt: attempt,
682
+ last_error: error.message
683
+ )
684
+
685
+ E11y.logger.error(
686
+ "Retry exhausted after #{attempt} attempts",
687
+ adapter: adapter.name,
688
+ event_type: event.type,
689
+ error: error.message
690
+ )
691
+
692
+ E11y::Metrics.increment('e11y.retry.exhausted', {
693
+ adapter: adapter.name
694
+ })
695
+ end
696
+ end
697
+ end
698
+ end
699
+ ```
700
+
701
+ #### 3.5.4. Configuration
702
+
703
+ **Retry rate limiting configuration:**
704
+
705
+ ```ruby
706
+ # config/initializers/e11y.rb
707
+ E11y.configure do |config|
708
+ config.error_handling.retry_policy do
709
+ enabled true
710
+ max_retries 3
711
+
712
+ # NEW: Retry rate limiting (C06 resolution)
713
+ retry_rate_limit do
714
+ enabled true
715
+
716
+ # Max retries per second (separate from main rate limit)
717
+ limit 100 # ← 100 retries/sec (prevents thundering herd)
718
+
719
+ # What to do when retry rate limit exceeded:
720
+ # - :delay (default) → Add to delayed retry queue (process later)
721
+ # - :drop → Drop event immediately (no DLQ)
722
+ # - :dlq → Save to DLQ for manual replay
723
+ on_limit_exceeded :delay
724
+
725
+ # Jitter range for spreading retry timing (avoid synchronized retries)
726
+ # 1.0 = ±100% jitter (retry delay varies from 0.5x to 1.5x base delay)
727
+ jitter_range 1.0
728
+ end
729
+ end
730
+ end
731
+ ```
732
+
733
+ **Example configuration for different environments:**
734
+
735
+ ```ruby
736
+ # Production: Conservative (prevent adapter overload)
737
+ config.error_handling.retry_policy.retry_rate_limit do
738
+ limit 50 # Low limit (50 retries/sec)
739
+ on_limit_exceeded :delay # Delay retries (don't drop!)
740
+ jitter_range 1.5 # High jitter (spread retries widely)
741
+ end
742
+
743
+ # Staging: Balanced (test retry behavior)
744
+ config.error_handling.retry_policy.retry_rate_limit do
745
+ limit 100 # Medium limit
746
+ on_limit_exceeded :delay
747
+ jitter_range 1.0 # Standard jitter
748
+ end
749
+
750
+ # Development: Permissive (fast feedback)
751
+ config.error_handling.retry_policy.retry_rate_limit do
752
+ limit 1000 # High limit (rarely hit)
753
+ on_limit_exceeded :dlq # Save to DLQ (inspect manually)
754
+ jitter_range 0.5 # Low jitter (faster retries)
755
+ end
756
+ ```
757
+
758
+ #### 3.5.5. Staged Retry Batching
759
+
760
+ **Alternative approach: Batch retries over time window:**
761
+
762
+ ```ruby
763
+ # lib/e11y/reliability/staged_retry_batcher.rb
764
+ module E11y
765
+ module Reliability
766
+ class StagedRetryBatcher
767
+ # Retry events in controlled batches
768
+ def initialize(batch_size:, batch_interval:)
769
+ @batch_size = batch_size # Max events per batch (e.g., 100)
770
+ @batch_interval = batch_interval # Time between batches (e.g., 1s)
771
+ @queue = Queue.new
772
+
773
+ start_background_processor
774
+ end
775
+
776
+ # Add event to retry queue
777
+ def enqueue(event, adapter, attempt)
778
+ @queue << { event: event, adapter: adapter, attempt: attempt }
779
+
780
+ E11y::Metrics.gauge('e11y.retry.queue_size', @queue.size)
781
+ end
782
+
783
+ private
784
+
785
+ def start_background_processor
786
+ Thread.new do
787
+ loop do
788
+ # Wait for batch interval
789
+ sleep(@batch_interval)
790
+
791
+ # Process batch
792
+ process_batch
793
+ end
794
+ end
795
+ end
796
+
797
+ def process_batch
798
+ batch = []
799
+
800
+ # Collect up to batch_size events
801
+ @batch_size.times do
802
+ break if @queue.empty?
803
+ batch << @queue.pop
804
+ end
805
+
806
+ return if batch.empty?
807
+
808
+ # Process batch in parallel
809
+ batch.each do |item|
810
+ # Attempt retry (with rate limiting)
811
+ retry_handler.with_retry(item[:adapter], item[:event]) do
812
+ item[:adapter].write(item[:event])
813
+ end
814
+ end
815
+
816
+ E11y::Metrics.increment('e11y.retry.batch_processed', {
817
+ batch_size: batch.size
818
+ })
819
+ end
820
+ end
821
+ end
822
+ end
823
+
824
+ # Configuration:
825
+ config.error_handling.retry_policy.staged_retry do
826
+ enabled true
827
+ batch_size 100 # Process 100 events per batch
828
+ batch_interval 1.0 # 1 batch per second → 100 events/sec
829
+ end
830
+ ```
831
+
832
+ #### 3.5.6. Retry Storm Scenario (Without Rate Limiting)
833
+
834
+ **Concrete example showing the problem:**
835
+
836
+ ```ruby
837
+ # Scenario: Loki outage + recovery
838
+ # Timeline:
839
+
840
+ # t=0s: Loki goes DOWN
841
+ # - Events start buffering (in-memory ring buffer)
842
+ # - Buffer size: 10,000 events capacity
843
+ # - Incoming rate: 100 events/sec
844
+
845
+ # t=0s to t=10s: Loki is DOWN
846
+ # - 1000 events buffered (100 events/sec × 10s)
847
+ # - All retries failing → circuit breaker OPEN
848
+
849
+ # t=10s: Loki comes BACK UP
850
+ # - Circuit breaker detects recovery → state: HALF_OPEN
851
+ # - Retry wave begins for 1000 buffered events
852
+
853
+ # WITHOUT retry rate limiting (C06):
854
+ # t=10.000s: Retry attempt #1 for 1000 events → 1000 requests in parallel
855
+ # t=10.100s: Retry attempt #2 (backoff 0.1s) → 1000 requests
856
+ # t=10.300s: Retry attempt #3 (backoff 0.2s) → 1000 requests
857
+ # Result: 3000 requests in 1 second
858
+ # → Loki: Connection pool exhausted! 502 Bad Gateway
859
+ # → Loki CRASHES AGAIN at t=10.5s (thundering herd!)
860
+
861
+ # WITH retry rate limiting (C06 resolution):
862
+ # t=10.000s: Retry rate limiter allows 100 events/sec
863
+ # t=10.000s: Batch 1 (100 events) → 100 requests (ALLOWED)
864
+ # t=11.000s: Batch 2 (100 events) → 100 requests (ALLOWED)
865
+ # t=12.000s: Batch 3 (100 events) → 100 requests (ALLOWED)
866
+ # ... continues for 10 seconds ...
867
+ # t=20.000s: All 1000 events retried (100 events/sec × 10s)
868
+ # Result: Controlled recovery, Loki STABLE! ✅
869
+
870
+ # Metrics comparison:
871
+ # Without rate limiting:
872
+ # - Peak throughput: 3000 req/sec (spike!)
873
+ # - Loki downtime: 10s (initial) + 60s (cascade) = 70s total
874
+ # - Events lost: 0 (but long recovery)
875
+
876
+ # With rate limiting:
877
+ # - Peak throughput: 100 req/sec (smooth)
878
+ # - Loki downtime: 10s (initial only)
879
+ # - Events lost: 0
880
+ # - Recovery time: 10s (staged batching)
881
+ ```
882
+
883
+ #### 3.5.7. Monitoring Metrics
884
+
885
+ **Key metrics for retry rate limiting:**
886
+
887
+ ```ruby
888
+ # Retry rate limiter metrics
889
+ e11y.retry.rate_limiter.allowed (counter)
890
+ # Retry allowed by rate limiter
891
+ labels: adapter
892
+
893
+ e11y.retry.rate_limiter.limited (counter)
894
+ # Retry blocked by rate limiter
895
+ labels: adapter, action (:delay/:drop/:dlq)
896
+
897
+ e11y.retry.queue_size (gauge)
898
+ # Number of retries waiting in delayed queue
899
+
900
+ e11y.retry.batch_processed (counter)
901
+ # Number of retry batches processed
902
+ labels: batch_size
903
+
904
+ e11y.retry.delay_seconds (histogram)
905
+ # Actual delay before retry (with jitter)
906
+ labels: adapter, attempt
907
+
908
+ e11y.retry.dropped (counter)
909
+ # Retries dropped due to rate limiting
910
+ labels: adapter, reason
911
+ ```
912
+
913
+ **Grafana dashboard queries:**
914
+
915
+ ```promql
916
+ # Retry rate (retries/sec)
917
+ rate(e11y_retry_rate_limiter_allowed_total[1m])
918
+
919
+ # Retry rate limit hit rate (%)
920
+ rate(e11y_retry_rate_limiter_limited_total[1m])
921
+ /
922
+ (rate(e11y_retry_rate_limiter_allowed_total[1m]) + rate(e11y_retry_rate_limiter_limited_total[1m]))
923
+
924
+ # Retry queue backlog
925
+ e11y_retry_queue_size
926
+
927
+ # Average retry delay (seconds)
928
+ rate(e11y_retry_delay_seconds_sum[1m]) / rate(e11y_retry_delay_seconds_count[1m])
929
+ ```
930
+
931
+ #### 3.5.8. Trade-offs (C06 Resolution)
932
+
933
+ | Aspect | With Retry Rate Limiting | Without Retry Rate Limiting |
934
+ |--------|--------------------------|------------------------------|
935
+ | **Adapter Protection** | ✅ Protected (smooth recovery) | ❌ Vulnerable (thundering herd) |
936
+ | **Recovery Time** | ⚠️ Slower (staged batching) | ✅ Faster (if adapter survives) |
937
+ | **Cascade Failure Risk** | ✅ Low (controlled load) | ❌ High (retry storm → crash) |
938
+ | **Complexity** | ⚠️ Higher (separate limiter) | ✅ Lower (no limiter) |
939
+ | **Event Loss Risk** | ✅ Zero (delayed, not dropped) | ⚠️ Medium (cascade → longer outage) |
940
+ | **Memory Footprint** | ⚠️ Higher (delayed queue) | ✅ Lower (immediate retry) |
941
+ | **Configuration Tuning** | ⚠️ Required (limit, jitter) | ✅ None needed |
942
+
943
+ **Why Retry Rate Limiting is Default:**
944
+ 1. ✅ **Prevents cascade failures** - Adapter recovers smoothly without crashing again
945
+ 2. ✅ **Zero event loss** - Retries delayed, not dropped (`:delay` strategy)
946
+ 3. ✅ **Configurable** - Can adjust `limit` per environment (prod: 50, staging: 100)
947
+ 4. ✅ **Jitter prevents synchronization** - Retries spread over time (avoid lock-step)
948
+ 5. ⚠️ **Trade-off: Slower recovery** - But SAFER recovery (no cascade)
949
+
950
+ **Related Conflicts:**
951
+ - **C02:** Rate Limiting × DLQ Filter (see §4.4 below)
952
+ - **C18:** Non-failing event tracking in jobs (see §3.6 below)
953
+ - **UC-011:** Rate Limiting (main rate limiter)
954
+ - **UC-021:** DLQ Replay interaction
955
+
956
+ ### 3.6. Event Tracking in Background Jobs (C18 Resolution)
957
+
958
+ > **⚠️ CRITICAL: C18 Conflict Resolution - Event Tracking Should Not Fail Jobs**
959
+ > **See:** [CONFLICT-ANALYSIS.md C18](researches/CONFLICT-ANALYSIS.md#c18-background-job-tracking--circuit-breaker-open) for detailed analysis
960
+ > **Problem:** Event tracking failures cause background jobs to fail (business logic blocked)
961
+ > **Solution:** Disable `fail_on_error` for Sidekiq/ActiveJob contexts, wrap tracking in rescue
962
+
963
+ #### 3.6.1. The Problem: Observability Blocking Business Logic
964
+
965
+ **When circuit breaker opens during a background job:**
966
+
967
+ ```ruby
968
+ # Scenario: Loki circuit breaker OPEN (adapter down)
969
+ # Background job attempts to track event
970
+
971
+ class SendOrderEmailJob < ApplicationJob
972
+ def perform(order_id)
973
+ order = Order.find(order_id)
974
+
975
+ # Send email (business logic)
976
+ OrderMailer.confirmation(order).deliver_now # ✅ SUCCESS
977
+
978
+ # Track event (observability)
979
+ Events::EmailSent.track(order_id: order_id) # ❌ FAILS!
980
+ # → Circuit breaker open → raises CircuitBreakerOpen exception
981
+ # → Job FAILS and retries!
982
+
983
+ # Problem:
984
+ # - Email ALREADY SENT (business logic succeeded)
985
+ # - But job is marked as FAILED (due to event tracking)
986
+ # - Job retries → sends DUPLICATE EMAIL!
987
+ end
988
+ end
989
+
990
+ # Result:
991
+ # - Customer receives 3 duplicate emails (3 retries)
992
+ # - Job fails despite email sent successfully
993
+ # - Observability failure blocks business logic!
994
+ ```
995
+
996
+ **Architectural Conflict:**
997
+ - ✅ **Circuit breaker needed** for adapter protection
998
+ - ❌ **Circuit breaker blocks jobs** when event tracking fails
999
+ - ⚠️ **Business logic success ≠ observability success**
1000
+
1001
+ #### 3.6.2. Decision: Non-Failing Event Tracking in Jobs
1002
+
1003
+ **Approved Solution:**
1004
+ Event tracking failures should **NOT** fail background jobs. Observability is **secondary** to business logic.
1005
+
1006
+ ```ruby
1007
+ # lib/e11y/config.rb
1008
+ module E11y
1009
+ class Config
1010
+ attr_accessor :error_handling
1011
+
1012
+ def initialize
1013
+ @error_handling = ErrorHandlingConfig.new
1014
+ end
1015
+ end
1016
+
1017
+ class ErrorHandlingConfig
1018
+ # Should event tracking errors raise exceptions?
1019
+ # Default: true (for web requests - fast feedback)
1020
+ # Exception: false (for background jobs - don't fail business logic)
1021
+ attr_accessor :fail_on_error
1022
+
1023
+ def initialize
1024
+ @fail_on_error = true # Default: raise errors
1025
+ end
1026
+ end
1027
+ end
1028
+ ```
1029
+
1030
+ #### 3.6.3. SidekiqErrorHandlingMiddleware
1031
+
1032
+ **Sidekiq server middleware to disable `fail_on_error`:**
1033
+
1034
+ ```ruby
1035
+ # lib/e11y/middleware/sidekiq_error_handling_middleware.rb
1036
+ module E11y
1037
+ module Middleware
1038
+ class SidekiqErrorHandlingMiddleware
1039
+ def call(worker, job, queue)
1040
+ # Save original setting
1041
+ original_fail_on_error = E11y.config.error_handling.fail_on_error
1042
+
1043
+ # Disable failing on errors for this job
1044
+ # Observability should NOT block business logic!
1045
+ E11y.config.error_handling.fail_on_error = false
1046
+
1047
+ E11y.logger.debug(
1048
+ "Sidekiq job starting with fail_on_error=false",
1049
+ job_class: worker.class.name,
1050
+ job_id: job['jid']
1051
+ )
1052
+
1053
+ yield
1054
+ ensure
1055
+ # Restore original setting
1056
+ E11y.config.error_handling.fail_on_error = original_fail_on_error
1057
+ end
1058
+ end
1059
+ end
1060
+ end
1061
+
1062
+ # Configure Sidekiq server
1063
+ Sidekiq.configure_server do |config|
1064
+ config.server_middleware do |chain|
1065
+ # Add BEFORE E11y tracing middleware
1066
+ chain.add E11y::Middleware::SidekiqErrorHandlingMiddleware
1067
+ end
1068
+ end
1069
+ ```
1070
+
1071
+ #### 3.6.4. Event Tracking with Error Handling
1072
+
1073
+ **Updated Event.track method with conditional error handling:**
1074
+
1075
+ ```ruby
1076
+ # lib/e11y/event.rb
1077
+ module E11y
1078
+ class Event
1079
+ def self.track(attributes = {})
1080
+ event = new(attributes)
1081
+
1082
+ # Validate event
1083
+ unless event.valid?
1084
+ handle_error(ValidationError.new(event.errors))
1085
+ return nil
1086
+ end
1087
+
1088
+ # Send through pipeline
1089
+ begin
1090
+ E11y::Pipeline.process(event)
1091
+ rescue CircuitBreakerOpen, AdapterTimeout, RateLimitExceeded => e
1092
+ # Adapter failure → handle according to fail_on_error setting
1093
+ handle_error(e)
1094
+ return nil
1095
+ rescue => e
1096
+ # Unexpected error → always log
1097
+ E11y.logger.error(
1098
+ "Unexpected error in E11y event tracking",
1099
+ error: e.class.name,
1100
+ message: e.message,
1101
+ event_type: event.type
1102
+ )
1103
+
1104
+ handle_error(e)
1105
+ return nil
1106
+ end
1107
+
1108
+ event
1109
+ end
1110
+
1111
+ private
1112
+
1113
+ def self.handle_error(error)
1114
+ # Should we raise or swallow?
1115
+ if E11y.config.error_handling.fail_on_error
1116
+ # Web request context: RAISE (fast feedback!)
1117
+ raise error
1118
+ else
1119
+ # Background job context: SWALLOW (don't fail job!)
1120
+ E11y.logger.warn(
1121
+ "E11y event tracking failed (fail_on_error=false)",
1122
+ error: error.class.name,
1123
+ message: error.message,
1124
+ context: :background_job
1125
+ )
1126
+
1127
+ # Send to DLQ for later replay
1128
+ E11y::Reliability::DLQ.save(
1129
+ event,
1130
+ reason: 'event_tracking_failed_in_job',
1131
+ error: error.message
1132
+ )
1133
+
1134
+ # Track metric
1135
+ E11y::Metrics.increment('e11y.event.tracking_failed_silent', {
1136
+ error_class: error.class.name,
1137
+ context: :background_job
1138
+ })
1139
+
1140
+ # DON'T re-raise!
1141
+ nil
1142
+ end
1143
+ end
1144
+ end
1145
+ end
1146
+ ```
1147
+
1148
+ #### 3.6.5. Configuration
1149
+
1150
+ **Global configuration:**
1151
+
1152
+ ```ruby
1153
+ # config/initializers/e11y.rb
1154
+ E11y.configure do |config|
1155
+ config.error_handling do |error_handling|
1156
+ # Default: raise errors in web requests (fast feedback)
1157
+ # Automatically disabled in Sidekiq/ActiveJob (via middleware)
1158
+ error_handling.fail_on_error = true
1159
+
1160
+ # Alternative: Manually control per context
1161
+ # error_handling.fail_on_error_contexts do
1162
+ # web_requests true # Raise errors
1163
+ # background_jobs false # Swallow errors (don't fail job)
1164
+ # rake_tasks true # Raise errors (CLI feedback)
1165
+ # end
1166
+ end
1167
+ end
1168
+ ```
1169
+
1170
+ **Per-job override (if needed):**
1171
+
1172
+ ```ruby
1173
+ # app/jobs/critical_job.rb
1174
+ class CriticalReportJob < ApplicationJob
1175
+ # Override: This job SHOULD fail if event tracking fails
1176
+ # (because tracking is part of compliance requirements)
1177
+
1178
+ def perform(report_id)
1179
+ # Temporarily enable fail_on_error
1180
+ E11y.config.error_handling.fail_on_error = true
1181
+
1182
+ # Generate report
1183
+ report = Report.generate(report_id)
1184
+
1185
+ # Track event (MUST succeed for compliance!)
1186
+ Events::ReportGenerated.track(
1187
+ report_id: report_id,
1188
+ user_id: report.user_id
1189
+ )
1190
+ # ↑ If this fails, job SHOULD fail (retry later)
1191
+ ensure
1192
+ # Restore default (will be restored by middleware anyway)
1193
+ E11y.config.error_handling.fail_on_error = false
1194
+ end
1195
+ end
1196
+ ```
1197
+
1198
+ #### 3.6.6. Job Success Despite Event Tracking Failure
1199
+
1200
+ **Example showing job succeeding even when circuit breaker open:**
1201
+
1202
+ ```ruby
1203
+ # Scenario: Loki circuit breaker OPEN
1204
+
1205
+ class ProcessPaymentJob < ApplicationJob
1206
+ def perform(payment_id)
1207
+ payment = Payment.find(payment_id)
1208
+
1209
+ # 1. Charge credit card (business logic)
1210
+ result = StripeGateway.charge(
1211
+ amount: payment.amount,
1212
+ token: payment.token
1213
+ )
1214
+ # ✅ SUCCESS: $100 charged
1215
+
1216
+ # 2. Update payment status (business logic)
1217
+ payment.update!(
1218
+ status: :completed,
1219
+ charged_at: Time.current,
1220
+ stripe_charge_id: result.id
1221
+ )
1222
+ # ✅ SUCCESS: Database updated
1223
+
1224
+ # 3. Track event (observability)
1225
+ Events::PaymentCompleted.track(
1226
+ payment_id: payment.id,
1227
+ amount: payment.amount,
1228
+ stripe_charge_id: result.id
1229
+ )
1230
+ # ❌ FAILS: Circuit breaker OPEN
1231
+ # BUT: Error swallowed! (fail_on_error = false)
1232
+ # Event saved to DLQ for later replay
1233
+
1234
+ # 4. Send confirmation email (business logic)
1235
+ PaymentMailer.confirmation(payment).deliver_now
1236
+ # ✅ SUCCESS: Email sent
1237
+
1238
+ # JOB RESULT: ✅ SUCCESS!
1239
+ # - Payment charged
1240
+ # - Database updated
1241
+ # - Email sent
1242
+ # - Event tracking failed BUT job did NOT fail
1243
+ # - Event in DLQ (will replay when Loki recovers)
1244
+ end
1245
+ end
1246
+
1247
+ # Job metrics:
1248
+ # - Job status: ✅ SUCCESS
1249
+ # - Business logic: 100% success rate
1250
+ # - Observability: ⚠️ Event missed (in DLQ)
1251
+ # - Customer experience: ✅ Perfect (no issues)
1252
+ ```
1253
+
1254
+ #### 3.6.7. Alternative Approach: Separate Event Tracking Job
1255
+
1256
+ **Decouple business logic from event tracking:**
1257
+
1258
+ ```ruby
1259
+ # Don't track events inline in jobs
1260
+ # Instead: Enqueue separate event tracking job
1261
+
1262
+ class SendEmailJob < ApplicationJob
1263
+ def perform(order_id)
1264
+ order = Order.find(order_id)
1265
+
1266
+ # Business logic: Send email
1267
+ OrderMailer.confirmation(order).deliver_now
1268
+
1269
+ # DON'T track event here!
1270
+ # Instead: Enqueue event tracking job (can fail independently)
1271
+ TrackEventJob.perform_async(
1272
+ event_type: 'email.sent',
1273
+ payload: {
1274
+ order_id: order.id,
1275
+ email: order.email,
1276
+ sent_at: Time.current.iso8601
1277
+ }
1278
+ )
1279
+ end
1280
+ end
1281
+
1282
+ # Separate job for event tracking (lower priority)
1283
+ class TrackEventJob < ApplicationJob
1284
+ queue_as :events # ← Separate queue (lower priority than business logic)
1285
+
1286
+ # Fewer retries (observability less critical than business logic)
1287
+ retry_on StandardError, wait: :exponentially_longer, attempts: 3
1288
+
1289
+ def perform(event_type, payload)
1290
+ # This job CAN fail without affecting business logic
1291
+ Events::Base.track(event_type, **payload)
1292
+ end
1293
+ end
1294
+ ```
1295
+
1296
+ **Trade-offs of separate job approach:**
1297
+
1298
+ | Aspect | Inline Tracking (§3.6.4) | Separate Job (§3.6.7) |
1299
+ |--------|---------------------------|------------------------|
1300
+ | **Coupling** | ⚠️ Coupled (same job) | ✅ Decoupled (separate jobs) |
1301
+ | **Latency** | ✅ Immediate (sync) | ⚠️ Delayed (async) |
1302
+ | **Failure Isolation** | ✅ Silent failure (DLQ) | ✅ Complete isolation (separate queue) |
1303
+ | **Complexity** | ✅ Simple (one job) | ⚠️ Higher (two jobs) |
1304
+ | **Queue Management** | ✅ Same queue | ⚠️ Must manage events queue |
1305
+ | **Cost** | ✅ Lower (one job) | ⚠️ Higher (two jobs) |
1306
+
1307
+ **Recommendation:**
1308
+ Use **inline tracking with `fail_on_error=false`** (§3.6.4) as default. Reserve **separate job** (§3.6.7) for:
1309
+ - High-volume jobs (millions/day)
1310
+ - Critical jobs where ANY failure is unacceptable
1311
+ - Jobs with strict latency requirements
1312
+
1313
+ #### 3.6.8. Monitoring & Alerting
1314
+
1315
+ **Key metrics for background job event tracking:**
1316
+
1317
+ ```ruby
1318
+ # Silent failures (event tracking failed but job succeeded)
1319
+ e11y.event.tracking_failed_silent (counter)
1320
+ # Event tracking failed in background job (error swallowed)
1321
+ labels: error_class, context
1322
+
1323
+ # Job context detection
1324
+ e11y.event.tracked_in_job (counter)
1325
+ # Event tracked successfully in background job
1326
+ labels: job_class
1327
+
1328
+ e11y.event.dlq_from_job (counter)
1329
+ # Event sent to DLQ from background job
1330
+ labels: job_class, reason
1331
+ ```
1332
+
1333
+ **Alert rules:**
1334
+
1335
+ ```yaml
1336
+ # Alert: High rate of silent failures in jobs
1337
+ - alert: E11yJobTrackingFailureHigh
1338
+ expr: |
1339
+ rate(e11y_event_tracking_failed_silent_total{context="background_job"}[5m]) > 10
1340
+ for: 5m
1341
+ annotations:
1342
+ summary: "E11y event tracking failing in background jobs"
1343
+ description: "{{ $value }} events/sec failing silently in jobs"
1344
+
1345
+ # Alert: Circuit breaker blocking job events
1346
+ - alert: E11yJobEventsBlockedByCircuitBreaker
1347
+ expr: |
1348
+ rate(e11y_event_tracking_failed_silent_total{error_class="CircuitBreakerOpen",context="background_job"}[5m]) > 5
1349
+ for: 2m
1350
+ annotations:
1351
+ summary: "E11y circuit breaker blocking job events"
1352
+ description: "Circuit breaker open - job events going to DLQ"
1353
+ ```
1354
+
1355
+ #### 3.6.9. Trade-offs (C18 Resolution)
1356
+
1357
+ | Aspect | fail_on_error=false (Jobs) | fail_on_error=true (Default) |
1358
+ |--------|----------------------------|------------------------------|
1359
+ | **Job Success Rate** | ✅ High (observability doesn't block) | ❌ Low (event failures → job failures) |
1360
+ | **Business Logic** | ✅ Never blocked | ❌ Blocked by event tracking issues |
1361
+ | **Fast Feedback** | ⚠️ Silent failures (no exception) | ✅ Immediate exceptions |
1362
+ | **Event Loss Risk** | ✅ Zero (DLQ saves all) | ⚠️ Medium (job retries may skip) |
1363
+ | **Debugging** | ⚠️ Harder (check DLQ + logs) | ✅ Easier (job fails immediately) |
1364
+ | **Customer Impact** | ✅ None (business logic succeeds) | ❌ High (duplicate emails, failed orders) |
1365
+ | **Observability Gaps** | ⚠️ Possible (events in DLQ) | ⚠️ Possible (job doesn't retry events) |
1366
+
1367
+ **Why fail_on_error=false for Jobs:**
1368
+ 1. ✅ **Business logic > observability** - Payment success > event tracking
1369
+ 2. ✅ **Prevents duplicate actions** - No duplicate emails/charges on retry
1370
+ 3. ✅ **Circuit breaker doesn't block jobs** - Jobs succeed during adapter outage
1371
+ 4. ✅ **Events preserved in DLQ** - Can replay when adapter recovers
1372
+ 5. ⚠️ **Trade-off: Silent failures** - But business logic succeeds (acceptable)
1373
+
1374
+ **Related Conflicts:**
1375
+ - **C06:** Retry rate limiting (see §3.5 above)
1376
+ - **C17:** Background job tracing (see ADR-005 §8.3)
1377
+ - **UC-010:** Background Job Tracking
1378
+ - **UC-021:** DLQ Replay
1379
+
1380
+ ---
1381
+
1382
+ ## 4. Dead Letter Queue (DLQ)
1383
+
1384
+ ### 4.1. DLQ Storage Interface
1385
+
1386
+ ```ruby
1387
+ # lib/e11y/reliability/dlq/base.rb
1388
+ module E11y
1389
+ module Reliability
1390
+ module DLQ
1391
+ class Base
1392
+ def save(event_data, metadata)
1393
+ raise NotImplementedError
1394
+ end
1395
+
1396
+ def list(limit: 100, offset: 0, filters: {})
1397
+ raise NotImplementedError
1398
+ end
1399
+
1400
+ def replay(event_id)
1401
+ raise NotImplementedError
1402
+ end
1403
+
1404
+ def replay_batch(event_ids)
1405
+ raise NotImplementedError
1406
+ end
1407
+
1408
+ def delete(event_id)
1409
+ raise NotImplementedError
1410
+ end
1411
+
1412
+ def stats
1413
+ raise NotImplementedError
1414
+ end
1415
+ end
1416
+ end
1417
+ end
1418
+ end
1419
+ ```
1420
+
1421
+ ### 4.2. File-based DLQ (Default)
1422
+
1423
+ ```ruby
1424
+ # lib/e11y/reliability/dlq/file_storage.rb
1425
+ module E11y
1426
+ module Reliability
1427
+ module DLQ
1428
+ class FileStorage < Base
1429
+ def initialize(config)
1430
+ @directory = config.directory || Rails.root.join('tmp', 'e11y', 'dlq')
1431
+ @max_file_size = config.max_file_size || 10.megabytes
1432
+ @retention_days = config.retention_days || 30
1433
+
1434
+ FileUtils.mkdir_p(@directory)
1435
+ end
1436
+
1437
+ def save(event_data, metadata)
1438
+ event_id = SecureRandom.uuid
1439
+ timestamp = Time.now.utc
1440
+
1441
+ dlq_entry = {
1442
+ id: event_id,
1443
+ timestamp: timestamp.iso8601,
1444
+ event_name: event_data[:event_name],
1445
+ event_data: event_data,
1446
+ metadata: metadata.merge(
1447
+ failed_at: timestamp.iso8601,
1448
+ retry_count: metadata[:retry_count] || 0,
1449
+ last_error: metadata[:error]&.message,
1450
+ error_class: metadata[:error]&.class&.name
1451
+ )
1452
+ }
1453
+
1454
+ # Partition by date for efficient cleanup
1455
+ date_partition = timestamp.strftime('%Y-%m-%d')
1456
+ partition_dir = @directory.join(date_partition)
1457
+ FileUtils.mkdir_p(partition_dir)
1458
+
1459
+ # Append to daily file
1460
+ file_path = partition_dir.join('events.jsonl')
1461
+
1462
+ File.open(file_path, 'a') do |f|
1463
+ f.flock(File::LOCK_EX)
1464
+ f.puts(JSON.generate(dlq_entry))
1465
+ end
1466
+
1467
+ # Track metric
1468
+ E11y::Metrics.increment('e11y.dlq.saved', {
1469
+ event_name: event_data[:event_name],
1470
+ adapter: metadata[:adapter]
1471
+ })
1472
+
1473
+ event_id
1474
+ end
1475
+
1476
+ def list(limit: 100, offset: 0, filters: {})
1477
+ entries = []
1478
+
1479
+ Dir.glob(@directory.join('*', 'events.jsonl')).sort.reverse.each do |file|
1480
+ File.readlines(file).drop(offset).first(limit).each do |line|
1481
+ entry = JSON.parse(line, symbolize_names: true)
1482
+
1483
+ # Apply filters
1484
+ next if filters[:event_name] && entry[:event_name] != filters[:event_name]
1485
+ next if filters[:after] && Time.parse(entry[:timestamp]) < filters[:after]
1486
+
1487
+ entries << entry
1488
+ break if entries.size >= limit
1489
+ end
1490
+
1491
+ break if entries.size >= limit
1492
+ end
1493
+
1494
+ entries
1495
+ end
1496
+
1497
+ def replay(event_id)
1498
+ entry = find_entry(event_id)
1499
+ return nil unless entry
1500
+
1501
+ # Re-dispatch event through normal pipeline
1502
+ E11y::Pipeline.dispatch(
1503
+ entry[:event_data],
1504
+ metadata: entry[:metadata].merge(replayed: true)
1505
+ )
1506
+
1507
+ # Delete from DLQ after successful replay
1508
+ delete(event_id)
1509
+
1510
+ E11y::Metrics.increment('e11y.dlq.replayed', {
1511
+ event_name: entry[:event_name]
1512
+ })
1513
+
1514
+ true
1515
+ rescue => error
1516
+ # Replay failed → keep in DLQ, increment retry count
1517
+ E11y::Metrics.increment('e11y.dlq.replay_failed', {
1518
+ event_name: entry[:event_name],
1519
+ error: error.class.name
1520
+ })
1521
+
1522
+ false
1523
+ end
1524
+
1525
+ def stats
1526
+ total_events = 0
1527
+ oldest_event = nil
1528
+ newest_event = nil
1529
+
1530
+ Dir.glob(@directory.join('*', 'events.jsonl')).each do |file|
1531
+ lines = File.readlines(file)
1532
+ total_events += lines.size
1533
+
1534
+ first_entry = JSON.parse(lines.first, symbolize_names: true) rescue nil
1535
+ last_entry = JSON.parse(lines.last, symbolize_names: true) rescue nil
1536
+
1537
+ oldest_event ||= first_entry[:timestamp] if first_entry
1538
+ newest_event = last_entry[:timestamp] if last_entry
1539
+ end
1540
+
1541
+ {
1542
+ total_events: total_events,
1543
+ oldest_event: oldest_event,
1544
+ newest_event: newest_event,
1545
+ storage_path: @directory.to_s,
1546
+ disk_usage: disk_usage_mb
1547
+ }
1548
+ end
1549
+
1550
+ def cleanup_old_entries!
1551
+ cutoff_date = @retention_days.days.ago.to_date
1552
+
1553
+ Dir.glob(@directory.join('*')).each do |partition_dir|
1554
+ partition_date = Date.parse(File.basename(partition_dir)) rescue nil
1555
+ next unless partition_date
1556
+
1557
+ if partition_date < cutoff_date
1558
+ FileUtils.rm_rf(partition_dir)
1559
+ E11y::Metrics.increment('e11y.dlq.partition_deleted', {
1560
+ date: partition_date.to_s
1561
+ })
1562
+ end
1563
+ end
1564
+ end
1565
+
1566
+ private
1567
+
1568
+ def find_entry(event_id)
1569
+ Dir.glob(@directory.join('*', 'events.jsonl')).each do |file|
1570
+ File.readlines(file).each do |line|
1571
+ entry = JSON.parse(line, symbolize_names: true)
1572
+ return entry if entry[:id] == event_id
1573
+ end
1574
+ end
1575
+
1576
+ nil
1577
+ end
1578
+
1579
+ def disk_usage_mb
1580
+ total_size = Dir.glob(@directory.join('**', '*'))
1581
+ .select { |f| File.file?(f) }
1582
+ .sum { |f| File.size(f) }
1583
+
1584
+ (total_size / 1.megabyte.to_f).round(2)
1585
+ end
1586
+ end
1587
+ end
1588
+ end
1589
+ end
1590
+ ```
1591
+
1592
+ ### 4.3. Redis-based DLQ (Optional)
1593
+
1594
+ ```ruby
1595
+ # lib/e11y/reliability/dlq/redis_storage.rb
1596
+ module E11y
1597
+ module Reliability
1598
+ module DLQ
1599
+ class RedisStorage < Base
1600
+ DLQ_KEY = 'e11y:dlq:events'
1601
+ DLQ_INDEX_KEY = 'e11y:dlq:index'
1602
+
1603
+ def initialize(config)
1604
+ @redis = config.redis_client || Redis.new(url: config.redis_url)
1605
+ @ttl = config.retention_days.days.to_i
1606
+ end
1607
+
1608
+ def save(event_data, metadata)
1609
+ event_id = SecureRandom.uuid
1610
+ timestamp = Time.now.utc.to_i
1611
+
1612
+ dlq_entry = {
1613
+ id: event_id,
1614
+ timestamp: timestamp,
1615
+ event_name: event_data[:event_name],
1616
+ event_data: event_data,
1617
+ metadata: metadata
1618
+ }
1619
+
1620
+ @redis.pipelined do |pipeline|
1621
+ # Store event data (with TTL)
1622
+ pipeline.setex(
1623
+ "#{DLQ_KEY}:#{event_id}",
1624
+ @ttl,
1625
+ JSON.generate(dlq_entry)
1626
+ )
1627
+
1628
+ # Add to sorted set (by timestamp)
1629
+ pipeline.zadd(DLQ_INDEX_KEY, timestamp, event_id)
1630
+ end
1631
+
1632
+ E11y::Metrics.increment('e11y.dlq.saved', {
1633
+ event_name: event_data[:event_name],
1634
+ storage: 'redis'
1635
+ })
1636
+
1637
+ event_id
1638
+ end
1639
+
1640
+ def list(limit: 100, offset: 0, filters: {})
1641
+ # Get event IDs from sorted set (newest first)
1642
+ event_ids = @redis.zrevrange(DLQ_INDEX_KEY, offset, offset + limit - 1)
1643
+
1644
+ return [] if event_ids.empty?
1645
+
1646
+ # Fetch event data in bulk
1647
+ entries = @redis.mget(*event_ids.map { |id| "#{DLQ_KEY}:#{id}" })
1648
+ .compact
1649
+ .map { |json| JSON.parse(json, symbolize_names: true) }
1650
+
1651
+ # Apply filters
1652
+ if filters[:event_name]
1653
+ entries.select! { |e| e[:event_name] == filters[:event_name] }
1654
+ end
1655
+
1656
+ entries
1657
+ end
1658
+
1659
+ def replay(event_id)
1660
+ json = @redis.get("#{DLQ_KEY}:#{event_id}")
1661
+ return nil unless json
1662
+
1663
+ entry = JSON.parse(json, symbolize_names: true)
1664
+
1665
+ # Re-dispatch
1666
+ E11y::Pipeline.dispatch(
1667
+ entry[:event_data],
1668
+ metadata: entry[:metadata].merge(replayed: true)
1669
+ )
1670
+
1671
+ # Delete from DLQ
1672
+ delete(event_id)
1673
+
1674
+ true
1675
+ rescue => error
1676
+ E11y::Metrics.increment('e11y.dlq.replay_failed', {
1677
+ error: error.class.name
1678
+ })
1679
+
1680
+ false
1681
+ end
1682
+
1683
+ def delete(event_id)
1684
+ @redis.pipelined do |pipeline|
1685
+ pipeline.del("#{DLQ_KEY}:#{event_id}")
1686
+ pipeline.zrem(DLQ_INDEX_KEY, event_id)
1687
+ end
1688
+ end
1689
+
1690
+ def stats
1691
+ {
1692
+ total_events: @redis.zcard(DLQ_INDEX_KEY),
1693
+ storage: 'redis',
1694
+ redis_memory: @redis.info('memory')['used_memory_human']
1695
+ }
1696
+ end
1697
+ end
1698
+ end
1699
+ end
1700
+ end
1701
+ ```
1702
+
1703
+ ### 4.4. DLQ Filter (Selective Storage)
1704
+
1705
+ ```ruby
1706
+ # lib/e11y/reliability/dlq/filter.rb
1707
+ module E11y
1708
+ module Reliability
1709
+ module DLQ
1710
+ class Filter
1711
+ def initialize(config)
1712
+ @always_save_patterns = config.always_save_patterns || []
1713
+ @never_save_patterns = config.never_save_patterns || []
1714
+ @save_if_block = config.save_if_block
1715
+ end
1716
+
1717
+ def should_save?(event_data, metadata)
1718
+ event_name = event_data[:event_name]
1719
+
1720
+ # Priority 1: Never save (explicit exclusion)
1721
+ return false if matches_any?(@never_save_patterns, event_name)
1722
+
1723
+ # Priority 2: Always save (explicit inclusion)
1724
+ return true if matches_any?(@always_save_patterns, event_name)
1725
+
1726
+ # Priority 3: Custom filter block
1727
+ if @save_if_block
1728
+ context = FilterContext.new(event_data, metadata)
1729
+ return @save_if_block.call(context)
1730
+ end
1731
+
1732
+ # Default: save all failed events
1733
+ true
1734
+ end
1735
+
1736
+ private
1737
+
1738
+ def matches_any?(patterns, event_name)
1739
+ patterns.any? do |pattern|
1740
+ case pattern
1741
+ when String
1742
+ event_name == pattern
1743
+ when Regexp
1744
+ event_name =~ pattern
1745
+ when Proc
1746
+ pattern.call(event_name)
1747
+ end
1748
+ end
1749
+ end
1750
+
1751
+ class FilterContext
1752
+ attr_reader :event_data, :metadata
1753
+
1754
+ def initialize(event_data, metadata)
1755
+ @event_data = event_data
1756
+ @metadata = metadata
1757
+ end
1758
+
1759
+ def event_name
1760
+ @event_data[:event_name]
1761
+ end
1762
+
1763
+ def payload
1764
+ @event_data[:payload]
1765
+ end
1766
+
1767
+ def error
1768
+ @metadata[:error]
1769
+ end
1770
+
1771
+ def adapter
1772
+ @metadata[:adapter]
1773
+ end
1774
+
1775
+ def retry_count
1776
+ @metadata[:retry_count] || 0
1777
+ end
1778
+ end
1779
+ end
1780
+ end
1781
+ end
1782
+ end
1783
+ ```
1784
+
1785
+ ### 4.5. DLQ Configuration
1786
+
1787
+ ```ruby
1788
+ # config/initializers/e11y.rb
1789
+ E11y.configure do |config|
1790
+ config.error_handling.dead_letter_queue do
1791
+ # Storage backend
1792
+ storage :file # or :redis
1793
+
1794
+ # File storage options
1795
+ file_storage do
1796
+ directory Rails.root.join('tmp', 'e11y', 'dlq')
1797
+ max_file_size 10.megabytes
1798
+ retention_days 30
1799
+ end
1800
+
1801
+ # Redis storage options (alternative)
1802
+ redis_storage do
1803
+ redis_url ENV['REDIS_URL']
1804
+ retention_days 7 # Shorter for Redis (expensive)
1805
+ end
1806
+
1807
+ # DLQ Filter: which events to save
1808
+ filter do
1809
+ # Always save critical events
1810
+ always_save_patterns [
1811
+ /^payment\./,
1812
+ /^order\./,
1813
+ /^audit\./,
1814
+ 'Events::CriticalEvent'
1815
+ ]
1816
+
1817
+ # Never save health checks or metrics
1818
+ never_save_patterns [
1819
+ /^health_check\./,
1820
+ /^metrics\./,
1821
+ 'Events::DebugEvent'
1822
+ ]
1823
+
1824
+ # Custom logic
1825
+ save_if do |context|
1826
+ # Save if payment amount > $100
1827
+ if context.event_name.include?('payment')
1828
+ (context.payload[:amount] || 0) > 100
1829
+ else
1830
+ # Save if > 2 retries (indicates persistent issue)
1831
+ context.retry_count > 2
1832
+ end
1833
+ end
1834
+ end
1835
+
1836
+ # Auto-cleanup old entries
1837
+ auto_cleanup do
1838
+ enabled true
1839
+ schedule '0 2 * * *' # Daily at 2 AM
1840
+ end
1841
+ end
1842
+ end
1843
+ ```
1844
+
1845
+ ### 4.6. Rate Limiting × DLQ Filter Interaction (C02 Resolution)
1846
+
1847
+ > **⚠️ CRITICAL: C02 Conflict Resolution - Rate Limiting Respects DLQ Filter**
1848
+ > **See:** [CONFLICT-ANALYSIS.md C02](researches/CONFLICT-ANALYSIS.md#c02-rate-limiting--dlq-filter-uc-021) for detailed analysis
1849
+ > **Problem:** Rate-limited critical events dropped despite `always_save` DLQ filter
1850
+ > **Solution:** Rate limiter checks DLQ filter before dropping, critical events bypass to DLQ
1851
+
1852
+ #### 4.6.1. The Problem: Critical Events Silently Dropped
1853
+
1854
+ **When rate limiting interacts with DLQ filter:**
1855
+
1856
+ ```ruby
1857
+ # Configuration:
1858
+ config.rate_limiting do
1859
+ enabled true
1860
+ limit 1000 # events/sec
1861
+ end
1862
+
1863
+ config.error_handling.dead_letter_queue.filter do
1864
+ always_save_patterns [/^payment\./] # ← Always save payment events!
1865
+ end
1866
+
1867
+ # Scenario: Traffic spike (1500 events/sec)
1868
+ 1500.times do |i|
1869
+ Events::PaymentFailed.track(
1870
+ order_id: i,
1871
+ amount: 100,
1872
+ error: 'card_declined'
1873
+ )
1874
+ end
1875
+
1876
+ # WITHOUT C02 resolution:
1877
+ # - First 1000 events: PROCESSED ✅
1878
+ # - Next 500 events: RATE LIMITED → DROPPED ❌
1879
+ # Problem: Payment events DROPPED despite `always_save` filter!
1880
+ # DLQ filter never sees rate-limited events (rate limiting happens BEFORE DLQ)
1881
+ ```
1882
+
1883
+ **Architectural Conflict:**
1884
+ - ✅ **Rate limiting needed** for adapter protection
1885
+ - ✅ **DLQ filter needed** for critical event preservation
1886
+ - ❌ **Pipeline order:** Rate limiting (step 3) → DLQ (step 7, after adapter failure)
1887
+ - ⚠️ **Rate-limited events never reach DLQ filter!**
1888
+
1889
+ #### 4.6.2. Decision: Rate Limiter Respects DLQ Filter
1890
+
1891
+ **Approved Solution:**
1892
+ Rate limiter must **check DLQ filter** before dropping events. Critical events go to DLQ instead of being dropped.
1893
+
1894
+ ```ruby
1895
+ # lib/e11y/middleware/rate_limiter.rb
1896
+ module E11y
1897
+ module Middleware
1898
+ class RateLimiter < Base
1899
+ def call(event, next_middleware)
1900
+ # Check rate limit
1901
+ unless @limiter.allow?(event)
1902
+ # Rate limit exceeded!
1903
+ E11y::Metrics.increment('e11y.rate_limiter.limited', {
1904
+ event_type: event.type
1905
+ })
1906
+
1907
+ # NEW: Check DLQ filter BEFORE dropping
1908
+ if should_save_to_dlq?(event)
1909
+ # Critical event → Send to DLQ (don't drop!)
1910
+ E11y::Reliability::DLQ.save(
1911
+ event,
1912
+ reason: 'rate_limited',
1913
+ metadata: {
1914
+ rate_limit: @config.limit,
1915
+ exceeded_at: Time.current.iso8601
1916
+ }
1917
+ )
1918
+
1919
+ E11y.logger.warn(
1920
+ "Rate-limited critical event saved to DLQ",
1921
+ event_type: event.type,
1922
+ rate_limit: @config.limit
1923
+ )
1924
+
1925
+ E11y::Metrics.increment('e11y.rate_limiter.dlq_saved', {
1926
+ event_type: event.type
1927
+ })
1928
+
1929
+ # DON'T process event (rate limited), but saved to DLQ
1930
+ return nil
1931
+ else
1932
+ # Non-critical event → Drop silently
1933
+ E11y.logger.debug(
1934
+ "Rate-limited event dropped",
1935
+ event_type: event.type
1936
+ )
1937
+
1938
+ E11y::Metrics.increment('e11y.rate_limiter.dropped', {
1939
+ event_type: event.type
1940
+ })
1941
+
1942
+ # Event dropped (not saved)
1943
+ return nil
1944
+ end
1945
+ end
1946
+
1947
+ # Rate limit OK → continue pipeline
1948
+ next_middleware.call(event)
1949
+ end
1950
+
1951
+ private
1952
+
1953
+ def should_save_to_dlq?(event)
1954
+ # Check DLQ filter (if enabled)
1955
+ return false unless E11y.config.error_handling.dead_letter_queue.enabled
1956
+
1957
+ # Get DLQ filter
1958
+ dlq_filter = E11y::Reliability::DLQ::Filter.new(
1959
+ E11y.config.error_handling.dead_letter_queue.filter_config
1960
+ )
1961
+
1962
+ # Ask filter: Should this event be saved?
1963
+ dlq_filter.should_save?(
1964
+ event.to_h,
1965
+ { reason: 'rate_limited' }
1966
+ )
1967
+ end
1968
+ end
1969
+ end
1970
+ end
1971
+ ```
1972
+
1973
+ #### 4.6.3. Configuration: Bypass Rate Limiting for Critical Events
1974
+
1975
+ **Alternative approach: Bypass rate limiting entirely for critical events:**
1976
+
1977
+ ```ruby
1978
+ # config/initializers/e11y.rb
1979
+ E11y.configure do |config|
1980
+ config.rate_limiting do
1981
+ enabled true
1982
+ limit 1000 # events/sec
1983
+
1984
+ # NEW: Bypass rate limiting for critical patterns (C02 resolution)
1985
+ bypass_for do
1986
+ # Bypass by severity
1987
+ severity [:error, :fatal]
1988
+
1989
+ # Bypass by event pattern
1990
+ event_patterns [
1991
+ /^payment\./, # All payment events
1992
+ /^audit\./, # All audit events
1993
+ /^order\.failed/ # Specific critical events
1994
+ ]
1995
+
1996
+ # Bypass by custom logic
1997
+ custom_check do |event|
1998
+ # Example: High-value payments
1999
+ event.type.start_with?('payment.') &&
2000
+ event.payload[:amount].to_i > 10_000
2001
+ end
2002
+ end
2003
+ end
2004
+
2005
+ config.error_handling.dead_letter_queue do
2006
+ enabled true
2007
+
2008
+ filter do
2009
+ # Always save patterns (aligned with bypass_for above)
2010
+ always_save_patterns [/^payment\./, /^audit\./]
2011
+ end
2012
+ end
2013
+ end
2014
+ ```
2015
+
2016
+ #### 4.6.4. Rate-Limited Payment Event Example
2017
+
2018
+ **Concrete example showing critical event preservation:**
2019
+
2020
+ ```ruby
2021
+ # Scenario: Payment service under DDoS attack
2022
+ # Normal rate: 100 payments/sec
2023
+ # Attack rate: 5000 req/sec (mixed: 1000 payments + 4000 junk)
2024
+
2025
+ # Rate limit: 1000 events/sec total
2026
+ config.rate_limiting.limit = 1000
2027
+
2028
+ # Events arriving:
2029
+ # - 1000 legitimate payment events
2030
+ # - 4000 junk events (bots, scrapers)
2031
+
2032
+ # Processing (WITH C02 resolution):
2033
+
2034
+ # Batch 1 (first 1000 events/sec):
2035
+ # - 600 legitimate payments → PROCESSED ✅
2036
+ # - 400 junk events → PROCESSED ✅ (consumed rate limit)
2037
+
2038
+ # Batch 2 (next 4000 events/sec):
2039
+ # - 400 legitimate payments → RATE LIMITED
2040
+ # → Check DLQ filter → Match `/^payment\./` pattern
2041
+ # → SAVED TO DLQ ✅ (not dropped!)
2042
+ # - 3600 junk events → RATE LIMITED → DROPPED ❌ (not critical)
2043
+
2044
+ # Result:
2045
+ # - Legitimate payments: 1000 total
2046
+ # → 600 processed immediately
2047
+ # → 400 saved to DLQ (can replay later)
2048
+ # → ZERO payment data loss! ✅
2049
+ # - Junk events: 4000 total
2050
+ # → 400 processed (wasted capacity)
2051
+ # → 3600 dropped (correct behavior)
2052
+
2053
+ # WITHOUT C02 resolution:
2054
+ # - 400 legitimate payments LOST! ❌
2055
+ # - No forensics, no replay, compliance violation
2056
+ ```
2057
+
2058
+ #### 4.6.5. Trade-offs: Bypass vs DLQ
2059
+
2060
+ | Approach | Bypass Rate Limiting (§4.6.3) | Save to DLQ (§4.6.2) |
2061
+ |----------|-------------------------------|----------------------|
2062
+ | **Adapter Protection** | ⚠️ Partial (critical events can overload) | ✅ Full (all events rate-limited) |
2063
+ | **Critical Event Loss** | ✅ Zero (always processed) | ✅ Zero (saved to DLQ) |
2064
+ | **Attack Surface** | ❌ High (attacker can fake critical events) | ✅ Low (all events limited) |
2065
+ | **Latency** | ✅ Immediate (no delay) | ⚠️ Delayed (replay from DLQ) |
2066
+ | **Complexity** | ⚠️ Higher (bypass logic) | ⚠️ Higher (DLQ check) |
2067
+ | **Cost** | ⚠️ Higher (more events processed) | ✅ Lower (fewer events processed) |
2068
+
2069
+ **Recommendation:**
2070
+ Use **Save to DLQ** (§4.6.2) as default. Reserve **Bypass** (§4.6.3) for:
2071
+ - Audit events (compliance requirement - MUST process immediately)
2072
+ - High-severity alerts (operational safety)
2073
+ - Events with authentication (can't be faked by attackers)
2074
+
2075
+ **Warning about Bypass:**
2076
+ ```ruby
2077
+ # ⚠️ SECURITY RISK: Bypass can be exploited!
2078
+ config.rate_limiting.bypass_for.event_patterns [/^payment\./]
2079
+
2080
+ # Attack scenario:
2081
+ # Attacker sends 10,000 fake payment events with pattern:
2082
+ Events::PaymentProcessed.track(amount: 1) # ← Matches bypass pattern!
2083
+ # → ALL events bypass rate limiting
2084
+ # → Adapter OVERLOADED despite rate limiter enabled
2085
+ # → DDoS successful! ❌
2086
+
2087
+ # Solution: Require authentication for bypass
2088
+ config.rate_limiting.bypass_for.custom_check do |event|
2089
+ event.type.start_with?('payment.') &&
2090
+ event.metadata[:authenticated] == true # ← Verify authenticity
2091
+ end
2092
+ ```
2093
+
2094
+ #### 4.6.6. Monitoring Metrics
2095
+
2096
+ **Key metrics for rate limiting × DLQ interaction:**
2097
+
2098
+ ```ruby
2099
+ # Rate limiter saved to DLQ (instead of dropped)
2100
+ e11y.rate_limiter.dlq_saved (counter)
2101
+ # Critical event rate-limited but saved to DLQ
2102
+ labels: event_type
2103
+
2104
+ # Rate limiter dropped events
2105
+ e11y.rate_limiter.dropped (counter)
2106
+ # Non-critical event rate-limited and dropped
2107
+ labels: event_type
2108
+
2109
+ # DLQ entries by reason
2110
+ e11y.dlq.entries_by_reason (counter)
2111
+ # Track why events went to DLQ
2112
+ labels: reason # 'rate_limited', 'retry_exhausted', 'circuit_breaker_open'
2113
+ ```
2114
+
2115
+ **Alert rules:**
2116
+
2117
+ ```yaml
2118
+ # Alert: Critical events being rate-limited
2119
+ - alert: E11yCriticalEventsRateLimited
2120
+ expr: |
2121
+ rate(e11y_rate_limiter_dlq_saved_total[5m]) > 10
2122
+ for: 2m
2123
+ annotations:
2124
+ summary: "E11y rate limiting critical events"
2125
+ description: "{{ $value }} critical events/sec rate-limited (saved to DLQ)"
2126
+
2127
+ # Alert: High DLQ entries from rate limiting
2128
+ - alert: E11yDlqRateLimitedHigh
2129
+ expr: |
2130
+ rate(e11y_dlq_entries_by_reason_total{reason="rate_limited"}[5m]) > 50
2131
+ for: 5m
2132
+ annotations:
2133
+ summary: "E11y DLQ filling from rate-limited events"
2134
+ description: "{{ $value }} events/sec going to DLQ due to rate limiting"
2135
+ ```
2136
+
2137
+ #### 4.6.7. Trade-offs (C02 Resolution)
2138
+
2139
+ | Aspect | With DLQ Filter Check | Without DLQ Filter Check |
2140
+ |--------|------------------------|---------------------------|
2141
+ | **Critical Event Loss** | ✅ Zero (saved to DLQ) | ❌ High (dropped silently) |
2142
+ | **DLQ Filter Guarantee** | ✅ Respected (`always_save` works) | ❌ Violated (filter never sees events) |
2143
+ | **Adapter Protection** | ✅ Full (rate limiting works) | ✅ Full (rate limiting works) |
2144
+ | **Complexity** | ⚠️ Higher (filter check in rate limiter) | ✅ Lower (simple drop) |
2145
+ | **Performance** | ⚠️ Slower (filter evaluation) | ✅ Faster (immediate drop) |
2146
+ | **Forensics** | ✅ Available (DLQ replay) | ❌ Lost (no record) |
2147
+ | **Compliance** | ✅ Meets (audit trail preserved) | ❌ Fails (data loss) |
2148
+
2149
+ **Why DLQ Filter Check is Default:**
2150
+ 1. ✅ **Zero critical data loss** - Payment/audit events never lost
2151
+ 2. ✅ **DLQ filter works as documented** - `always_save` guarantee respected
2152
+ 3. ✅ **Forensics preserved** - Can replay rate-limited events when load drops
2153
+ 4. ✅ **Compliance met** - Audit trail complete (no silent drops)
2154
+ 5. ⚠️ **Trade-off: Complexity** - But SAFETY > simplicity for critical events
2155
+
2156
+ **Related Conflicts:**
2157
+ - **C06:** Retry rate limiting (see §3.5 above)
2158
+ - **C18:** Non-failing job tracking (see §3.6 above)
2159
+ - **UC-011:** Rate Limiting
2160
+ - **UC-021:** DLQ & Replay
2161
+
2162
+ ---
2163
+
2164
+ ## 5. Circuit Breaker
2165
+
2166
+ ### 5.1. Circuit Breaker Implementation
2167
+
2168
+ ```ruby
2169
+ # lib/e11y/reliability/circuit_breaker.rb
2170
+ module E11y
2171
+ module Reliability
2172
+ class CircuitBreaker
2173
+ STATE_CLOSED = :closed
2174
+ STATE_OPEN = :open
2175
+ STATE_HALF_OPEN = :half_open
2176
+
2177
+ def initialize(adapter_name, config)
2178
+ @adapter_name = adapter_name
2179
+ @threshold = config.failure_threshold
2180
+ @timeout = config.timeout_seconds
2181
+ @half_open_attempts = config.half_open_attempts
2182
+
2183
+ @state = STATE_CLOSED
2184
+ @failure_count = 0
2185
+ @success_count = 0
2186
+ @last_failure_time = nil
2187
+ @opened_at = nil
2188
+ @mutex = Mutex.new
2189
+ end
2190
+
2191
+ def call
2192
+ check_state_transition
2193
+
2194
+ case @state
2195
+ when STATE_CLOSED
2196
+ execute_with_closed_circuit { yield }
2197
+ when STATE_OPEN
2198
+ handle_open_circuit
2199
+ when STATE_HALF_OPEN
2200
+ execute_with_half_open_circuit { yield }
2201
+ end
2202
+ end
2203
+
2204
+ def healthy?
2205
+ @state == STATE_CLOSED
2206
+ end
2207
+
2208
+ def stats
2209
+ {
2210
+ adapter: @adapter_name,
2211
+ state: @state,
2212
+ failure_count: @failure_count,
2213
+ success_count: @success_count,
2214
+ last_failure: @last_failure_time,
2215
+ opened_at: @opened_at
2216
+ }
2217
+ end
2218
+
2219
+ private
2220
+
2221
+ def execute_with_closed_circuit
2222
+ begin
2223
+ result = yield
2224
+ on_success
2225
+ result
2226
+ rescue => error
2227
+ on_failure(error)
2228
+ raise
2229
+ end
2230
+ end
2231
+
2232
+ def execute_with_half_open_circuit
2233
+ begin
2234
+ result = yield
2235
+ on_half_open_success
2236
+ result
2237
+ rescue => error
2238
+ on_half_open_failure(error)
2239
+ raise
2240
+ end
2241
+ end
2242
+
2243
+ def handle_open_circuit
2244
+ E11y::Metrics.increment('e11y.circuit_breaker.rejected', {
2245
+ adapter: @adapter_name
2246
+ })
2247
+
2248
+ raise CircuitOpenError.new(@adapter_name, @opened_at)
2249
+ end
2250
+
2251
+ def on_success
2252
+ @mutex.synchronize do
2253
+ @failure_count = 0
2254
+ @success_count += 1
2255
+ @last_failure_time = nil
2256
+ end
2257
+ end
2258
+
2259
+ def on_failure(error)
2260
+ @mutex.synchronize do
2261
+ @failure_count += 1
2262
+ @last_failure_time = Time.now
2263
+
2264
+ if @failure_count >= @threshold
2265
+ transition_to_open
2266
+ end
2267
+
2268
+ E11y::Metrics.increment('e11y.circuit_breaker.failure', {
2269
+ adapter: @adapter_name,
2270
+ count: @failure_count
2271
+ })
2272
+ end
2273
+ end
2274
+
2275
+ def on_half_open_success
2276
+ @mutex.synchronize do
2277
+ @success_count += 1
2278
+
2279
+ if @success_count >= @half_open_attempts
2280
+ transition_to_closed
2281
+ end
2282
+ end
2283
+ end
2284
+
2285
+ def on_half_open_failure(error)
2286
+ @mutex.synchronize do
2287
+ transition_to_open
2288
+ end
2289
+ end
2290
+
2291
+ def transition_to_open
2292
+ @state = STATE_OPEN
2293
+ @opened_at = Time.now
2294
+ @failure_count = 0
2295
+ @success_count = 0
2296
+
2297
+ E11y::Metrics.gauge('e11y.circuit_breaker.state', 2, {
2298
+ adapter: @adapter_name,
2299
+ state: 'open'
2300
+ })
2301
+
2302
+ # Log warning
2303
+ E11y.logger.warn(
2304
+ "Circuit breaker OPENED for adapter: #{@adapter_name}"
2305
+ )
2306
+ end
2307
+
2308
+ def transition_to_half_open
2309
+ @state = STATE_HALF_OPEN
2310
+ @success_count = 0
2311
+
2312
+ E11y::Metrics.gauge('e11y.circuit_breaker.state', 1, {
2313
+ adapter: @adapter_name,
2314
+ state: 'half_open'
2315
+ })
2316
+
2317
+ E11y.logger.info(
2318
+ "Circuit breaker HALF-OPEN for adapter: #{@adapter_name}"
2319
+ )
2320
+ end
2321
+
2322
+ def transition_to_closed
2323
+ @state = STATE_CLOSED
2324
+ @opened_at = nil
2325
+ @failure_count = 0
2326
+ @success_count = 0
2327
+
2328
+ E11y::Metrics.gauge('e11y.circuit_breaker.state', 0, {
2329
+ adapter: @adapter_name,
2330
+ state: 'closed'
2331
+ })
2332
+
2333
+ E11y.logger.info(
2334
+ "Circuit breaker CLOSED for adapter: #{@adapter_name}"
2335
+ )
2336
+ end
2337
+
2338
+ def check_state_transition
2339
+ return unless @state == STATE_OPEN
2340
+
2341
+ @mutex.synchronize do
2342
+ if Time.now - @opened_at >= @timeout
2343
+ transition_to_half_open
2344
+ end
2345
+ end
2346
+ end
2347
+
2348
+ class CircuitOpenError < StandardError
2349
+ attr_reader :adapter_name, :opened_at
2350
+
2351
+ def initialize(adapter_name, opened_at)
2352
+ @adapter_name = adapter_name
2353
+ @opened_at = opened_at
2354
+
2355
+ super("Circuit breaker is OPEN for adapter '#{adapter_name}' (opened at #{opened_at})")
2356
+ end
2357
+ end
2358
+ end
2359
+ end
2360
+ end
2361
+ ```
2362
+
2363
+ ### 5.2. Circuit Breaker Configuration
2364
+
2365
+ ```ruby
2366
+ # config/initializers/e11y.rb
2367
+ E11y.configure do |config|
2368
+ config.error_handling.circuit_breaker do
2369
+ # Enable circuit breaker
2370
+ enabled true
2371
+
2372
+ # Failure threshold (consecutive failures to trip)
2373
+ failure_threshold 5
2374
+
2375
+ # Timeout before attempting recovery (seconds)
2376
+ timeout_seconds 60
2377
+
2378
+ # Successful attempts in HALF_OPEN before closing
2379
+ half_open_attempts 3
2380
+
2381
+ # Per-adapter overrides
2382
+ adapter_overrides do
2383
+ adapter :loki do
2384
+ failure_threshold 10 # More tolerant for Loki
2385
+ timeout_seconds 120 # Longer recovery time
2386
+ end
2387
+
2388
+ adapter :sentry do
2389
+ failure_threshold 3 # Less tolerant for Sentry
2390
+ timeout_seconds 30 # Faster recovery
2391
+ end
2392
+ end
2393
+ end
2394
+ end
2395
+ ```
2396
+
2397
+ ### 5.3. Circuit Breaker State Diagram
2398
+
2399
+ ```mermaid
2400
+ stateDiagram-v2
2401
+ [*] --> CLOSED
2402
+
2403
+ CLOSED --> OPEN : Failures >= threshold
2404
+ OPEN --> HALF_OPEN : Timeout elapsed
2405
+ HALF_OPEN --> CLOSED : Success count >= attempts
2406
+ HALF_OPEN --> OPEN : Any failure
2407
+
2408
+ note right of CLOSED
2409
+ Requests: Allowed
2410
+ Action: Execute normally
2411
+ end note
2412
+
2413
+ note right of OPEN
2414
+ Requests: Fast-fail
2415
+ Action: Save to DLQ
2416
+ end note
2417
+
2418
+ note right of HALF_OPEN
2419
+ Requests: Limited
2420
+ Action: Health check
2421
+ end note
2422
+ ```
2423
+
2424
+ ---
2425
+
2426
+ ## 6. Graceful Degradation
2427
+
2428
+ ### 6.1. Partial Delivery Strategy
2429
+
2430
+ **Design Decision:** If one adapter fails, others should still succeed.
2431
+
2432
+ ```ruby
2433
+ # lib/e11y/pipeline/dispatcher.rb (extended)
2434
+ module E11y
2435
+ module Pipeline
2436
+ class Dispatcher
2437
+ def dispatch_to_adapters(event_data, adapters)
2438
+ results = {}
2439
+ errors = {}
2440
+
2441
+ # Dispatch to all adapters in parallel (Thread pool)
2442
+ futures = adapters.map do |adapter|
2443
+ Concurrent::Future.execute do
2444
+ begin
2445
+ circuit_breaker = CircuitBreaker.for(adapter.name)
2446
+
2447
+ circuit_breaker.call do
2448
+ retry_handler.with_retry(adapter, event_data) do
2449
+ adapter.send(event_data)
2450
+ end
2451
+ end
2452
+
2453
+ [adapter.name, :success]
2454
+ rescue CircuitBreaker::CircuitOpenError => e
2455
+ # Circuit open → fast fail to DLQ
2456
+ [adapter.name, :circuit_open]
2457
+ rescue RetryHandler::RetryExhausted => e
2458
+ # Retries exhausted → save to DLQ
2459
+ [adapter.name, :retry_exhausted]
2460
+ rescue => e
2461
+ # Unexpected error → save to DLQ
2462
+ [adapter.name, :error, e]
2463
+ end
2464
+ end
2465
+ end
2466
+
2467
+ # Wait for all futures (with timeout)
2468
+ futures.each_with_index do |future, index|
2469
+ adapter_name, status, error = future.value!(5.seconds) rescue [:timeout]
2470
+
2471
+ case status
2472
+ when :success
2473
+ results[adapter_name] = :ok
2474
+ when :circuit_open, :retry_exhausted, :error, :timeout
2475
+ errors[adapter_name] = error || status
2476
+
2477
+ # Save to DLQ if filter allows
2478
+ save_to_dlq_if_allowed(event_data, adapter_name, error)
2479
+ end
2480
+ end
2481
+
2482
+ # Track metrics
2483
+ E11y::Metrics.histogram('e11y.dispatch.success_rate',
2484
+ results.size.to_f / adapters.size,
2485
+ { total_adapters: adapters.size }
2486
+ )
2487
+
2488
+ # Partial success is still considered success
2489
+ # (Graceful degradation)
2490
+ {
2491
+ success: results.size > 0,
2492
+ results: results,
2493
+ errors: errors
2494
+ }
2495
+ end
2496
+
2497
+ private
2498
+
2499
+ def save_to_dlq_if_allowed(event_data, adapter_name, error)
2500
+ metadata = {
2501
+ adapter: adapter_name,
2502
+ error: error,
2503
+ retry_count: 3, # Assume max retries
2504
+ timestamp: Time.now.utc
2505
+ }
2506
+
2507
+ if dlq_filter.should_save?(event_data, metadata)
2508
+ dlq_storage.save(event_data, metadata)
2509
+ else
2510
+ E11y::Metrics.increment('e11y.dlq.filtered_out', {
2511
+ event_name: event_data[:event_name],
2512
+ adapter: adapter_name
2513
+ })
2514
+ end
2515
+ end
2516
+ end
2517
+ end
2518
+ end
2519
+ ```
2520
+
2521
+ ---
2522
+
2523
+ ## 7. Self-Healing
2524
+
2525
+ ### 7.1. Background Health Checker
2526
+
2527
+ ```ruby
2528
+ # lib/e11y/reliability/health_checker.rb
2529
+ module E11y
2530
+ module Reliability
2531
+ class HealthChecker
2532
+ def initialize(config)
2533
+ @interval = config.check_interval_seconds
2534
+ @thread = nil
2535
+ @running = false
2536
+ end
2537
+
2538
+ def start!
2539
+ return if @running
2540
+
2541
+ @running = true
2542
+ @thread = Thread.new { run_health_checks }
2543
+ end
2544
+
2545
+ def stop!
2546
+ @running = false
2547
+ @thread&.join(5.seconds)
2548
+ end
2549
+
2550
+ private
2551
+
2552
+ def run_health_checks
2553
+ loop do
2554
+ break unless @running
2555
+
2556
+ check_all_adapters
2557
+ sleep(@interval)
2558
+ end
2559
+ rescue => error
2560
+ E11y.logger.error("Health checker error: #{error.message}")
2561
+ retry
2562
+ end
2563
+
2564
+ def check_all_adapters
2565
+ E11y::Adapters::Registry.all.each do |adapter|
2566
+ circuit_breaker = CircuitBreaker.for(adapter.name)
2567
+
2568
+ # Only check adapters with open/half-open circuits
2569
+ next if circuit_breaker.healthy?
2570
+
2571
+ begin
2572
+ adapter.health_check
2573
+
2574
+ E11y::Metrics.increment('e11y.health_check.success', {
2575
+ adapter: adapter.name
2576
+ })
2577
+ rescue => error
2578
+ E11y::Metrics.increment('e11y.health_check.failure', {
2579
+ adapter: adapter.name,
2580
+ error: error.class.name
2581
+ })
2582
+ end
2583
+ end
2584
+ end
2585
+ end
2586
+ end
2587
+ end
2588
+ ```
2589
+
2590
+ ### 7.2. Automatic DLQ Replay (Optional)
2591
+
2592
+ ```ruby
2593
+ # lib/e11y/reliability/auto_replayer.rb
2594
+ module E11y
2595
+ module Reliability
2596
+ class AutoReplayer
2597
+ def initialize(config)
2598
+ @enabled = config.enabled
2599
+ @schedule = config.schedule # Cron expression
2600
+ @batch_size = config.batch_size
2601
+ @max_age = config.max_age_hours
2602
+ end
2603
+
2604
+ def replay_old_events
2605
+ return unless @enabled
2606
+
2607
+ cutoff_time = @max_age.hours.ago
2608
+
2609
+ events = DLQ.list(
2610
+ limit: @batch_size,
2611
+ filters: { after: cutoff_time }
2612
+ )
2613
+
2614
+ success_count = 0
2615
+ failure_count = 0
2616
+
2617
+ events.each do |event|
2618
+ if DLQ.replay(event[:id])
2619
+ success_count += 1
2620
+ else
2621
+ failure_count += 1
2622
+ end
2623
+ end
2624
+
2625
+ E11y::Metrics.histogram('e11y.dlq.auto_replay.success_rate',
2626
+ success_count.to_f / (success_count + failure_count),
2627
+ { batch_size: events.size }
2628
+ )
2629
+
2630
+ {
2631
+ total: events.size,
2632
+ success: success_count,
2633
+ failure: failure_count
2634
+ }
2635
+ end
2636
+ end
2637
+ end
2638
+ end
2639
+ ```
2640
+
2641
+ ---
2642
+
2643
+ ## 8. Monitoring & Alerting
2644
+
2645
+ ### 8.1. Key Metrics
2646
+
2647
+ ```ruby
2648
+ # Self-monitoring metrics for reliability
2649
+ E11y::Metrics.define do
2650
+ # Retry metrics
2651
+ counter 'e11y.retry.attempt', 'Retry attempt', [:adapter, :attempt, :delay_ms]
2652
+ counter 'e11y.retry.success', 'Retry succeeded', [:adapter, :attempt]
2653
+ counter 'e11y.retry.exhausted', 'Retry exhausted', [:adapter, :attempts]
2654
+ counter 'e11y.retry.permanent_failure', 'Permanent failure', [:adapter, :error_class]
2655
+
2656
+ # Circuit breaker metrics
2657
+ gauge 'e11y.circuit_breaker.state', 'Circuit state (0=closed, 1=half_open, 2=open)', [:adapter]
2658
+ counter 'e11y.circuit_breaker.opened', 'Circuit opened', [:adapter]
2659
+ counter 'e11y.circuit_breaker.closed', 'Circuit closed', [:adapter]
2660
+ counter 'e11y.circuit_breaker.rejected', 'Requests rejected (circuit open)', [:adapter]
2661
+
2662
+ # DLQ metrics
2663
+ counter 'e11y.dlq.saved', 'Events saved to DLQ', [:event_name, :adapter]
2664
+ counter 'e11y.dlq.replayed', 'Events replayed from DLQ', [:event_name]
2665
+ counter 'e11y.dlq.replay_failed', 'DLQ replay failed', [:event_name, :error]
2666
+ counter 'e11y.dlq.filtered_out', 'Events filtered out (not saved)', [:event_name, :adapter]
2667
+ gauge 'e11y.dlq.size', 'Total events in DLQ', []
2668
+
2669
+ # Health check metrics
2670
+ counter 'e11y.health_check.success', 'Health check succeeded', [:adapter]
2671
+ counter 'e11y.health_check.failure', 'Health check failed', [:adapter, :error]
2672
+
2673
+ # Dispatch metrics
2674
+ histogram 'e11y.dispatch.success_rate', 'Adapter success rate', [:total_adapters]
2675
+ end
2676
+ ```
2677
+
2678
+ ### 8.2. Alert Rules (Prometheus/Grafana)
2679
+
2680
+ ```yaml
2681
+ # Alert when circuit breaker opens
2682
+ - alert: E11yCircuitBreakerOpen
2683
+ expr: e11y_circuit_breaker_state == 2
2684
+ for: 1m
2685
+ labels:
2686
+ severity: warning
2687
+ annotations:
2688
+ summary: "E11y circuit breaker open for {{ $labels.adapter }}"
2689
+
2690
+ # Alert when DLQ grows too large
2691
+ - alert: E11yDLQSizeHigh
2692
+ expr: e11y_dlq_size > 10000
2693
+ for: 5m
2694
+ labels:
2695
+ severity: critical
2696
+ annotations:
2697
+ summary: "E11y DLQ has {{ $value }} events (threshold: 10000)"
2698
+
2699
+ # Alert when retry rate is high
2700
+ - alert: E11yHighRetryRate
2701
+ expr: rate(e11y_retry_attempt[5m]) > 100
2702
+ for: 2m
2703
+ labels:
2704
+ severity: warning
2705
+ annotations:
2706
+ summary: "E11y retry rate is {{ $value }}/s (threshold: 100/s)"
2707
+ ```
2708
+
2709
+ ---
2710
+
2711
+ ## 9. Trade-offs
2712
+
2713
+ ### 9.1. Key Decisions
2714
+
2715
+ | Decision | Pro | Con | Rationale |
2716
+ |----------|-----|-----|-----------|
2717
+ | **Exponential backoff** | Adaptive recovery | Longer delays | Industry best practice |
2718
+ | **Jitter** | Avoid thundering herd | Complexity | Prevent simultaneous retries |
2719
+ | **Per-adapter circuit breaker** | Isolation | Memory overhead | Independent failure domains |
2720
+ | **File-based DLQ (default)** | No dependencies | Slower | Simple, reliable |
2721
+ | **Redis DLQ (optional)** | Faster | Requires Redis | For high-volume |
2722
+ | **DLQ filter** | Cost control | Event loss risk | Critical events prioritized |
2723
+ | **Graceful degradation** | Partial success | Complexity | Availability > consistency |
2724
+ | **Retries count toward rate limit** | Prevent abuse | May discard events | DLQ safety net |
2725
+ | **Retry rate limiting (C06)** ⚠️ | Prevents cascade failures | Slower recovery | Protects recovering adapters |
2726
+ | **Non-failing job tracking (C18)** ⚠️ | Business logic never blocked | Silent failures | Observability < business logic |
2727
+ | **Rate limiter checks DLQ (C02)** ⚠️ | Zero critical data loss | Higher complexity | `always_save` guarantee respected |
2728
+
2729
+ ### 9.2. Alternatives Considered
2730
+
2731
+ **A) No retry policy**
2732
+ - ❌ Rejected: Too many transient failures
2733
+
2734
+ **B) Fixed backoff delay**
2735
+ - ❌ Rejected: Not adaptive to failure severity
2736
+
2737
+ **C) Global circuit breaker**
2738
+ - ❌ Rejected: One adapter failure affects all
2739
+
2740
+ **D) Database-backed DLQ**
2741
+ - ❌ Rejected: Added dependency, slower writes
2742
+
2743
+ **E) Immediate DLQ replay**
2744
+ - ❌ Rejected: Could overwhelm recovering adapters
2745
+
2746
+ ---
2747
+
2748
+ **Status:** ✅ Draft Complete
2749
+ **Next:** ADR-011 (Testing Strategy) or ADR-005 (Tracing & Context)
2750
+ **Estimated Implementation:** 3 weeks