e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,956 @@
1
+ # UC-021: Error Handling, Retry Policy & Dead Letter Queue
2
+
3
+ **Status:** Reliability Feature (MVP)
4
+ **Complexity:** Intermediate
5
+ **Setup Time:** 20-30 minutes
6
+ **Target Users:** DevOps, SRE, Platform Engineers
7
+
8
+ ---
9
+
10
+ ## ๐Ÿ“‹ Overview
11
+
12
+ ### Problem Statement
13
+
14
+ **Current Pain Points:**
15
+
16
+ 1. **Events lost on transient failures**
17
+ - Network timeout โ†’ event dropped
18
+ - Elasticsearch temporarily down โ†’ no retry
19
+ - Loki 503 error โ†’ data loss
20
+
21
+ 2. **No retry mechanism**
22
+ - Single attempt to send event
23
+ - If adapter fails โ†’ event lost forever
24
+ - No visibility into failed sends
25
+
26
+ 3. **No dead letter queue**
27
+ - Failed events disappear
28
+ - Can't replay failed events
29
+ - No forensics for why events failed
30
+
31
+ ### E11y Solution
32
+
33
+ **Robust Error Handling Pipeline:**
34
+
35
+ - **Retry Policy:** Exponential backoff with jitter
36
+ - **Dead Letter Queue:** Failed events stored for later analysis/replay
37
+ - **Circuit Breaker:** Prevent cascading failures (already covered in UC-011)
38
+ - **Observability:** Metrics for failures, retries, DLQ size
39
+
40
+ **Result:** Zero data loss, resilient to transient failures.
41
+
42
+ ---
43
+
44
+ ## ๐ŸŽฏ Use Case Scenarios
45
+
46
+ ### Scenario 1: Transient Network Failure
47
+
48
+ **Problem:** Loki temporarily unavailable (30s downtime)
49
+
50
+ **Without retry (DATA LOSS!):**
51
+ ```ruby
52
+ Events::OrderCreated.track(order_id: '123')
53
+ # โ†’ Send to Loki
54
+ # โ†’ Network timeout (30s)
55
+ # โ†’ โŒ Event dropped! No retry!
56
+ ```
57
+
58
+ **With retry (RESILIENT!):**
59
+ ```ruby
60
+ Events::OrderCreated.track(order_id: '123')
61
+ # โ†’ Send to Loki
62
+ # โ†’ Network timeout (30s)
63
+ # โ†’ Retry #1 after 100ms โ†’ Still timeout
64
+ # โ†’ Retry #2 after 200ms โ†’ Still timeout
65
+ # โ†’ Retry #3 after 400ms โ†’ Success! โœ…
66
+ # Event delivered successfully
67
+ ```
68
+
69
+ ---
70
+
71
+ ### Scenario 2: Persistent Failure โ†’ Dead Letter Queue
72
+
73
+ **Problem:** Elasticsearch down for maintenance (2 hours)
74
+
75
+ **Without DLQ (DATA LOSS!):**
76
+ ```ruby
77
+ # 1000 events during 2-hour maintenance window
78
+ 1000.times do
79
+ Events::OrderCreated.track(...)
80
+ # โ†’ All retries exhausted
81
+ # โ†’ โŒ All 1000 events lost!
82
+ end
83
+ ```
84
+
85
+ **With DLQ (NO DATA LOSS!):**
86
+ ```ruby
87
+ # Config:
88
+ E11y.configure do |config|
89
+ config.error_handling.dead_letter_queue do
90
+ enabled true
91
+ adapter :dlq_file # Write to local file
92
+ end
93
+ end
94
+
95
+ # During ES maintenance:
96
+ 1000.times do
97
+ Events::OrderCreated.track(...)
98
+ # โ†’ All retries exhausted
99
+ # โ†’ โœ… Event written to DLQ!
100
+ end
101
+
102
+ # After ES maintenance:
103
+ # Replay DLQ events
104
+ E11y::DeadLetterQueue.replay_all
105
+ # โ†’ All 1000 events successfully sent!
106
+ ```
107
+
108
+ ---
109
+
110
+ ### Scenario 3: Partial Adapter Failure
111
+
112
+ **Problem:** Sentry down, but Loki working
113
+
114
+ ```ruby
115
+ class CriticalError < E11y::Event::Base
116
+ adapters [:loki, :sentry, :file]
117
+ end
118
+
119
+ Events::CriticalError.track(error: 'Something went wrong')
120
+
121
+ # Loki: โœ… Success
122
+ # Sentry: โŒ Timeout (retries exhausted)
123
+ # File: โœ… Success
124
+
125
+ # Result:
126
+ # - Event in Loki โœ…
127
+ # - Event in File โœ…
128
+ # - Event in DLQ (for Sentry) โœ…
129
+ #
130
+ # Later: Replay DLQ โ†’ Send to Sentry when it's back up
131
+ ```
132
+
133
+ ---
134
+
135
+ ### Scenario 4: DLQ Filter (Critical vs. Non-Critical Events)
136
+
137
+ **Problem:** DLQ fills with unimportant events (health checks, metrics).
138
+
139
+ **Without DLQ filter (BAD!):**
140
+ ```ruby
141
+ # Health checks fill DLQ
142
+ 1000.times do
143
+ Events::HealthCheck.track(status: 'ok')
144
+ # Loki down โ†’ All 1000 health checks in DLQ!
145
+ end
146
+
147
+ # DLQ full of unimportant events ๐Ÿ˜ž
148
+ E11y::DeadLetterQueue.size # => 1000 (mostly garbage)
149
+ ```
150
+
151
+ **With DLQ filter (GOOD!):**
152
+ ```ruby
153
+ # Config:
154
+ E11y.configure do |config|
155
+ config.error_handling.dead_letter_queue.filter do
156
+ # Don't save health checks to DLQ
157
+ never_save do
158
+ event_patterns ['health_check.*', 'ping.*']
159
+ end
160
+
161
+ # Always save payments
162
+ always_save do
163
+ event_patterns ['payment.*', 'order.*']
164
+ end
165
+ end
166
+ end
167
+
168
+ # Health checks (not saved to DLQ):
169
+ 1000.times do
170
+ Events::HealthCheck.track(status: 'ok')
171
+ # Loki down โ†’ โŒ Retries exhausted โ†’ Dropped (not in DLQ)
172
+ end
173
+
174
+ # Payment (saved to DLQ):
175
+ Events::PaymentFailed.track(order_id: '123', amount: 500)
176
+ # Loki down โ†’ โŒ Retries exhausted โ†’ โœ… Saved to DLQ!
177
+
178
+ # DLQ only contains critical events
179
+ E11y::DeadLetterQueue.size # => 1 (only payment)
180
+ ```
181
+
182
+ ---
183
+
184
+ ## ๐Ÿ—๏ธ Architecture
185
+
186
+ > **Implementation:** See [ADR-013: Reliability & Error Handling](../ADR-013-reliability-error-handling.md) for complete error handling architecture, including retry policy with exponential backoff and jitter, circuit breaker pattern, Dead Letter Queue (DLQ) storage strategies, and self-monitoring metrics.
187
+
188
+ ### Retry Pipeline
189
+
190
+ ```
191
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
192
+ โ”‚ Event Flow with Retry & DLQ โ”‚
193
+ โ”‚ โ”‚
194
+ โ”‚ Event.track(...) โ”‚
195
+ โ”‚ โ†“ โ”‚
196
+ โ”‚ Main Buffer โ”‚
197
+ โ”‚ โ†“ โ”‚
198
+ โ”‚ Flush (every 200ms) โ”‚
199
+ โ”‚ โ†“ โ”‚
200
+ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚
201
+ โ”‚ โ”‚ Try: Send to Adapter โ”‚ โ”‚
202
+ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
203
+ โ”‚ โ†“ โ”‚
204
+ โ”‚ Success? โ”€โ”€YESโ”€โ”€โ†’ โœ… Done โ”‚
205
+ โ”‚ โ”‚ โ”‚
206
+ โ”‚ NO (Error) โ”‚
207
+ โ”‚ โ†“ โ”‚
208
+ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚
209
+ โ”‚ โ”‚ Retry Policy: Exponential Backoff โ”‚ โ”‚
210
+ โ”‚ โ”‚ - Retry #1 after 100ms โ”‚ โ”‚
211
+ โ”‚ โ”‚ - Retry #2 after 200ms (ร—2) โ”‚ โ”‚
212
+ โ”‚ โ”‚ - Retry #3 after 400ms (ร—2) โ”‚ โ”‚
213
+ โ”‚ โ”‚ - Max 3 retries โ”‚ โ”‚
214
+ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
215
+ โ”‚ โ†“ โ”‚
216
+ โ”‚ All retries exhausted? โ”‚
217
+ โ”‚ โ†“ โ”‚
218
+ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚
219
+ โ”‚ โ”‚ Dead Letter Queue (DLQ) โ”‚ โ”‚
220
+ โ”‚ โ”‚ - Store failed event โ”‚ โ”‚
221
+ โ”‚ โ”‚ - Store error details โ”‚ โ”‚
222
+ โ”‚ โ”‚ - Store retry attempts โ”‚ โ”‚
223
+ โ”‚ โ”‚ - Allow replay later โ”‚ โ”‚
224
+ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
225
+ โ”‚ โ†“ โ”‚
226
+ โ”‚ โœ… Event preserved for later replay โ”‚
227
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
228
+ ```
229
+
230
+ ### Exponential Backoff with Jitter
231
+
232
+ ```
233
+ Retry Delays (with jitter):
234
+
235
+ Retry #1: 100ms + random(0-50ms) = 100-150ms
236
+ Retry #2: 200ms + random(0-100ms) = 200-300ms
237
+ Retry #3: 400ms + random(0-200ms) = 400-600ms
238
+
239
+ Max delay: 5 seconds (configurable)
240
+
241
+ Jitter prevents "thundering herd" problem
242
+ ```
243
+
244
+ ---
245
+
246
+ ## ๐Ÿ”ง Configuration
247
+
248
+ ### Basic Setup
249
+
250
+ ```ruby
251
+ # config/initializers/e11y.rb
252
+ E11y.configure do |config|
253
+ config.error_handling do
254
+ # === Retry Policy ===
255
+ retry_policy do
256
+ enabled true
257
+ max_retries 3
258
+ initial_delay 0.1.seconds # 100ms
259
+ max_delay 5.seconds
260
+ multiplier 2 # Exponential: 100ms, 200ms, 400ms
261
+ jitter true # Add randomness to prevent thundering herd
262
+ end
263
+
264
+ # === Dead Letter Queue ===
265
+ dead_letter_queue do
266
+ enabled true
267
+
268
+ # Where to store failed events
269
+ adapter :dlq_file # Reference to registered adapter
270
+
271
+ # Or use specific DLQ adapter
272
+ # adapter E11y::Adapters::FileAdapter.new(
273
+ # path: Rails.root.join('log', 'e11y_dlq'),
274
+ # rotation: :daily
275
+ # )
276
+
277
+ # Max events in DLQ before alerting
278
+ max_size 10_000
279
+
280
+ # Alert when DLQ grows
281
+ alert_on_size 1000 # Alert at 1000 events
282
+ end
283
+
284
+ # What to do after max_retries exhausted
285
+ on_max_retries_exceeded :send_to_dlq # :send_to_dlq, :drop, :log
286
+
287
+ # Which errors are retryable
288
+ retryable_errors [
289
+ Errno::ETIMEDOUT,
290
+ Errno::ECONNREFUSED,
291
+ Errno::ECONNRESET,
292
+ Net::OpenTimeout,
293
+ Net::ReadTimeout,
294
+ HTTP::TimeoutError
295
+ ]
296
+
297
+ # Which errors are NOT retryable (fail immediately)
298
+ non_retryable_errors [
299
+ E11y::ValidationError, # Schema validation failed
300
+ E11y::RateLimitError # Rate limit exceeded
301
+ ]
302
+ end
303
+ end
304
+ ```
305
+
306
+ ### Advanced Configuration
307
+
308
+ ```ruby
309
+ E11y.configure do |config|
310
+ config.error_handling do
311
+ retry_policy do
312
+ enabled true
313
+ max_retries 5 # More retries for critical systems
314
+
315
+ # Adaptive retry delays
316
+ delays [0.1, 0.2, 0.5, 1, 2] # Custom delays in seconds
317
+
318
+ # Or exponential with custom params
319
+ initial_delay 0.05.seconds
320
+ max_delay 10.seconds
321
+ multiplier 2.5 # Faster exponential growth
322
+ jitter_range 0.5 # ยฑ50% jitter
323
+
324
+ # Per-adapter retry configuration
325
+ per_adapter do
326
+ adapter :loki do
327
+ max_retries 3
328
+ initial_delay 0.1
329
+ end
330
+
331
+ adapter :sentry do
332
+ max_retries 5 # More retries for Sentry
333
+ initial_delay 0.5
334
+ end
335
+ end
336
+
337
+ # Retry predicate (custom logic)
338
+ retry_if do |error, attempt|
339
+ # Custom logic: retry only for specific errors
340
+ error.is_a?(Net::ReadTimeout) && attempt < 5
341
+ end
342
+ end
343
+
344
+ dead_letter_queue do
345
+ enabled true
346
+ adapter :dlq_file
347
+
348
+ # DLQ retention
349
+ retention 7.days # Auto-delete old DLQ events
350
+
351
+ # DLQ partitioning (for large volumes)
352
+ partition_by :adapter # Separate DLQ per adapter
353
+ # log/e11y_dlq/loki/2026-01-12.jsonl
354
+ # log/e11y_dlq/sentry/2026-01-12.jsonl
355
+
356
+ # Compression
357
+ compression :gzip # Compress DLQ files
358
+
359
+ # Metadata
360
+ include_metadata true # Store error details, retry count
361
+
362
+ # ===== DLQ FILTER (Critical!) =====
363
+ # Control which events are saved to DLQ vs. dropped
364
+ filter do
365
+ # Always save critical events to DLQ (never drop!)
366
+ always_save do
367
+ severity [:error, :fatal] # All errors must be preserved
368
+ event_patterns [
369
+ 'payment.*', # Payment events are critical
370
+ 'order.*', # Order events are critical
371
+ 'audit.*', # Audit events must never be lost
372
+ 'security.*', # Security events are critical
373
+ 'fraud.*' # Fraud detection events
374
+ ]
375
+ end
376
+
377
+ # Never save to DLQ (drop after max retries)
378
+ never_save do
379
+ severity [:debug] # Debug events can be dropped
380
+ event_patterns [
381
+ 'metrics.*', # Metrics can be dropped (regenerated)
382
+ 'health_check.*', # Health checks not critical
383
+ 'ping.*' # Ping events not important
384
+ ]
385
+ end
386
+
387
+ # Custom filter function
388
+ save_if do |event|
389
+ # Example: Save high-value payments only
390
+ if event.name.include?('payment') && event.payload[:amount]
391
+ event.payload[:amount] > 100 # Only save payments >$100
392
+ else
393
+ true # Save all other events by default
394
+ end
395
+ end
396
+ end
397
+ end
398
+
399
+ # Fallback chain
400
+ fallback_chain do
401
+ # If primary adapter fails after retries:
402
+ # 1. Try fallback adapter
403
+ # 2. If fallback fails โ†’ DLQ
404
+
405
+ adapter :loki do
406
+ fallback :file # Loki fails โ†’ write to file
407
+ end
408
+
409
+ adapter :sentry do
410
+ fallback nil # Sentry fails โ†’ DLQ directly
411
+ end
412
+ end
413
+ end
414
+ end
415
+ ```
416
+
417
+ ---
418
+
419
+ ## ๐Ÿ“ DLQ Management
420
+
421
+ ### Replay DLQ Events
422
+
423
+ ```ruby
424
+ # Replay all DLQ events
425
+ E11y::DeadLetterQueue.replay_all
426
+
427
+ # Replay specific adapter's DLQ
428
+ E11y::DeadLetterQueue.replay(adapter: :loki)
429
+
430
+ # Replay with filtering
431
+ E11y::DeadLetterQueue.replay do |event|
432
+ # Only replay events from last hour
433
+ event.timestamp > 1.hour.ago
434
+ end
435
+
436
+ # Replay with rate limiting
437
+ E11y::DeadLetterQueue.replay(
438
+ rate_limit: 100, # 100 events/sec
439
+ batch_size: 50
440
+ )
441
+ ```
442
+
443
+ ---
444
+
445
+ ### DLQ Replay with PII & Schema Considerations (C07, C15)
446
+
447
+ > **โš ๏ธ CRITICAL:** DLQ replay requires special handling for PII filtering and schema migrations.
448
+ > **See:** [ADR-006 Section 5.6](../ADR-006-security-compliance.md#56-pii-handling-for-event-replay-from-dlq-c07-resolution) for C07 (PII double-hashing), [ADR-012 Section 8](../ADR-012-event-evolution.md#8-schema-migrations-and-dlq-replay-c15-resolution--critical) for C15 (schema migrations).
449
+
450
+ **Problem 1: PII Double-Hashing on Replay (C07)**
451
+
452
+ When replaying events from DLQ, PII filtering middleware runs again, causing double-hashing:
453
+
454
+ ```ruby
455
+ # โŒ BAD: Double-hashing PII on replay
456
+ # Original event (first processing):
457
+ Events::UserLogin.track(
458
+ email: 'user@example.com', # โ† Original PII
459
+ ip: '192.168.1.1' # โ† Original PII
460
+ )
461
+
462
+ # Pipeline step 2: PII Filtering
463
+ # โ†’ email: 'user@example.com' โ†’ SHA256 hash โ†’ 'a1b2c3d4...'
464
+ # โ†’ ip: '192.168.1.1' โ†’ SHA256 hash โ†’ 'e5f6g7h8...'
465
+
466
+ # Event sent, but Loki fails โ†’ goes to DLQ
467
+
468
+ # DLQ Replay:
469
+ E11y::DeadLetterQueue.replay_all
470
+
471
+ # Pipeline step 2: PII Filtering runs AGAIN!
472
+ # โ†’ email: 'a1b2c3d4...' (already hashed!) โ†’ SHA256 hash โ†’ 'x9y8z7w6...'
473
+ # โŒ DOUBLE-HASHED! Original: a1b2c3d4, Replay: x9y8z7w6
474
+
475
+ # Result: DATA CORRUPTION!
476
+ # - Same user, DIFFERENT hashes!
477
+ # - Audit trail broken
478
+ # - GDPR data deletion impossible
479
+ ```
480
+
481
+ **Solution: Metadata Flags to Skip PII Filtering**
482
+
483
+ ```ruby
484
+ # โœ… GOOD: Mark replayed events to skip PII filtering
485
+ # config/initializers/e11y.rb
486
+ E11y.configure do |config|
487
+ config.error_handling.dead_letter_queue do
488
+ enabled true
489
+ adapter :dlq_file
490
+
491
+ # === CRITICAL: Enable replay metadata (C07) ===
492
+ # Replay service automatically adds flags:
493
+ # - :replayed => true (skip transformations)
494
+ # - :pii_filtered => true (already filtered)
495
+ mark_replayed_events true # โ† Default: true
496
+ end
497
+ end
498
+
499
+ # Replay service implementation:
500
+ module E11y
501
+ module DLQ
502
+ class ReplayService
503
+ def replay_event(dlq_event)
504
+ event_data = dlq_event[:event_data]
505
+
506
+ # โœ… CRITICAL: Add replay metadata flags
507
+ event_data[:metadata] ||= {}
508
+ event_data[:metadata][:replayed] = true
509
+ event_data[:metadata][:pii_filtered] = true # Already filtered!
510
+ event_data[:metadata][:replayed_at] = Time.now.utc.iso8601
511
+ event_data[:metadata][:original_event_id] = event_data[:event_id]
512
+
513
+ # Send through pipeline
514
+ # PII filter middleware will skip (checks :replayed flag)
515
+ E11y::Pipeline.process(event_data)
516
+ end
517
+ end
518
+ end
519
+ end
520
+
521
+ # PiiFilter middleware checks flags:
522
+ class PiiFilter < Base
523
+ def call(event_data)
524
+ # โœ… Skip PII filtering for replayed events
525
+ if already_filtered?(event_data)
526
+ E11y.logger.debug "[E11y] Skipping PII filtering for replayed event"
527
+ return event_data
528
+ end
529
+
530
+ # Apply PII filtering for new events
531
+ filter_pii(event_data)
532
+ end
533
+
534
+ private
535
+
536
+ def already_filtered?(event_data)
537
+ metadata = event_data[:metadata] || {}
538
+ metadata[:replayed] || metadata[:pii_filtered]
539
+ end
540
+ end
541
+
542
+ # Replay with idempotency guarantee:
543
+ E11y::DeadLetterQueue.replay_all
544
+ # โ†’ All events processed correctly
545
+ # โ†’ PII hashes preserved (no double-hashing)
546
+ # โ†’ Audit trail intact โœ…
547
+ ```
548
+
549
+ **Problem 2: Schema Migrations & DLQ Replay (C15) โš ๏ธ User Responsibility**
550
+
551
+ > **Decision:** Schema migrations are the **user's responsibility**, not E11y's. This is an edge case for poorly managed DLQs.
552
+
553
+ **Scenario:**
554
+
555
+ ```ruby
556
+ # v1.0: Order event schema (old)
557
+ class OrderCreated < E11y::Event::Base
558
+ schema do
559
+ required(:order_id).filled(:string)
560
+ required(:amount).filled(:float)
561
+ end
562
+ end
563
+
564
+ # Events tracked with v1.0 schema
565
+ Events::OrderCreated.track(order_id: '123', amount: 99.99)
566
+ # โ†’ Loki fails โ†’ Event goes to DLQ
567
+
568
+ # v2.0: Order event schema (new - added required field)
569
+ class OrderCreated < E11y::Event::Base
570
+ schema do
571
+ required(:order_id).filled(:string)
572
+ required(:amount).filled(:float)
573
+ required(:currency).filled(:string) # โ† NEW REQUIRED FIELD!
574
+ end
575
+ end
576
+
577
+ # DLQ Replay (after schema change):
578
+ E11y::DeadLetterQueue.replay_all
579
+ # โ†’ Old event: { order_id: '123', amount: 99.99 }
580
+ # โ†’ โŒ Schema validation fails (missing :currency)!
581
+ # โ†’ Event REJECTED!
582
+ ```
583
+
584
+ **Recommendation: User Responsibility**
585
+
586
+ 1. **Clear DLQ before schema changes** (best practice):
587
+ ```ruby
588
+ # Before deploying v2.0:
589
+ # 1. Replay all DLQ events (under v1.0 schema)
590
+ E11y::DeadLetterQueue.replay_all
591
+
592
+ # 2. Verify DLQ is empty
593
+ E11y::DeadLetterQueue.size # => 0
594
+
595
+ # 3. Deploy v2.0 with new schema
596
+ ```
597
+
598
+ 2. **Use lenient validation for DLQ replay** (optional - user-implemented):
599
+ ```ruby
600
+ # config/initializers/e11y.rb
601
+ E11y.configure do |config|
602
+ config.validation do
603
+ # Lenient validation for replayed events
604
+ # (user chooses to allow old schema)
605
+ lenient_mode_if do |event_data|
606
+ event_data.dig(:metadata, :replayed) == true
607
+ end
608
+ end
609
+ end
610
+ ```
611
+
612
+ 3. **Separate DLQ processing for old events** (optional - user-implemented):
613
+ ```ruby
614
+ # Replay old events with schema migration logic
615
+ E11y::DeadLetterQueue.replay do |event|
616
+ # User-implemented migration
617
+ if event.version == '1.0' && event.name == 'order.created'
618
+ # Add missing :currency field
619
+ event.payload[:currency] = 'USD' # Default value
620
+ end
621
+
622
+ true # Replay this event
623
+ end
624
+ ```
625
+
626
+ **Key Takeaways:**
627
+
628
+ | Aspect | E11y Responsibility | User Responsibility |
629
+ |--------|---------------------|---------------------|
630
+ | **PII Double-Hashing** | โœ… Handled by E11y (metadata flags) | None - automatic |
631
+ | **Schema Migrations** | โŒ NOT handled by E11y | โœ… User must clear DLQ before schema changes OR implement lenient validation |
632
+ | **Idempotency** | โœ… Guaranteed by E11y (replay flags) | None - automatic |
633
+ | **DLQ Management** | โŒ NOT handled by E11y | โœ… User must clear old events periodically |
634
+
635
+ **Trade-offs (C07):**
636
+
637
+ | Decision | Pro | Con | Mitigation |
638
+ |----------|-----|-----|------------|
639
+ | **Metadata flags** | Simple, automatic | Metadata size +24 bytes | Acceptable overhead |
640
+ | **`:replayed` flag** | Clear intent | None | โœ… Best practice |
641
+ | **Skip PII filter** | Prevents double-hashing | Must trust DLQ integrity | DLQ stored securely (encrypted) |
642
+
643
+ **Trade-offs (C15):**
644
+
645
+ | Decision | Pro | Con | Mitigation |
646
+ |----------|-----|-----|------------|
647
+ | **User responsibility** | E11y stays simple | User must manage DLQ lifecycle | Document best practices (clear DLQ before schema changes) |
648
+ | **No auto-migration** | No complex migration logic in E11y | Old events may fail validation | User implements lenient validation OR pre-replay migration |
649
+ | **Edge case** | Rare in well-managed systems | May surprise users with large DLQs | Clear warnings in docs |
650
+
651
+ ---
652
+
653
+ ### Inspect DLQ
654
+
655
+ ```ruby
656
+ # Count events in DLQ
657
+ E11y::DeadLetterQueue.size
658
+ # => 1234
659
+
660
+ # Peek at DLQ (first 10 events)
661
+ E11y::DeadLetterQueue.peek(limit: 10)
662
+ # => [<Event>, <Event>, ...]
663
+
664
+ # Get DLQ stats
665
+ E11y::DeadLetterQueue.stats
666
+ # => {
667
+ # total: 1234,
668
+ # by_adapter: { loki: 1000, sentry: 234 },
669
+ # oldest: 2.hours.ago,
670
+ # newest: 5.minutes.ago
671
+ # }
672
+
673
+ # Find specific events
674
+ E11y::DeadLetterQueue.find do |event|
675
+ event.name == 'order.paid' && event.payload[:amount] > 1000
676
+ end
677
+ ```
678
+
679
+ ### Clean DLQ
680
+
681
+ ```ruby
682
+ # Clear all DLQ events
683
+ E11y::DeadLetterQueue.clear!
684
+
685
+ # Clear old events (older than 7 days)
686
+ E11y::DeadLetterQueue.clear_old!(7.days)
687
+
688
+ # Clear by adapter
689
+ E11y::DeadLetterQueue.clear!(adapter: :loki)
690
+ ```
691
+
692
+ ---
693
+
694
+ ## ๐Ÿ’ก Best Practices
695
+
696
+ ### โœ… DO
697
+
698
+ **1. Enable retry for transient errors**
699
+ ```ruby
700
+ # โœ… GOOD: Retry on network errors
701
+ config.error_handling.retry_policy do
702
+ enabled true
703
+ retryable_errors [
704
+ Errno::ETIMEDOUT,
705
+ Net::ReadTimeout,
706
+ HTTP::TimeoutError
707
+ ]
708
+ end
709
+ ```
710
+
711
+ **2. Use DLQ for critical events**
712
+ ```ruby
713
+ # โœ… GOOD: DLQ enabled for zero data loss
714
+ config.error_handling.dead_letter_queue do
715
+ enabled true
716
+ adapter :dlq_file
717
+ end
718
+ ```
719
+
720
+ **3. Monitor DLQ size**
721
+ ```ruby
722
+ # โœ… GOOD: Alert when DLQ grows
723
+ config.error_handling.dead_letter_queue do
724
+ max_size 10_000
725
+ alert_on_size 1000
726
+ end
727
+
728
+ # Set up Prometheus alert:
729
+ # alert: DLQSizeHigh
730
+ # expr: e11y_dlq_size > 1000
731
+ ```
732
+
733
+ **4. Replay DLQ regularly**
734
+ ```ruby
735
+ # โœ… GOOD: Schedule DLQ replay
736
+ # config/schedule.rb (whenever gem)
737
+ every 10.minutes do
738
+ runner "E11y::DeadLetterQueue.replay_all"
739
+ end
740
+
741
+ # Or Sidekiq job:
742
+ class E11yDlqReplayJob
743
+ include Sidekiq::Job
744
+
745
+ def perform
746
+ E11y::DeadLetterQueue.replay_all
747
+ end
748
+ end
749
+
750
+ # Schedule every 10 minutes
751
+ ```
752
+
753
+ ---
754
+
755
+ ### โŒ DON'T
756
+
757
+ **1. Don't retry non-retryable errors**
758
+ ```ruby
759
+ # โŒ BAD: Retrying validation errors (will always fail)
760
+ config.error_handling.retry_policy do
761
+ retryable_errors [
762
+ E11y::ValidationError # โ† Will NEVER succeed!
763
+ ]
764
+ end
765
+
766
+ # โœ… GOOD: Skip retry for validation errors
767
+ config.error_handling.non_retryable_errors [
768
+ E11y::ValidationError,
769
+ E11y::RateLimitError
770
+ ]
771
+ ```
772
+
773
+ **2. Don't set too many retries**
774
+ ```ruby
775
+ # โŒ BAD: Too many retries (adds latency)
776
+ config.error_handling.retry_policy do
777
+ max_retries 20 # โ† Too many! Total delay: minutes
778
+ end
779
+
780
+ # โœ… GOOD: Reasonable retry count
781
+ config.error_handling.retry_policy do
782
+ max_retries 3 # โ† Enough for transient errors
783
+ # Total delay: ~700ms (acceptable)
784
+ end
785
+ ```
786
+
787
+ **3. Don't ignore DLQ growth**
788
+ ```ruby
789
+ # โŒ BAD: No monitoring, DLQ grows indefinitely
790
+ config.error_handling.dead_letter_queue do
791
+ enabled true
792
+ # No max_size, no alerts!
793
+ end
794
+
795
+ # โœ… GOOD: Monitor and alert
796
+ config.error_handling.dead_letter_queue do
797
+ enabled true
798
+ max_size 10_000
799
+ alert_on_size 1000
800
+
801
+ # Auto-cleanup old events
802
+ retention 7.days
803
+ end
804
+ ```
805
+
806
+ ---
807
+
808
+ ## ๐Ÿ“Š Monitoring & Metrics
809
+
810
+ ### Self-Monitoring Metrics
811
+
812
+ ```ruby
813
+ # E11y automatically exports these metrics:
814
+
815
+ # Retries
816
+ e11y_retries_total{adapter, error_type} # Counter
817
+ e11y_retry_attempts{adapter} # Histogram (how many retries before success)
818
+
819
+ # DLQ
820
+ e11y_dlq_size{adapter} # Gauge (current DLQ size)
821
+ e11y_dlq_events_added_total{adapter, error_type} # Counter
822
+ e11y_dlq_events_replayed_total{adapter, status} # Counter (status: success/failure)
823
+
824
+ # Errors
825
+ e11y_adapter_errors_total{adapter, error_type, retryable} # Counter
826
+ e11y_max_retries_exceeded_total{adapter} # Counter
827
+ ```
828
+
829
+ ### Prometheus Alerts
830
+
831
+ ```yaml
832
+ groups:
833
+ - name: e11y_error_handling
834
+ rules:
835
+ # DLQ growing
836
+ - alert: E11yDLQSizeHigh
837
+ expr: e11y_dlq_size > 1000
838
+ for: 5m
839
+ annotations:
840
+ summary: "E11y DLQ has >1000 events"
841
+
842
+ # High retry rate
843
+ - alert: E11yHighRetryRate
844
+ expr: rate(e11y_retries_total[5m]) > 10
845
+ for: 5m
846
+ annotations:
847
+ summary: "E11y retrying >10 events/sec"
848
+
849
+ # Max retries exceeded
850
+ - alert: E11yMaxRetriesExceeded
851
+ expr: rate(e11y_max_retries_exceeded_total[5m]) > 1
852
+ for: 5m
853
+ annotations:
854
+ summary: "E11y events failing after max retries"
855
+ ```
856
+
857
+ ---
858
+
859
+ ## ๐Ÿงช Testing
860
+
861
+ ### RSpec Examples
862
+
863
+ ```ruby
864
+ RSpec.describe 'E11y Error Handling' do
865
+ describe 'Retry Policy' do
866
+ it 'retries on transient errors' do
867
+ adapter = instance_double(E11y::Adapters::LokiAdapter)
868
+
869
+ # First 2 attempts fail, 3rd succeeds
870
+ allow(adapter).to receive(:write_batch)
871
+ .and_raise(Net::ReadTimeout).twice
872
+ allow(adapter).to receive(:write_batch)
873
+ .and_return(E11y::Result.success).once
874
+
875
+ Events::OrderCreated.track(order_id: '123')
876
+
877
+ # Should retry twice, then succeed
878
+ expect(adapter).to have_received(:write_batch).exactly(3).times
879
+ end
880
+
881
+ it 'does not retry non-retryable errors' do
882
+ adapter = instance_double(E11y::Adapters::LokiAdapter)
883
+
884
+ allow(adapter).to receive(:write_batch)
885
+ .and_raise(E11y::ValidationError)
886
+
887
+ Events::OrderCreated.track(order_id: '123')
888
+
889
+ # Should try once, then give up (no retry)
890
+ expect(adapter).to have_received(:write_batch).once
891
+ end
892
+ end
893
+
894
+ describe 'Dead Letter Queue' do
895
+ it 'sends to DLQ after max retries' do
896
+ adapter = instance_double(E11y::Adapters::LokiAdapter)
897
+
898
+ # All retries fail
899
+ allow(adapter).to receive(:write_batch)
900
+ .and_raise(Net::ReadTimeout)
901
+
902
+ expect {
903
+ Events::OrderCreated.track(order_id: '123')
904
+ }.to change { E11y::DeadLetterQueue.size }.by(1)
905
+ end
906
+
907
+ it 'replays DLQ events' do
908
+ # Add event to DLQ
909
+ E11y::DeadLetterQueue.add(
910
+ event: build_event(name: 'order.created'),
911
+ adapter: :loki,
912
+ error: 'Network timeout'
913
+ )
914
+
915
+ adapter = instance_double(E11y::Adapters::LokiAdapter)
916
+ allow(adapter).to receive(:write_batch).and_return(E11y::Result.success)
917
+
918
+ # Replay DLQ
919
+ E11y::DeadLetterQueue.replay_all
920
+
921
+ # DLQ should be empty
922
+ expect(E11y::DeadLetterQueue.size).to eq(0)
923
+
924
+ # Event should be sent
925
+ expect(adapter).to have_received(:write_batch).once
926
+ end
927
+ end
928
+ end
929
+ ```
930
+
931
+ ---
932
+
933
+ ## ๐Ÿ”— Related Use Cases
934
+
935
+ - **[UC-011: Rate Limiting](./UC-011-rate-limiting.md)** - Protect system from overload
936
+ - **[UC-015: Cost Optimization](./UC-015-cost-optimization.md)** - Sampling and compression for cost reduction
937
+ - **[CONFLICT-ANALYSIS](../CONFLICT-ANALYSIS.md)** - Circuit Breaker interaction
938
+
939
+ ---
940
+
941
+ ## ๐Ÿš€ Quick Start Checklist
942
+
943
+ - [ ] Enable retry policy in config
944
+ - [ ] Configure max_retries (recommend: 3)
945
+ - [ ] Enable dead letter queue
946
+ - [ ] Configure DLQ adapter (file or database)
947
+ - [ ] Set up DLQ replay job (every 10 minutes)
948
+ - [ ] Configure Prometheus alerts for DLQ size
949
+ - [ ] Test retry behavior in staging
950
+ - [ ] Monitor retry rate and DLQ growth
951
+
952
+ ---
953
+
954
+ **Status:** โœ… Reliability Feature
955
+ **Priority:** High (zero data loss)
956
+ **Complexity:** Intermediate