e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,956 @@
|
|
|
1
|
+
# UC-021: Error Handling, Retry Policy & Dead Letter Queue
|
|
2
|
+
|
|
3
|
+
**Status:** Reliability Feature (MVP)
|
|
4
|
+
**Complexity:** Intermediate
|
|
5
|
+
**Setup Time:** 20-30 minutes
|
|
6
|
+
**Target Users:** DevOps, SRE, Platform Engineers
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## ๐ Overview
|
|
11
|
+
|
|
12
|
+
### Problem Statement
|
|
13
|
+
|
|
14
|
+
**Current Pain Points:**
|
|
15
|
+
|
|
16
|
+
1. **Events lost on transient failures**
|
|
17
|
+
- Network timeout โ event dropped
|
|
18
|
+
- Elasticsearch temporarily down โ no retry
|
|
19
|
+
- Loki 503 error โ data loss
|
|
20
|
+
|
|
21
|
+
2. **No retry mechanism**
|
|
22
|
+
- Single attempt to send event
|
|
23
|
+
- If adapter fails โ event lost forever
|
|
24
|
+
- No visibility into failed sends
|
|
25
|
+
|
|
26
|
+
3. **No dead letter queue**
|
|
27
|
+
- Failed events disappear
|
|
28
|
+
- Can't replay failed events
|
|
29
|
+
- No forensics for why events failed
|
|
30
|
+
|
|
31
|
+
### E11y Solution
|
|
32
|
+
|
|
33
|
+
**Robust Error Handling Pipeline:**
|
|
34
|
+
|
|
35
|
+
- **Retry Policy:** Exponential backoff with jitter
|
|
36
|
+
- **Dead Letter Queue:** Failed events stored for later analysis/replay
|
|
37
|
+
- **Circuit Breaker:** Prevent cascading failures (already covered in UC-011)
|
|
38
|
+
- **Observability:** Metrics for failures, retries, DLQ size
|
|
39
|
+
|
|
40
|
+
**Result:** Zero data loss, resilient to transient failures.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## ๐ฏ Use Case Scenarios
|
|
45
|
+
|
|
46
|
+
### Scenario 1: Transient Network Failure
|
|
47
|
+
|
|
48
|
+
**Problem:** Loki temporarily unavailable (30s downtime)
|
|
49
|
+
|
|
50
|
+
**Without retry (DATA LOSS!):**
|
|
51
|
+
```ruby
|
|
52
|
+
Events::OrderCreated.track(order_id: '123')
|
|
53
|
+
# โ Send to Loki
|
|
54
|
+
# โ Network timeout (30s)
|
|
55
|
+
# โ โ Event dropped! No retry!
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**With retry (RESILIENT!):**
|
|
59
|
+
```ruby
|
|
60
|
+
Events::OrderCreated.track(order_id: '123')
|
|
61
|
+
# โ Send to Loki
|
|
62
|
+
# โ Network timeout (30s)
|
|
63
|
+
# โ Retry #1 after 100ms โ Still timeout
|
|
64
|
+
# โ Retry #2 after 200ms โ Still timeout
|
|
65
|
+
# โ Retry #3 after 400ms โ Success! โ
|
|
66
|
+
# Event delivered successfully
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
### Scenario 2: Persistent Failure โ Dead Letter Queue
|
|
72
|
+
|
|
73
|
+
**Problem:** Elasticsearch down for maintenance (2 hours)
|
|
74
|
+
|
|
75
|
+
**Without DLQ (DATA LOSS!):**
|
|
76
|
+
```ruby
|
|
77
|
+
# 1000 events during 2-hour maintenance window
|
|
78
|
+
1000.times do
|
|
79
|
+
Events::OrderCreated.track(...)
|
|
80
|
+
# โ All retries exhausted
|
|
81
|
+
# โ โ All 1000 events lost!
|
|
82
|
+
end
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**With DLQ (NO DATA LOSS!):**
|
|
86
|
+
```ruby
|
|
87
|
+
# Config:
|
|
88
|
+
E11y.configure do |config|
|
|
89
|
+
config.error_handling.dead_letter_queue do
|
|
90
|
+
enabled true
|
|
91
|
+
adapter :dlq_file # Write to local file
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# During ES maintenance:
|
|
96
|
+
1000.times do
|
|
97
|
+
Events::OrderCreated.track(...)
|
|
98
|
+
# โ All retries exhausted
|
|
99
|
+
# โ โ
Event written to DLQ!
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# After ES maintenance:
|
|
103
|
+
# Replay DLQ events
|
|
104
|
+
E11y::DeadLetterQueue.replay_all
|
|
105
|
+
# โ All 1000 events successfully sent!
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
### Scenario 3: Partial Adapter Failure
|
|
111
|
+
|
|
112
|
+
**Problem:** Sentry down, but Loki working
|
|
113
|
+
|
|
114
|
+
```ruby
|
|
115
|
+
class CriticalError < E11y::Event::Base
|
|
116
|
+
adapters [:loki, :sentry, :file]
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
Events::CriticalError.track(error: 'Something went wrong')
|
|
120
|
+
|
|
121
|
+
# Loki: โ
Success
|
|
122
|
+
# Sentry: โ Timeout (retries exhausted)
|
|
123
|
+
# File: โ
Success
|
|
124
|
+
|
|
125
|
+
# Result:
|
|
126
|
+
# - Event in Loki โ
|
|
127
|
+
# - Event in File โ
|
|
128
|
+
# - Event in DLQ (for Sentry) โ
|
|
129
|
+
#
|
|
130
|
+
# Later: Replay DLQ โ Send to Sentry when it's back up
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
### Scenario 4: DLQ Filter (Critical vs. Non-Critical Events)
|
|
136
|
+
|
|
137
|
+
**Problem:** DLQ fills with unimportant events (health checks, metrics).
|
|
138
|
+
|
|
139
|
+
**Without DLQ filter (BAD!):**
|
|
140
|
+
```ruby
|
|
141
|
+
# Health checks fill DLQ
|
|
142
|
+
1000.times do
|
|
143
|
+
Events::HealthCheck.track(status: 'ok')
|
|
144
|
+
# Loki down โ All 1000 health checks in DLQ!
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# DLQ full of unimportant events ๐
|
|
148
|
+
E11y::DeadLetterQueue.size # => 1000 (mostly garbage)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
**With DLQ filter (GOOD!):**
|
|
152
|
+
```ruby
|
|
153
|
+
# Config:
|
|
154
|
+
E11y.configure do |config|
|
|
155
|
+
config.error_handling.dead_letter_queue.filter do
|
|
156
|
+
# Don't save health checks to DLQ
|
|
157
|
+
never_save do
|
|
158
|
+
event_patterns ['health_check.*', 'ping.*']
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Always save payments
|
|
162
|
+
always_save do
|
|
163
|
+
event_patterns ['payment.*', 'order.*']
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Health checks (not saved to DLQ):
|
|
169
|
+
1000.times do
|
|
170
|
+
Events::HealthCheck.track(status: 'ok')
|
|
171
|
+
# Loki down โ โ Retries exhausted โ Dropped (not in DLQ)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Payment (saved to DLQ):
|
|
175
|
+
Events::PaymentFailed.track(order_id: '123', amount: 500)
|
|
176
|
+
# Loki down โ โ Retries exhausted โ โ
Saved to DLQ!
|
|
177
|
+
|
|
178
|
+
# DLQ only contains critical events
|
|
179
|
+
E11y::DeadLetterQueue.size # => 1 (only payment)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## ๐๏ธ Architecture
|
|
185
|
+
|
|
186
|
+
> **Implementation:** See [ADR-013: Reliability & Error Handling](../ADR-013-reliability-error-handling.md) for complete error handling architecture, including retry policy with exponential backoff and jitter, circuit breaker pattern, Dead Letter Queue (DLQ) storage strategies, and self-monitoring metrics.
|
|
187
|
+
|
|
188
|
+
### Retry Pipeline
|
|
189
|
+
|
|
190
|
+
```
|
|
191
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
192
|
+
โ Event Flow with Retry & DLQ โ
|
|
193
|
+
โ โ
|
|
194
|
+
โ Event.track(...) โ
|
|
195
|
+
โ โ โ
|
|
196
|
+
โ Main Buffer โ
|
|
197
|
+
โ โ โ
|
|
198
|
+
โ Flush (every 200ms) โ
|
|
199
|
+
โ โ โ
|
|
200
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
201
|
+
โ โ Try: Send to Adapter โ โ
|
|
202
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
203
|
+
โ โ โ
|
|
204
|
+
โ Success? โโYESโโโ โ
Done โ
|
|
205
|
+
โ โ โ
|
|
206
|
+
โ NO (Error) โ
|
|
207
|
+
โ โ โ
|
|
208
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
209
|
+
โ โ Retry Policy: Exponential Backoff โ โ
|
|
210
|
+
โ โ - Retry #1 after 100ms โ โ
|
|
211
|
+
โ โ - Retry #2 after 200ms (ร2) โ โ
|
|
212
|
+
โ โ - Retry #3 after 400ms (ร2) โ โ
|
|
213
|
+
โ โ - Max 3 retries โ โ
|
|
214
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
215
|
+
โ โ โ
|
|
216
|
+
โ All retries exhausted? โ
|
|
217
|
+
โ โ โ
|
|
218
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
219
|
+
โ โ Dead Letter Queue (DLQ) โ โ
|
|
220
|
+
โ โ - Store failed event โ โ
|
|
221
|
+
โ โ - Store error details โ โ
|
|
222
|
+
โ โ - Store retry attempts โ โ
|
|
223
|
+
โ โ - Allow replay later โ โ
|
|
224
|
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
|
225
|
+
โ โ โ
|
|
226
|
+
โ โ
Event preserved for later replay โ
|
|
227
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Exponential Backoff with Jitter
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
Retry Delays (with jitter):
|
|
234
|
+
|
|
235
|
+
Retry #1: 100ms + random(0-50ms) = 100-150ms
|
|
236
|
+
Retry #2: 200ms + random(0-100ms) = 200-300ms
|
|
237
|
+
Retry #3: 400ms + random(0-200ms) = 400-600ms
|
|
238
|
+
|
|
239
|
+
Max delay: 5 seconds (configurable)
|
|
240
|
+
|
|
241
|
+
Jitter prevents "thundering herd" problem
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## ๐ง Configuration
|
|
247
|
+
|
|
248
|
+
### Basic Setup
|
|
249
|
+
|
|
250
|
+
```ruby
|
|
251
|
+
# config/initializers/e11y.rb
|
|
252
|
+
E11y.configure do |config|
|
|
253
|
+
config.error_handling do
|
|
254
|
+
# === Retry Policy ===
|
|
255
|
+
retry_policy do
|
|
256
|
+
enabled true
|
|
257
|
+
max_retries 3
|
|
258
|
+
initial_delay 0.1.seconds # 100ms
|
|
259
|
+
max_delay 5.seconds
|
|
260
|
+
multiplier 2 # Exponential: 100ms, 200ms, 400ms
|
|
261
|
+
jitter true # Add randomness to prevent thundering herd
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# === Dead Letter Queue ===
|
|
265
|
+
dead_letter_queue do
|
|
266
|
+
enabled true
|
|
267
|
+
|
|
268
|
+
# Where to store failed events
|
|
269
|
+
adapter :dlq_file # Reference to registered adapter
|
|
270
|
+
|
|
271
|
+
# Or use specific DLQ adapter
|
|
272
|
+
# adapter E11y::Adapters::FileAdapter.new(
|
|
273
|
+
# path: Rails.root.join('log', 'e11y_dlq'),
|
|
274
|
+
# rotation: :daily
|
|
275
|
+
# )
|
|
276
|
+
|
|
277
|
+
# Max events in DLQ before alerting
|
|
278
|
+
max_size 10_000
|
|
279
|
+
|
|
280
|
+
# Alert when DLQ grows
|
|
281
|
+
alert_on_size 1000 # Alert at 1000 events
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# What to do after max_retries exhausted
|
|
285
|
+
on_max_retries_exceeded :send_to_dlq # :send_to_dlq, :drop, :log
|
|
286
|
+
|
|
287
|
+
# Which errors are retryable
|
|
288
|
+
retryable_errors [
|
|
289
|
+
Errno::ETIMEDOUT,
|
|
290
|
+
Errno::ECONNREFUSED,
|
|
291
|
+
Errno::ECONNRESET,
|
|
292
|
+
Net::OpenTimeout,
|
|
293
|
+
Net::ReadTimeout,
|
|
294
|
+
HTTP::TimeoutError
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
# Which errors are NOT retryable (fail immediately)
|
|
298
|
+
non_retryable_errors [
|
|
299
|
+
E11y::ValidationError, # Schema validation failed
|
|
300
|
+
E11y::RateLimitError # Rate limit exceeded
|
|
301
|
+
]
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Advanced Configuration
|
|
307
|
+
|
|
308
|
+
```ruby
|
|
309
|
+
E11y.configure do |config|
|
|
310
|
+
config.error_handling do
|
|
311
|
+
retry_policy do
|
|
312
|
+
enabled true
|
|
313
|
+
max_retries 5 # More retries for critical systems
|
|
314
|
+
|
|
315
|
+
# Adaptive retry delays
|
|
316
|
+
delays [0.1, 0.2, 0.5, 1, 2] # Custom delays in seconds
|
|
317
|
+
|
|
318
|
+
# Or exponential with custom params
|
|
319
|
+
initial_delay 0.05.seconds
|
|
320
|
+
max_delay 10.seconds
|
|
321
|
+
multiplier 2.5 # Faster exponential growth
|
|
322
|
+
jitter_range 0.5 # ยฑ50% jitter
|
|
323
|
+
|
|
324
|
+
# Per-adapter retry configuration
|
|
325
|
+
per_adapter do
|
|
326
|
+
adapter :loki do
|
|
327
|
+
max_retries 3
|
|
328
|
+
initial_delay 0.1
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
adapter :sentry do
|
|
332
|
+
max_retries 5 # More retries for Sentry
|
|
333
|
+
initial_delay 0.5
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
# Retry predicate (custom logic)
|
|
338
|
+
retry_if do |error, attempt|
|
|
339
|
+
# Custom logic: retry only for specific errors
|
|
340
|
+
error.is_a?(Net::ReadTimeout) && attempt < 5
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
dead_letter_queue do
|
|
345
|
+
enabled true
|
|
346
|
+
adapter :dlq_file
|
|
347
|
+
|
|
348
|
+
# DLQ retention
|
|
349
|
+
retention 7.days # Auto-delete old DLQ events
|
|
350
|
+
|
|
351
|
+
# DLQ partitioning (for large volumes)
|
|
352
|
+
partition_by :adapter # Separate DLQ per adapter
|
|
353
|
+
# log/e11y_dlq/loki/2026-01-12.jsonl
|
|
354
|
+
# log/e11y_dlq/sentry/2026-01-12.jsonl
|
|
355
|
+
|
|
356
|
+
# Compression
|
|
357
|
+
compression :gzip # Compress DLQ files
|
|
358
|
+
|
|
359
|
+
# Metadata
|
|
360
|
+
include_metadata true # Store error details, retry count
|
|
361
|
+
|
|
362
|
+
# ===== DLQ FILTER (Critical!) =====
|
|
363
|
+
# Control which events are saved to DLQ vs. dropped
|
|
364
|
+
filter do
|
|
365
|
+
# Always save critical events to DLQ (never drop!)
|
|
366
|
+
always_save do
|
|
367
|
+
severity [:error, :fatal] # All errors must be preserved
|
|
368
|
+
event_patterns [
|
|
369
|
+
'payment.*', # Payment events are critical
|
|
370
|
+
'order.*', # Order events are critical
|
|
371
|
+
'audit.*', # Audit events must never be lost
|
|
372
|
+
'security.*', # Security events are critical
|
|
373
|
+
'fraud.*' # Fraud detection events
|
|
374
|
+
]
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
# Never save to DLQ (drop after max retries)
|
|
378
|
+
never_save do
|
|
379
|
+
severity [:debug] # Debug events can be dropped
|
|
380
|
+
event_patterns [
|
|
381
|
+
'metrics.*', # Metrics can be dropped (regenerated)
|
|
382
|
+
'health_check.*', # Health checks not critical
|
|
383
|
+
'ping.*' # Ping events not important
|
|
384
|
+
]
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
# Custom filter function
|
|
388
|
+
save_if do |event|
|
|
389
|
+
# Example: Save high-value payments only
|
|
390
|
+
if event.name.include?('payment') && event.payload[:amount]
|
|
391
|
+
event.payload[:amount] > 100 # Only save payments >$100
|
|
392
|
+
else
|
|
393
|
+
true # Save all other events by default
|
|
394
|
+
end
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# Fallback chain
|
|
400
|
+
fallback_chain do
|
|
401
|
+
# If primary adapter fails after retries:
|
|
402
|
+
# 1. Try fallback adapter
|
|
403
|
+
# 2. If fallback fails โ DLQ
|
|
404
|
+
|
|
405
|
+
adapter :loki do
|
|
406
|
+
fallback :file # Loki fails โ write to file
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
adapter :sentry do
|
|
410
|
+
fallback nil # Sentry fails โ DLQ directly
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
end
|
|
414
|
+
end
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
---
|
|
418
|
+
|
|
419
|
+
## ๐ DLQ Management
|
|
420
|
+
|
|
421
|
+
### Replay DLQ Events
|
|
422
|
+
|
|
423
|
+
```ruby
|
|
424
|
+
# Replay all DLQ events
|
|
425
|
+
E11y::DeadLetterQueue.replay_all
|
|
426
|
+
|
|
427
|
+
# Replay specific adapter's DLQ
|
|
428
|
+
E11y::DeadLetterQueue.replay(adapter: :loki)
|
|
429
|
+
|
|
430
|
+
# Replay with filtering
|
|
431
|
+
E11y::DeadLetterQueue.replay do |event|
|
|
432
|
+
# Only replay events from last hour
|
|
433
|
+
event.timestamp > 1.hour.ago
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
# Replay with rate limiting
|
|
437
|
+
E11y::DeadLetterQueue.replay(
|
|
438
|
+
rate_limit: 100, # 100 events/sec
|
|
439
|
+
batch_size: 50
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
---
|
|
444
|
+
|
|
445
|
+
### DLQ Replay with PII & Schema Considerations (C07, C15)
|
|
446
|
+
|
|
447
|
+
> **โ ๏ธ CRITICAL:** DLQ replay requires special handling for PII filtering and schema migrations.
|
|
448
|
+
> **See:** [ADR-006 Section 5.6](../ADR-006-security-compliance.md#56-pii-handling-for-event-replay-from-dlq-c07-resolution) for C07 (PII double-hashing), [ADR-012 Section 8](../ADR-012-event-evolution.md#8-schema-migrations-and-dlq-replay-c15-resolution--critical) for C15 (schema migrations).
|
|
449
|
+
|
|
450
|
+
**Problem 1: PII Double-Hashing on Replay (C07)**
|
|
451
|
+
|
|
452
|
+
When replaying events from DLQ, PII filtering middleware runs again, causing double-hashing:
|
|
453
|
+
|
|
454
|
+
```ruby
|
|
455
|
+
# โ BAD: Double-hashing PII on replay
|
|
456
|
+
# Original event (first processing):
|
|
457
|
+
Events::UserLogin.track(
|
|
458
|
+
email: 'user@example.com', # โ Original PII
|
|
459
|
+
ip: '192.168.1.1' # โ Original PII
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Pipeline step 2: PII Filtering
|
|
463
|
+
# โ email: 'user@example.com' โ SHA256 hash โ 'a1b2c3d4...'
|
|
464
|
+
# โ ip: '192.168.1.1' โ SHA256 hash โ 'e5f6g7h8...'
|
|
465
|
+
|
|
466
|
+
# Event sent, but Loki fails โ goes to DLQ
|
|
467
|
+
|
|
468
|
+
# DLQ Replay:
|
|
469
|
+
E11y::DeadLetterQueue.replay_all
|
|
470
|
+
|
|
471
|
+
# Pipeline step 2: PII Filtering runs AGAIN!
|
|
472
|
+
# โ email: 'a1b2c3d4...' (already hashed!) โ SHA256 hash โ 'x9y8z7w6...'
|
|
473
|
+
# โ DOUBLE-HASHED! Original: a1b2c3d4, Replay: x9y8z7w6
|
|
474
|
+
|
|
475
|
+
# Result: DATA CORRUPTION!
|
|
476
|
+
# - Same user, DIFFERENT hashes!
|
|
477
|
+
# - Audit trail broken
|
|
478
|
+
# - GDPR data deletion impossible
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
**Solution: Metadata Flags to Skip PII Filtering**
|
|
482
|
+
|
|
483
|
+
```ruby
|
|
484
|
+
# โ
GOOD: Mark replayed events to skip PII filtering
|
|
485
|
+
# config/initializers/e11y.rb
|
|
486
|
+
E11y.configure do |config|
|
|
487
|
+
config.error_handling.dead_letter_queue do
|
|
488
|
+
enabled true
|
|
489
|
+
adapter :dlq_file
|
|
490
|
+
|
|
491
|
+
# === CRITICAL: Enable replay metadata (C07) ===
|
|
492
|
+
# Replay service automatically adds flags:
|
|
493
|
+
# - :replayed => true (skip transformations)
|
|
494
|
+
# - :pii_filtered => true (already filtered)
|
|
495
|
+
mark_replayed_events true # โ Default: true
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
# Replay service implementation:
|
|
500
|
+
module E11y
|
|
501
|
+
module DLQ
|
|
502
|
+
class ReplayService
|
|
503
|
+
def replay_event(dlq_event)
|
|
504
|
+
event_data = dlq_event[:event_data]
|
|
505
|
+
|
|
506
|
+
# โ
CRITICAL: Add replay metadata flags
|
|
507
|
+
event_data[:metadata] ||= {}
|
|
508
|
+
event_data[:metadata][:replayed] = true
|
|
509
|
+
event_data[:metadata][:pii_filtered] = true # Already filtered!
|
|
510
|
+
event_data[:metadata][:replayed_at] = Time.now.utc.iso8601
|
|
511
|
+
event_data[:metadata][:original_event_id] = event_data[:event_id]
|
|
512
|
+
|
|
513
|
+
# Send through pipeline
|
|
514
|
+
# PII filter middleware will skip (checks :replayed flag)
|
|
515
|
+
E11y::Pipeline.process(event_data)
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
end
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
# PiiFilter middleware checks flags:
|
|
522
|
+
class PiiFilter < Base
|
|
523
|
+
def call(event_data)
|
|
524
|
+
# โ
Skip PII filtering for replayed events
|
|
525
|
+
if already_filtered?(event_data)
|
|
526
|
+
E11y.logger.debug "[E11y] Skipping PII filtering for replayed event"
|
|
527
|
+
return event_data
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
# Apply PII filtering for new events
|
|
531
|
+
filter_pii(event_data)
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
private
|
|
535
|
+
|
|
536
|
+
def already_filtered?(event_data)
|
|
537
|
+
metadata = event_data[:metadata] || {}
|
|
538
|
+
metadata[:replayed] || metadata[:pii_filtered]
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
# Replay with idempotency guarantee:
|
|
543
|
+
E11y::DeadLetterQueue.replay_all
|
|
544
|
+
# โ All events processed correctly
|
|
545
|
+
# โ PII hashes preserved (no double-hashing)
|
|
546
|
+
# โ Audit trail intact โ
|
|
547
|
+
```
|
|
548
|
+
|
|
549
|
+
**Problem 2: Schema Migrations & DLQ Replay (C15) โ ๏ธ User Responsibility**
|
|
550
|
+
|
|
551
|
+
> **Decision:** Schema migrations are the **user's responsibility**, not E11y's. This is an edge case for poorly managed DLQs.
|
|
552
|
+
|
|
553
|
+
**Scenario:**
|
|
554
|
+
|
|
555
|
+
```ruby
|
|
556
|
+
# v1.0: Order event schema (old)
|
|
557
|
+
class OrderCreated < E11y::Event::Base
|
|
558
|
+
schema do
|
|
559
|
+
required(:order_id).filled(:string)
|
|
560
|
+
required(:amount).filled(:float)
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# Events tracked with v1.0 schema
|
|
565
|
+
Events::OrderCreated.track(order_id: '123', amount: 99.99)
|
|
566
|
+
# โ Loki fails โ Event goes to DLQ
|
|
567
|
+
|
|
568
|
+
# v2.0: Order event schema (new - added required field)
|
|
569
|
+
class OrderCreated < E11y::Event::Base
|
|
570
|
+
schema do
|
|
571
|
+
required(:order_id).filled(:string)
|
|
572
|
+
required(:amount).filled(:float)
|
|
573
|
+
required(:currency).filled(:string) # โ NEW REQUIRED FIELD!
|
|
574
|
+
end
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
# DLQ Replay (after schema change):
|
|
578
|
+
E11y::DeadLetterQueue.replay_all
|
|
579
|
+
# โ Old event: { order_id: '123', amount: 99.99 }
|
|
580
|
+
# โ โ Schema validation fails (missing :currency)!
|
|
581
|
+
# โ Event REJECTED!
|
|
582
|
+
```
|
|
583
|
+
|
|
584
|
+
**Recommendation: User Responsibility**
|
|
585
|
+
|
|
586
|
+
1. **Clear DLQ before schema changes** (best practice):
|
|
587
|
+
```ruby
|
|
588
|
+
# Before deploying v2.0:
|
|
589
|
+
# 1. Replay all DLQ events (under v1.0 schema)
|
|
590
|
+
E11y::DeadLetterQueue.replay_all
|
|
591
|
+
|
|
592
|
+
# 2. Verify DLQ is empty
|
|
593
|
+
E11y::DeadLetterQueue.size # => 0
|
|
594
|
+
|
|
595
|
+
# 3. Deploy v2.0 with new schema
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
2. **Use lenient validation for DLQ replay** (optional - user-implemented):
|
|
599
|
+
```ruby
|
|
600
|
+
# config/initializers/e11y.rb
|
|
601
|
+
E11y.configure do |config|
|
|
602
|
+
config.validation do
|
|
603
|
+
# Lenient validation for replayed events
|
|
604
|
+
# (user chooses to allow old schema)
|
|
605
|
+
lenient_mode_if do |event_data|
|
|
606
|
+
event_data.dig(:metadata, :replayed) == true
|
|
607
|
+
end
|
|
608
|
+
end
|
|
609
|
+
end
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
3. **Separate DLQ processing for old events** (optional - user-implemented):
|
|
613
|
+
```ruby
|
|
614
|
+
# Replay old events with schema migration logic
|
|
615
|
+
E11y::DeadLetterQueue.replay do |event|
|
|
616
|
+
# User-implemented migration
|
|
617
|
+
if event.version == '1.0' && event.name == 'order.created'
|
|
618
|
+
# Add missing :currency field
|
|
619
|
+
event.payload[:currency] = 'USD' # Default value
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
true # Replay this event
|
|
623
|
+
end
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
**Key Takeaways:**
|
|
627
|
+
|
|
628
|
+
| Aspect | E11y Responsibility | User Responsibility |
|
|
629
|
+
|--------|---------------------|---------------------|
|
|
630
|
+
| **PII Double-Hashing** | โ
Handled by E11y (metadata flags) | None - automatic |
|
|
631
|
+
| **Schema Migrations** | โ NOT handled by E11y | โ
User must clear DLQ before schema changes OR implement lenient validation |
|
|
632
|
+
| **Idempotency** | โ
Guaranteed by E11y (replay flags) | None - automatic |
|
|
633
|
+
| **DLQ Management** | โ NOT handled by E11y | โ
User must clear old events periodically |
|
|
634
|
+
|
|
635
|
+
**Trade-offs (C07):**
|
|
636
|
+
|
|
637
|
+
| Decision | Pro | Con | Mitigation |
|
|
638
|
+
|----------|-----|-----|------------|
|
|
639
|
+
| **Metadata flags** | Simple, automatic | Metadata size +24 bytes | Acceptable overhead |
|
|
640
|
+
| **`:replayed` flag** | Clear intent | None | โ
Best practice |
|
|
641
|
+
| **Skip PII filter** | Prevents double-hashing | Must trust DLQ integrity | DLQ stored securely (encrypted) |
|
|
642
|
+
|
|
643
|
+
**Trade-offs (C15):**
|
|
644
|
+
|
|
645
|
+
| Decision | Pro | Con | Mitigation |
|
|
646
|
+
|----------|-----|-----|------------|
|
|
647
|
+
| **User responsibility** | E11y stays simple | User must manage DLQ lifecycle | Document best practices (clear DLQ before schema changes) |
|
|
648
|
+
| **No auto-migration** | No complex migration logic in E11y | Old events may fail validation | User implements lenient validation OR pre-replay migration |
|
|
649
|
+
| **Edge case** | Rare in well-managed systems | May surprise users with large DLQs | Clear warnings in docs |
|
|
650
|
+
|
|
651
|
+
---
|
|
652
|
+
|
|
653
|
+
### Inspect DLQ
|
|
654
|
+
|
|
655
|
+
```ruby
|
|
656
|
+
# Count events in DLQ
|
|
657
|
+
E11y::DeadLetterQueue.size
|
|
658
|
+
# => 1234
|
|
659
|
+
|
|
660
|
+
# Peek at DLQ (first 10 events)
|
|
661
|
+
E11y::DeadLetterQueue.peek(limit: 10)
|
|
662
|
+
# => [<Event>, <Event>, ...]
|
|
663
|
+
|
|
664
|
+
# Get DLQ stats
|
|
665
|
+
E11y::DeadLetterQueue.stats
|
|
666
|
+
# => {
|
|
667
|
+
# total: 1234,
|
|
668
|
+
# by_adapter: { loki: 1000, sentry: 234 },
|
|
669
|
+
# oldest: 2.hours.ago,
|
|
670
|
+
# newest: 5.minutes.ago
|
|
671
|
+
# }
|
|
672
|
+
|
|
673
|
+
# Find specific events
|
|
674
|
+
E11y::DeadLetterQueue.find do |event|
|
|
675
|
+
event.name == 'order.paid' && event.payload[:amount] > 1000
|
|
676
|
+
end
|
|
677
|
+
```
|
|
678
|
+
|
|
679
|
+
### Clean DLQ
|
|
680
|
+
|
|
681
|
+
```ruby
|
|
682
|
+
# Clear all DLQ events
|
|
683
|
+
E11y::DeadLetterQueue.clear!
|
|
684
|
+
|
|
685
|
+
# Clear old events (older than 7 days)
|
|
686
|
+
E11y::DeadLetterQueue.clear_old!(7.days)
|
|
687
|
+
|
|
688
|
+
# Clear by adapter
|
|
689
|
+
E11y::DeadLetterQueue.clear!(adapter: :loki)
|
|
690
|
+
```
|
|
691
|
+
|
|
692
|
+
---
|
|
693
|
+
|
|
694
|
+
## ๐ก Best Practices
|
|
695
|
+
|
|
696
|
+
### โ
DO
|
|
697
|
+
|
|
698
|
+
**1. Enable retry for transient errors**
|
|
699
|
+
```ruby
|
|
700
|
+
# โ
GOOD: Retry on network errors
|
|
701
|
+
config.error_handling.retry_policy do
|
|
702
|
+
enabled true
|
|
703
|
+
retryable_errors [
|
|
704
|
+
Errno::ETIMEDOUT,
|
|
705
|
+
Net::ReadTimeout,
|
|
706
|
+
HTTP::TimeoutError
|
|
707
|
+
]
|
|
708
|
+
end
|
|
709
|
+
```
|
|
710
|
+
|
|
711
|
+
**2. Use DLQ for critical events**
|
|
712
|
+
```ruby
|
|
713
|
+
# โ
GOOD: DLQ enabled for zero data loss
|
|
714
|
+
config.error_handling.dead_letter_queue do
|
|
715
|
+
enabled true
|
|
716
|
+
adapter :dlq_file
|
|
717
|
+
end
|
|
718
|
+
```
|
|
719
|
+
|
|
720
|
+
**3. Monitor DLQ size**
|
|
721
|
+
```ruby
|
|
722
|
+
# โ
GOOD: Alert when DLQ grows
|
|
723
|
+
config.error_handling.dead_letter_queue do
|
|
724
|
+
max_size 10_000
|
|
725
|
+
alert_on_size 1000
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
# Set up Prometheus alert:
|
|
729
|
+
# alert: DLQSizeHigh
|
|
730
|
+
# expr: e11y_dlq_size > 1000
|
|
731
|
+
```
|
|
732
|
+
|
|
733
|
+
**4. Replay DLQ regularly**
|
|
734
|
+
```ruby
|
|
735
|
+
# โ
GOOD: Schedule DLQ replay
|
|
736
|
+
# config/schedule.rb (whenever gem)
|
|
737
|
+
every 10.minutes do
|
|
738
|
+
runner "E11y::DeadLetterQueue.replay_all"
|
|
739
|
+
end
|
|
740
|
+
|
|
741
|
+
# Or Sidekiq job:
|
|
742
|
+
class E11yDlqReplayJob
|
|
743
|
+
include Sidekiq::Job
|
|
744
|
+
|
|
745
|
+
def perform
|
|
746
|
+
E11y::DeadLetterQueue.replay_all
|
|
747
|
+
end
|
|
748
|
+
end
|
|
749
|
+
|
|
750
|
+
# Schedule every 10 minutes
|
|
751
|
+
```
|
|
752
|
+
|
|
753
|
+
---
|
|
754
|
+
|
|
755
|
+
### โ DON'T
|
|
756
|
+
|
|
757
|
+
**1. Don't retry non-retryable errors**
|
|
758
|
+
```ruby
|
|
759
|
+
# โ BAD: Retrying validation errors (will always fail)
|
|
760
|
+
config.error_handling.retry_policy do
|
|
761
|
+
retryable_errors [
|
|
762
|
+
E11y::ValidationError # โ Will NEVER succeed!
|
|
763
|
+
]
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
# โ
GOOD: Skip retry for validation errors
|
|
767
|
+
config.error_handling.non_retryable_errors [
|
|
768
|
+
E11y::ValidationError,
|
|
769
|
+
E11y::RateLimitError
|
|
770
|
+
]
|
|
771
|
+
```
|
|
772
|
+
|
|
773
|
+
**2. Don't set too many retries**
|
|
774
|
+
```ruby
|
|
775
|
+
# โ BAD: Too many retries (adds latency)
|
|
776
|
+
config.error_handling.retry_policy do
|
|
777
|
+
max_retries 20 # โ Too many! Total delay: minutes
|
|
778
|
+
end
|
|
779
|
+
|
|
780
|
+
# โ
GOOD: Reasonable retry count
|
|
781
|
+
config.error_handling.retry_policy do
|
|
782
|
+
max_retries 3 # โ Enough for transient errors
|
|
783
|
+
# Total delay: ~700ms (acceptable)
|
|
784
|
+
end
|
|
785
|
+
```
|
|
786
|
+
|
|
787
|
+
**3. Don't ignore DLQ growth**
|
|
788
|
+
```ruby
|
|
789
|
+
# โ BAD: No monitoring, DLQ grows indefinitely
|
|
790
|
+
config.error_handling.dead_letter_queue do
|
|
791
|
+
enabled true
|
|
792
|
+
# No max_size, no alerts!
|
|
793
|
+
end
|
|
794
|
+
|
|
795
|
+
# โ
GOOD: Monitor and alert
|
|
796
|
+
config.error_handling.dead_letter_queue do
|
|
797
|
+
enabled true
|
|
798
|
+
max_size 10_000
|
|
799
|
+
alert_on_size 1000
|
|
800
|
+
|
|
801
|
+
# Auto-cleanup old events
|
|
802
|
+
retention 7.days
|
|
803
|
+
end
|
|
804
|
+
```
|
|
805
|
+
|
|
806
|
+
---
|
|
807
|
+
|
|
808
|
+
## ๐ Monitoring & Metrics
|
|
809
|
+
|
|
810
|
+
### Self-Monitoring Metrics
|
|
811
|
+
|
|
812
|
+
```ruby
|
|
813
|
+
# E11y automatically exports these metrics:
|
|
814
|
+
|
|
815
|
+
# Retries
|
|
816
|
+
e11y_retries_total{adapter, error_type} # Counter
|
|
817
|
+
e11y_retry_attempts{adapter} # Histogram (how many retries before success)
|
|
818
|
+
|
|
819
|
+
# DLQ
|
|
820
|
+
e11y_dlq_size{adapter} # Gauge (current DLQ size)
|
|
821
|
+
e11y_dlq_events_added_total{adapter, error_type} # Counter
|
|
822
|
+
e11y_dlq_events_replayed_total{adapter, status} # Counter (status: success/failure)
|
|
823
|
+
|
|
824
|
+
# Errors
|
|
825
|
+
e11y_adapter_errors_total{adapter, error_type, retryable} # Counter
|
|
826
|
+
e11y_max_retries_exceeded_total{adapter} # Counter
|
|
827
|
+
```
|
|
828
|
+
|
|
829
|
+
### Prometheus Alerts
|
|
830
|
+
|
|
831
|
+
```yaml
|
|
832
|
+
groups:
|
|
833
|
+
- name: e11y_error_handling
|
|
834
|
+
rules:
|
|
835
|
+
# DLQ growing
|
|
836
|
+
- alert: E11yDLQSizeHigh
|
|
837
|
+
expr: e11y_dlq_size > 1000
|
|
838
|
+
for: 5m
|
|
839
|
+
annotations:
|
|
840
|
+
summary: "E11y DLQ has >1000 events"
|
|
841
|
+
|
|
842
|
+
# High retry rate
|
|
843
|
+
- alert: E11yHighRetryRate
|
|
844
|
+
expr: rate(e11y_retries_total[5m]) > 10
|
|
845
|
+
for: 5m
|
|
846
|
+
annotations:
|
|
847
|
+
summary: "E11y retrying >10 events/sec"
|
|
848
|
+
|
|
849
|
+
# Max retries exceeded
|
|
850
|
+
- alert: E11yMaxRetriesExceeded
|
|
851
|
+
expr: rate(e11y_max_retries_exceeded_total[5m]) > 1
|
|
852
|
+
for: 5m
|
|
853
|
+
annotations:
|
|
854
|
+
summary: "E11y events failing after max retries"
|
|
855
|
+
```
|
|
856
|
+
|
|
857
|
+
---
|
|
858
|
+
|
|
859
|
+
## ๐งช Testing
|
|
860
|
+
|
|
861
|
+
### RSpec Examples
|
|
862
|
+
|
|
863
|
+
```ruby
|
|
864
|
+
RSpec.describe 'E11y Error Handling' do
|
|
865
|
+
describe 'Retry Policy' do
|
|
866
|
+
it 'retries on transient errors' do
|
|
867
|
+
adapter = instance_double(E11y::Adapters::LokiAdapter)
|
|
868
|
+
|
|
869
|
+
# First 2 attempts fail, 3rd succeeds
|
|
870
|
+
allow(adapter).to receive(:write_batch)
|
|
871
|
+
.and_raise(Net::ReadTimeout).twice
|
|
872
|
+
allow(adapter).to receive(:write_batch)
|
|
873
|
+
.and_return(E11y::Result.success).once
|
|
874
|
+
|
|
875
|
+
Events::OrderCreated.track(order_id: '123')
|
|
876
|
+
|
|
877
|
+
# Should retry twice, then succeed
|
|
878
|
+
expect(adapter).to have_received(:write_batch).exactly(3).times
|
|
879
|
+
end
|
|
880
|
+
|
|
881
|
+
it 'does not retry non-retryable errors' do
|
|
882
|
+
adapter = instance_double(E11y::Adapters::LokiAdapter)
|
|
883
|
+
|
|
884
|
+
allow(adapter).to receive(:write_batch)
|
|
885
|
+
.and_raise(E11y::ValidationError)
|
|
886
|
+
|
|
887
|
+
Events::OrderCreated.track(order_id: '123')
|
|
888
|
+
|
|
889
|
+
# Should try once, then give up (no retry)
|
|
890
|
+
expect(adapter).to have_received(:write_batch).once
|
|
891
|
+
end
|
|
892
|
+
end
|
|
893
|
+
|
|
894
|
+
describe 'Dead Letter Queue' do
|
|
895
|
+
it 'sends to DLQ after max retries' do
|
|
896
|
+
adapter = instance_double(E11y::Adapters::LokiAdapter)
|
|
897
|
+
|
|
898
|
+
# All retries fail
|
|
899
|
+
allow(adapter).to receive(:write_batch)
|
|
900
|
+
.and_raise(Net::ReadTimeout)
|
|
901
|
+
|
|
902
|
+
expect {
|
|
903
|
+
Events::OrderCreated.track(order_id: '123')
|
|
904
|
+
}.to change { E11y::DeadLetterQueue.size }.by(1)
|
|
905
|
+
end
|
|
906
|
+
|
|
907
|
+
it 'replays DLQ events' do
|
|
908
|
+
# Add event to DLQ
|
|
909
|
+
E11y::DeadLetterQueue.add(
|
|
910
|
+
event: build_event(name: 'order.created'),
|
|
911
|
+
adapter: :loki,
|
|
912
|
+
error: 'Network timeout'
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
adapter = instance_double(E11y::Adapters::LokiAdapter)
|
|
916
|
+
allow(adapter).to receive(:write_batch).and_return(E11y::Result.success)
|
|
917
|
+
|
|
918
|
+
# Replay DLQ
|
|
919
|
+
E11y::DeadLetterQueue.replay_all
|
|
920
|
+
|
|
921
|
+
# DLQ should be empty
|
|
922
|
+
expect(E11y::DeadLetterQueue.size).to eq(0)
|
|
923
|
+
|
|
924
|
+
# Event should be sent
|
|
925
|
+
expect(adapter).to have_received(:write_batch).once
|
|
926
|
+
end
|
|
927
|
+
end
|
|
928
|
+
end
|
|
929
|
+
```
|
|
930
|
+
|
|
931
|
+
---
|
|
932
|
+
|
|
933
|
+
## ๐ Related Use Cases
|
|
934
|
+
|
|
935
|
+
- **[UC-011: Rate Limiting](./UC-011-rate-limiting.md)** - Protect system from overload
|
|
936
|
+
- **[UC-015: Cost Optimization](./UC-015-cost-optimization.md)** - Sampling and compression for cost reduction
|
|
937
|
+
- **[CONFLICT-ANALYSIS](../CONFLICT-ANALYSIS.md)** - Circuit Breaker interaction
|
|
938
|
+
|
|
939
|
+
---
|
|
940
|
+
|
|
941
|
+
## ๐ Quick Start Checklist
|
|
942
|
+
|
|
943
|
+
- [ ] Enable retry policy in config
|
|
944
|
+
- [ ] Configure max_retries (recommend: 3)
|
|
945
|
+
- [ ] Enable dead letter queue
|
|
946
|
+
- [ ] Configure DLQ adapter (file or database)
|
|
947
|
+
- [ ] Set up DLQ replay job (every 10 minutes)
|
|
948
|
+
- [ ] Configure Prometheus alerts for DLQ size
|
|
949
|
+
- [ ] Test retry behavior in staging
|
|
950
|
+
- [ ] Monitor retry rate and DLQ growth
|
|
951
|
+
|
|
952
|
+
---
|
|
953
|
+
|
|
954
|
+
**Status:** โ
Reliability Feature
|
|
955
|
+
**Priority:** High (zero data loss)
|
|
956
|
+
**Complexity:** Intermediate
|