e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,1906 @@
1
+ # UC-011: Rate Limiting (DoS Protection)
2
+
3
+ **Status:** MVP Feature (Critical for Production)
4
+ **Complexity:** Intermediate
5
+ **Setup Time:** 20-30 minutes
6
+ **Target Users:** Security Engineers, SRE, Backend Developers
7
+
8
+ ---
9
+
10
+ ## 📋 Overview
11
+
12
+ ### Problem Statement
13
+
14
+ **The production incident:**
15
+ ```ruby
16
+ # ❌ NO RATE LIMITING: Infinite retry storm
17
+ begin
18
+ process_payment(order)
19
+ rescue PaymentError => e
20
+ # Retry immediately (bad idea!)
21
+ 3.times do
22
+ Events::PaymentRetry.track(order_id: order.id, attempt: _1)
23
+ end
24
+ end
25
+
26
+ # What happened:
27
+ # - 1000 failed payments
28
+ # - 3000 retry events
29
+ # - × 100 fields per event
30
+ # - = 300,000 events in 10 seconds
31
+ # → Buffer overflow
32
+ # → Loki API rate limit hit (429)
33
+ # → All observability lost during incident! 😱
34
+ ```
35
+
36
+ **Real incident impact:**
37
+ - **09:00 AM**: Payment gateway down
38
+ - **09:01 AM**: 50k retry events/sec flooding E11y
39
+ - **09:02 AM**: Loki returns 429 (rate limit)
40
+ - **09:03 AM**: E11y buffer full, events dropped
41
+ - **09:05 AM**: **No observability** - blind during incident
42
+ - **09:30 AM**: Incident resolved, but root cause unclear (no logs!)
43
+
44
+ ### E11y Solution
45
+
46
+ **3-Layer Rate Limiting (Global + Per-Event + Per-Context):**
47
+ ```ruby
48
+ # ✅ PROTECTED: Multi-layer rate limiting
49
+ E11y.configure do |config|
50
+ config.rate_limiting do
51
+ # Layer 1: Global limit (protect buffer)
52
+ global limit: 10_000, window: 1.minute
53
+
54
+ # Layer 2: Per-event limit (prevent retry storms)
55
+ per_event 'payment.retry', limit: 100, window: 1.minute
56
+
57
+ # Layer 3: Per-context limit (per user/IP)
58
+ per_context :user_id, limit: 1_000, window: 1.minute
59
+ per_context :ip_address, limit: 500, window: 1.minute
60
+
61
+ # What happens when limit exceeded:
62
+ on_exceeded :sample # Keep 10%, drop 90%
63
+ sample_rate 0.1
64
+
65
+ # Alert on rate limiting
66
+ alert_on_limit true
67
+ alert_channel '#observability'
68
+ end
69
+ end
70
+
71
+ # Result during incident:
72
+ # - Global limit: 10k/min enforced
73
+ # - Payment retry: 100/min enforced
74
+ # - Per user: 1k/min enforced
75
+ # → Observability maintained ✅
76
+ # → Root cause identified quickly ✅
77
+ ```
78
+
79
+ ---
80
+
81
+ ## 🎯 Event-Level Rate Limiting (NEW - v1.1)
82
+
83
+ > **🎯 CONTRADICTION_01 Resolution:** Move rate limiting config from global initializer to event classes.
84
+
85
+ **Event-level rate limiting DSL:**
86
+
87
+ ```ruby
88
+ # app/events/payment_retry.rb
89
+ module Events
90
+ class PaymentRetry < E11y::Event::Base
91
+ schema do
92
+ required(:order_id).filled(:string)
93
+ required(:attempt).filled(:integer)
94
+ end
95
+
96
+ # ✨ Event-level rate limiting (right next to schema!)
97
+ rate_limit 100, window: 1.minute # Max 100 retries/min
98
+ on_exceeded :drop # Drop retry logs (not critical)
99
+ end
100
+ end
101
+
102
+ # app/events/user_login_failed.rb
103
+ module Events
104
+ class UserLoginFailed < E11y::Event::Base
105
+ schema do
106
+ required(:user_id).filled(:string)
107
+ required(:ip_address).filled(:string)
108
+ end
109
+
110
+ # ✨ Event-level rate limiting
111
+ rate_limit 50, window: 1.minute # Max 50 failures/min
112
+ on_exceeded :sample # Keep 20% (flattened syntax!)
113
+ sample_rate 0.2
114
+ end
115
+ end
116
+ ```
117
+
118
+ **Inheritance for rate limiting:**
119
+
120
+ ```ruby
121
+ # Base class with common rate limiting
122
+ module Events
123
+ class BaseDebugEvent < E11y::Event::Base
124
+ # Common for ALL debug events
125
+ severity :debug
126
+ rate_limit 100, window: 1.minute # Low limit
127
+ on_exceeded :drop # Drop debug logs (not critical)
128
+ sample_rate 0.01 # 1% sampling
129
+ end
130
+ end
131
+
132
+ # Inherit from base
133
+ class Events::DebugSqlQuery < Events::BaseDebugEvent
134
+ schema do; required(:query).filled(:string); end
135
+ # ← Inherits: rate_limit 100 + on_exceeded :drop
136
+ end
137
+
138
+ class Events::DebugApiCall < Events::BaseDebugEvent
139
+ schema do; required(:endpoint).filled(:string); end
140
+ # ← Inherits: rate_limit 100 + on_exceeded :drop
141
+ end
142
+ ```
143
+
144
+ **Preset modules for rate limiting:**
145
+
146
+ ```ruby
147
+ # lib/e11y/presets/high_value_event.rb
148
+ module E11y
149
+ module Presets
150
+ module HighValueEvent
151
+ extend ActiveSupport::Concern
152
+ included do
153
+ rate_limit 10_000 # High limit
154
+ on_exceeded :throttle # Slow down, don't drop
155
+ sample_rate 1.0 # Never sample
156
+ end
157
+ end
158
+
159
+ module DebugEvent
160
+ extend ActiveSupport::Concern
161
+ included do
162
+ rate_limit 100 # Low limit
163
+ on_exceeded :drop # Drop debug logs
164
+ sample_rate 0.01 # 1% sampling
165
+ end
166
+ end
167
+ end
168
+ end
169
+
170
+ # Usage:
171
+ class Events::PaymentProcessed < E11y::Event::Base
172
+ include E11y::Presets::HighValueEvent # ← Rate limit inherited!
173
+ schema do; required(:transaction_id).filled(:string); end
174
+ end
175
+ ```
176
+
177
+ **Conventions for rate limiting (sensible defaults):**
178
+
179
+ ```ruby
180
+ # Convention: Default rate limit = 1000 events/sec
181
+ # Override only for high-volume or low-volume events
182
+
183
+ # Zero-config event (uses convention):
184
+ class Events::OrderCreated < E11y::Event::Base
185
+ schema do; required(:order_id).filled(:string); end
186
+ # ← Auto: rate_limit = 1000 (default)
187
+ end
188
+
189
+ # Override for high-volume:
190
+ class Events::PageView < E11y::Event::Base
191
+ rate_limit 10_000 # ← Override: high-volume
192
+ schema do; required(:page).filled(:string); end
193
+ end
194
+
195
+ # Override for low-volume:
196
+ class Events::DebugQuery < E11y::Event::Base
197
+ rate_limit 100 # ← Override: low-volume
198
+ schema do; required(:query).filled(:string); end
199
+ end
200
+ ```
201
+
202
+ **Precedence (event-level overrides global):**
203
+
204
+ ```ruby
205
+ # Global config (infrastructure):
206
+ E11y.configure do |config|
207
+ config.rate_limiting do
208
+ global limit: 10_000, window: 1.minute
209
+ default_per_event_limit 1000 # Default for events
210
+ end
211
+ end
212
+
213
+ # Event-level config (overrides global):
214
+ class Events::PaymentRetry < E11y::Event::Base
215
+ rate_limit 100 # ← Override: 100 (not 1000)
216
+ on_exceeded :drop # ← Override: drop (not default)
217
+ end
218
+ ```
219
+
220
+ **Benefits:**
221
+ - ✅ Locality of behavior (rate limit next to schema)
222
+ - ✅ DRY via inheritance/presets
223
+ - ✅ Sensible defaults (1000/sec)
224
+ - ✅ Easy to override when needed
225
+
226
+ ---
227
+
228
+ ## 🎯 The 3-Layer Rate Limiting System
229
+
230
+ ### Layer 1: Global Rate Limiting
231
+
232
+ **Protect E11y infrastructure from flooding:**
233
+
234
+ ```ruby
235
+ E11y.configure do |config|
236
+ config.rate_limiting do
237
+ # === GLOBAL LIMIT ===
238
+ # Across ALL events, ALL sources
239
+ global limit: 10_000, # Max 10k events
240
+ window: 1.minute, # Per minute
241
+ algorithm: :sliding_window # OR :token_bucket, :fixed_window
242
+
243
+ # What happens when exceeded:
244
+ on_exceeded :sample # Options: :drop, :sample, :throttle
245
+ sample_rate 0.1 # Keep 10% when over limit
246
+
247
+ # Track dropped events
248
+ track_drops true
249
+ end
250
+ end
251
+
252
+ # How it works:
253
+ # - Counts events across entire system
254
+ # - If > 10k/min → apply sample_rate (90% dropped)
255
+ # - Metrics: e11y_rate_limit_global_hits_total
256
+ ```
257
+
258
+ **Algorithms:**
259
+
260
+ | Algorithm | Behavior | Use Case |
261
+ |-----------|----------|----------|
262
+ | `:sliding_window` | Smooth rate control | **Default** (best for most cases) |
263
+ | `:token_bucket` | Allows bursts | APIs with bursty traffic |
264
+ | `:fixed_window` | Simple but has edge cases | Low-volume scenarios |
265
+
266
+ ---
267
+
268
+ ### Layer 2: Per-Event Rate Limiting
269
+
270
+ **Prevent specific events from flooding:**
271
+
272
+ ```ruby
273
+ E11y.configure do |config|
274
+ config.rate_limiting do
275
+ # === PER-EVENT LIMITS ===
276
+
277
+ # Retry events (common culprit)
278
+ per_event 'payment.retry',
279
+ limit: 100,
280
+ window: 1.minute,
281
+ on_exceeded: :drop # Drop retry logs (not critical)
282
+
283
+ # Login failures (security)
284
+ per_event 'user.login.failed',
285
+ limit: 50,
286
+ window: 1.minute,
287
+ on_exceeded: :sample,
288
+ sample_rate: 0.2 # Keep 20%
289
+
290
+ # API errors (debugging)
291
+ per_event 'api.error',
292
+ limit: 200,
293
+ window: 1.minute,
294
+ on_exceeded: :throttle # Slow down, don't drop
295
+
296
+ # Background job failures
297
+ per_event 'job.failed',
298
+ limit: 500,
299
+ window: 5.minutes,
300
+ on_exceeded: :sample,
301
+ sample_rate: 0.1
302
+ end
303
+ end
304
+
305
+ # Usage:
306
+ Events::PaymentRetry.track(order_id: '123', attempt: 1)
307
+ Events::PaymentRetry.track(order_id: '456', attempt: 1)
308
+ # ... 99 more in same minute → All tracked
309
+
310
+ Events::PaymentRetry.track(order_id: '789', attempt: 1)
311
+ # → 101st event in minute → DROPPED (limit: 100)
312
+ # → Metric: e11y_rate_limit_per_event_hits_total{event="payment.retry"}
313
+ ```
314
+
315
+ ---
316
+
317
+ ### Layer 3: Per-Context Rate Limiting
318
+
319
+ **Prevent single user/IP/tenant from flooding:**
320
+
321
+ ```ruby
322
+ E11y.configure do |config|
323
+ config.rate_limiting do
324
+ # === PER-CONTEXT LIMITS ===
325
+
326
+ # Per user (prevent single user abuse)
327
+ per_context :user_id,
328
+ limit: 1_000,
329
+ window: 1.minute,
330
+ on_exceeded: :sample,
331
+ sample_rate: 0.1
332
+
333
+ # Per IP address (prevent DDoS)
334
+ per_context :ip_address,
335
+ limit: 500,
336
+ window: 1.minute,
337
+ on_exceeded: :drop
338
+
339
+ # Per tenant (multi-tenant apps)
340
+ per_context :tenant_id,
341
+ limit: 5_000,
342
+ window: 1.minute,
343
+ on_exceeded: :throttle
344
+
345
+ # Per session (prevent session replay attacks)
346
+ per_context :session_id,
347
+ limit: 200,
348
+ window: 1.minute,
349
+ on_exceeded: :drop
350
+ end
351
+ end
352
+
353
+ # How it works:
354
+ # User A: 1000 events/min → OK
355
+ # User A: 1001st event → 90% dropped (sample_rate 0.1)
356
+ # User B: 1000 events/min → OK (separate limit)
357
+ ```
358
+
359
+ **Context extraction:**
360
+ ```ruby
361
+ # E11y automatically extracts context from:
362
+ # 1. Event payload: event.payload[:user_id]
363
+ # 2. Event context: event.context[:user_id]
364
+ # 3. Rails Current: Current.user_id
365
+ # 4. Custom extractor:
366
+
367
+ E11y.configure do |config|
368
+ config.rate_limiting do
369
+ per_context :user_id,
370
+ limit: 1_000,
371
+ window: 1.minute,
372
+ extractor: ->(event) {
373
+ # Custom logic to extract user_id
374
+ event.payload[:user_id] || event.context[:current_user]&.id
375
+ }
376
+ end
377
+ end
378
+ ```
379
+
380
+ ---
381
+
382
+ ### Layer 4: DLQ Filter Integration (C02 Resolution) ⚠️
383
+
384
+ > **Reference:** See [ADR-013 §4.6: Rate Limiting × DLQ Filter](../ADR-013-reliability-error-handling.md#46-rate-limiting--dlq-filter-interaction-c02-resolution) for full architecture.
385
+
386
+ **Problem:** Rate limiting drops events BEFORE they reach DLQ filter. Critical events (e.g., payments) may be lost during traffic spikes, even though DLQ filter says "always save payments".
387
+
388
+ **Solution:** Rate limiter respects DLQ `always_save` filter - critical events bypass rate limits.
389
+
390
+ ```ruby
391
+ E11y.configure do |config|
392
+ config.rate_limiting do
393
+ enabled true
394
+ global limit: 10_000, window: 1.minute
395
+
396
+ # ✅ Respect DLQ filter for critical events
397
+ respect_dlq_filter true # Critical events bypass rate limits!
398
+
399
+ # Alternative: Explicit bypass patterns
400
+ bypass_for do
401
+ event_patterns ['payment.*', 'order.*', 'audit.*']
402
+ severities [:error, :fatal]
403
+ end
404
+ end
405
+
406
+ # DLQ filter configuration
407
+ config.error_handling.dead_letter_queue.filter do
408
+ always_save do
409
+ event_patterns ['payment.*', 'order.*']
410
+ end
411
+ end
412
+ end
413
+
414
+ # Scenario: Traffic spike (15,000 payment failures/sec)
415
+ 15_000.times do
416
+ Events::PaymentFailed.track(order_id: '123', amount: 500)
417
+ end
418
+
419
+ # Result:
420
+ # - Rate limit: 10,000/min
421
+ # - Excess: 5,000 events over limit
422
+ # - ❌ WITHOUT C02 fix: 5,000 critical payment events DROPPED!
423
+ # - ✅ WITH C02 fix: ALL payment events processed (bypass rate limit!)
424
+ # → Why? DLQ filter says "always_save payment.*"
425
+ # → Rate limiter checks: dlq_filter.always_save?(event) → true
426
+ # → Bypass rate limit → event goes to buffer → success!
427
+ ```
428
+
429
+ **Flow Diagram:**
430
+
431
+ ```
432
+ Event → Rate Limiter
433
+ ├─ Check: dlq_filter.always_save?(event)?
434
+ │ ├─ YES (critical event) → ✅ BYPASS rate limit → Buffer
435
+ │ └─ NO (non-critical)
436
+ │ ├─ Under limit? → ✅ PASS → Buffer
437
+ │ └─ Over limit? → ❌ DROP (or sample)
438
+ └─ Buffer → Adapter → DLQ (if adapter fails)
439
+ ```
440
+
441
+ **Configuration Options:**
442
+
443
+ ```ruby
444
+ # Option 1: Auto-respect DLQ filter (recommended)
445
+ config.rate_limiting.respect_dlq_filter = true
446
+
447
+ # Option 2: Explicit bypass patterns (more control)
448
+ config.rate_limiting.bypass_for do
449
+ event_patterns ['payment.*', 'fraud.*', 'security.*']
450
+ severities [:error, :fatal]
451
+ custom_check { |event| event[:vip_customer] == true }
452
+ end
453
+
454
+ # Option 3: Hybrid (DLQ filter + extra patterns)
455
+ config.rate_limiting do
456
+ respect_dlq_filter true # Respect DLQ always_save
457
+ bypass_for do
458
+ event_patterns ['audit.*'] # Additional patterns
459
+ end
460
+ end
461
+ ```
462
+
463
+ **Trade-offs:**
464
+
465
+ | Aspect | Pro | Con | Decision |
466
+ |--------|-----|-----|----------|
467
+ | **Bypass critical events** | Zero data loss for payments | Rate limit less effective during attacks | Critical events > rate limits |
468
+ | **respect_dlq_filter** | DRY (single source of truth) | Tight coupling to DLQ config | Worth it for simplicity |
469
+ | **bypass_for patterns** | Flexible custom rules | Need to maintain bypass list | Use for edge cases only |
470
+
471
+ ---
472
+
473
+ ### Layer 5: Retry Rate Limiting (C06 Resolution) ⚠️
474
+
475
+ > **Reference:** See [ADR-013 §3.5: Retry Rate Limiting](../ADR-013-reliability-error-handling.md#35-retry-rate-limiting-c06-resolution) for full architecture.
476
+
477
+ **Problem:** Adapter failures trigger retries. If 1000 events fail → 3000 retry attempts (thundering herd) → buffer overflow.
478
+
479
+ **Solution:** Separate rate limit for RETRIES (staged retry with jitter).
480
+
481
+ ```ruby
482
+ E11y.configure do |config|
483
+ config.error_handling do
484
+ retry_policy do
485
+ max_attempts 3
486
+ base_delay 100 # ms
487
+ max_delay 5000 # ms
488
+ exponential_backoff true
489
+ jitter true
490
+
491
+ # ✅ Retry rate limiting (separate from main rate limit!)
492
+ retry_rate_limit do
493
+ enabled true
494
+ limit 1000 # Max 1000 retries/minute (not 10k!)
495
+ window 1.minute
496
+
497
+ # When retry is rate-limited:
498
+ on_limit_exceeded :delay # Options: :drop, :delay, :dlq
499
+
500
+ # Delay strategy (staged retry)
501
+ delay_strategy do
502
+ base_delay 1000 # 1 sec
503
+ max_delay 60_000 # 60 sec
504
+ backoff_multiplier 2 # 1s → 2s → 4s → 8s → 16s → 32s → 60s
505
+ jitter_range 0.2 # ±20% randomization
506
+ end
507
+ end
508
+ end
509
+ end
510
+ end
511
+
512
+ # Scenario: Loki down for 5 minutes (adapter fails)
513
+ # - 10,000 events/min attempted
514
+ # - Adapter fails → 10,000 events need retry
515
+ # - Without retry rate limiting:
516
+ # - 10,000 × 3 attempts = 30,000 retries
517
+ # - Immediate retry storm (buffer overflow!)
518
+ # - With retry rate limiting:
519
+ # - First retry: 1,000 events (rate limit enforced)
520
+ # - Next 9,000 events: delayed (staged retry)
521
+ # - Retry schedule:
522
+ # - 00:00 - 1,000 retries (immediate)
523
+ # - 00:01 - 1,000 retries (delayed 1s)
524
+ # - 00:02 - 1,000 retries (delayed 2s)
525
+ # - 00:04 - 1,000 retries (delayed 4s)
526
+ # - ... (exponential backoff)
527
+ # - 01:00 - Last batch (delayed 60s)
528
+ # - Result: ✅ No buffer overflow! Smooth retry spread over time.
529
+ ```
530
+
531
+ **Retry Timeline Comparison:**
532
+
533
+ ```
534
+ WITHOUT retry rate limiting:
535
+ 00:00 Loki down
536
+ 00:00 10,000 events fail → 30,000 immediate retries ❌ BUFFER OVERFLOW!
537
+ 00:01 All retries exhausted, 10,000 events lost
538
+
539
+ WITH retry rate limiting:
540
+ 00:00 Loki down
541
+ 00:00 10,000 events fail → 1,000 immediate retries (rate limited)
542
+ 00:01 1,000 retries (delayed)
543
+ 00:02 1,000 retries (delayed)
544
+ 00:04 1,000 retries (delayed)
545
+ 00:08 1,000 retries (delayed)
546
+ 00:16 1,000 retries (delayed)
547
+ 00:32 1,000 retries (delayed)
548
+ 01:00 1,000 retries (delayed)
549
+ 02:00 1,000 retries (delayed)
550
+ 04:00 1,000 retries (delayed)
551
+ 05:00 Loki back online → 1,000 final retries → ✅ All 10,000 events saved!
552
+ ```
553
+
554
+ **Configuration: Main Rate Limit vs Retry Rate Limit**
555
+
556
+ ```ruby
557
+ E11y.configure do |config|
558
+ # Main rate limiting (for NEW events)
559
+ config.rate_limiting do
560
+ global limit: 10_000, window: 1.minute
561
+ on_exceeded :sample # Sample excess events
562
+ end
563
+
564
+ # Retry rate limiting (for FAILED events)
565
+ config.error_handling.retry_policy.retry_rate_limit do
566
+ limit: 1_000, window: 1.minute # 10× LOWER than main limit!
567
+ on_limit_exceeded :delay # Delay excess retries (don't drop!)
568
+ end
569
+ end
570
+
571
+ # Why separate limits?
572
+ # 1. Retries are MORE expensive (adapter already failed once)
573
+ # 2. Lower retry rate prevents cascading failures (give adapter time to recover)
574
+ # 3. Main limit protects ingestion, retry limit protects adapter
575
+ ```
576
+
577
+ **Trade-offs:**
578
+
579
+ | Aspect | Pro | Con | Decision |
580
+ |--------|-----|-----|----------|
581
+ | **Retry rate limit 10× lower** | Prevents adapter overload | Slower retry | Adapter stability > speed |
582
+ | **:delay (not :drop)** | No data loss | Memory for delayed queue | Worth it for reliability |
583
+ | **Exponential backoff + jitter** | Smooth recovery | Complex timing | Industry best practice |
584
+ | **Separate from main limit** | Fine-grained control | More config | Flexibility > simplicity |
585
+
586
+ ---
587
+
588
+ ## 💻 Rate Limiting Strategies
589
+
590
+ ### Strategy 1: Drop
591
+
592
+ **Discard excess events:**
593
+ ```ruby
594
+ on_exceeded :drop
595
+
596
+ # Use when:
597
+ # - Non-critical events (retry logs, debug events)
598
+ # - High volume, low value events
599
+ # - Already have enough signal
600
+
601
+ # Example:
602
+ per_event 'debug.log', limit: 100, window: 1.minute, on_exceeded: :drop
603
+ ```
604
+
605
+ ---
606
+
607
+ ### Strategy 2: Sample
608
+
609
+ **Keep percentage of excess events:**
610
+ ```ruby
611
+ on_exceeded :sample
612
+ sample_rate 0.1 # Keep 10%
613
+
614
+ # Use when:
615
+ # - Want SOME signal during flood
616
+ # - Statistical analysis OK (don't need every event)
617
+ # - Moderate volume
618
+
619
+ # Example:
620
+ per_event 'user.action', limit: 1000, window: 1.minute,
621
+ on_exceeded: :sample
622
+ # → Sample rate: 0.1 (10%)
623
+ # → First 1000: all kept
624
+ # → Next 9000: 10% kept (900 events)
625
+ # → Total: 1900 events (vs 10,000 without rate limiting)
626
+ ```
627
+
628
+ ---
629
+
630
+ ### Strategy 3: Backpressure
631
+
632
+ **Slow down event production:**
633
+ ```ruby
634
+ on_exceeded :throttle
635
+
636
+ # Use when:
637
+ # - Events MUST be tracked (critical)
638
+ # - Can afford latency increase
639
+ # - Low to moderate volume
640
+
641
+ # How it works:
642
+ # 1. Limit exceeded
643
+ # 2. Sleep 10ms before tracking next event
644
+ # 3. Gradual slow down (not sudden drop)
645
+
646
+ # Example:
647
+ per_event 'order.created', limit: 100, window: 1.minute,
648
+ on_exceeded: :throttle,
649
+ backpressure_delay: 10.milliseconds
650
+ ```
651
+
652
+ ---
653
+
654
+ ### Strategy 4: Aggregate
655
+
656
+ **Combine events into summary:**
657
+ ```ruby
658
+ on_exceeded :aggregate
659
+
660
+ # Use when:
661
+ # - Many similar events
662
+ # - Summary is sufficient
663
+ # - High volume
664
+
665
+ # How it works:
666
+ # 1. First 100 events: tracked individually
667
+ # 2. Next 900 events: aggregated into 1 summary event
668
+ # 3. Summary includes: count, min/max/avg, sample
669
+
670
+ # Example:
671
+ per_event 'api.slow_request', limit: 100, window: 1.minute,
672
+ on_exceeded: :aggregate,
673
+ aggregate_fields: [:duration_ms, :endpoint]
674
+ # → First 100: individual events
675
+ # → Next 900: Summary event:
676
+ # {
677
+ # event_name: 'api.slow_request.aggregated',
678
+ # count: 900,
679
+ # duration_ms_min: 501,
680
+ # duration_ms_max: 5000,
681
+ # duration_ms_avg: 1200,
682
+ # endpoints: ['/api/users', '/api/orders']
683
+ # }
684
+ ```
685
+
686
+ ---
687
+
688
+ ## 🚫 Bypass Rules (Allowlists)
689
+
690
+ **Always allow critical events:**
691
+
692
+ ```ruby
693
+ E11y.configure do |config|
694
+ config.rate_limiting do
695
+ # Global rate limiting
696
+ global limit: 10_000, window: 1.minute
697
+
698
+ # === BYPASS RULES ===
699
+
700
+ # Bypass by event type
701
+ bypass_for event_types: [
702
+ 'system.critical', # System-critical events
703
+ 'security.alert', # Security alerts
704
+ 'payment.fraud', # Fraud detection
705
+ 'data.corruption' # Data integrity issues
706
+ ]
707
+
708
+ # Bypass by severity
709
+ bypass_for severities: [:fatal, :error]
710
+
711
+ # Bypass by context
712
+ bypass_for contexts: {
713
+ env: 'production', # Only production
714
+ user_role: 'admin' # Admin users
715
+ }
716
+
717
+ # Bypass for specific users (VIPs)
718
+ bypass_for_users ['vip_user_1', 'vip_user_2']
719
+
720
+ # Custom bypass logic
721
+ bypass_if ->(event) {
722
+ # Always track events with high order amounts
723
+ event.payload[:amount].to_i > 10_000
724
+ }
725
+ end
726
+ end
727
+
728
+ # Result:
729
+ # - Normal events: rate limited
730
+ # - Critical events: ALWAYS tracked (bypass)
731
+ ```
732
+
733
+ ---
734
+
735
+ ## 📊 Implementation with Redis
736
+
737
+ **Production-ready implementation using Redis:**
738
+
739
+ ```ruby
740
+ # lib/e11y/processing/rate_limiter.rb
741
+ module E11y
742
+ module Processing
743
+ class RateLimiter
744
+ def initialize(redis: Redis.new)
745
+ @redis = redis
746
+ @config = E11y.config.rate_limiting
747
+ end
748
+
749
+ def allowed?(event)
750
+ # Check bypass rules first
751
+ return true if bypassed?(event)
752
+
753
+ # Check global limit
754
+ return false unless check_global_limit(event)
755
+
756
+ # Check per-event limit
757
+ return false unless check_per_event_limit(event)
758
+
759
+ # Check per-context limits
760
+ return false unless check_per_context_limits(event)
761
+
762
+ true
763
+ end
764
+
765
+ private
766
+
767
+ def check_global_limit(event)
768
+ key = 'e11y:rate_limit:global'
769
+ limit = @config.global_limit
770
+ window = @config.global_window
771
+
772
+ check_limit(key, limit, window)
773
+ end
774
+
775
+ def check_per_event_limit(event)
776
+ limit_config = @config.per_event_limits[event.event_name]
777
+ return true unless limit_config
778
+
779
+ key = "e11y:rate_limit:event:#{event.event_name}"
780
+ check_limit(key, limit_config[:limit], limit_config[:window])
781
+ end
782
+
783
+ def check_per_context_limits(event)
784
+ @config.per_context_limits.all? do |field, limit_config|
785
+ value = extract_context_value(event, field, limit_config[:extractor])
786
+ next true unless value
787
+
788
+ key = "e11y:rate_limit:context:#{field}:#{value}"
789
+ check_limit(key, limit_config[:limit], limit_config[:window])
790
+ end
791
+ end
792
+
793
+ def check_limit(key, limit, window)
794
+ # Sliding window counter using Redis sorted sets
795
+ now = Time.now.to_f
796
+ window_start = now - window
797
+
798
+ # Remove old entries (outside window)
799
+ @redis.zremrangebyscore(key, 0, window_start)
800
+
801
+ # Count current entries
802
+ current_count = @redis.zcard(key)
803
+
804
+ if current_count < limit
805
+ # Add new entry
806
+ @redis.zadd(key, now, "#{now}-#{SecureRandom.hex(8)}")
807
+ @redis.expire(key, window.to_i + 60) # TTL = window + buffer
808
+ true
809
+ else
810
+ # Limit exceeded
811
+ handle_exceeded(key, current_count, limit)
812
+ false
813
+ end
814
+ end
815
+
816
+ def handle_exceeded(key, current, limit)
817
+ # Track metric
818
+ Yabeda.e11y_internal.rate_limit_hits_total.increment(
819
+ limit_type: extract_limit_type(key),
820
+ key: key
821
+ )
822
+
823
+ # Log warning
824
+ E11y.logger.warn(
825
+ "[E11y] Rate limit exceeded: #{key} (#{current}/#{limit})"
826
+ )
827
+
828
+ # Alert if configured
829
+ if @config.alert_on_limit
830
+ alert_rate_limit_exceeded(key, current, limit)
831
+ end
832
+ end
833
+
834
+ def bypassed?(event)
835
+ # Check bypass rules
836
+ @config.bypass_rules.any? do |rule|
837
+ case rule[:type]
838
+ when :event_types
839
+ rule[:values].include?(event.event_name)
840
+ when :severities
841
+ rule[:values].include?(event.severity)
842
+ when :contexts
843
+ rule[:values].all? { |k, v| event.context[k] == v }
844
+ when :custom
845
+ rule[:condition].call(event)
846
+ end
847
+ end
848
+ end
849
+ end
850
+ end
851
+ end
852
+ ```
853
+
854
+ ---
855
+
856
+ ## 🔧 Implementation Details
857
+
858
+ > **Implementation:** See [ADR-006 Section 4.0: Rate Limiting + Retry Policy Resolution](../ADR-006-security-compliance.md#40-rate-limiting--retry-policy-resolution-conflict-14) for detailed architecture.
859
+
860
+ ### Middleware Flow
861
+
862
+ E11y rate limiting is implemented as **middleware** in the event processing pipeline. Understanding the flow helps debug rate limiting behavior and optimize performance.
863
+
864
+ **Pipeline Order:**
865
+ ```
866
+ Event.track()
867
+ → Schema Validation
868
+ → Context Enrichment
869
+ → Rate Limiting Middleware ← YOU ARE HERE
870
+ → Adaptive Sampling
871
+ → PII Filtering
872
+ → Audit Signing
873
+ → Adapter Routing
874
+ → Write to Adapters
875
+ ```
876
+
877
+ **Why Rate Limiting Before PII Filtering?**
878
+ - ✅ **Efficiency:** Drop events early (no wasted CPU on PII filtering)
879
+ - ✅ **Security:** Rate limiter sees original event (can detect patterns)
880
+ - ✅ **Accuracy:** Count real events (not filtered versions)
881
+
882
+ ---
883
+
884
+ ### Middleware Implementation
885
+
886
+ ```ruby
887
+ # lib/e11y/middleware/rate_limiter.rb
888
+ module E11y
889
+ module Middleware
890
+ class RateLimiter < Base
891
+ def call(event_data)
892
+ # 1. Check bypass rules first (critical events)
893
+ if bypassed?(event_data)
894
+ return super(event_data) # Pass to next middleware
895
+ end
896
+
897
+ # 2. Check global limit
898
+ unless check_global_limit(event_data)
899
+ handle_rate_limited(event_data, :global)
900
+ return false # Stop pipeline
901
+ end
902
+
903
+ # 3. Check per-event limit
904
+ unless check_per_event_limit(event_data)
905
+ handle_rate_limited(event_data, :per_event)
906
+ return false
907
+ end
908
+
909
+ # 4. Check per-context limits
910
+ unless check_per_context_limits(event_data)
911
+ handle_rate_limited(event_data, :per_context)
912
+ return false
913
+ end
914
+
915
+ # 5. All checks passed → continue pipeline
916
+ super(event_data)
917
+ end
918
+
919
+ private
920
+
921
+ def handle_rate_limited(event_data, limit_type)
922
+ # Apply configured strategy
923
+ case config.on_exceeded
924
+ when :drop
925
+ drop_event(event_data, limit_type)
926
+ when :sample
927
+ sample_event(event_data, limit_type)
928
+ when :throttle
929
+ apply_backpressure(event_data, limit_type)
930
+ when :aggregate
931
+ aggregate_event(event_data, limit_type)
932
+ end
933
+
934
+ # Track metric
935
+ Yabeda.e11y_internal.rate_limit_hits_total.increment(
936
+ limit_type: limit_type,
937
+ event_name: event_data[:event_name]
938
+ )
939
+
940
+ # Log warning
941
+ E11y.logger.warn(
942
+ "[E11y RateLimit] Event rate limited: #{event_data[:event_name]} (#{limit_type})"
943
+ )
944
+ end
945
+
946
+ def drop_event(event_data, limit_type)
947
+ Yabeda.e11y_internal.rate_limit_dropped_events_total.increment(
948
+ limit_type: limit_type
949
+ )
950
+ end
951
+
952
+ def sample_event(event_data, limit_type)
953
+ # Random sampling
954
+ if rand < config.sample_rate
955
+ super(event_data) # Keep this one
956
+ else
957
+ drop_event(event_data, limit_type)
958
+ end
959
+ end
960
+
961
+ def apply_backpressure(event_data, limit_type)
962
+ # Slow down production
963
+ sleep(config.backpressure_delay)
964
+ super(event_data)
965
+ end
966
+
967
+ def aggregate_event(event_data, limit_type)
968
+ # Add to aggregation buffer
969
+ aggregation_buffer.add(event_data)
970
+
971
+ # Flush aggregated event periodically
972
+ if aggregation_buffer.should_flush?
973
+ flush_aggregated_events(limit_type)
974
+ end
975
+ end
976
+ end
977
+ end
978
+ end
979
+ ```
980
+
981
+ ---
982
+
983
+ ### Redis-Based Rate Limiting
984
+
985
+ E11y uses **Redis sorted sets** for distributed rate limiting across multiple application instances.
986
+
987
+ **Algorithm: Sliding Window Counter**
988
+
989
+ ```ruby
990
+ def check_limit(key, limit, window)
991
+ now = Time.now.to_f
992
+ window_start = now - window
993
+
994
+ # 1. Remove expired entries (outside window)
995
+ redis.zremrangebyscore(key, 0, window_start)
996
+
997
+ # 2. Count current entries
998
+ current_count = redis.zcard(key)
999
+
1000
+ # 3. Check limit
1001
+ if current_count < limit
1002
+ # Add new entry (score = timestamp, member = unique ID)
1003
+ redis.zadd(key, now, "#{now}-#{SecureRandom.hex(8)}")
1004
+ redis.expire(key, window.to_i + 60) # TTL cleanup
1005
+ true # Allowed
1006
+ else
1007
+ false # Rate limited
1008
+ end
1009
+ end
1010
+ ```
1011
+
1012
+ **Why Sorted Sets?**
1013
+ - ✅ **Sliding window:** Accurate counting (no edge cases like fixed window)
1014
+ - ✅ **Distributed:** Works across multiple app instances
1015
+ - ✅ **Efficient:** O(log N) for add/remove operations
1016
+ - ✅ **Automatic cleanup:** Redis TTL handles old entries
1017
+
1018
+ **Redis Keys:**
1019
+ ```ruby
1020
+ # Global limit
1021
+ "e11y:rate_limit:global"
1022
+
1023
+ # Per-event limit
1024
+ "e11y:rate_limit:event:payment.retry"
1025
+
1026
+ # Per-context limit
1027
+ "e11y:rate_limit:context:user_id:user-123"
1028
+ "e11y:rate_limit:context:ip_address:192.168.1.100"
1029
+ ```
1030
+
1031
+ ---
1032
+
1033
+ ### Retry Policy Integration
1034
+
1035
+ **Critical Decision:** Retries DO count toward rate limits (prevent retry amplification).
1036
+
1037
+ ```ruby
1038
+ # config/initializers/e11y.rb
1039
+ E11y.configure do |config|
1040
+ # Rate limiting
1041
+ config.rate_limiting do
1042
+ per_event 'payment.retry', limit: 100, window: 1.minute
1043
+ end
1044
+
1045
+ # Retry policy
1046
+ config.error_handling do
1047
+ retry_policy do
1048
+ # ✅ Retries respect rate limits
1049
+ respect_rate_limits true
1050
+
1051
+ # If retry is rate limited → send to DLQ
1052
+ on_retry_rate_limited :send_to_dlq
1053
+ end
1054
+ end
1055
+ end
1056
+ ```
1057
+
1058
+ **Flow with Retries:**
1059
+ ```
1060
+ 1. Event.track() → Rate limited
1061
+ 2. Retry logic triggered
1062
+ 3. Retry attempt → Check rate limit AGAIN
1063
+ 4. If still rate limited → Send to DLQ (not dropped)
1064
+ 5. DLQ processed later (outside rate limit window)
1065
+ ```
1066
+
1067
+ **Why This Matters:**
1068
+ - ✅ **Prevents retry amplification:** 1 failure → 1000 retries → 1000 rate limit hits
1069
+ - ✅ **DLQ safety net:** Rate-limited retries not lost (processed later)
1070
+ - ✅ **Observability preserved:** Can see retry patterns in metrics
1071
+
1072
+ ---
1073
+
1074
+ ### Performance Characteristics
1075
+
1076
+ **Latency:**
1077
+ ```ruby
1078
+ # Benchmark: Rate limiter overhead
1079
+ Benchmark.ips do |x|
1080
+ x.report('No rate limiting') do
1081
+ Events::TestEvent.track(foo: 'bar') # Baseline
1082
+ end
1083
+
1084
+ x.report('With rate limiting') do
1085
+ # Rate limiter enabled
1086
+ Events::TestEvent.track(foo: 'bar')
1087
+ end
1088
+
1089
+ x.compare!
1090
+ end
1091
+
1092
+ # Results:
1093
+ # No rate limiting: 100,000 i/s (10μs per event)
1094
+ # With rate limiting: 95,000 i/s (10.5μs per event)
1095
+ # Overhead: ~0.5μs (5% increase)
1096
+ ```
1097
+
1098
+ **Redis Latency:**
1099
+ ```ruby
1100
+ # Redis operations per event (within limit):
1101
+ # 1. ZREMRANGEBYSCORE (cleanup) ~0.1ms
1102
+ # 2. ZCARD (count) ~0.05ms
1103
+ # 3. ZADD (add entry) ~0.05ms
1104
+ # 4. EXPIRE (set TTL) ~0.05ms
1105
+ # Total: ~0.25ms per event
1106
+
1107
+ # When rate limited:
1108
+ # 1. ZREMRANGEBYSCORE ~0.1ms
1109
+ # 2. ZCARD ~0.05ms
1110
+ # Total: ~0.15ms (no write)
1111
+ ```
1112
+
1113
+ **Scaling:**
1114
+ ```ruby
1115
+ # Redis memory usage:
1116
+ # - Global limit (10k events/min): ~500KB
1117
+ # - Per-event limit (100/min): ~5KB per event type
1118
+ # - Per-context limit (1k/min): ~50KB per user
1119
+ #
1120
+ # Example: 1000 users × 50KB = 50MB
1121
+ # → Acceptable for most deployments
1122
+ ```
1123
+
1124
+ ---
1125
+
1126
+ ### Troubleshooting
1127
+
1128
+ **Problem: Events dropped unexpectedly**
1129
+
1130
+ ```ruby
1131
+ # Check rate limit metrics
1132
+ rate_limit_hits = Yabeda.e11y_internal.rate_limit_hits_total.values
1133
+ # => { limit_type: 'per_event', event_name: 'payment.retry' } => 42
1134
+
1135
+ # Check Redis keys
1136
+ redis.keys('e11y:rate_limit:*')
1137
+ # => ["e11y:rate_limit:event:payment.retry"]
1138
+
1139
+ redis.zcard('e11y:rate_limit:event:payment.retry')
1140
+ # => 100 (at limit!)
1141
+
1142
+ # Check TTL
1143
+ redis.ttl('e11y:rate_limit:event:payment.retry')
1144
+ # => 45 (45 seconds until window resets)
1145
+ ```
1146
+
1147
+ **Problem: Rate limiter not working**
1148
+
1149
+ ```ruby
1150
+ # 1. Check middleware order
1151
+ E11y.config.middleware.list
1152
+ # => [SchemaValidator, ContextEnricher, RateLimiter, ...]
1153
+
1154
+ # 2. Check rate limiting enabled
1155
+ E11y.config.rate_limiting.enabled?
1156
+ # => true
1157
+
1158
+ # 3. Check bypass rules
1159
+ E11y.config.rate_limiting.bypass_rules
1160
+ # => [{ type: :severities, values: [:fatal] }]
1161
+
1162
+ # 4. Check event matches bypass
1163
+ event = { severity: :fatal }
1164
+ E11y::Middleware::RateLimiter.new.bypassed?(event)
1165
+ # => true (bypassed!)
1166
+ ```
1167
+
1168
+ **Problem: High Redis latency**
1169
+
1170
+ ```ruby
1171
+ # 1. Check Redis connection pool
1172
+ E11y.config.redis.pool_size
1173
+ # => 5 (default)
1174
+
1175
+ # Increase if needed
1176
+ E11y.configure do |config|
1177
+ config.redis do
1178
+ pool_size 20 # For high-concurrency
1179
+ end
1180
+ end
1181
+
1182
+ # 2. Use Redis pipelining for multiple checks
1183
+ redis.pipelined do
1184
+ redis.zremrangebyscore(key, 0, window_start)
1185
+ redis.zcard(key)
1186
+ redis.zadd(key, now, id)
1187
+ end
1188
+
1189
+ # 3. Consider local caching (for read-heavy workloads)
1190
+ E11y.configure do |config|
1191
+ config.rate_limiting do
1192
+ cache_limit_checks true # Cache for 1s
1193
+ end
1194
+ end
1195
+ ```
1196
+
1197
+ ---
1198
+
1199
+ ## 📊 Self-Monitoring & Metrics
1200
+
1201
+ > **Implementation:** See [ADR-006 Section 4: Rate Limiting](../ADR-006-security-compliance.md#4-rate-limiting) for detailed architecture.
1202
+
1203
+ E11y provides comprehensive self-monitoring metrics for rate limiting. These metrics help you understand rate limit behavior, detect attacks, and optimize limits.
1204
+
1205
+ ### Core Metrics
1206
+
1207
+ **1. `e11y_rate_limit_hits_total` (Counter)**
1208
+ - **Description:** Total number of times a rate limit was hit (event attempted but limit reached).
1209
+ - **Labels:**
1210
+ - `limit_type`: Type of limit (`global`, `per_event`, `per_context`)
1211
+ - `event_name`: Event type that hit the limit
1212
+ - `key`: Specific limit key (e.g., `user_id:123`, `ip:192.168.1.1`)
1213
+ - `strategy`: How event was handled (`drop`, `sample`, `backpressure`, `aggregate`)
1214
+ - **Monitoring:**
1215
+ ```prometheus
1216
+ # Rate limit hit rate (events/sec)
1217
+ rate(e11y_rate_limit_hits_total[5m])
1218
+
1219
+ # Which events hit limits most often?
1220
+ topk(10, sum by (event_name) (rate(e11y_rate_limit_hits_total[5m])))
1221
+
1222
+ # Per-context abuse detection
1223
+ topk(10, sum by (key) (e11y_rate_limit_hits_total{limit_type="per_context"}))
1224
+ ```
1225
+ - **Grafana Panel:**
1226
+ - **Title:** Rate Limit Hits by Type
1227
+ - **Query:** `sum by (limit_type) (rate(e11y_rate_limit_hits_total[5m]))`
1228
+ - **Visualization:** Time series graph
1229
+ - **Description:** Shows which rate limit type (global/per-event/per-context) is hit most frequently.
1230
+
1231
+ **2. `e11y_rate_limit_dropped_events_total` (Counter)**
1232
+ - **Description:** Total number of events dropped due to rate limiting.
1233
+ - **Labels:**
1234
+ - `limit_type`: Type of limit that caused drop
1235
+ - `event_name`: Event type that was dropped
1236
+ - **Monitoring:**
1237
+ ```prometheus
1238
+ # Drop rate (events/sec)
1239
+ rate(e11y_rate_limit_dropped_events_total[5m])
1240
+
1241
+ # Total dropped in last hour
1242
+ sum(increase(e11y_rate_limit_dropped_events_total[1h]))
1243
+
1244
+ # Drop ratio (% of total events)
1245
+ rate(e11y_rate_limit_dropped_events_total[5m])
1246
+ / rate(e11y_events_tracked_total[5m])
1247
+ ```
1248
+ - **Grafana Panel:**
1249
+ - **Title:** Event Drop Rate
1250
+ - **Query:** `rate(e11y_rate_limit_dropped_events_total[5m])`
1251
+ - **Visualization:** Time series graph with threshold line
1252
+ - **Alert Threshold:** > 100 drops/sec (high drop rate)
1253
+
1254
+ **3. `e11y_rate_limit_sampled_events_total` (Counter)**
1255
+ - **Description:** Total number of events sampled (kept) when limit exceeded with `sample` strategy.
1256
+ - **Labels:**
1257
+ - `limit_type`, `event_name`, `sample_rate`
1258
+ - **Monitoring:**
1259
+ ```prometheus
1260
+ # Sampling effectiveness
1261
+ e11y_rate_limit_sampled_events_total / e11y_rate_limit_hits_total
1262
+
1263
+ # Events saved by sampling (vs full drop)
1264
+ increase(e11y_rate_limit_sampled_events_total[1h])
1265
+ ```
1266
+ - **Grafana Panel:**
1267
+ - **Title:** Sampled vs Dropped Events
1268
+ - **Query:**
1269
+ ```prometheus
1270
+ sum(rate(e11y_rate_limit_sampled_events_total[5m])) /
1271
+ sum(rate(e11y_rate_limit_hits_total[5m]))
1272
+ ```
1273
+ - **Description:** Shows percentage of events retained during rate limiting (sampling effectiveness).
1274
+
1275
+ **4. `e11y_rate_limit_current` (Gauge)**
1276
+ - **Description:** Current number of events in the rate limit window.
1277
+ - **Labels:**
1278
+ - `limit_type`, `key`
1279
+ - **Monitoring:**
1280
+ ```prometheus
1281
+ # Current utilization (% of limit)
1282
+ e11y_rate_limit_current / e11y_rate_limit_threshold
1283
+
1284
+ # Max utilization in last hour
1285
+ max_over_time(e11y_rate_limit_current[1h])
1286
+ ```
1287
+ - **Grafana Panel:**
1288
+ - **Title:** Rate Limit Utilization
1289
+ - **Query:**
1290
+ ```prometheus
1291
+ (e11y_rate_limit_current / e11y_rate_limit_threshold) * 100
1292
+ ```
1293
+ - **Visualization:** Gauge (0-100%)
1294
+ - **Thresholds:**
1295
+ - Green: 0-70%
1296
+ - Yellow: 70-90%
1297
+ - Red: 90-100%
1298
+
1299
+ **5. `e11y_rate_limit_threshold` (Gauge)**
1300
+ - **Description:** Configured rate limit threshold.
1301
+ - **Labels:**
1302
+ - `limit_type`, `key`
1303
+ - **Monitoring:**
1304
+ ```prometheus
1305
+ # View configured limits
1306
+ e11y_rate_limit_threshold
1307
+
1308
+ # Check if limits need adjustment
1309
+ e11y_rate_limit_current / e11y_rate_limit_threshold > 0.8
1310
+ ```
1311
+
1312
+ **6. `e11y_rate_limit_bypass_total` (Counter)**
1313
+ - **Description:** Total number of events that bypassed rate limiting (critical events).
1314
+ - **Labels:**
1315
+ - `event_name`, `bypass_reason` (`severity`, `event_type`, `custom`)
1316
+ - **Monitoring:**
1317
+ ```prometheus
1318
+ # Bypass rate
1319
+ rate(e11y_rate_limit_bypass_total[5m])
1320
+
1321
+ # Which events bypass most?
1322
+ topk(10, sum by (event_name) (e11y_rate_limit_bypass_total))
1323
+ ```
1324
+
1325
+ ---
1326
+
1327
+ ### Monitoring Dashboard
1328
+
1329
+ **Grafana Dashboard: E11y Rate Limiting**
1330
+
1331
+ ```yaml
1332
+ # dashboard.json structure
1333
+ {
1334
+ "title": "E11y Rate Limiting",
1335
+ "panels": [
1336
+ {
1337
+ "title": "Rate Limit Hits (by type)",
1338
+ "query": "sum by (limit_type) (rate(e11y_rate_limit_hits_total[5m]))",
1339
+ "type": "graph"
1340
+ },
1341
+ {
1342
+ "title": "Drop Rate",
1343
+ "query": "rate(e11y_rate_limit_dropped_events_total[5m])",
1344
+ "type": "graph",
1345
+ "alert": {
1346
+ "threshold": 100,
1347
+ "severity": "warning"
1348
+ }
1349
+ },
1350
+ {
1351
+ "title": "Top Rate-Limited Events",
1352
+ "query": "topk(10, sum by (event_name) (e11y_rate_limit_hits_total))",
1353
+ "type": "table"
1354
+ },
1355
+ {
1356
+ "title": "Global Limit Utilization",
1357
+ "query": "(e11y_rate_limit_current{limit_type='global'} / e11y_rate_limit_threshold{limit_type='global'}) * 100",
1358
+ "type": "gauge",
1359
+ "thresholds": [70, 90]
1360
+ },
1361
+ {
1362
+ "title": "Per-Context Abuse Detection",
1363
+ "query": "topk(10, sum by (key) (e11y_rate_limit_hits_total{limit_type='per_context'}))",
1364
+ "type": "table"
1365
+ }
1366
+ ]
1367
+ }
1368
+ ```
1369
+
1370
+ ---
1371
+
1372
+ ### Alerting Thresholds
1373
+
1374
+ | Metric | Threshold | Severity | Rationale |
1375
+ |--------|-----------|----------|-----------|
1376
+ | **Rate limit hits** | > 10/sec | Warning | Frequent rate limiting indicates high load or attack |
1377
+ | **Drop rate** | > 100/sec | Critical | High drop rate means observability loss |
1378
+ | **Global utilization** | > 80% | Warning | Approaching global limit, may need increase |
1379
+ | **Global utilization** | > 95% | Critical | Nearly at limit, immediate action needed |
1380
+ | **Per-event hits** | > 50/min | Warning | Specific event flooding (retry storm, bug) |
1381
+ | **Per-context hits** | > 100/min | Warning | Single user/IP abusing system |
1382
+
1383
+ ---
1384
+
1385
+ ### Prometheus Alerts
1386
+
1387
+ ```yaml
1388
+ # config/prometheus/alerts.yml
1389
+ groups:
1390
+ - name: e11y_rate_limiting
1391
+ rules:
1392
+ # Alert on frequent rate limiting
1393
+ - alert: E11yRateLimitHit
1394
+ expr: rate(e11y_rate_limit_hits_total[5m]) > 10
1395
+ for: 2m
1396
+ annotations:
1397
+ summary: "Rate limit hit frequently ({{ $value }} hits/sec)"
1398
+ description: "Check for retry storms or attacks"
1399
+
1400
+ # Alert on high drop rate
1401
+ - alert: E11yHighDropRate
1402
+ expr: rate(e11y_rate_limit_dropped_events_total[5m]) > 100
1403
+ for: 1m
1404
+ annotations:
1405
+ summary: "High event drop rate ({{ $value }} events/sec)"
1406
+ description: "Increase limits or investigate flood source"
1407
+
1408
+ # Alert on global limit approached
1409
+ - alert: E11yGlobalLimitApproached
1410
+ expr: |
1411
+ e11y_rate_limit_current{limit_type="global"}
1412
+ / e11y_rate_limit_threshold{limit_type="global"} > 0.8
1413
+ for: 1m
1414
+ annotations:
1415
+ summary: "Global rate limit at {{ $value }}%"
1416
+ ```
1417
+
1418
+ ---
1419
+
1420
+ ## 💻 Usage Examples
1421
+
1422
+ ### Example 1: Retry Storm Protection
1423
+
1424
+ ```ruby
1425
+ # app/services/payment_processor.rb
1426
+ class PaymentProcessor
1427
+ MAX_RETRIES = 3
1428
+
1429
+ def process(order)
1430
+ Events::PaymentAttempt.track(order_id: order.id)
1431
+
1432
+ begin
1433
+ result = PaymentGateway.charge(order)
1434
+ Events::PaymentSuccess.track(order_id: order.id, severity: :success)
1435
+ result
1436
+ rescue PaymentGateway::TemporaryError => e
1437
+ retry_with_rate_limit(order, e)
1438
+ end
1439
+ end
1440
+
1441
+ private
1442
+
1443
+ def retry_with_rate_limit(order, error)
1444
+ MAX_RETRIES.times do |attempt|
1445
+ # Track retry (rate limited!)
1446
+ Events::PaymentRetry.track(
1447
+ order_id: order.id,
1448
+ attempt: attempt + 1,
1449
+ error: error.message
1450
+ )
1451
+
1452
+ sleep(2 ** attempt) # Exponential backoff
1453
+
1454
+ begin
1455
+ return PaymentGateway.charge(order)
1456
+ rescue => e
1457
+ error = e
1458
+ end
1459
+ end
1460
+
1461
+ # All retries failed
1462
+ Events::PaymentFailed.track(
1463
+ order_id: order.id,
1464
+ error: error.message,
1465
+ severity: :error
1466
+ )
1467
+
1468
+ raise error
1469
+ end
1470
+ end
1471
+
1472
+ # Rate limiting config:
1473
+ E11y.configure do |config|
1474
+ config.rate_limiting do
1475
+ # Limit retries to 100/min globally
1476
+ per_event 'payment.retry',
1477
+ limit: 100,
1478
+ window: 1.minute,
1479
+ on_exceeded: :sample,
1480
+ sample_rate: 0.1
1481
+ end
1482
+ end
1483
+
1484
+ # Result:
1485
+ # - Normal operation: All retries tracked
1486
+ # - Gateway outage: 100 retries/min tracked + 10% sampled
1487
+ # - Observability maintained during incident ✅
1488
+ ```
1489
+
1490
+ ---
1491
+
1492
+ ### Example 2: Login Failure Protection
1493
+
1494
+ ```ruby
1495
+ # app/controllers/sessions_controller.rb
1496
+ class SessionsController < ApplicationController
1497
+ def create
1498
+ user = User.find_by(email: params[:email])
1499
+
1500
+ if user&.authenticate(params[:password])
1501
+ # Success
1502
+ Events::UserLoggedIn.track(
1503
+ user_id: user.id,
1504
+ ip_address: request.remote_ip,
1505
+ severity: :success
1506
+ )
1507
+
1508
+ session[:user_id] = user.id
1509
+ redirect_to root_path
1510
+ else
1511
+ # Failure (rate limited per IP)
1512
+ Events::LoginFailed.track(
1513
+ email: params[:email], # Filtered by PII filter
1514
+ ip_address: request.remote_ip,
1515
+ reason: 'invalid_credentials',
1516
+ severity: :warn
1517
+ )
1518
+
1519
+ flash[:error] = 'Invalid credentials'
1520
+ render :new
1521
+ end
1522
+ end
1523
+ end
1524
+
1525
+ # Rate limiting config:
1526
+ E11y.configure do |config|
1527
+ config.rate_limiting do
1528
+ # Limit login failures per IP
1529
+ per_context :ip_address,
1530
+ limit: 50,
1531
+ window: 5.minutes,
1532
+ on_exceeded: :drop
1533
+
1534
+ # Also limit per event
1535
+ per_event 'login.failed',
1536
+ limit: 200,
1537
+ window: 1.minute,
1538
+ on_exceeded: :sample,
1539
+ sample_rate: 0.2
1540
+ end
1541
+ end
1542
+
1543
+ # Result:
1544
+ # - Brute force attack: Max 50 events/IP/5min
1545
+ # - Global flood: Max 200 events/min
1546
+ # - Observability maintained, attacker data not logged ✅
1547
+ ```
1548
+
1549
+ ---
1550
+
1551
+ ## 🧪 Testing
1552
+
1553
+ ```ruby
1554
+ # spec/e11y/rate_limiting_spec.rb
1555
+ RSpec.describe 'E11y Rate Limiting' do
1556
+ before do
1557
+ E11y.configure do |config|
1558
+ config.rate_limiting do
1559
+ global limit: 100, window: 1.minute
1560
+ per_event 'test.event', limit: 10, window: 1.minute
1561
+ end
1562
+ end
1563
+ end
1564
+
1565
+ describe 'global rate limiting' do
1566
+ it 'allows events under limit' do
1567
+ 50.times do
1568
+ result = Events::TestEvent.track(foo: 'bar')
1569
+ expect(result).to be_success
1570
+ end
1571
+ end
1572
+
1573
+ it 'rate limits after threshold' do
1574
+ # Track 100 events (at limit)
1575
+ 100.times { Events::TestEvent.track(foo: 'bar') }
1576
+
1577
+ # 101st event should be rate limited
1578
+ result = Events::TestEvent.track(foo: 'bar')
1579
+ expect(result).to be_rate_limited
1580
+
1581
+ # Metric incremented
1582
+ metric = Yabeda.e11y_internal.rate_limit_hits_total
1583
+ expect(metric.values[{ limit_type: 'global' }]).to be > 0
1584
+ end
1585
+ end
1586
+
1587
+ describe 'per-event rate limiting' do
1588
+ it 'rate limits specific event type' do
1589
+ # Track 10 test.event (at limit)
1590
+ 10.times { Events::TestEvent.track(foo: 'bar') }
1591
+
1592
+ # 11th should be rate limited
1593
+ result = Events::TestEvent.track(foo: 'bar')
1594
+ expect(result).to be_rate_limited
1595
+
1596
+ # But other events still work
1597
+ result = Events::OtherEvent.track(baz: 'qux')
1598
+ expect(result).to be_success
1599
+ end
1600
+ end
1601
+
1602
+ describe 'bypass rules' do
1603
+ before do
1604
+ E11y.configure do |config|
1605
+ config.rate_limiting do
1606
+ global limit: 10, window: 1.minute
1607
+ bypass_for severities: [:fatal]
1608
+ end
1609
+ end
1610
+ end
1611
+
1612
+ it 'bypasses rate limiting for critical events' do
1613
+ # Fill up limit
1614
+ 10.times { Events::TestEvent.track(severity: :info) }
1615
+
1616
+ # Fatal event should bypass
1617
+ result = Events::CriticalError.track(severity: :fatal)
1618
+ expect(result).to be_success # Not rate limited!
1619
+ end
1620
+ end
1621
+ end
1622
+ ```
1623
+
1624
+ ---
1625
+
1626
+ ## 💡 Best Practices
1627
+
1628
+ ### ✅ DO
1629
+
1630
+ **1. Set conservative limits initially**
1631
+ ```ruby
1632
+ # ✅ GOOD: Start low, increase if needed
1633
+ global limit: 5_000, window: 1.minute
1634
+ ```
1635
+
1636
+ **2. Use per-context limits for abuse prevention**
1637
+ ```ruby
1638
+ # ✅ GOOD: Prevent single user flooding
1639
+ per_context :user_id, limit: 1_000, window: 1.minute
1640
+ per_context :ip_address, limit: 500, window: 1.minute
1641
+ ```
1642
+
1643
+ **3. Always bypass critical events**
1644
+ ```ruby
1645
+ # ✅ GOOD: Never rate limit security/system events
1646
+ bypass_for event_types: ['security.alert', 'system.critical']
1647
+ bypass_for severities: [:fatal]
1648
+ ```
1649
+
1650
+ **4. Monitor rate limit hits**
1651
+ ```ruby
1652
+ # ✅ GOOD: Alert on frequent rate limiting
1653
+ # Alert: rate_limit_hits_total > 10/min
1654
+ ```
1655
+
1656
+ ---
1657
+
1658
+ ### ❌ DON'T
1659
+
1660
+ **1. Don't set limits too high (defeats purpose)**
1661
+ ```ruby
1662
+ # ❌ BAD: Limit too high to be effective
1663
+ global limit: 1_000_000, window: 1.minute # Useless!
1664
+ ```
1665
+
1666
+ **2. Don't rate limit critical events**
1667
+ ```ruby
1668
+ # ❌ BAD: Rate limiting errors
1669
+ per_event 'system.error', limit: 10 # You WANT to know about ALL errors!
1670
+ ```
1671
+
1672
+ **3. Don't ignore rate limit alerts**
1673
+ ```ruby
1674
+ # ❌ BAD: Rate limits hitting frequently
1675
+ # → Investigate! Could be attack or misconfiguration
1676
+ ```
1677
+
1678
+ ---
1679
+
1680
+ ## 🔒 Validations (NEW - v1.1)
1681
+
1682
+ > **🎯 Pattern:** Validate rate limiting configuration at class load time.
1683
+
1684
+ ### Rate Limit Value Validation
1685
+
1686
+ **Problem:** Invalid rate limit values → runtime errors.
1687
+
1688
+ **Solution:** Validate rate limit is positive integer:
1689
+
1690
+ ```ruby
1691
+ # Gem implementation (automatic):
1692
+ def self.rate_limit(limit, window: 1.minute)
1693
+ unless limit.is_a?(Integer) && limit > 0
1694
+ raise ArgumentError, "rate_limit must be positive integer, got: #{limit.inspect}"
1695
+ end
1696
+ unless window.is_a?(ActiveSupport::Duration) && window > 0
1697
+ raise ArgumentError, "window must be positive duration, got: #{window.inspect}"
1698
+ end
1699
+ self._rate_limit = limit
1700
+ self._rate_limit_window = window
1701
+ end
1702
+
1703
+ # Result:
1704
+ class Events::ApiRequest < E11y::Event::Base
1705
+ rate_limit -100 # ← ERROR: "rate_limit must be positive integer, got: -100"
1706
+ end
1707
+ ```
1708
+
1709
+ ### On Exceeded Strategy Validation
1710
+
1711
+ **Problem:** Invalid on_exceeded strategies → silent failures.
1712
+
1713
+ **Solution:** Validate strategy against whitelist:
1714
+
1715
+ ```ruby
1716
+ # Gem implementation (automatic):
1717
+ VALID_ON_EXCEEDED = [:drop, :sample, :throttle]
1718
+
1719
+ def self.on_exceeded(strategy)
1720
+ unless VALID_ON_EXCEEDED.include?(strategy)
1721
+ raise ArgumentError, "Invalid on_exceeded: #{strategy}. Valid: #{VALID_ON_EXCEEDED.join(', ')}"
1722
+ end
1723
+ self._on_exceeded = strategy
1724
+ end
1725
+
1726
+ # Result:
1727
+ class Events::ApiRequest < E11y::Event::Base
1728
+ on_exceeded :backpressure # ← ERROR: "Invalid on_exceeded: :backpressure. Valid: drop, sample, throttle"
1729
+ end
1730
+ ```
1731
+
1732
+ ### Audit Event Rate Limiting Validation (LOCKED)
1733
+
1734
+ **Problem:** Attempting to rate limit audit events → compliance violations.
1735
+
1736
+ **Solution:** Lock rate_limiting for audit events:
1737
+
1738
+ ```ruby
1739
+ # Gem implementation (automatic):
1740
+ def self.rate_limiting(enabled)
1741
+ if self._audit_event && enabled
1742
+ raise ArgumentError, "Cannot enable rate_limiting for audit events! Audit events must never be rate limited."
1743
+ end
1744
+ self._rate_limiting = enabled
1745
+ end
1746
+
1747
+ # Result:
1748
+ class Events::UserDeleted < E11y::Event::Base
1749
+ audit_event true
1750
+ rate_limit 1000 # ← ERROR: "Cannot enable rate_limiting for audit events!"
1751
+ end
1752
+ ```
1753
+
1754
+ ---
1755
+
1756
+ ## 🌍 Environment-Specific Rate Limiting (NEW - v1.1)
1757
+
1758
+ > **🎯 Pattern:** Different rate limits per environment.
1759
+
1760
+ ### Example 1: Higher Limits in Production
1761
+
1762
+ ```ruby
1763
+ class Events::ApiRequest < E11y::Event::Base
1764
+ schema do
1765
+ required(:endpoint).filled(:string)
1766
+ required(:status).filled(:integer)
1767
+ end
1768
+
1769
+ # Environment-specific rate limits
1770
+ rate_limit case Rails.env
1771
+ when 'production' then 10_000
1772
+ when 'staging' then 1_000
1773
+ else 100 # Dev/test
1774
+ end
1775
+
1776
+ on_exceeded Rails.env.production? ? :sample : :drop
1777
+ sample_rate 0.1 if Rails.env.production?
1778
+ end
1779
+ ```
1780
+
1781
+ ### Example 2: Debug Events (Strict in Prod, Loose in Dev)
1782
+
1783
+ ```ruby
1784
+ class Events::DebugQuery < E11y::Event::Base
1785
+ schema do
1786
+ required(:query).filled(:string)
1787
+ required(:duration_ms).filled(:integer)
1788
+ end
1789
+
1790
+ # Strict rate limiting in production
1791
+ rate_limit Rails.env.production? ? 100 : 10_000
1792
+ on_exceeded :drop # Always drop debug logs
1793
+ end
1794
+ ```
1795
+
1796
+ ### Example 3: Feature Flag for Rate Limiting
1797
+
1798
+ ```ruby
1799
+ class Events::ExperimentalFeature < E11y::Event::Base
1800
+ schema do
1801
+ required(:feature_name).filled(:string)
1802
+ end
1803
+
1804
+ # Enable rate limiting only when feature flag is on
1805
+ if ENV['ENABLE_RATE_LIMITING'] == 'true'
1806
+ rate_limit 1_000
1807
+ on_exceeded :sample
1808
+ sample_rate 0.1
1809
+ end
1810
+ end
1811
+ ```
1812
+
1813
+ ---
1814
+
1815
+ ## 📊 Precedence Rules for Rate Limiting (NEW - v1.1)
1816
+
1817
+ > **🎯 Pattern:** Rate limiting configuration precedence (most specific wins).
1818
+
1819
+ ### Precedence Order (Highest to Lowest)
1820
+
1821
+ ```
1822
+ 1. Event-level explicit config (highest priority)
1823
+
1824
+ 2. Preset module config
1825
+
1826
+ 3. Base class config (inheritance)
1827
+
1828
+ 4. Convention-based defaults (1000/sec)
1829
+
1830
+ 5. Global config (lowest priority)
1831
+ ```
1832
+
1833
+ ### Example: Mixing Inheritance + Presets for Rate Limiting
1834
+
1835
+ ```ruby
1836
+ # Global config (lowest priority)
1837
+ E11y.configure do |config|
1838
+ config.rate_limiting do
1839
+ global limit: 10_000, window: 1.minute # Default for all events
1840
+ on_exceeded :drop
1841
+ end
1842
+ end
1843
+
1844
+ # Base class (medium priority)
1845
+ class Events::BaseDebugEvent < E11y::Event::Base
1846
+ severity :debug
1847
+ rate_limit 100, window: 1.minute # Override global (stricter)
1848
+ on_exceeded :drop
1849
+ end
1850
+
1851
+ # Preset module (higher priority)
1852
+ module E11y::Presets::HighValueEvent
1853
+ extend ActiveSupport::Concern
1854
+ included do
1855
+ rate_limit 10_000, window: 1.minute # Override base (looser)
1856
+ on_exceeded :throttle # Never drop high-value events
1857
+ end
1858
+ end
1859
+
1860
+ # Event (highest priority)
1861
+ class Events::CriticalPayment < Events::BaseDebugEvent
1862
+ include E11y::Presets::HighValueEvent
1863
+
1864
+ rate_limit 50_000, window: 1.minute # Override preset (even looser)
1865
+
1866
+ # Final config:
1867
+ # - severity: :debug (from base)
1868
+ # - rate_limit: 50_000/min (event-level override)
1869
+ # - on_exceeded: :throttle (from preset)
1870
+ end
1871
+ ```
1872
+
1873
+ ### Precedence Rules Table
1874
+
1875
+ | Config | Global | Convention | Base Class | Preset | Event-Level | Winner |
1876
+ |--------|--------|------------|------------|--------|-------------|--------|
1877
+ | `rate_limit` | `10_000` | `1_000` | `100` | `10_000` | `50_000` | **`50_000`** (event) |
1878
+ | `on_exceeded` | `:drop` | - | `:drop` | `:throttle` | - | **`:throttle`** (preset) |
1879
+ | `sample_rate` | `0.1` | - | - | `0.5` | - | **`0.5`** (preset) |
1880
+
1881
+ ### Convention-Based Defaults
1882
+
1883
+ **Convention:** If no rate_limit specified → default `1000/sec`:
1884
+
1885
+ ```ruby
1886
+ class Events::ApiRequest < E11y::Event::Base
1887
+ schema do
1888
+ required(:endpoint).filled(:string)
1889
+ end
1890
+ # ← Auto: rate_limit = 1000 (convention!)
1891
+ end
1892
+ ```
1893
+
1894
+ ---
1895
+
1896
+ ## 📚 Related Use Cases
1897
+
1898
+ - **[UC-002: Business Event Tracking](./UC-002-business-event-tracking.md)** - Event definitions
1899
+ - **[UC-007: PII Filtering](./UC-007-pii-filtering.md)** - Prevent PII leaks
1900
+ - **[UC-013: High Cardinality Protection](./UC-013-high-cardinality-protection.md)** - Cost control
1901
+
1902
+ ---
1903
+
1904
+ **Document Version:** 1.1 (Unified DSL)
1905
+ **Last Updated:** January 16, 2026
1906
+ **Status:** ✅ Complete - Consistent with DSL-SPECIFICATION.md v1.1.0