e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,1906 @@
|
|
|
1
|
+
# UC-011: Rate Limiting (DoS Protection)
|
|
2
|
+
|
|
3
|
+
**Status:** MVP Feature (Critical for Production)
|
|
4
|
+
**Complexity:** Intermediate
|
|
5
|
+
**Setup Time:** 20-30 minutes
|
|
6
|
+
**Target Users:** Security Engineers, SRE, Backend Developers
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## 📋 Overview
|
|
11
|
+
|
|
12
|
+
### Problem Statement
|
|
13
|
+
|
|
14
|
+
**The production incident:**
|
|
15
|
+
```ruby
|
|
16
|
+
# ❌ NO RATE LIMITING: Infinite retry storm
|
|
17
|
+
begin
|
|
18
|
+
process_payment(order)
|
|
19
|
+
rescue PaymentError => e
|
|
20
|
+
# Retry immediately (bad idea!)
|
|
21
|
+
3.times do
|
|
22
|
+
Events::PaymentRetry.track(order_id: order.id, attempt: _1)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# What happened:
|
|
27
|
+
# - 1000 failed payments
|
|
28
|
+
# - 3000 retry events
|
|
29
|
+
# - × 100 fields per event
|
|
30
|
+
# - = 300,000 events in 10 seconds
|
|
31
|
+
# → Buffer overflow
|
|
32
|
+
# → Loki API rate limit hit (429)
|
|
33
|
+
# → All observability lost during incident! 😱
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Real incident impact:**
|
|
37
|
+
- **09:00 AM**: Payment gateway down
|
|
38
|
+
- **09:01 AM**: 50k retry events/sec flooding E11y
|
|
39
|
+
- **09:02 AM**: Loki returns 429 (rate limit)
|
|
40
|
+
- **09:03 AM**: E11y buffer full, events dropped
|
|
41
|
+
- **09:05 AM**: **No observability** - blind during incident
|
|
42
|
+
- **09:30 AM**: Incident resolved, but root cause unclear (no logs!)
|
|
43
|
+
|
|
44
|
+
### E11y Solution
|
|
45
|
+
|
|
46
|
+
**3-Layer Rate Limiting (Global + Per-Event + Per-Context):**
|
|
47
|
+
```ruby
|
|
48
|
+
# ✅ PROTECTED: Multi-layer rate limiting
|
|
49
|
+
E11y.configure do |config|
|
|
50
|
+
config.rate_limiting do
|
|
51
|
+
# Layer 1: Global limit (protect buffer)
|
|
52
|
+
global limit: 10_000, window: 1.minute
|
|
53
|
+
|
|
54
|
+
# Layer 2: Per-event limit (prevent retry storms)
|
|
55
|
+
per_event 'payment.retry', limit: 100, window: 1.minute
|
|
56
|
+
|
|
57
|
+
# Layer 3: Per-context limit (per user/IP)
|
|
58
|
+
per_context :user_id, limit: 1_000, window: 1.minute
|
|
59
|
+
per_context :ip_address, limit: 500, window: 1.minute
|
|
60
|
+
|
|
61
|
+
# What happens when limit exceeded:
|
|
62
|
+
on_exceeded :sample # Keep 10%, drop 90%
|
|
63
|
+
sample_rate 0.1
|
|
64
|
+
|
|
65
|
+
# Alert on rate limiting
|
|
66
|
+
alert_on_limit true
|
|
67
|
+
alert_channel '#observability'
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Result during incident:
|
|
72
|
+
# - Global limit: 10k/min enforced
|
|
73
|
+
# - Payment retry: 100/min enforced
|
|
74
|
+
# - Per user: 1k/min enforced
|
|
75
|
+
# → Observability maintained ✅
|
|
76
|
+
# → Root cause identified quickly ✅
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## 🎯 Event-Level Rate Limiting (NEW - v1.1)
|
|
82
|
+
|
|
83
|
+
> **🎯 CONTRADICTION_01 Resolution:** Move rate limiting config from global initializer to event classes.
|
|
84
|
+
|
|
85
|
+
**Event-level rate limiting DSL:**
|
|
86
|
+
|
|
87
|
+
```ruby
|
|
88
|
+
# app/events/payment_retry.rb
|
|
89
|
+
module Events
|
|
90
|
+
class PaymentRetry < E11y::Event::Base
|
|
91
|
+
schema do
|
|
92
|
+
required(:order_id).filled(:string)
|
|
93
|
+
required(:attempt).filled(:integer)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# ✨ Event-level rate limiting (right next to schema!)
|
|
97
|
+
rate_limit 100, window: 1.minute # Max 100 retries/min
|
|
98
|
+
on_exceeded :drop # Drop retry logs (not critical)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# app/events/user_login_failed.rb
|
|
103
|
+
module Events
|
|
104
|
+
class UserLoginFailed < E11y::Event::Base
|
|
105
|
+
schema do
|
|
106
|
+
required(:user_id).filled(:string)
|
|
107
|
+
required(:ip_address).filled(:string)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# ✨ Event-level rate limiting
|
|
111
|
+
rate_limit 50, window: 1.minute # Max 50 failures/min
|
|
112
|
+
on_exceeded :sample # Keep 20% (flattened syntax!)
|
|
113
|
+
sample_rate 0.2
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Inheritance for rate limiting:**
|
|
119
|
+
|
|
120
|
+
```ruby
|
|
121
|
+
# Base class with common rate limiting
|
|
122
|
+
module Events
|
|
123
|
+
class BaseDebugEvent < E11y::Event::Base
|
|
124
|
+
# Common for ALL debug events
|
|
125
|
+
severity :debug
|
|
126
|
+
rate_limit 100, window: 1.minute # Low limit
|
|
127
|
+
on_exceeded :drop # Drop debug logs (not critical)
|
|
128
|
+
sample_rate 0.01 # 1% sampling
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Inherit from base
|
|
133
|
+
class Events::DebugSqlQuery < Events::BaseDebugEvent
|
|
134
|
+
schema do; required(:query).filled(:string); end
|
|
135
|
+
# ← Inherits: rate_limit 100 + on_exceeded :drop
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
class Events::DebugApiCall < Events::BaseDebugEvent
|
|
139
|
+
schema do; required(:endpoint).filled(:string); end
|
|
140
|
+
# ← Inherits: rate_limit 100 + on_exceeded :drop
|
|
141
|
+
end
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Preset modules for rate limiting:**
|
|
145
|
+
|
|
146
|
+
```ruby
|
|
147
|
+
# lib/e11y/presets/high_value_event.rb
|
|
148
|
+
module E11y
|
|
149
|
+
module Presets
|
|
150
|
+
module HighValueEvent
|
|
151
|
+
extend ActiveSupport::Concern
|
|
152
|
+
included do
|
|
153
|
+
rate_limit 10_000 # High limit
|
|
154
|
+
on_exceeded :throttle # Slow down, don't drop
|
|
155
|
+
sample_rate 1.0 # Never sample
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
module DebugEvent
|
|
160
|
+
extend ActiveSupport::Concern
|
|
161
|
+
included do
|
|
162
|
+
rate_limit 100 # Low limit
|
|
163
|
+
on_exceeded :drop # Drop debug logs
|
|
164
|
+
sample_rate 0.01 # 1% sampling
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Usage:
|
|
171
|
+
class Events::PaymentProcessed < E11y::Event::Base
|
|
172
|
+
include E11y::Presets::HighValueEvent # ← Rate limit inherited!
|
|
173
|
+
schema do; required(:transaction_id).filled(:string); end
|
|
174
|
+
end
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Conventions for rate limiting (sensible defaults):**
|
|
178
|
+
|
|
179
|
+
```ruby
|
|
180
|
+
# Convention: Default rate limit = 1000 events/sec
|
|
181
|
+
# Override only for high-volume or low-volume events
|
|
182
|
+
|
|
183
|
+
# Zero-config event (uses convention):
|
|
184
|
+
class Events::OrderCreated < E11y::Event::Base
|
|
185
|
+
schema do; required(:order_id).filled(:string); end
|
|
186
|
+
# ← Auto: rate_limit = 1000 (default)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Override for high-volume:
|
|
190
|
+
class Events::PageView < E11y::Event::Base
|
|
191
|
+
rate_limit 10_000 # ← Override: high-volume
|
|
192
|
+
schema do; required(:page).filled(:string); end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Override for low-volume:
|
|
196
|
+
class Events::DebugQuery < E11y::Event::Base
|
|
197
|
+
rate_limit 100 # ← Override: low-volume
|
|
198
|
+
schema do; required(:query).filled(:string); end
|
|
199
|
+
end
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Precedence (event-level overrides global):**
|
|
203
|
+
|
|
204
|
+
```ruby
|
|
205
|
+
# Global config (infrastructure):
|
|
206
|
+
E11y.configure do |config|
|
|
207
|
+
config.rate_limiting do
|
|
208
|
+
global limit: 10_000, window: 1.minute
|
|
209
|
+
default_per_event_limit 1000 # Default for events
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Event-level config (overrides global):
|
|
214
|
+
class Events::PaymentRetry < E11y::Event::Base
|
|
215
|
+
rate_limit 100 # ← Override: 100 (not 1000)
|
|
216
|
+
on_exceeded :drop # ← Override: drop (not default)
|
|
217
|
+
end
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Benefits:**
|
|
221
|
+
- ✅ Locality of behavior (rate limit next to schema)
|
|
222
|
+
- ✅ DRY via inheritance/presets
|
|
223
|
+
- ✅ Sensible defaults (1000/sec)
|
|
224
|
+
- ✅ Easy to override when needed
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## 🎯 The 3-Layer Rate Limiting System
|
|
229
|
+
|
|
230
|
+
### Layer 1: Global Rate Limiting
|
|
231
|
+
|
|
232
|
+
**Protect E11y infrastructure from flooding:**
|
|
233
|
+
|
|
234
|
+
```ruby
|
|
235
|
+
E11y.configure do |config|
|
|
236
|
+
config.rate_limiting do
|
|
237
|
+
# === GLOBAL LIMIT ===
|
|
238
|
+
# Across ALL events, ALL sources
|
|
239
|
+
global limit: 10_000, # Max 10k events
|
|
240
|
+
window: 1.minute, # Per minute
|
|
241
|
+
algorithm: :sliding_window # OR :token_bucket, :fixed_window
|
|
242
|
+
|
|
243
|
+
# What happens when exceeded:
|
|
244
|
+
on_exceeded :sample # Options: :drop, :sample, :throttle
|
|
245
|
+
sample_rate 0.1 # Keep 10% when over limit
|
|
246
|
+
|
|
247
|
+
# Track dropped events
|
|
248
|
+
track_drops true
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# How it works:
|
|
253
|
+
# - Counts events across entire system
|
|
254
|
+
# - If > 10k/min → apply sample_rate (90% dropped)
|
|
255
|
+
# - Metrics: e11y_rate_limit_global_hits_total
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
**Algorithms:**
|
|
259
|
+
|
|
260
|
+
| Algorithm | Behavior | Use Case |
|
|
261
|
+
|-----------|----------|----------|
|
|
262
|
+
| `:sliding_window` | Smooth rate control | **Default** (best for most cases) |
|
|
263
|
+
| `:token_bucket` | Allows bursts | APIs with bursty traffic |
|
|
264
|
+
| `:fixed_window` | Simple but has edge cases | Low-volume scenarios |
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
### Layer 2: Per-Event Rate Limiting
|
|
269
|
+
|
|
270
|
+
**Prevent specific events from flooding:**
|
|
271
|
+
|
|
272
|
+
```ruby
|
|
273
|
+
E11y.configure do |config|
|
|
274
|
+
config.rate_limiting do
|
|
275
|
+
# === PER-EVENT LIMITS ===
|
|
276
|
+
|
|
277
|
+
# Retry events (common culprit)
|
|
278
|
+
per_event 'payment.retry',
|
|
279
|
+
limit: 100,
|
|
280
|
+
window: 1.minute,
|
|
281
|
+
on_exceeded: :drop # Drop retry logs (not critical)
|
|
282
|
+
|
|
283
|
+
# Login failures (security)
|
|
284
|
+
per_event 'user.login.failed',
|
|
285
|
+
limit: 50,
|
|
286
|
+
window: 1.minute,
|
|
287
|
+
on_exceeded: :sample,
|
|
288
|
+
sample_rate: 0.2 # Keep 20%
|
|
289
|
+
|
|
290
|
+
# API errors (debugging)
|
|
291
|
+
per_event 'api.error',
|
|
292
|
+
limit: 200,
|
|
293
|
+
window: 1.minute,
|
|
294
|
+
on_exceeded: :throttle # Slow down, don't drop
|
|
295
|
+
|
|
296
|
+
# Background job failures
|
|
297
|
+
per_event 'job.failed',
|
|
298
|
+
limit: 500,
|
|
299
|
+
window: 5.minutes,
|
|
300
|
+
on_exceeded: :sample,
|
|
301
|
+
sample_rate: 0.1
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Usage:
|
|
306
|
+
Events::PaymentRetry.track(order_id: '123', attempt: 1)
|
|
307
|
+
Events::PaymentRetry.track(order_id: '456', attempt: 1)
|
|
308
|
+
# ... 99 more in same minute → All tracked
|
|
309
|
+
|
|
310
|
+
Events::PaymentRetry.track(order_id: '789', attempt: 1)
|
|
311
|
+
# → 101st event in minute → DROPPED (limit: 100)
|
|
312
|
+
# → Metric: e11y_rate_limit_per_event_hits_total{event="payment.retry"}
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
### Layer 3: Per-Context Rate Limiting
|
|
318
|
+
|
|
319
|
+
**Prevent single user/IP/tenant from flooding:**
|
|
320
|
+
|
|
321
|
+
```ruby
|
|
322
|
+
E11y.configure do |config|
|
|
323
|
+
config.rate_limiting do
|
|
324
|
+
# === PER-CONTEXT LIMITS ===
|
|
325
|
+
|
|
326
|
+
# Per user (prevent single user abuse)
|
|
327
|
+
per_context :user_id,
|
|
328
|
+
limit: 1_000,
|
|
329
|
+
window: 1.minute,
|
|
330
|
+
on_exceeded: :sample,
|
|
331
|
+
sample_rate: 0.1
|
|
332
|
+
|
|
333
|
+
# Per IP address (prevent DDoS)
|
|
334
|
+
per_context :ip_address,
|
|
335
|
+
limit: 500,
|
|
336
|
+
window: 1.minute,
|
|
337
|
+
on_exceeded: :drop
|
|
338
|
+
|
|
339
|
+
# Per tenant (multi-tenant apps)
|
|
340
|
+
per_context :tenant_id,
|
|
341
|
+
limit: 5_000,
|
|
342
|
+
window: 1.minute,
|
|
343
|
+
on_exceeded: :throttle
|
|
344
|
+
|
|
345
|
+
# Per session (prevent session replay attacks)
|
|
346
|
+
per_context :session_id,
|
|
347
|
+
limit: 200,
|
|
348
|
+
window: 1.minute,
|
|
349
|
+
on_exceeded: :drop
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# How it works:
|
|
354
|
+
# User A: 1000 events/min → OK
|
|
355
|
+
# User A: 1001st event → 90% dropped (sample_rate 0.1)
|
|
356
|
+
# User B: 1000 events/min → OK (separate limit)
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
**Context extraction:**
|
|
360
|
+
```ruby
|
|
361
|
+
# E11y automatically extracts context from:
|
|
362
|
+
# 1. Event payload: event.payload[:user_id]
|
|
363
|
+
# 2. Event context: event.context[:user_id]
|
|
364
|
+
# 3. Rails Current: Current.user_id
|
|
365
|
+
# 4. Custom extractor:
|
|
366
|
+
|
|
367
|
+
E11y.configure do |config|
|
|
368
|
+
config.rate_limiting do
|
|
369
|
+
per_context :user_id,
|
|
370
|
+
limit: 1_000,
|
|
371
|
+
window: 1.minute,
|
|
372
|
+
extractor: ->(event) {
|
|
373
|
+
# Custom logic to extract user_id
|
|
374
|
+
event.payload[:user_id] || event.context[:current_user]&.id
|
|
375
|
+
}
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
---
|
|
381
|
+
|
|
382
|
+
### Layer 4: DLQ Filter Integration (C02 Resolution) ⚠️
|
|
383
|
+
|
|
384
|
+
> **Reference:** See [ADR-013 §4.6: Rate Limiting × DLQ Filter](../ADR-013-reliability-error-handling.md#46-rate-limiting--dlq-filter-interaction-c02-resolution) for full architecture.
|
|
385
|
+
|
|
386
|
+
**Problem:** Rate limiting drops events BEFORE they reach DLQ filter. Critical events (e.g., payments) may be lost during traffic spikes, even though DLQ filter says "always save payments".
|
|
387
|
+
|
|
388
|
+
**Solution:** Rate limiter respects DLQ `always_save` filter - critical events bypass rate limits.
|
|
389
|
+
|
|
390
|
+
```ruby
|
|
391
|
+
E11y.configure do |config|
|
|
392
|
+
config.rate_limiting do
|
|
393
|
+
enabled true
|
|
394
|
+
global limit: 10_000, window: 1.minute
|
|
395
|
+
|
|
396
|
+
# ✅ Respect DLQ filter for critical events
|
|
397
|
+
respect_dlq_filter true # Critical events bypass rate limits!
|
|
398
|
+
|
|
399
|
+
# Alternative: Explicit bypass patterns
|
|
400
|
+
bypass_for do
|
|
401
|
+
event_patterns ['payment.*', 'order.*', 'audit.*']
|
|
402
|
+
severities [:error, :fatal]
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
# DLQ filter configuration
|
|
407
|
+
config.error_handling.dead_letter_queue.filter do
|
|
408
|
+
always_save do
|
|
409
|
+
event_patterns ['payment.*', 'order.*']
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
# Scenario: Traffic spike (15,000 payment failures/sec)
|
|
415
|
+
15_000.times do
|
|
416
|
+
Events::PaymentFailed.track(order_id: '123', amount: 500)
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Result:
|
|
420
|
+
# - Rate limit: 10,000/min
|
|
421
|
+
# - Excess: 5,000 events over limit
|
|
422
|
+
# - ❌ WITHOUT C02 fix: 5,000 critical payment events DROPPED!
|
|
423
|
+
# - ✅ WITH C02 fix: ALL payment events processed (bypass rate limit!)
|
|
424
|
+
# → Why? DLQ filter says "always_save payment.*"
|
|
425
|
+
# → Rate limiter checks: dlq_filter.always_save?(event) → true
|
|
426
|
+
# → Bypass rate limit → event goes to buffer → success!
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
**Flow Diagram:**
|
|
430
|
+
|
|
431
|
+
```
|
|
432
|
+
Event → Rate Limiter
|
|
433
|
+
├─ Check: dlq_filter.always_save?(event)?
|
|
434
|
+
│ ├─ YES (critical event) → ✅ BYPASS rate limit → Buffer
|
|
435
|
+
│ └─ NO (non-critical)
|
|
436
|
+
│ ├─ Under limit? → ✅ PASS → Buffer
|
|
437
|
+
│ └─ Over limit? → ❌ DROP (or sample)
|
|
438
|
+
└─ Buffer → Adapter → DLQ (if adapter fails)
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
**Configuration Options:**
|
|
442
|
+
|
|
443
|
+
```ruby
|
|
444
|
+
# Option 1: Auto-respect DLQ filter (recommended)
|
|
445
|
+
config.rate_limiting.respect_dlq_filter = true
|
|
446
|
+
|
|
447
|
+
# Option 2: Explicit bypass patterns (more control)
|
|
448
|
+
config.rate_limiting.bypass_for do
|
|
449
|
+
event_patterns ['payment.*', 'fraud.*', 'security.*']
|
|
450
|
+
severities [:error, :fatal]
|
|
451
|
+
custom_check { |event| event[:vip_customer] == true }
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# Option 3: Hybrid (DLQ filter + extra patterns)
|
|
455
|
+
config.rate_limiting do
|
|
456
|
+
respect_dlq_filter true # Respect DLQ always_save
|
|
457
|
+
bypass_for do
|
|
458
|
+
event_patterns ['audit.*'] # Additional patterns
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
**Trade-offs:**
|
|
464
|
+
|
|
465
|
+
| Aspect | Pro | Con | Decision |
|
|
466
|
+
|--------|-----|-----|----------|
|
|
467
|
+
| **Bypass critical events** | Zero data loss for payments | Rate limit less effective during attacks | Critical events > rate limits |
|
|
468
|
+
| **respect_dlq_filter** | DRY (single source of truth) | Tight coupling to DLQ config | Worth it for simplicity |
|
|
469
|
+
| **bypass_for patterns** | Flexible custom rules | Need to maintain bypass list | Use for edge cases only |
|
|
470
|
+
|
|
471
|
+
---
|
|
472
|
+
|
|
473
|
+
### Layer 5: Retry Rate Limiting (C06 Resolution) ⚠️
|
|
474
|
+
|
|
475
|
+
> **Reference:** See [ADR-013 §3.5: Retry Rate Limiting](../ADR-013-reliability-error-handling.md#35-retry-rate-limiting-c06-resolution) for full architecture.
|
|
476
|
+
|
|
477
|
+
**Problem:** Adapter failures trigger retries. If 1000 events fail → 3000 retry attempts (thundering herd) → buffer overflow.
|
|
478
|
+
|
|
479
|
+
**Solution:** Separate rate limit for RETRIES (staged retry with jitter).
|
|
480
|
+
|
|
481
|
+
```ruby
|
|
482
|
+
E11y.configure do |config|
|
|
483
|
+
config.error_handling do
|
|
484
|
+
retry_policy do
|
|
485
|
+
max_attempts 3
|
|
486
|
+
base_delay 100 # ms
|
|
487
|
+
max_delay 5000 # ms
|
|
488
|
+
exponential_backoff true
|
|
489
|
+
jitter true
|
|
490
|
+
|
|
491
|
+
# ✅ Retry rate limiting (separate from main rate limit!)
|
|
492
|
+
retry_rate_limit do
|
|
493
|
+
enabled true
|
|
494
|
+
limit 1000 # Max 1000 retries/minute (not 10k!)
|
|
495
|
+
window 1.minute
|
|
496
|
+
|
|
497
|
+
# When retry is rate-limited:
|
|
498
|
+
on_limit_exceeded :delay # Options: :drop, :delay, :dlq
|
|
499
|
+
|
|
500
|
+
# Delay strategy (staged retry)
|
|
501
|
+
delay_strategy do
|
|
502
|
+
base_delay 1000 # 1 sec
|
|
503
|
+
max_delay 60_000 # 60 sec
|
|
504
|
+
backoff_multiplier 2 # 1s → 2s → 4s → 8s → 16s → 32s → 60s
|
|
505
|
+
jitter_range 0.2 # ±20% randomization
|
|
506
|
+
end
|
|
507
|
+
end
|
|
508
|
+
end
|
|
509
|
+
end
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
# Scenario: Loki down for 5 minutes (adapter fails)
|
|
513
|
+
# - 10,000 events/min attempted
|
|
514
|
+
# - Adapter fails → 10,000 events need retry
|
|
515
|
+
# - Without retry rate limiting:
|
|
516
|
+
# - 10,000 × 3 attempts = 30,000 retries
|
|
517
|
+
# - Immediate retry storm (buffer overflow!)
|
|
518
|
+
# - With retry rate limiting:
|
|
519
|
+
# - First retry: 1,000 events (rate limit enforced)
|
|
520
|
+
# - Next 9,000 events: delayed (staged retry)
|
|
521
|
+
# - Retry schedule:
|
|
522
|
+
# - 00:00 - 1,000 retries (immediate)
|
|
523
|
+
# - 00:01 - 1,000 retries (delayed 1s)
|
|
524
|
+
# - 00:02 - 1,000 retries (delayed 2s)
|
|
525
|
+
# - 00:04 - 1,000 retries (delayed 4s)
|
|
526
|
+
# - ... (exponential backoff)
|
|
527
|
+
# - 01:00 - Last batch (delayed 60s)
|
|
528
|
+
# - Result: ✅ No buffer overflow! Smooth retry spread over time.
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
**Retry Timeline Comparison:**
|
|
532
|
+
|
|
533
|
+
```
|
|
534
|
+
WITHOUT retry rate limiting:
|
|
535
|
+
00:00 Loki down
|
|
536
|
+
00:00 10,000 events fail → 30,000 immediate retries ❌ BUFFER OVERFLOW!
|
|
537
|
+
00:01 All retries exhausted, 10,000 events lost
|
|
538
|
+
|
|
539
|
+
WITH retry rate limiting:
|
|
540
|
+
00:00 Loki down
|
|
541
|
+
00:00 10,000 events fail → 1,000 immediate retries (rate limited)
|
|
542
|
+
00:01 1,000 retries (delayed)
|
|
543
|
+
00:02 1,000 retries (delayed)
|
|
544
|
+
00:04 1,000 retries (delayed)
|
|
545
|
+
00:08 1,000 retries (delayed)
|
|
546
|
+
00:16 1,000 retries (delayed)
|
|
547
|
+
00:32 1,000 retries (delayed)
|
|
548
|
+
01:00 1,000 retries (delayed)
|
|
549
|
+
02:00 1,000 retries (delayed)
|
|
550
|
+
04:00 1,000 retries (delayed)
|
|
551
|
+
05:00 Loki back online → 1,000 final retries → ✅ All 10,000 events saved!
|
|
552
|
+
```
|
|
553
|
+
|
|
554
|
+
**Configuration: Main Rate Limit vs Retry Rate Limit**
|
|
555
|
+
|
|
556
|
+
```ruby
|
|
557
|
+
E11y.configure do |config|
|
|
558
|
+
# Main rate limiting (for NEW events)
|
|
559
|
+
config.rate_limiting do
|
|
560
|
+
global limit: 10_000, window: 1.minute
|
|
561
|
+
on_exceeded :sample # Sample excess events
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# Retry rate limiting (for FAILED events)
|
|
565
|
+
config.error_handling.retry_policy.retry_rate_limit do
|
|
566
|
+
limit: 1_000, window: 1.minute # 10× LOWER than main limit!
|
|
567
|
+
on_limit_exceeded :delay # Delay excess retries (don't drop!)
|
|
568
|
+
end
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# Why separate limits?
|
|
572
|
+
# 1. Retries are MORE expensive (adapter already failed once)
|
|
573
|
+
# 2. Lower retry rate prevents cascading failures (give adapter time to recover)
|
|
574
|
+
# 3. Main limit protects ingestion, retry limit protects adapter
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
**Trade-offs:**
|
|
578
|
+
|
|
579
|
+
| Aspect | Pro | Con | Decision |
|
|
580
|
+
|--------|-----|-----|----------|
|
|
581
|
+
| **Retry rate limit 10× lower** | Prevents adapter overload | Slower retry | Adapter stability > speed |
|
|
582
|
+
| **:delay (not :drop)** | No data loss | Memory for delayed queue | Worth it for reliability |
|
|
583
|
+
| **Exponential backoff + jitter** | Smooth recovery | Complex timing | Industry best practice |
|
|
584
|
+
| **Separate from main limit** | Fine-grained control | More config | Flexibility > simplicity |
|
|
585
|
+
|
|
586
|
+
---
|
|
587
|
+
|
|
588
|
+
## 💻 Rate Limiting Strategies
|
|
589
|
+
|
|
590
|
+
### Strategy 1: Drop
|
|
591
|
+
|
|
592
|
+
**Discard excess events:**
|
|
593
|
+
```ruby
|
|
594
|
+
on_exceeded :drop
|
|
595
|
+
|
|
596
|
+
# Use when:
|
|
597
|
+
# - Non-critical events (retry logs, debug events)
|
|
598
|
+
# - High volume, low value events
|
|
599
|
+
# - Already have enough signal
|
|
600
|
+
|
|
601
|
+
# Example:
|
|
602
|
+
per_event 'debug.log', limit: 100, window: 1.minute, on_exceeded: :drop
|
|
603
|
+
```
|
|
604
|
+
|
|
605
|
+
---
|
|
606
|
+
|
|
607
|
+
### Strategy 2: Sample
|
|
608
|
+
|
|
609
|
+
**Keep percentage of excess events:**
|
|
610
|
+
```ruby
|
|
611
|
+
on_exceeded :sample
|
|
612
|
+
sample_rate 0.1 # Keep 10%
|
|
613
|
+
|
|
614
|
+
# Use when:
|
|
615
|
+
# - Want SOME signal during flood
|
|
616
|
+
# - Statistical analysis OK (don't need every event)
|
|
617
|
+
# - Moderate volume
|
|
618
|
+
|
|
619
|
+
# Example:
|
|
620
|
+
per_event 'user.action', limit: 1000, window: 1.minute,
|
|
621
|
+
on_exceeded: :sample
|
|
622
|
+
# → Sample rate: 0.1 (10%)
|
|
623
|
+
# → First 1000: all kept
|
|
624
|
+
# → Next 9000: 10% kept (900 events)
|
|
625
|
+
# → Total: 1900 events (vs 10,000 without rate limiting)
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
---
|
|
629
|
+
|
|
630
|
+
### Strategy 3: Backpressure
|
|
631
|
+
|
|
632
|
+
**Slow down event production:**
|
|
633
|
+
```ruby
|
|
634
|
+
on_exceeded :throttle
|
|
635
|
+
|
|
636
|
+
# Use when:
|
|
637
|
+
# - Events MUST be tracked (critical)
|
|
638
|
+
# - Can afford latency increase
|
|
639
|
+
# - Low to moderate volume
|
|
640
|
+
|
|
641
|
+
# How it works:
|
|
642
|
+
# 1. Limit exceeded
|
|
643
|
+
# 2. Sleep 10ms before tracking next event
|
|
644
|
+
# 3. Gradual slow down (not sudden drop)
|
|
645
|
+
|
|
646
|
+
# Example:
|
|
647
|
+
per_event 'order.created', limit: 100, window: 1.minute,
|
|
648
|
+
on_exceeded: :throttle,
|
|
649
|
+
backpressure_delay: 10.milliseconds
|
|
650
|
+
```
|
|
651
|
+
|
|
652
|
+
---
|
|
653
|
+
|
|
654
|
+
### Strategy 4: Aggregate
|
|
655
|
+
|
|
656
|
+
**Combine events into summary:**
|
|
657
|
+
```ruby
|
|
658
|
+
on_exceeded :aggregate
|
|
659
|
+
|
|
660
|
+
# Use when:
|
|
661
|
+
# - Many similar events
|
|
662
|
+
# - Summary is sufficient
|
|
663
|
+
# - High volume
|
|
664
|
+
|
|
665
|
+
# How it works:
|
|
666
|
+
# 1. First 100 events: tracked individually
|
|
667
|
+
# 2. Next 900 events: aggregated into 1 summary event
|
|
668
|
+
# 3. Summary includes: count, min/max/avg, sample
|
|
669
|
+
|
|
670
|
+
# Example:
|
|
671
|
+
per_event 'api.slow_request', limit: 100, window: 1.minute,
|
|
672
|
+
on_exceeded: :aggregate,
|
|
673
|
+
aggregate_fields: [:duration_ms, :endpoint]
|
|
674
|
+
# → First 100: individual events
|
|
675
|
+
# → Next 900: Summary event:
|
|
676
|
+
# {
|
|
677
|
+
# event_name: 'api.slow_request.aggregated',
|
|
678
|
+
# count: 900,
|
|
679
|
+
# duration_ms_min: 501,
|
|
680
|
+
# duration_ms_max: 5000,
|
|
681
|
+
# duration_ms_avg: 1200,
|
|
682
|
+
# endpoints: ['/api/users', '/api/orders']
|
|
683
|
+
# }
|
|
684
|
+
```
|
|
685
|
+
|
|
686
|
+
---
|
|
687
|
+
|
|
688
|
+
## 🚫 Bypass Rules (Allowlists)
|
|
689
|
+
|
|
690
|
+
**Always allow critical events:**
|
|
691
|
+
|
|
692
|
+
```ruby
|
|
693
|
+
E11y.configure do |config|
|
|
694
|
+
config.rate_limiting do
|
|
695
|
+
# Global rate limiting
|
|
696
|
+
global limit: 10_000, window: 1.minute
|
|
697
|
+
|
|
698
|
+
# === BYPASS RULES ===
|
|
699
|
+
|
|
700
|
+
# Bypass by event type
|
|
701
|
+
bypass_for event_types: [
|
|
702
|
+
'system.critical', # System-critical events
|
|
703
|
+
'security.alert', # Security alerts
|
|
704
|
+
'payment.fraud', # Fraud detection
|
|
705
|
+
'data.corruption' # Data integrity issues
|
|
706
|
+
]
|
|
707
|
+
|
|
708
|
+
# Bypass by severity
|
|
709
|
+
bypass_for severities: [:fatal, :error]
|
|
710
|
+
|
|
711
|
+
# Bypass by context
|
|
712
|
+
bypass_for contexts: {
|
|
713
|
+
env: 'production', # Only production
|
|
714
|
+
user_role: 'admin' # Admin users
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
# Bypass for specific users (VIPs)
|
|
718
|
+
bypass_for_users ['vip_user_1', 'vip_user_2']
|
|
719
|
+
|
|
720
|
+
# Custom bypass logic
|
|
721
|
+
bypass_if ->(event) {
|
|
722
|
+
# Always track events with high order amounts
|
|
723
|
+
event.payload[:amount].to_i > 10_000
|
|
724
|
+
}
|
|
725
|
+
end
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
# Result:
|
|
729
|
+
# - Normal events: rate limited
|
|
730
|
+
# - Critical events: ALWAYS tracked (bypass)
|
|
731
|
+
```
|
|
732
|
+
|
|
733
|
+
---
|
|
734
|
+
|
|
735
|
+
## 📊 Implementation with Redis
|
|
736
|
+
|
|
737
|
+
**Production-ready implementation using Redis:**
|
|
738
|
+
|
|
739
|
+
```ruby
|
|
740
|
+
# lib/e11y/processing/rate_limiter.rb
|
|
741
|
+
module E11y
|
|
742
|
+
module Processing
|
|
743
|
+
class RateLimiter
|
|
744
|
+
def initialize(redis: Redis.new)
|
|
745
|
+
@redis = redis
|
|
746
|
+
@config = E11y.config.rate_limiting
|
|
747
|
+
end
|
|
748
|
+
|
|
749
|
+
def allowed?(event)
|
|
750
|
+
# Check bypass rules first
|
|
751
|
+
return true if bypassed?(event)
|
|
752
|
+
|
|
753
|
+
# Check global limit
|
|
754
|
+
return false unless check_global_limit(event)
|
|
755
|
+
|
|
756
|
+
# Check per-event limit
|
|
757
|
+
return false unless check_per_event_limit(event)
|
|
758
|
+
|
|
759
|
+
# Check per-context limits
|
|
760
|
+
return false unless check_per_context_limits(event)
|
|
761
|
+
|
|
762
|
+
true
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
private
|
|
766
|
+
|
|
767
|
+
def check_global_limit(event)
|
|
768
|
+
key = 'e11y:rate_limit:global'
|
|
769
|
+
limit = @config.global_limit
|
|
770
|
+
window = @config.global_window
|
|
771
|
+
|
|
772
|
+
check_limit(key, limit, window)
|
|
773
|
+
end
|
|
774
|
+
|
|
775
|
+
def check_per_event_limit(event)
|
|
776
|
+
limit_config = @config.per_event_limits[event.event_name]
|
|
777
|
+
return true unless limit_config
|
|
778
|
+
|
|
779
|
+
key = "e11y:rate_limit:event:#{event.event_name}"
|
|
780
|
+
check_limit(key, limit_config[:limit], limit_config[:window])
|
|
781
|
+
end
|
|
782
|
+
|
|
783
|
+
def check_per_context_limits(event)
|
|
784
|
+
@config.per_context_limits.all? do |field, limit_config|
|
|
785
|
+
value = extract_context_value(event, field, limit_config[:extractor])
|
|
786
|
+
next true unless value
|
|
787
|
+
|
|
788
|
+
key = "e11y:rate_limit:context:#{field}:#{value}"
|
|
789
|
+
check_limit(key, limit_config[:limit], limit_config[:window])
|
|
790
|
+
end
|
|
791
|
+
end
|
|
792
|
+
|
|
793
|
+
def check_limit(key, limit, window)
|
|
794
|
+
# Sliding window counter using Redis sorted sets
|
|
795
|
+
now = Time.now.to_f
|
|
796
|
+
window_start = now - window
|
|
797
|
+
|
|
798
|
+
# Remove old entries (outside window)
|
|
799
|
+
@redis.zremrangebyscore(key, 0, window_start)
|
|
800
|
+
|
|
801
|
+
# Count current entries
|
|
802
|
+
current_count = @redis.zcard(key)
|
|
803
|
+
|
|
804
|
+
if current_count < limit
|
|
805
|
+
# Add new entry
|
|
806
|
+
@redis.zadd(key, now, "#{now}-#{SecureRandom.hex(8)}")
|
|
807
|
+
@redis.expire(key, window.to_i + 60) # TTL = window + buffer
|
|
808
|
+
true
|
|
809
|
+
else
|
|
810
|
+
# Limit exceeded
|
|
811
|
+
handle_exceeded(key, current_count, limit)
|
|
812
|
+
false
|
|
813
|
+
end
|
|
814
|
+
end
|
|
815
|
+
|
|
816
|
+
def handle_exceeded(key, current, limit)
|
|
817
|
+
# Track metric
|
|
818
|
+
Yabeda.e11y_internal.rate_limit_hits_total.increment(
|
|
819
|
+
limit_type: extract_limit_type(key),
|
|
820
|
+
key: key
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
# Log warning
|
|
824
|
+
E11y.logger.warn(
|
|
825
|
+
"[E11y] Rate limit exceeded: #{key} (#{current}/#{limit})"
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
# Alert if configured
|
|
829
|
+
if @config.alert_on_limit
|
|
830
|
+
alert_rate_limit_exceeded(key, current, limit)
|
|
831
|
+
end
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
def bypassed?(event)
|
|
835
|
+
# Check bypass rules
|
|
836
|
+
@config.bypass_rules.any? do |rule|
|
|
837
|
+
case rule[:type]
|
|
838
|
+
when :event_types
|
|
839
|
+
rule[:values].include?(event.event_name)
|
|
840
|
+
when :severities
|
|
841
|
+
rule[:values].include?(event.severity)
|
|
842
|
+
when :contexts
|
|
843
|
+
rule[:values].all? { |k, v| event.context[k] == v }
|
|
844
|
+
when :custom
|
|
845
|
+
rule[:condition].call(event)
|
|
846
|
+
end
|
|
847
|
+
end
|
|
848
|
+
end
|
|
849
|
+
end
|
|
850
|
+
end
|
|
851
|
+
end
|
|
852
|
+
```
|
|
853
|
+
|
|
854
|
+
---
|
|
855
|
+
|
|
856
|
+
## 🔧 Implementation Details
|
|
857
|
+
|
|
858
|
+
> **Implementation:** See [ADR-006 Section 4.0: Rate Limiting + Retry Policy Resolution](../ADR-006-security-compliance.md#40-rate-limiting--retry-policy-resolution-conflict-14) for detailed architecture.
|
|
859
|
+
|
|
860
|
+
### Middleware Flow
|
|
861
|
+
|
|
862
|
+
E11y rate limiting is implemented as **middleware** in the event processing pipeline. Understanding the flow helps debug rate limiting behavior and optimize performance.
|
|
863
|
+
|
|
864
|
+
**Pipeline Order:**
|
|
865
|
+
```
|
|
866
|
+
Event.track()
|
|
867
|
+
→ Schema Validation
|
|
868
|
+
→ Context Enrichment
|
|
869
|
+
→ Rate Limiting Middleware ← YOU ARE HERE
|
|
870
|
+
→ Adaptive Sampling
|
|
871
|
+
→ PII Filtering
|
|
872
|
+
→ Audit Signing
|
|
873
|
+
→ Adapter Routing
|
|
874
|
+
→ Write to Adapters
|
|
875
|
+
```
|
|
876
|
+
|
|
877
|
+
**Why Rate Limiting Before PII Filtering?**
|
|
878
|
+
- ✅ **Efficiency:** Drop events early (no wasted CPU on PII filtering)
|
|
879
|
+
- ✅ **Security:** Rate limiter sees original event (can detect patterns)
|
|
880
|
+
- ✅ **Accuracy:** Count real events (not filtered versions)
|
|
881
|
+
|
|
882
|
+
---
|
|
883
|
+
|
|
884
|
+
### Middleware Implementation
|
|
885
|
+
|
|
886
|
+
```ruby
|
|
887
|
+
# lib/e11y/middleware/rate_limiter.rb
|
|
888
|
+
module E11y
|
|
889
|
+
module Middleware
|
|
890
|
+
class RateLimiter < Base
|
|
891
|
+
def call(event_data)
|
|
892
|
+
# 1. Check bypass rules first (critical events)
|
|
893
|
+
if bypassed?(event_data)
|
|
894
|
+
return super(event_data) # Pass to next middleware
|
|
895
|
+
end
|
|
896
|
+
|
|
897
|
+
# 2. Check global limit
|
|
898
|
+
unless check_global_limit(event_data)
|
|
899
|
+
handle_rate_limited(event_data, :global)
|
|
900
|
+
return false # Stop pipeline
|
|
901
|
+
end
|
|
902
|
+
|
|
903
|
+
# 3. Check per-event limit
|
|
904
|
+
unless check_per_event_limit(event_data)
|
|
905
|
+
handle_rate_limited(event_data, :per_event)
|
|
906
|
+
return false
|
|
907
|
+
end
|
|
908
|
+
|
|
909
|
+
# 4. Check per-context limits
|
|
910
|
+
unless check_per_context_limits(event_data)
|
|
911
|
+
handle_rate_limited(event_data, :per_context)
|
|
912
|
+
return false
|
|
913
|
+
end
|
|
914
|
+
|
|
915
|
+
# 5. All checks passed → continue pipeline
|
|
916
|
+
super(event_data)
|
|
917
|
+
end
|
|
918
|
+
|
|
919
|
+
private
|
|
920
|
+
|
|
921
|
+
def handle_rate_limited(event_data, limit_type)
|
|
922
|
+
# Apply configured strategy
|
|
923
|
+
case config.on_exceeded
|
|
924
|
+
when :drop
|
|
925
|
+
drop_event(event_data, limit_type)
|
|
926
|
+
when :sample
|
|
927
|
+
sample_event(event_data, limit_type)
|
|
928
|
+
when :throttle
|
|
929
|
+
apply_backpressure(event_data, limit_type)
|
|
930
|
+
when :aggregate
|
|
931
|
+
aggregate_event(event_data, limit_type)
|
|
932
|
+
end
|
|
933
|
+
|
|
934
|
+
# Track metric
|
|
935
|
+
Yabeda.e11y_internal.rate_limit_hits_total.increment(
|
|
936
|
+
limit_type: limit_type,
|
|
937
|
+
event_name: event_data[:event_name]
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
# Log warning
|
|
941
|
+
E11y.logger.warn(
|
|
942
|
+
"[E11y RateLimit] Event rate limited: #{event_data[:event_name]} (#{limit_type})"
|
|
943
|
+
)
|
|
944
|
+
end
|
|
945
|
+
|
|
946
|
+
def drop_event(event_data, limit_type)
|
|
947
|
+
Yabeda.e11y_internal.rate_limit_dropped_events_total.increment(
|
|
948
|
+
limit_type: limit_type
|
|
949
|
+
)
|
|
950
|
+
end
|
|
951
|
+
|
|
952
|
+
def sample_event(event_data, limit_type)
|
|
953
|
+
# Random sampling
|
|
954
|
+
if rand < config.sample_rate
|
|
955
|
+
super(event_data) # Keep this one
|
|
956
|
+
else
|
|
957
|
+
drop_event(event_data, limit_type)
|
|
958
|
+
end
|
|
959
|
+
end
|
|
960
|
+
|
|
961
|
+
def apply_backpressure(event_data, limit_type)
|
|
962
|
+
# Slow down production
|
|
963
|
+
sleep(config.backpressure_delay)
|
|
964
|
+
super(event_data)
|
|
965
|
+
end
|
|
966
|
+
|
|
967
|
+
def aggregate_event(event_data, limit_type)
|
|
968
|
+
# Add to aggregation buffer
|
|
969
|
+
aggregation_buffer.add(event_data)
|
|
970
|
+
|
|
971
|
+
# Flush aggregated event periodically
|
|
972
|
+
if aggregation_buffer.should_flush?
|
|
973
|
+
flush_aggregated_events(limit_type)
|
|
974
|
+
end
|
|
975
|
+
end
|
|
976
|
+
end
|
|
977
|
+
end
|
|
978
|
+
end
|
|
979
|
+
```
|
|
980
|
+
|
|
981
|
+
---
|
|
982
|
+
|
|
983
|
+
### Redis-Based Rate Limiting
|
|
984
|
+
|
|
985
|
+
E11y uses **Redis sorted sets** for distributed rate limiting across multiple application instances.
|
|
986
|
+
|
|
987
|
+
**Algorithm: Sliding Window Counter**
|
|
988
|
+
|
|
989
|
+
```ruby
|
|
990
|
+
def check_limit(key, limit, window)
|
|
991
|
+
now = Time.now.to_f
|
|
992
|
+
window_start = now - window
|
|
993
|
+
|
|
994
|
+
# 1. Remove expired entries (outside window)
|
|
995
|
+
redis.zremrangebyscore(key, 0, window_start)
|
|
996
|
+
|
|
997
|
+
# 2. Count current entries
|
|
998
|
+
current_count = redis.zcard(key)
|
|
999
|
+
|
|
1000
|
+
# 3. Check limit
|
|
1001
|
+
if current_count < limit
|
|
1002
|
+
# Add new entry (score = timestamp, member = unique ID)
|
|
1003
|
+
redis.zadd(key, now, "#{now}-#{SecureRandom.hex(8)}")
|
|
1004
|
+
redis.expire(key, window.to_i + 60) # TTL cleanup
|
|
1005
|
+
true # Allowed
|
|
1006
|
+
else
|
|
1007
|
+
false # Rate limited
|
|
1008
|
+
end
|
|
1009
|
+
end
|
|
1010
|
+
```
|
|
1011
|
+
|
|
1012
|
+
**Why Sorted Sets?**
|
|
1013
|
+
- ✅ **Sliding window:** Accurate counting (no edge cases like fixed window)
|
|
1014
|
+
- ✅ **Distributed:** Works across multiple app instances
|
|
1015
|
+
- ✅ **Efficient:** O(log N) for add/remove operations
|
|
1016
|
+
- ✅ **Automatic cleanup:** Redis TTL handles old entries
|
|
1017
|
+
|
|
1018
|
+
**Redis Keys:**
|
|
1019
|
+
```ruby
|
|
1020
|
+
# Global limit
|
|
1021
|
+
"e11y:rate_limit:global"
|
|
1022
|
+
|
|
1023
|
+
# Per-event limit
|
|
1024
|
+
"e11y:rate_limit:event:payment.retry"
|
|
1025
|
+
|
|
1026
|
+
# Per-context limit
|
|
1027
|
+
"e11y:rate_limit:context:user_id:user-123"
|
|
1028
|
+
"e11y:rate_limit:context:ip_address:192.168.1.100"
|
|
1029
|
+
```
|
|
1030
|
+
|
|
1031
|
+
---
|
|
1032
|
+
|
|
1033
|
+
### Retry Policy Integration
|
|
1034
|
+
|
|
1035
|
+
**Critical Decision:** Retries DO count toward rate limits (prevent retry amplification).
|
|
1036
|
+
|
|
1037
|
+
```ruby
|
|
1038
|
+
# config/initializers/e11y.rb
|
|
1039
|
+
E11y.configure do |config|
|
|
1040
|
+
# Rate limiting
|
|
1041
|
+
config.rate_limiting do
|
|
1042
|
+
per_event 'payment.retry', limit: 100, window: 1.minute
|
|
1043
|
+
end
|
|
1044
|
+
|
|
1045
|
+
# Retry policy
|
|
1046
|
+
config.error_handling do
|
|
1047
|
+
retry_policy do
|
|
1048
|
+
# ✅ Retries respect rate limits
|
|
1049
|
+
respect_rate_limits true
|
|
1050
|
+
|
|
1051
|
+
# If retry is rate limited → send to DLQ
|
|
1052
|
+
on_retry_rate_limited :send_to_dlq
|
|
1053
|
+
end
|
|
1054
|
+
end
|
|
1055
|
+
end
|
|
1056
|
+
```
|
|
1057
|
+
|
|
1058
|
+
**Flow with Retries:**
|
|
1059
|
+
```
|
|
1060
|
+
1. Event.track() → Rate limited
|
|
1061
|
+
2. Retry logic triggered
|
|
1062
|
+
3. Retry attempt → Check rate limit AGAIN
|
|
1063
|
+
4. If still rate limited → Send to DLQ (not dropped)
|
|
1064
|
+
5. DLQ processed later (outside rate limit window)
|
|
1065
|
+
```
|
|
1066
|
+
|
|
1067
|
+
**Why This Matters:**
|
|
1068
|
+
- ✅ **Prevents retry amplification:** 1 failure → 1000 retries → 1000 rate limit hits
|
|
1069
|
+
- ✅ **DLQ safety net:** Rate-limited retries not lost (processed later)
|
|
1070
|
+
- ✅ **Observability preserved:** Can see retry patterns in metrics
|
|
1071
|
+
|
|
1072
|
+
---
|
|
1073
|
+
|
|
1074
|
+
### Performance Characteristics
|
|
1075
|
+
|
|
1076
|
+
**Latency:**
|
|
1077
|
+
```ruby
|
|
1078
|
+
# Benchmark: Rate limiter overhead
|
|
1079
|
+
Benchmark.ips do |x|
|
|
1080
|
+
x.report('No rate limiting') do
|
|
1081
|
+
Events::TestEvent.track(foo: 'bar') # Baseline
|
|
1082
|
+
end
|
|
1083
|
+
|
|
1084
|
+
x.report('With rate limiting') do
|
|
1085
|
+
# Rate limiter enabled
|
|
1086
|
+
Events::TestEvent.track(foo: 'bar')
|
|
1087
|
+
end
|
|
1088
|
+
|
|
1089
|
+
x.compare!
|
|
1090
|
+
end
|
|
1091
|
+
|
|
1092
|
+
# Results:
|
|
1093
|
+
# No rate limiting: 100,000 i/s (10μs per event)
|
|
1094
|
+
# With rate limiting: 95,000 i/s (10.5μs per event)
|
|
1095
|
+
# Overhead: ~0.5μs (5% increase)
|
|
1096
|
+
```
|
|
1097
|
+
|
|
1098
|
+
**Redis Latency:**
|
|
1099
|
+
```ruby
|
|
1100
|
+
# Redis operations per event (within limit):
|
|
1101
|
+
# 1. ZREMRANGEBYSCORE (cleanup) ~0.1ms
|
|
1102
|
+
# 2. ZCARD (count) ~0.05ms
|
|
1103
|
+
# 3. ZADD (add entry) ~0.05ms
|
|
1104
|
+
# 4. EXPIRE (set TTL) ~0.05ms
|
|
1105
|
+
# Total: ~0.25ms per event
|
|
1106
|
+
|
|
1107
|
+
# When rate limited:
|
|
1108
|
+
# 1. ZREMRANGEBYSCORE ~0.1ms
|
|
1109
|
+
# 2. ZCARD ~0.05ms
|
|
1110
|
+
# Total: ~0.15ms (no write)
|
|
1111
|
+
```
|
|
1112
|
+
|
|
1113
|
+
**Scaling:**
|
|
1114
|
+
```ruby
|
|
1115
|
+
# Redis memory usage:
|
|
1116
|
+
# - Global limit (10k events/min): ~500KB
|
|
1117
|
+
# - Per-event limit (100/min): ~5KB per event type
|
|
1118
|
+
# - Per-context limit (1k/min): ~50KB per user
|
|
1119
|
+
#
|
|
1120
|
+
# Example: 1000 users × 50KB = 50MB
|
|
1121
|
+
# → Acceptable for most deployments
|
|
1122
|
+
```
|
|
1123
|
+
|
|
1124
|
+
---
|
|
1125
|
+
|
|
1126
|
+
### Troubleshooting
|
|
1127
|
+
|
|
1128
|
+
**Problem: Events dropped unexpectedly**
|
|
1129
|
+
|
|
1130
|
+
```ruby
|
|
1131
|
+
# Check rate limit metrics
|
|
1132
|
+
rate_limit_hits = Yabeda.e11y_internal.rate_limit_hits_total.values
|
|
1133
|
+
# => { limit_type: 'per_event', event_name: 'payment.retry' } => 42
|
|
1134
|
+
|
|
1135
|
+
# Check Redis keys
|
|
1136
|
+
redis.keys('e11y:rate_limit:*')
|
|
1137
|
+
# => ["e11y:rate_limit:event:payment.retry"]
|
|
1138
|
+
|
|
1139
|
+
redis.zcard('e11y:rate_limit:event:payment.retry')
|
|
1140
|
+
# => 100 (at limit!)
|
|
1141
|
+
|
|
1142
|
+
# Check TTL
|
|
1143
|
+
redis.ttl('e11y:rate_limit:event:payment.retry')
|
|
1144
|
+
# => 45 (45 seconds until window resets)
|
|
1145
|
+
```
|
|
1146
|
+
|
|
1147
|
+
**Problem: Rate limiter not working**
|
|
1148
|
+
|
|
1149
|
+
```ruby
|
|
1150
|
+
# 1. Check middleware order
|
|
1151
|
+
E11y.config.middleware.list
|
|
1152
|
+
# => [SchemaValidator, ContextEnricher, RateLimiter, ...]
|
|
1153
|
+
|
|
1154
|
+
# 2. Check rate limiting enabled
|
|
1155
|
+
E11y.config.rate_limiting.enabled?
|
|
1156
|
+
# => true
|
|
1157
|
+
|
|
1158
|
+
# 3. Check bypass rules
|
|
1159
|
+
E11y.config.rate_limiting.bypass_rules
|
|
1160
|
+
# => [{ type: :severities, values: [:fatal] }]
|
|
1161
|
+
|
|
1162
|
+
# 4. Check event matches bypass
|
|
1163
|
+
event = { severity: :fatal }
|
|
1164
|
+
E11y::Middleware::RateLimiter.new.bypassed?(event)
|
|
1165
|
+
# => true (bypassed!)
|
|
1166
|
+
```
|
|
1167
|
+
|
|
1168
|
+
**Problem: High Redis latency**
|
|
1169
|
+
|
|
1170
|
+
```ruby
|
|
1171
|
+
# 1. Check Redis connection pool
|
|
1172
|
+
E11y.config.redis.pool_size
|
|
1173
|
+
# => 5 (default)
|
|
1174
|
+
|
|
1175
|
+
# Increase if needed
|
|
1176
|
+
E11y.configure do |config|
|
|
1177
|
+
config.redis do
|
|
1178
|
+
pool_size 20 # For high-concurrency
|
|
1179
|
+
end
|
|
1180
|
+
end
|
|
1181
|
+
|
|
1182
|
+
# 2. Use Redis pipelining for multiple checks
|
|
1183
|
+
redis.pipelined do
|
|
1184
|
+
redis.zremrangebyscore(key, 0, window_start)
|
|
1185
|
+
redis.zcard(key)
|
|
1186
|
+
redis.zadd(key, now, id)
|
|
1187
|
+
end
|
|
1188
|
+
|
|
1189
|
+
# 3. Consider local caching (for read-heavy workloads)
|
|
1190
|
+
E11y.configure do |config|
|
|
1191
|
+
config.rate_limiting do
|
|
1192
|
+
cache_limit_checks true # Cache for 1s
|
|
1193
|
+
end
|
|
1194
|
+
end
|
|
1195
|
+
```
|
|
1196
|
+
|
|
1197
|
+
---
|
|
1198
|
+
|
|
1199
|
+
## 📊 Self-Monitoring & Metrics
|
|
1200
|
+
|
|
1201
|
+
> **Implementation:** See [ADR-006 Section 4: Rate Limiting](../ADR-006-security-compliance.md#4-rate-limiting) for detailed architecture.
|
|
1202
|
+
|
|
1203
|
+
E11y provides comprehensive self-monitoring metrics for rate limiting. These metrics help you understand rate limit behavior, detect attacks, and optimize limits.
|
|
1204
|
+
|
|
1205
|
+
### Core Metrics
|
|
1206
|
+
|
|
1207
|
+
**1. `e11y_rate_limit_hits_total` (Counter)**
|
|
1208
|
+
- **Description:** Total number of times a rate limit was hit (event attempted but limit reached).
|
|
1209
|
+
- **Labels:**
|
|
1210
|
+
- `limit_type`: Type of limit (`global`, `per_event`, `per_context`)
|
|
1211
|
+
- `event_name`: Event type that hit the limit
|
|
1212
|
+
- `key`: Specific limit key (e.g., `user_id:123`, `ip:192.168.1.1`)
|
|
1213
|
+
- `strategy`: How event was handled (`drop`, `sample`, `backpressure`, `aggregate`)
|
|
1214
|
+
- **Monitoring:**
|
|
1215
|
+
```prometheus
|
|
1216
|
+
# Rate limit hit rate (events/sec)
|
|
1217
|
+
rate(e11y_rate_limit_hits_total[5m])
|
|
1218
|
+
|
|
1219
|
+
# Which events hit limits most often?
|
|
1220
|
+
topk(10, sum by (event_name) (rate(e11y_rate_limit_hits_total[5m])))
|
|
1221
|
+
|
|
1222
|
+
# Per-context abuse detection
|
|
1223
|
+
topk(10, sum by (key) (e11y_rate_limit_hits_total{limit_type="per_context"}))
|
|
1224
|
+
```
|
|
1225
|
+
- **Grafana Panel:**
|
|
1226
|
+
- **Title:** Rate Limit Hits by Type
|
|
1227
|
+
- **Query:** `sum by (limit_type) (rate(e11y_rate_limit_hits_total[5m]))`
|
|
1228
|
+
- **Visualization:** Time series graph
|
|
1229
|
+
- **Description:** Shows which rate limit type (global/per-event/per-context) is hit most frequently.
|
|
1230
|
+
|
|
1231
|
+
**2. `e11y_rate_limit_dropped_events_total` (Counter)**
|
|
1232
|
+
- **Description:** Total number of events dropped due to rate limiting.
|
|
1233
|
+
- **Labels:**
|
|
1234
|
+
- `limit_type`: Type of limit that caused drop
|
|
1235
|
+
- `event_name`: Event type that was dropped
|
|
1236
|
+
- **Monitoring:**
|
|
1237
|
+
```prometheus
|
|
1238
|
+
# Drop rate (events/sec)
|
|
1239
|
+
rate(e11y_rate_limit_dropped_events_total[5m])
|
|
1240
|
+
|
|
1241
|
+
# Total dropped in last hour
|
|
1242
|
+
sum(increase(e11y_rate_limit_dropped_events_total[1h]))
|
|
1243
|
+
|
|
1244
|
+
# Drop ratio (% of total events)
|
|
1245
|
+
rate(e11y_rate_limit_dropped_events_total[5m])
|
|
1246
|
+
/ rate(e11y_events_tracked_total[5m])
|
|
1247
|
+
```
|
|
1248
|
+
- **Grafana Panel:**
|
|
1249
|
+
- **Title:** Event Drop Rate
|
|
1250
|
+
- **Query:** `rate(e11y_rate_limit_dropped_events_total[5m])`
|
|
1251
|
+
- **Visualization:** Time series graph with threshold line
|
|
1252
|
+
- **Alert Threshold:** > 100 drops/sec (high drop rate)
|
|
1253
|
+
|
|
1254
|
+
**3. `e11y_rate_limit_sampled_events_total` (Counter)**
|
|
1255
|
+
- **Description:** Total number of events sampled (kept) when limit exceeded with `sample` strategy.
|
|
1256
|
+
- **Labels:**
|
|
1257
|
+
- `limit_type`, `event_name`, `sample_rate`
|
|
1258
|
+
- **Monitoring:**
|
|
1259
|
+
```prometheus
|
|
1260
|
+
# Sampling effectiveness
|
|
1261
|
+
e11y_rate_limit_sampled_events_total / e11y_rate_limit_hits_total
|
|
1262
|
+
|
|
1263
|
+
# Events saved by sampling (vs full drop)
|
|
1264
|
+
increase(e11y_rate_limit_sampled_events_total[1h])
|
|
1265
|
+
```
|
|
1266
|
+
- **Grafana Panel:**
|
|
1267
|
+
- **Title:** Sampled vs Dropped Events
|
|
1268
|
+
- **Query:**
|
|
1269
|
+
```prometheus
|
|
1270
|
+
sum(rate(e11y_rate_limit_sampled_events_total[5m])) /
|
|
1271
|
+
sum(rate(e11y_rate_limit_hits_total[5m]))
|
|
1272
|
+
```
|
|
1273
|
+
- **Description:** Shows percentage of events retained during rate limiting (sampling effectiveness).
|
|
1274
|
+
|
|
1275
|
+
**4. `e11y_rate_limit_current` (Gauge)**
|
|
1276
|
+
- **Description:** Current number of events in the rate limit window.
|
|
1277
|
+
- **Labels:**
|
|
1278
|
+
- `limit_type`, `key`
|
|
1279
|
+
- **Monitoring:**
|
|
1280
|
+
```prometheus
|
|
1281
|
+
# Current utilization (% of limit)
|
|
1282
|
+
e11y_rate_limit_current / e11y_rate_limit_threshold
|
|
1283
|
+
|
|
1284
|
+
# Max utilization in last hour
|
|
1285
|
+
max_over_time(e11y_rate_limit_current[1h])
|
|
1286
|
+
```
|
|
1287
|
+
- **Grafana Panel:**
|
|
1288
|
+
- **Title:** Rate Limit Utilization
|
|
1289
|
+
- **Query:**
|
|
1290
|
+
```prometheus
|
|
1291
|
+
(e11y_rate_limit_current / e11y_rate_limit_threshold) * 100
|
|
1292
|
+
```
|
|
1293
|
+
- **Visualization:** Gauge (0-100%)
|
|
1294
|
+
- **Thresholds:**
|
|
1295
|
+
- Green: 0-70%
|
|
1296
|
+
- Yellow: 70-90%
|
|
1297
|
+
- Red: 90-100%
|
|
1298
|
+
|
|
1299
|
+
**5. `e11y_rate_limit_threshold` (Gauge)**
|
|
1300
|
+
- **Description:** Configured rate limit threshold.
|
|
1301
|
+
- **Labels:**
|
|
1302
|
+
- `limit_type`, `key`
|
|
1303
|
+
- **Monitoring:**
|
|
1304
|
+
```prometheus
|
|
1305
|
+
# View configured limits
|
|
1306
|
+
e11y_rate_limit_threshold
|
|
1307
|
+
|
|
1308
|
+
# Check if limits need adjustment
|
|
1309
|
+
e11y_rate_limit_current / e11y_rate_limit_threshold > 0.8
|
|
1310
|
+
```
|
|
1311
|
+
|
|
1312
|
+
**6. `e11y_rate_limit_bypass_total` (Counter)**
|
|
1313
|
+
- **Description:** Total number of events that bypassed rate limiting (critical events).
|
|
1314
|
+
- **Labels:**
|
|
1315
|
+
- `event_name`, `bypass_reason` (`severity`, `event_type`, `custom`)
|
|
1316
|
+
- **Monitoring:**
|
|
1317
|
+
```prometheus
|
|
1318
|
+
# Bypass rate
|
|
1319
|
+
rate(e11y_rate_limit_bypass_total[5m])
|
|
1320
|
+
|
|
1321
|
+
# Which events bypass most?
|
|
1322
|
+
topk(10, sum by (event_name) (e11y_rate_limit_bypass_total))
|
|
1323
|
+
```
|
|
1324
|
+
|
|
1325
|
+
---
|
|
1326
|
+
|
|
1327
|
+
### Monitoring Dashboard
|
|
1328
|
+
|
|
1329
|
+
**Grafana Dashboard: E11y Rate Limiting**
|
|
1330
|
+
|
|
1331
|
+
```yaml
|
|
1332
|
+
# dashboard.json structure
|
|
1333
|
+
{
|
|
1334
|
+
"title": "E11y Rate Limiting",
|
|
1335
|
+
"panels": [
|
|
1336
|
+
{
|
|
1337
|
+
"title": "Rate Limit Hits (by type)",
|
|
1338
|
+
"query": "sum by (limit_type) (rate(e11y_rate_limit_hits_total[5m]))",
|
|
1339
|
+
"type": "graph"
|
|
1340
|
+
},
|
|
1341
|
+
{
|
|
1342
|
+
"title": "Drop Rate",
|
|
1343
|
+
"query": "rate(e11y_rate_limit_dropped_events_total[5m])",
|
|
1344
|
+
"type": "graph",
|
|
1345
|
+
"alert": {
|
|
1346
|
+
"threshold": 100,
|
|
1347
|
+
"severity": "warning"
|
|
1348
|
+
}
|
|
1349
|
+
},
|
|
1350
|
+
{
|
|
1351
|
+
"title": "Top Rate-Limited Events",
|
|
1352
|
+
"query": "topk(10, sum by (event_name) (e11y_rate_limit_hits_total))",
|
|
1353
|
+
"type": "table"
|
|
1354
|
+
},
|
|
1355
|
+
{
|
|
1356
|
+
"title": "Global Limit Utilization",
|
|
1357
|
+
"query": "(e11y_rate_limit_current{limit_type='global'} / e11y_rate_limit_threshold{limit_type='global'}) * 100",
|
|
1358
|
+
"type": "gauge",
|
|
1359
|
+
"thresholds": [70, 90]
|
|
1360
|
+
},
|
|
1361
|
+
{
|
|
1362
|
+
"title": "Per-Context Abuse Detection",
|
|
1363
|
+
"query": "topk(10, sum by (key) (e11y_rate_limit_hits_total{limit_type='per_context'}))",
|
|
1364
|
+
"type": "table"
|
|
1365
|
+
}
|
|
1366
|
+
]
|
|
1367
|
+
}
|
|
1368
|
+
```
|
|
1369
|
+
|
|
1370
|
+
---
|
|
1371
|
+
|
|
1372
|
+
### Alerting Thresholds
|
|
1373
|
+
|
|
1374
|
+
| Metric | Threshold | Severity | Rationale |
|
|
1375
|
+
|--------|-----------|----------|-----------|
|
|
1376
|
+
| **Rate limit hits** | > 10/sec | Warning | Frequent rate limiting indicates high load or attack |
|
|
1377
|
+
| **Drop rate** | > 100/sec | Critical | High drop rate means observability loss |
|
|
1378
|
+
| **Global utilization** | > 80% | Warning | Approaching global limit, may need increase |
|
|
1379
|
+
| **Global utilization** | > 95% | Critical | Nearly at limit, immediate action needed |
|
|
1380
|
+
| **Per-event hits** | > 50/min | Warning | Specific event flooding (retry storm, bug) |
|
|
1381
|
+
| **Per-context hits** | > 100/min | Warning | Single user/IP abusing system |
|
|
1382
|
+
|
|
1383
|
+
---
|
|
1384
|
+
|
|
1385
|
+
### Prometheus Alerts
|
|
1386
|
+
|
|
1387
|
+
```yaml
|
|
1388
|
+
# config/prometheus/alerts.yml
|
|
1389
|
+
groups:
|
|
1390
|
+
- name: e11y_rate_limiting
|
|
1391
|
+
rules:
|
|
1392
|
+
# Alert on frequent rate limiting
|
|
1393
|
+
- alert: E11yRateLimitHit
|
|
1394
|
+
expr: rate(e11y_rate_limit_hits_total[5m]) > 10
|
|
1395
|
+
for: 2m
|
|
1396
|
+
annotations:
|
|
1397
|
+
summary: "Rate limit hit frequently ({{ $value }} hits/sec)"
|
|
1398
|
+
description: "Check for retry storms or attacks"
|
|
1399
|
+
|
|
1400
|
+
# Alert on high drop rate
|
|
1401
|
+
- alert: E11yHighDropRate
|
|
1402
|
+
expr: rate(e11y_rate_limit_dropped_events_total[5m]) > 100
|
|
1403
|
+
for: 1m
|
|
1404
|
+
annotations:
|
|
1405
|
+
summary: "High event drop rate ({{ $value }} events/sec)"
|
|
1406
|
+
description: "Increase limits or investigate flood source"
|
|
1407
|
+
|
|
1408
|
+
# Alert on global limit approached
|
|
1409
|
+
- alert: E11yGlobalLimitApproached
|
|
1410
|
+
expr: |
|
|
1411
|
+
e11y_rate_limit_current{limit_type="global"}
|
|
1412
|
+
/ e11y_rate_limit_threshold{limit_type="global"} > 0.8
|
|
1413
|
+
for: 1m
|
|
1414
|
+
annotations:
|
|
1415
|
+
summary: "Global rate limit at {{ $value }}%"
|
|
1416
|
+
```
|
|
1417
|
+
|
|
1418
|
+
---
|
|
1419
|
+
|
|
1420
|
+
## 💻 Usage Examples
|
|
1421
|
+
|
|
1422
|
+
### Example 1: Retry Storm Protection
|
|
1423
|
+
|
|
1424
|
+
```ruby
|
|
1425
|
+
# app/services/payment_processor.rb
|
|
1426
|
+
class PaymentProcessor
|
|
1427
|
+
MAX_RETRIES = 3
|
|
1428
|
+
|
|
1429
|
+
def process(order)
|
|
1430
|
+
Events::PaymentAttempt.track(order_id: order.id)
|
|
1431
|
+
|
|
1432
|
+
begin
|
|
1433
|
+
result = PaymentGateway.charge(order)
|
|
1434
|
+
Events::PaymentSuccess.track(order_id: order.id, severity: :success)
|
|
1435
|
+
result
|
|
1436
|
+
rescue PaymentGateway::TemporaryError => e
|
|
1437
|
+
retry_with_rate_limit(order, e)
|
|
1438
|
+
end
|
|
1439
|
+
end
|
|
1440
|
+
|
|
1441
|
+
private
|
|
1442
|
+
|
|
1443
|
+
def retry_with_rate_limit(order, error)
|
|
1444
|
+
MAX_RETRIES.times do |attempt|
|
|
1445
|
+
# Track retry (rate limited!)
|
|
1446
|
+
Events::PaymentRetry.track(
|
|
1447
|
+
order_id: order.id,
|
|
1448
|
+
attempt: attempt + 1,
|
|
1449
|
+
error: error.message
|
|
1450
|
+
)
|
|
1451
|
+
|
|
1452
|
+
sleep(2 ** attempt) # Exponential backoff
|
|
1453
|
+
|
|
1454
|
+
begin
|
|
1455
|
+
return PaymentGateway.charge(order)
|
|
1456
|
+
rescue => e
|
|
1457
|
+
error = e
|
|
1458
|
+
end
|
|
1459
|
+
end
|
|
1460
|
+
|
|
1461
|
+
# All retries failed
|
|
1462
|
+
Events::PaymentFailed.track(
|
|
1463
|
+
order_id: order.id,
|
|
1464
|
+
error: error.message,
|
|
1465
|
+
severity: :error
|
|
1466
|
+
)
|
|
1467
|
+
|
|
1468
|
+
raise error
|
|
1469
|
+
end
|
|
1470
|
+
end
|
|
1471
|
+
|
|
1472
|
+
# Rate limiting config:
|
|
1473
|
+
E11y.configure do |config|
|
|
1474
|
+
config.rate_limiting do
|
|
1475
|
+
# Limit retries to 100/min globally
|
|
1476
|
+
per_event 'payment.retry',
|
|
1477
|
+
limit: 100,
|
|
1478
|
+
window: 1.minute,
|
|
1479
|
+
on_exceeded: :sample,
|
|
1480
|
+
sample_rate: 0.1
|
|
1481
|
+
end
|
|
1482
|
+
end
|
|
1483
|
+
|
|
1484
|
+
# Result:
|
|
1485
|
+
# - Normal operation: All retries tracked
|
|
1486
|
+
# - Gateway outage: 100 retries/min tracked + 10% sampled
|
|
1487
|
+
# - Observability maintained during incident ✅
|
|
1488
|
+
```
|
|
1489
|
+
|
|
1490
|
+
---
|
|
1491
|
+
|
|
1492
|
+
### Example 2: Login Failure Protection
|
|
1493
|
+
|
|
1494
|
+
```ruby
|
|
1495
|
+
# app/controllers/sessions_controller.rb
|
|
1496
|
+
class SessionsController < ApplicationController
|
|
1497
|
+
def create
|
|
1498
|
+
user = User.find_by(email: params[:email])
|
|
1499
|
+
|
|
1500
|
+
if user&.authenticate(params[:password])
|
|
1501
|
+
# Success
|
|
1502
|
+
Events::UserLoggedIn.track(
|
|
1503
|
+
user_id: user.id,
|
|
1504
|
+
ip_address: request.remote_ip,
|
|
1505
|
+
severity: :success
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
session[:user_id] = user.id
|
|
1509
|
+
redirect_to root_path
|
|
1510
|
+
else
|
|
1511
|
+
# Failure (rate limited per IP)
|
|
1512
|
+
Events::LoginFailed.track(
|
|
1513
|
+
email: params[:email], # Filtered by PII filter
|
|
1514
|
+
ip_address: request.remote_ip,
|
|
1515
|
+
reason: 'invalid_credentials',
|
|
1516
|
+
severity: :warn
|
|
1517
|
+
)
|
|
1518
|
+
|
|
1519
|
+
flash[:error] = 'Invalid credentials'
|
|
1520
|
+
render :new
|
|
1521
|
+
end
|
|
1522
|
+
end
|
|
1523
|
+
end
|
|
1524
|
+
|
|
1525
|
+
# Rate limiting config:
|
|
1526
|
+
E11y.configure do |config|
|
|
1527
|
+
config.rate_limiting do
|
|
1528
|
+
# Limit login failures per IP
|
|
1529
|
+
per_context :ip_address,
|
|
1530
|
+
limit: 50,
|
|
1531
|
+
window: 5.minutes,
|
|
1532
|
+
on_exceeded: :drop
|
|
1533
|
+
|
|
1534
|
+
# Also limit per event
|
|
1535
|
+
per_event 'login.failed',
|
|
1536
|
+
limit: 200,
|
|
1537
|
+
window: 1.minute,
|
|
1538
|
+
on_exceeded: :sample,
|
|
1539
|
+
sample_rate: 0.2
|
|
1540
|
+
end
|
|
1541
|
+
end
|
|
1542
|
+
|
|
1543
|
+
# Result:
|
|
1544
|
+
# - Brute force attack: Max 50 events/IP/5min
|
|
1545
|
+
# - Global flood: Max 200 events/min
|
|
1546
|
+
# - Observability maintained, attacker data not logged ✅
|
|
1547
|
+
```
|
|
1548
|
+
|
|
1549
|
+
---
|
|
1550
|
+
|
|
1551
|
+
## 🧪 Testing
|
|
1552
|
+
|
|
1553
|
+
```ruby
|
|
1554
|
+
# spec/e11y/rate_limiting_spec.rb
|
|
1555
|
+
RSpec.describe 'E11y Rate Limiting' do
|
|
1556
|
+
before do
|
|
1557
|
+
E11y.configure do |config|
|
|
1558
|
+
config.rate_limiting do
|
|
1559
|
+
global limit: 100, window: 1.minute
|
|
1560
|
+
per_event 'test.event', limit: 10, window: 1.minute
|
|
1561
|
+
end
|
|
1562
|
+
end
|
|
1563
|
+
end
|
|
1564
|
+
|
|
1565
|
+
describe 'global rate limiting' do
|
|
1566
|
+
it 'allows events under limit' do
|
|
1567
|
+
50.times do
|
|
1568
|
+
result = Events::TestEvent.track(foo: 'bar')
|
|
1569
|
+
expect(result).to be_success
|
|
1570
|
+
end
|
|
1571
|
+
end
|
|
1572
|
+
|
|
1573
|
+
it 'rate limits after threshold' do
|
|
1574
|
+
# Track 100 events (at limit)
|
|
1575
|
+
100.times { Events::TestEvent.track(foo: 'bar') }
|
|
1576
|
+
|
|
1577
|
+
# 101st event should be rate limited
|
|
1578
|
+
result = Events::TestEvent.track(foo: 'bar')
|
|
1579
|
+
expect(result).to be_rate_limited
|
|
1580
|
+
|
|
1581
|
+
# Metric incremented
|
|
1582
|
+
metric = Yabeda.e11y_internal.rate_limit_hits_total
|
|
1583
|
+
expect(metric.values[{ limit_type: 'global' }]).to be > 0
|
|
1584
|
+
end
|
|
1585
|
+
end
|
|
1586
|
+
|
|
1587
|
+
describe 'per-event rate limiting' do
|
|
1588
|
+
it 'rate limits specific event type' do
|
|
1589
|
+
# Track 10 test.event (at limit)
|
|
1590
|
+
10.times { Events::TestEvent.track(foo: 'bar') }
|
|
1591
|
+
|
|
1592
|
+
# 11th should be rate limited
|
|
1593
|
+
result = Events::TestEvent.track(foo: 'bar')
|
|
1594
|
+
expect(result).to be_rate_limited
|
|
1595
|
+
|
|
1596
|
+
# But other events still work
|
|
1597
|
+
result = Events::OtherEvent.track(baz: 'qux')
|
|
1598
|
+
expect(result).to be_success
|
|
1599
|
+
end
|
|
1600
|
+
end
|
|
1601
|
+
|
|
1602
|
+
describe 'bypass rules' do
|
|
1603
|
+
before do
|
|
1604
|
+
E11y.configure do |config|
|
|
1605
|
+
config.rate_limiting do
|
|
1606
|
+
global limit: 10, window: 1.minute
|
|
1607
|
+
bypass_for severities: [:fatal]
|
|
1608
|
+
end
|
|
1609
|
+
end
|
|
1610
|
+
end
|
|
1611
|
+
|
|
1612
|
+
it 'bypasses rate limiting for critical events' do
|
|
1613
|
+
# Fill up limit
|
|
1614
|
+
10.times { Events::TestEvent.track(severity: :info) }
|
|
1615
|
+
|
|
1616
|
+
# Fatal event should bypass
|
|
1617
|
+
result = Events::CriticalError.track(severity: :fatal)
|
|
1618
|
+
expect(result).to be_success # Not rate limited!
|
|
1619
|
+
end
|
|
1620
|
+
end
|
|
1621
|
+
end
|
|
1622
|
+
```
|
|
1623
|
+
|
|
1624
|
+
---
|
|
1625
|
+
|
|
1626
|
+
## 💡 Best Practices
|
|
1627
|
+
|
|
1628
|
+
### ✅ DO
|
|
1629
|
+
|
|
1630
|
+
**1. Set conservative limits initially**
|
|
1631
|
+
```ruby
|
|
1632
|
+
# ✅ GOOD: Start low, increase if needed
|
|
1633
|
+
global limit: 5_000, window: 1.minute
|
|
1634
|
+
```
|
|
1635
|
+
|
|
1636
|
+
**2. Use per-context limits for abuse prevention**
|
|
1637
|
+
```ruby
|
|
1638
|
+
# ✅ GOOD: Prevent single user flooding
|
|
1639
|
+
per_context :user_id, limit: 1_000, window: 1.minute
|
|
1640
|
+
per_context :ip_address, limit: 500, window: 1.minute
|
|
1641
|
+
```
|
|
1642
|
+
|
|
1643
|
+
**3. Always bypass critical events**
|
|
1644
|
+
```ruby
|
|
1645
|
+
# ✅ GOOD: Never rate limit security/system events
|
|
1646
|
+
bypass_for event_types: ['security.alert', 'system.critical']
|
|
1647
|
+
bypass_for severities: [:fatal]
|
|
1648
|
+
```
|
|
1649
|
+
|
|
1650
|
+
**4. Monitor rate limit hits**
|
|
1651
|
+
```ruby
|
|
1652
|
+
# ✅ GOOD: Alert on frequent rate limiting
|
|
1653
|
+
# Alert: rate_limit_hits_total > 10/min
|
|
1654
|
+
```
|
|
1655
|
+
|
|
1656
|
+
---
|
|
1657
|
+
|
|
1658
|
+
### ❌ DON'T
|
|
1659
|
+
|
|
1660
|
+
**1. Don't set limits too high (defeats purpose)**
|
|
1661
|
+
```ruby
|
|
1662
|
+
# ❌ BAD: Limit too high to be effective
|
|
1663
|
+
global limit: 1_000_000, window: 1.minute # Useless!
|
|
1664
|
+
```
|
|
1665
|
+
|
|
1666
|
+
**2. Don't rate limit critical events**
|
|
1667
|
+
```ruby
|
|
1668
|
+
# ❌ BAD: Rate limiting errors
|
|
1669
|
+
per_event 'system.error', limit: 10 # You WANT to know about ALL errors!
|
|
1670
|
+
```
|
|
1671
|
+
|
|
1672
|
+
**3. Don't ignore rate limit alerts**
|
|
1673
|
+
```ruby
|
|
1674
|
+
# ❌ BAD: Rate limits hitting frequently
|
|
1675
|
+
# → Investigate! Could be attack or misconfiguration
|
|
1676
|
+
```
|
|
1677
|
+
|
|
1678
|
+
---
|
|
1679
|
+
|
|
1680
|
+
## 🔒 Validations (NEW - v1.1)
|
|
1681
|
+
|
|
1682
|
+
> **🎯 Pattern:** Validate rate limiting configuration at class load time.
|
|
1683
|
+
|
|
1684
|
+
### Rate Limit Value Validation
|
|
1685
|
+
|
|
1686
|
+
**Problem:** Invalid rate limit values → runtime errors.
|
|
1687
|
+
|
|
1688
|
+
**Solution:** Validate rate limit is positive integer:
|
|
1689
|
+
|
|
1690
|
+
```ruby
|
|
1691
|
+
# Gem implementation (automatic):
|
|
1692
|
+
def self.rate_limit(limit, window: 1.minute)
|
|
1693
|
+
unless limit.is_a?(Integer) && limit > 0
|
|
1694
|
+
raise ArgumentError, "rate_limit must be positive integer, got: #{limit.inspect}"
|
|
1695
|
+
end
|
|
1696
|
+
unless window.is_a?(ActiveSupport::Duration) && window > 0
|
|
1697
|
+
raise ArgumentError, "window must be positive duration, got: #{window.inspect}"
|
|
1698
|
+
end
|
|
1699
|
+
self._rate_limit = limit
|
|
1700
|
+
self._rate_limit_window = window
|
|
1701
|
+
end
|
|
1702
|
+
|
|
1703
|
+
# Result:
|
|
1704
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1705
|
+
rate_limit -100 # ← ERROR: "rate_limit must be positive integer, got: -100"
|
|
1706
|
+
end
|
|
1707
|
+
```
|
|
1708
|
+
|
|
1709
|
+
### On Exceeded Strategy Validation
|
|
1710
|
+
|
|
1711
|
+
**Problem:** Invalid on_exceeded strategies → silent failures.
|
|
1712
|
+
|
|
1713
|
+
**Solution:** Validate strategy against whitelist:
|
|
1714
|
+
|
|
1715
|
+
```ruby
|
|
1716
|
+
# Gem implementation (automatic):
|
|
1717
|
+
VALID_ON_EXCEEDED = [:drop, :sample, :throttle]
|
|
1718
|
+
|
|
1719
|
+
def self.on_exceeded(strategy)
|
|
1720
|
+
unless VALID_ON_EXCEEDED.include?(strategy)
|
|
1721
|
+
raise ArgumentError, "Invalid on_exceeded: #{strategy}. Valid: #{VALID_ON_EXCEEDED.join(', ')}"
|
|
1722
|
+
end
|
|
1723
|
+
self._on_exceeded = strategy
|
|
1724
|
+
end
|
|
1725
|
+
|
|
1726
|
+
# Result:
|
|
1727
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1728
|
+
on_exceeded :backpressure # ← ERROR: "Invalid on_exceeded: :backpressure. Valid: drop, sample, throttle"
|
|
1729
|
+
end
|
|
1730
|
+
```
|
|
1731
|
+
|
|
1732
|
+
### Audit Event Rate Limiting Validation (LOCKED)
|
|
1733
|
+
|
|
1734
|
+
**Problem:** Attempting to rate limit audit events → compliance violations.
|
|
1735
|
+
|
|
1736
|
+
**Solution:** Lock rate_limiting for audit events:
|
|
1737
|
+
|
|
1738
|
+
```ruby
|
|
1739
|
+
# Gem implementation (automatic):
|
|
1740
|
+
def self.rate_limiting(enabled)
|
|
1741
|
+
if self._audit_event && enabled
|
|
1742
|
+
raise ArgumentError, "Cannot enable rate_limiting for audit events! Audit events must never be rate limited."
|
|
1743
|
+
end
|
|
1744
|
+
self._rate_limiting = enabled
|
|
1745
|
+
end
|
|
1746
|
+
|
|
1747
|
+
# Result:
|
|
1748
|
+
class Events::UserDeleted < E11y::Event::Base
|
|
1749
|
+
audit_event true
|
|
1750
|
+
rate_limit 1000 # ← ERROR: "Cannot enable rate_limiting for audit events!"
|
|
1751
|
+
end
|
|
1752
|
+
```
|
|
1753
|
+
|
|
1754
|
+
---
|
|
1755
|
+
|
|
1756
|
+
## 🌍 Environment-Specific Rate Limiting (NEW - v1.1)
|
|
1757
|
+
|
|
1758
|
+
> **🎯 Pattern:** Different rate limits per environment.
|
|
1759
|
+
|
|
1760
|
+
### Example 1: Higher Limits in Production
|
|
1761
|
+
|
|
1762
|
+
```ruby
|
|
1763
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1764
|
+
schema do
|
|
1765
|
+
required(:endpoint).filled(:string)
|
|
1766
|
+
required(:status).filled(:integer)
|
|
1767
|
+
end
|
|
1768
|
+
|
|
1769
|
+
# Environment-specific rate limits
|
|
1770
|
+
rate_limit case Rails.env
|
|
1771
|
+
when 'production' then 10_000
|
|
1772
|
+
when 'staging' then 1_000
|
|
1773
|
+
else 100 # Dev/test
|
|
1774
|
+
end
|
|
1775
|
+
|
|
1776
|
+
on_exceeded Rails.env.production? ? :sample : :drop
|
|
1777
|
+
sample_rate 0.1 if Rails.env.production?
|
|
1778
|
+
end
|
|
1779
|
+
```
|
|
1780
|
+
|
|
1781
|
+
### Example 2: Debug Events (Strict in Prod, Loose in Dev)
|
|
1782
|
+
|
|
1783
|
+
```ruby
|
|
1784
|
+
class Events::DebugQuery < E11y::Event::Base
|
|
1785
|
+
schema do
|
|
1786
|
+
required(:query).filled(:string)
|
|
1787
|
+
required(:duration_ms).filled(:integer)
|
|
1788
|
+
end
|
|
1789
|
+
|
|
1790
|
+
# Strict rate limiting in production
|
|
1791
|
+
rate_limit Rails.env.production? ? 100 : 10_000
|
|
1792
|
+
on_exceeded :drop # Always drop debug logs
|
|
1793
|
+
end
|
|
1794
|
+
```
|
|
1795
|
+
|
|
1796
|
+
### Example 3: Feature Flag for Rate Limiting
|
|
1797
|
+
|
|
1798
|
+
```ruby
|
|
1799
|
+
class Events::ExperimentalFeature < E11y::Event::Base
|
|
1800
|
+
schema do
|
|
1801
|
+
required(:feature_name).filled(:string)
|
|
1802
|
+
end
|
|
1803
|
+
|
|
1804
|
+
# Enable rate limiting only when feature flag is on
|
|
1805
|
+
if ENV['ENABLE_RATE_LIMITING'] == 'true'
|
|
1806
|
+
rate_limit 1_000
|
|
1807
|
+
on_exceeded :sample
|
|
1808
|
+
sample_rate 0.1
|
|
1809
|
+
end
|
|
1810
|
+
end
|
|
1811
|
+
```
|
|
1812
|
+
|
|
1813
|
+
---
|
|
1814
|
+
|
|
1815
|
+
## 📊 Precedence Rules for Rate Limiting (NEW - v1.1)
|
|
1816
|
+
|
|
1817
|
+
> **🎯 Pattern:** Rate limiting configuration precedence (most specific wins).
|
|
1818
|
+
|
|
1819
|
+
### Precedence Order (Highest to Lowest)
|
|
1820
|
+
|
|
1821
|
+
```
|
|
1822
|
+
1. Event-level explicit config (highest priority)
|
|
1823
|
+
↓
|
|
1824
|
+
2. Preset module config
|
|
1825
|
+
↓
|
|
1826
|
+
3. Base class config (inheritance)
|
|
1827
|
+
↓
|
|
1828
|
+
4. Convention-based defaults (1000/sec)
|
|
1829
|
+
↓
|
|
1830
|
+
5. Global config (lowest priority)
|
|
1831
|
+
```
|
|
1832
|
+
|
|
1833
|
+
### Example: Mixing Inheritance + Presets for Rate Limiting
|
|
1834
|
+
|
|
1835
|
+
```ruby
|
|
1836
|
+
# Global config (lowest priority)
|
|
1837
|
+
E11y.configure do |config|
|
|
1838
|
+
config.rate_limiting do
|
|
1839
|
+
global limit: 10_000, window: 1.minute # Default for all events
|
|
1840
|
+
on_exceeded :drop
|
|
1841
|
+
end
|
|
1842
|
+
end
|
|
1843
|
+
|
|
1844
|
+
# Base class (medium priority)
|
|
1845
|
+
class Events::BaseDebugEvent < E11y::Event::Base
|
|
1846
|
+
severity :debug
|
|
1847
|
+
rate_limit 100, window: 1.minute # Override global (stricter)
|
|
1848
|
+
on_exceeded :drop
|
|
1849
|
+
end
|
|
1850
|
+
|
|
1851
|
+
# Preset module (higher priority)
|
|
1852
|
+
module E11y::Presets::HighValueEvent
|
|
1853
|
+
extend ActiveSupport::Concern
|
|
1854
|
+
included do
|
|
1855
|
+
rate_limit 10_000, window: 1.minute # Override base (looser)
|
|
1856
|
+
on_exceeded :throttle # Never drop high-value events
|
|
1857
|
+
end
|
|
1858
|
+
end
|
|
1859
|
+
|
|
1860
|
+
# Event (highest priority)
|
|
1861
|
+
class Events::CriticalPayment < Events::BaseDebugEvent
|
|
1862
|
+
include E11y::Presets::HighValueEvent
|
|
1863
|
+
|
|
1864
|
+
rate_limit 50_000, window: 1.minute # Override preset (even looser)
|
|
1865
|
+
|
|
1866
|
+
# Final config:
|
|
1867
|
+
# - severity: :debug (from base)
|
|
1868
|
+
# - rate_limit: 50_000/min (event-level override)
|
|
1869
|
+
# - on_exceeded: :throttle (from preset)
|
|
1870
|
+
end
|
|
1871
|
+
```
|
|
1872
|
+
|
|
1873
|
+
### Precedence Rules Table
|
|
1874
|
+
|
|
1875
|
+
| Config | Global | Convention | Base Class | Preset | Event-Level | Winner |
|
|
1876
|
+
|--------|--------|------------|------------|--------|-------------|--------|
|
|
1877
|
+
| `rate_limit` | `10_000` | `1_000` | `100` | `10_000` | `50_000` | **`50_000`** (event) |
|
|
1878
|
+
| `on_exceeded` | `:drop` | - | `:drop` | `:throttle` | - | **`:throttle`** (preset) |
|
|
1879
|
+
| `sample_rate` | `0.1` | - | - | `0.5` | - | **`0.5`** (preset) |
|
|
1880
|
+
|
|
1881
|
+
### Convention-Based Defaults
|
|
1882
|
+
|
|
1883
|
+
**Convention:** If no rate_limit specified → default `1000/sec`:
|
|
1884
|
+
|
|
1885
|
+
```ruby
|
|
1886
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1887
|
+
schema do
|
|
1888
|
+
required(:endpoint).filled(:string)
|
|
1889
|
+
end
|
|
1890
|
+
# ← Auto: rate_limit = 1000 (convention!)
|
|
1891
|
+
end
|
|
1892
|
+
```
|
|
1893
|
+
|
|
1894
|
+
---
|
|
1895
|
+
|
|
1896
|
+
## 📚 Related Use Cases
|
|
1897
|
+
|
|
1898
|
+
- **[UC-002: Business Event Tracking](./UC-002-business-event-tracking.md)** - Event definitions
|
|
1899
|
+
- **[UC-007: PII Filtering](./UC-007-pii-filtering.md)** - Prevent PII leaks
|
|
1900
|
+
- **[UC-013: High Cardinality Protection](./UC-013-high-cardinality-protection.md)** - Cost control
|
|
1901
|
+
|
|
1902
|
+
---
|
|
1903
|
+
|
|
1904
|
+
**Document Version:** 1.1 (Unified DSL)
|
|
1905
|
+
**Last Updated:** January 16, 2026
|
|
1906
|
+
**Status:** ✅ Complete - Consistent with DSL-SPECIFICATION.md v1.1.0
|