e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,1940 @@
|
|
|
1
|
+
# UC-014: Adaptive Sampling
|
|
2
|
+
|
|
3
|
+
**Status:** Partially Implemented (Error-Based - 2026-01-19)
|
|
4
|
+
**Complexity:** Advanced
|
|
5
|
+
**Setup Time:** 45-60 minutes
|
|
6
|
+
**Target Users:** SRE, DevOps, Engineering Managers
|
|
7
|
+
|
|
8
|
+
**Implementation Status:**
|
|
9
|
+
- ✅ **Basic Sampling** - `E11y::Middleware::Sampling` with trace-aware logic (L2.7)
|
|
10
|
+
- ✅ **Event-level DSL** - `sample_rate` and `adaptive_sampling` in `Event::Base`
|
|
11
|
+
- ✅ **Pipeline Integration** - Sampling middleware in default pipeline
|
|
12
|
+
- ✅ **Error-Based Adaptive** - 100% sampling during error spikes (FEAT-4838)
|
|
13
|
+
- ✅ **Load-Based Adaptive** - Tiered sampling (100%/50%/10%/1%) based on load (FEAT-4842)
|
|
14
|
+
- ✅ **Value-Based Sampling** - Event DSL for sampling by payload values (FEAT-4846)
|
|
15
|
+
- ✅ **Stratified Sampling** - SLO-accurate sampling with correction (C11, FEAT-4850)
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 📋 Overview
|
|
20
|
+
|
|
21
|
+
### Problem Statement
|
|
22
|
+
|
|
23
|
+
**The resource waste problem:**
|
|
24
|
+
```ruby
|
|
25
|
+
# ❌ FIXED SAMPLING: Wastes resources, misses important events
|
|
26
|
+
E11y.configure do |config|
|
|
27
|
+
config.sampling do
|
|
28
|
+
# Fixed 10% sampling for ALL events
|
|
29
|
+
sample_rate 0.1 # 90% dropped!
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Problems during incidents:
|
|
34
|
+
# 09:00 AM: Normal load (1k events/sec)
|
|
35
|
+
# → 10% sampling = 100 events/sec tracked (OK)
|
|
36
|
+
#
|
|
37
|
+
# 09:30 AM: Error spike! (100k errors/sec)
|
|
38
|
+
# → 10% sampling = 10k events/sec tracked
|
|
39
|
+
# → But we need MORE samples during errors, not same rate!
|
|
40
|
+
#
|
|
41
|
+
# Result:
|
|
42
|
+
# - Wasted capacity during normal times (could track more)
|
|
43
|
+
# - Insufficient data during incidents (need more samples)
|
|
44
|
+
# - No signal/noise optimization
|
|
45
|
+
# - Fixed cost regardless of load
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### E11y Solution
|
|
49
|
+
|
|
50
|
+
**Dynamic sampling adapts to conditions:**
|
|
51
|
+
```ruby
|
|
52
|
+
# ✅ ADAPTIVE SAMPLING: Smart resource allocation
|
|
53
|
+
E11y.configure do |config|
|
|
54
|
+
config.adaptive_sampling do
|
|
55
|
+
# Adjust sampling based on multiple factors
|
|
56
|
+
enabled true
|
|
57
|
+
|
|
58
|
+
# Base sampling rate (adjusted dynamically)
|
|
59
|
+
base_sample_rate 0.1 # 10% default
|
|
60
|
+
|
|
61
|
+
# Increase sampling during errors
|
|
62
|
+
on_error_spike do
|
|
63
|
+
sample_rate 1.0 # 100% during errors!
|
|
64
|
+
duration 5.minutes
|
|
65
|
+
error_rate_threshold 0.05 # 5% error rate triggers
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Decrease sampling during high load
|
|
69
|
+
on_high_load do
|
|
70
|
+
sample_rate 0.01 # 1% during overload
|
|
71
|
+
load_threshold 50_000 # events/sec
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Never sample critical events
|
|
75
|
+
always_sample severities: [:error, :fatal],
|
|
76
|
+
event_patterns: ['payment.*', 'security.*']
|
|
77
|
+
|
|
78
|
+
# Sample by value (high-value events)
|
|
79
|
+
sample_by_value do
|
|
80
|
+
field :amount
|
|
81
|
+
threshold 1000 # Always sample >$1000 transactions
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Result:
|
|
87
|
+
# Normal: 10% sampling (1k → 100 events/sec)
|
|
88
|
+
# Error spike: 100% sampling (100k errors → 100k tracked!)
|
|
89
|
+
# High load: 1% sampling (100k → 1k events/sec)
|
|
90
|
+
# Critical: Always 100% (never dropped)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 🚀 Current Implementation (2026-01-19)
|
|
96
|
+
|
|
97
|
+
### Basic Sampling (L2.7) ✅
|
|
98
|
+
|
|
99
|
+
**What's Implemented:**
|
|
100
|
+
|
|
101
|
+
```ruby
|
|
102
|
+
# 1. Event-level sample rate (explicit)
|
|
103
|
+
class HighFrequencyEvent < E11y::Event::Base
|
|
104
|
+
sample_rate 0.01 # 1% sampling
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# 2. Severity-based defaults (automatic)
|
|
108
|
+
class ErrorEvent < E11y::Event::Base
|
|
109
|
+
severity :error # → 100% sampling (SEVERITY_SAMPLE_RATES[:error])
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
class DebugEvent < E11y::Event::Base
|
|
113
|
+
severity :debug # → 1% sampling (SEVERITY_SAMPLE_RATES[:debug])
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# 3. Trace-aware sampling (C05 resolution)
|
|
117
|
+
# All events in same trace_id get same sampling decision
|
|
118
|
+
OrderCreated.track(order_id: 123, trace_id: "abc") # Sampled
|
|
119
|
+
PaymentProcessed.track(order_id: 123, trace_id: "abc") # Also sampled (same trace)
|
|
120
|
+
|
|
121
|
+
# 4. Audit event exemption
|
|
122
|
+
class AuditEvent < E11y::Event::Base
|
|
123
|
+
audit_event true # Never sampled, always processed
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# 5. Middleware configuration
|
|
127
|
+
E11y.configure do |config|
|
|
128
|
+
# Sampling middleware is automatically added to default pipeline
|
|
129
|
+
# Can override with custom config:
|
|
130
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
131
|
+
default_sample_rate: 0.1,
|
|
132
|
+
trace_aware: true
|
|
133
|
+
end
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Error-Based Adaptive Sampling (FEAT-4838) ✅
|
|
137
|
+
|
|
138
|
+
**Implemented (2026-01-19):**
|
|
139
|
+
|
|
140
|
+
```ruby
|
|
141
|
+
# Enable error-based adaptive sampling
|
|
142
|
+
E11y.configure do |config|
|
|
143
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
144
|
+
default_sample_rate: 0.1, # 10% normal sampling
|
|
145
|
+
error_based_adaptive: true, # Enable adaptive sampling
|
|
146
|
+
error_spike_config: {
|
|
147
|
+
window: 60, # 60 seconds sliding window
|
|
148
|
+
absolute_threshold: 100, # 100 errors/min triggers spike
|
|
149
|
+
relative_threshold: 3.0, # 3x normal rate triggers spike
|
|
150
|
+
spike_duration: 300 # Keep 100% sampling for 5 minutes
|
|
151
|
+
}
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Behavior:
|
|
155
|
+
# Normal conditions: 10% sampling (as configured)
|
|
156
|
+
# Error spike detected: Automatically increases to 100% sampling
|
|
157
|
+
# After spike ends: Returns to 10% after 5 minutes
|
|
158
|
+
|
|
159
|
+
# Example scenario:
|
|
160
|
+
# 09:00 AM: Normal load (10 errors/min)
|
|
161
|
+
# → 10% sampling (100 events/sec tracked)
|
|
162
|
+
#
|
|
163
|
+
# 09:30 AM: Error spike! (150 errors/min > 100 threshold)
|
|
164
|
+
# → 100% sampling (1000 events/sec tracked)
|
|
165
|
+
# → Better data for debugging!
|
|
166
|
+
#
|
|
167
|
+
# 09:35 AM: Errors normalized (10 errors/min)
|
|
168
|
+
# → Still 100% sampling (within spike_duration)
|
|
169
|
+
#
|
|
170
|
+
# 09:40 AM: 5 minutes passed
|
|
171
|
+
# → Back to 10% sampling
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Features:**
|
|
175
|
+
- **Absolute threshold**: Triggers when errors/min exceeds configured value
|
|
176
|
+
- **Relative threshold**: Triggers when error rate is 3x baseline
|
|
177
|
+
- **Baseline tracking**: Exponential moving average of normal error rate
|
|
178
|
+
- **Spike duration**: Maintains elevated sampling for configured period
|
|
179
|
+
- **Non-intrusive**: Errors are tracked automatically, no code changes needed
|
|
180
|
+
|
|
181
|
+
**Precedence (updated):**
|
|
182
|
+
0. **Error spike override** (100% during spike) - FEAT-4838
|
|
183
|
+
1. Explicit `sample_rate` (highest priority)
|
|
184
|
+
2. Severity-based defaults (`SEVERITY_SAMPLE_RATES`)
|
|
185
|
+
3. Middleware `default_sample_rate` (fallback)
|
|
186
|
+
|
|
187
|
+
**What's NOT Implemented Yet:**
|
|
188
|
+
- ⏳ Load-based adaptive (FEAT-4842)
|
|
189
|
+
- ⏳ Value-based sampling (FEAT-4846)
|
|
190
|
+
- ⏳ Stratified sampling for SLO accuracy (FEAT-4850, C11)
|
|
191
|
+
- ⏳ `adaptive_sampling` DSL block (placeholder only)
|
|
192
|
+
|
|
193
|
+
**See:**
|
|
194
|
+
- Detector: `lib/e11y/sampling/error_spike_detector.rb`
|
|
195
|
+
- Middleware: `lib/e11y/middleware/sampling.rb`
|
|
196
|
+
- Event DSL: `lib/e11y/event/base.rb` (`.sample_rate`, `.resolve_sample_rate`)
|
|
197
|
+
- Tests: `spec/e11y/sampling/error_spike_detector_spec.rb`, `spec/e11y/middleware/sampling_spec.rb`
|
|
198
|
+
|
|
199
|
+
### Load-Based Adaptive Sampling (FEAT-4842) ✅
|
|
200
|
+
|
|
201
|
+
**Implemented (2026-01-20):**
|
|
202
|
+
|
|
203
|
+
```ruby
|
|
204
|
+
# Enable load-based adaptive sampling
|
|
205
|
+
E11y.configure do |config|
|
|
206
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
207
|
+
default_sample_rate: 0.1, # 10% base sampling
|
|
208
|
+
load_based_adaptive: true, # Enable adaptive sampling
|
|
209
|
+
load_monitor_config: {
|
|
210
|
+
window: 60, # 60 seconds sliding window
|
|
211
|
+
normal_threshold: 1_000, # < 1k events/sec = normal
|
|
212
|
+
high_threshold: 10_000, # 10k events/sec = high load
|
|
213
|
+
very_high_threshold: 50_000, # 50k events/sec = very high
|
|
214
|
+
overload_threshold: 100_000 # > 100k events/sec = overload
|
|
215
|
+
}
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Tiered sampling behavior:
|
|
219
|
+
# Normal (<1k): 100% sampling (track everything)
|
|
220
|
+
# High (1k-10k): 50% sampling (moderate reduction)
|
|
221
|
+
# Very High (10k-50k): 10% sampling (aggressive reduction)
|
|
222
|
+
# Overload (>100k): 1% sampling (extreme reduction)
|
|
223
|
+
|
|
224
|
+
# Example scenario:
|
|
225
|
+
# 09:00: 500 events/sec → 100% sampling (normal load)
|
|
226
|
+
# 09:30: 15k events/sec → 10% sampling (very high load)
|
|
227
|
+
# 10:00: 120k events/sec → 1% sampling (overload!)
|
|
228
|
+
# 10:30: 2k events/sec → 50% sampling (returning to normal)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
**Features:**
|
|
232
|
+
- **Tiered sampling**: 4 load levels with different sampling rates
|
|
233
|
+
- **Sliding window**: Event rate calculated over 60-second window
|
|
234
|
+
- **Thread-safe**: Uses MonitorMixin for concurrent access
|
|
235
|
+
- **Dynamic adjustment**: Sample rate updates in real-time
|
|
236
|
+
|
|
237
|
+
**Integration with other strategies:**
|
|
238
|
+
- Works as a "base rate" that can be further restricted by event-level config
|
|
239
|
+
- Error-based adaptive overrides load-based (100% during spikes)
|
|
240
|
+
- Value-based sampling overrides load-based (100% for high-value events)
|
|
241
|
+
|
|
242
|
+
**See:**
|
|
243
|
+
- Monitor: `lib/e11y/sampling/load_monitor.rb`
|
|
244
|
+
- Tests: `spec/e11y/sampling/load_monitor_spec.rb`, `spec/e11y/middleware/sampling_stress_spec.rb`
|
|
245
|
+
|
|
246
|
+
### Value-Based Sampling (FEAT-4846) ✅
|
|
247
|
+
|
|
248
|
+
**Implemented (2026-01-20):**
|
|
249
|
+
|
|
250
|
+
```ruby
|
|
251
|
+
# Event-level value-based sampling DSL
|
|
252
|
+
class Events::OrderPaid < E11y::Event::Base
|
|
253
|
+
schema do
|
|
254
|
+
required(:order_id).filled(:string)
|
|
255
|
+
required(:amount).filled(:decimal)
|
|
256
|
+
required(:user_segment).filled(:string)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Strategy 1: Always sample high-value orders
|
|
260
|
+
sample_by_value field: "amount",
|
|
261
|
+
operator: :greater_than,
|
|
262
|
+
threshold: 1000,
|
|
263
|
+
sample_rate: 1.0 # 100% for orders > $1000
|
|
264
|
+
|
|
265
|
+
# Strategy 2: Sample range of orders
|
|
266
|
+
sample_by_value field: "amount",
|
|
267
|
+
operator: :in_range,
|
|
268
|
+
threshold: 100..500,
|
|
269
|
+
sample_rate: 0.5 # 50% for $100-$500 orders
|
|
270
|
+
|
|
271
|
+
# Strategy 3: Sample specific segments
|
|
272
|
+
sample_by_value field: "user_segment",
|
|
273
|
+
operator: :equals,
|
|
274
|
+
threshold: "enterprise",
|
|
275
|
+
sample_rate: 1.0 # 100% for enterprise users
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Usage:
|
|
279
|
+
Events::OrderPaid.track(
|
|
280
|
+
order_id: "123",
|
|
281
|
+
amount: 5000, # > $1000 → Always sampled!
|
|
282
|
+
user_segment: "enterprise" # → Also matches second rule!
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
Events::OrderPaid.track(
|
|
286
|
+
order_id: "456",
|
|
287
|
+
amount: 250, # In range $100-$500 → 50% chance
|
|
288
|
+
user_segment: "free"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
Events::OrderPaid.track(
|
|
292
|
+
order_id: "789",
|
|
293
|
+
amount: 50, # < $100 → Falls back to default sampling
|
|
294
|
+
user_segment: "free"
|
|
295
|
+
)
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
**Supported Operators:**
|
|
299
|
+
- `:greater_than` (>): Numeric threshold (e.g., amount > 1000)
|
|
300
|
+
- `:less_than` (<): Numeric threshold (e.g., latency < 100)
|
|
301
|
+
- `:equals` (==): Exact value match (string or numeric)
|
|
302
|
+
- `:in_range`: Range of values (e.g., 100..500)
|
|
303
|
+
|
|
304
|
+
**Features:**
|
|
305
|
+
- **Nested field extraction**: Supports dot notation (e.g., `"order.amount"`)
|
|
306
|
+
- **Type coercion**: Numeric strings automatically converted to floats
|
|
307
|
+
- **Nil handling**: Missing/nil values treated as 0.0
|
|
308
|
+
- **High priority**: Overrides load-based sampling
|
|
309
|
+
|
|
310
|
+
**See:**
|
|
311
|
+
- Extractor: `lib/e11y/sampling/value_extractor.rb`
|
|
312
|
+
- Config: `lib/e11y/event/value_sampling_config.rb`
|
|
313
|
+
- Event DSL: `lib/e11y/event/base.rb` (`.sample_by_value`)
|
|
314
|
+
- Tests: `spec/e11y/sampling/value_extractor_spec.rb`, `spec/e11y/middleware/sampling_value_based_spec.rb`
|
|
315
|
+
|
|
316
|
+
### Stratified Sampling for SLO Accuracy (FEAT-4850, C11 Resolution) ✅
|
|
317
|
+
|
|
318
|
+
**Implemented (2026-01-20):**
|
|
319
|
+
|
|
320
|
+
```ruby
|
|
321
|
+
# Automatic stratified sampling (no config needed!)
|
|
322
|
+
# E11y automatically records sample rates per severity stratum
|
|
323
|
+
|
|
324
|
+
# Example: SLO calculation with sampling correction
|
|
325
|
+
module E11y
|
|
326
|
+
module SLO
|
|
327
|
+
class Tracker
|
|
328
|
+
# Calculates success rate with sampling correction
|
|
329
|
+
def track_http_request_success(controller:, action:, status:, duration_ms:, sample_rate: 1.0)
|
|
330
|
+
# Record sampled event
|
|
331
|
+
E11y::Metrics.increment("e11y.slo.http.requests",
|
|
332
|
+
controller: controller,
|
|
333
|
+
action: action,
|
|
334
|
+
status_class: "#{status / 100}xx",
|
|
335
|
+
sample_rate: sample_rate # ← Metadata for correction
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Stratified tracker records for later correction
|
|
339
|
+
@stratified_tracker.record_sample(
|
|
340
|
+
severity: status >= 500 ? :error : :info,
|
|
341
|
+
sample_rate: sample_rate
|
|
342
|
+
)
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Calculate corrected success rate
|
|
346
|
+
def http_success_rate(window: 5.minutes)
|
|
347
|
+
# Get correction factors per severity
|
|
348
|
+
correction = @stratified_tracker.sampling_correction(:info)
|
|
349
|
+
|
|
350
|
+
# Apply correction to raw metrics
|
|
351
|
+
raw_success_rate = Yabeda.e11y_slo_http_requests
|
|
352
|
+
.with_labels(status_class: "2xx")
|
|
353
|
+
.get
|
|
354
|
+
|
|
355
|
+
corrected_success_rate = raw_success_rate * correction
|
|
356
|
+
corrected_success_rate
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# Scenario: 1000 requests (950 success, 50 errors)
|
|
363
|
+
# Stratified sampling: errors 100%, success 10%
|
|
364
|
+
# Events kept: 50 + 95 = 145 (85.5% cost savings!)
|
|
365
|
+
|
|
366
|
+
# Without correction:
|
|
367
|
+
# Observed success rate: 95/145 = 65.5% ❌ WRONG
|
|
368
|
+
|
|
369
|
+
# With correction:
|
|
370
|
+
# Corrected success: 95 / 0.1 = 950
|
|
371
|
+
# Corrected errors: 50 / 1.0 = 50
|
|
372
|
+
# Corrected success rate: 950 / 1000 = 95.0% ✅ ACCURATE
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
**Features:**
|
|
376
|
+
- **Automatic tracking**: No config needed, records sample rates per event
|
|
377
|
+
- **Per-severity correction**: Different correction factors for errors vs success
|
|
378
|
+
- **Integration with SLO**: Seamlessly corrects SLO metrics
|
|
379
|
+
- **< 5% error margin**: Proven accuracy in integration tests
|
|
380
|
+
|
|
381
|
+
**See:**
|
|
382
|
+
- Tracker: `lib/e11y/sampling/stratified_tracker.rb`
|
|
383
|
+
- SLO integration: `lib/e11y/slo/tracker.rb`
|
|
384
|
+
- Tests: `spec/e11y/sampling/stratified_tracker_spec.rb`, `spec/e11y/slo/stratified_sampling_integration_spec.rb`
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
## 🎯 Adaptive Sampling Strategies (Conceptual - Future)
|
|
389
|
+
|
|
390
|
+
---
|
|
391
|
+
|
|
392
|
+
### Event-Level Sampling Configuration (NEW - v1.1)
|
|
393
|
+
|
|
394
|
+
> **🎯 CONTRADICTION_01 Resolution:** Move sampling config from global initializer to event classes.
|
|
395
|
+
|
|
396
|
+
**Event-level sampling DSL:**
|
|
397
|
+
|
|
398
|
+
```ruby
|
|
399
|
+
# app/events/order_created.rb
|
|
400
|
+
module Events
|
|
401
|
+
class OrderCreated < E11y::Event::Base
|
|
402
|
+
schema do
|
|
403
|
+
required(:order_id).filled(:string)
|
|
404
|
+
required(:amount).filled(:decimal)
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
# ✨ Event-level sampling (right next to schema!)
|
|
408
|
+
sample_rate 0.1 # 10% sampling
|
|
409
|
+
|
|
410
|
+
# Or adaptive sampling:
|
|
411
|
+
adaptive_sampling do
|
|
412
|
+
base_rate 0.1 # 10% default
|
|
413
|
+
|
|
414
|
+
# Increase for high-value orders
|
|
415
|
+
sample_by_value do
|
|
416
|
+
field :amount
|
|
417
|
+
threshold 1000 # Always sample >$1000
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# Increase during errors
|
|
421
|
+
on_error_spike do
|
|
422
|
+
sample_rate 1.0 # 100% during errors
|
|
423
|
+
duration 5.minutes
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
**Inheritance for sampling:**
|
|
431
|
+
|
|
432
|
+
```ruby
|
|
433
|
+
# Base class with common sampling
|
|
434
|
+
module Events
|
|
435
|
+
class BasePaymentEvent < E11y::Event::Base
|
|
436
|
+
# Never sample payments (high-value)
|
|
437
|
+
sample_rate 1.0 # 100%
|
|
438
|
+
|
|
439
|
+
# Or adaptive:
|
|
440
|
+
adaptive_sampling do
|
|
441
|
+
base_rate 1.0 # Always sample by default
|
|
442
|
+
|
|
443
|
+
# But reduce during extreme load
|
|
444
|
+
on_high_load do
|
|
445
|
+
sample_rate 0.5 # 50% during overload
|
|
446
|
+
load_threshold 100_000 # events/sec
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
# Inherit from base
|
|
453
|
+
class Events::PaymentSucceeded < Events::BasePaymentEvent
|
|
454
|
+
schema do; required(:transaction_id).filled(:string); end
|
|
455
|
+
# ← Inherits: sample_rate 1.0 + adaptive rules
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
class Events::PaymentFailed < Events::BasePaymentEvent
|
|
459
|
+
# Override: ALWAYS sample errors (even during overload)
|
|
460
|
+
sample_rate 1.0
|
|
461
|
+
adaptive_sampling enabled: false # ← Disable adaptive (keyword arg!)
|
|
462
|
+
end
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
**Preset modules for sampling:**
|
|
466
|
+
|
|
467
|
+
```ruby
|
|
468
|
+
# lib/e11y/presets/high_value_event.rb
|
|
469
|
+
module E11y
|
|
470
|
+
module Presets
|
|
471
|
+
module HighValueEvent
|
|
472
|
+
extend ActiveSupport::Concern
|
|
473
|
+
included do
|
|
474
|
+
sample_rate 1.0 # Never sample (100%)
|
|
475
|
+
|
|
476
|
+
adaptive_sampling do
|
|
477
|
+
# Only reduce during extreme load
|
|
478
|
+
on_high_load do
|
|
479
|
+
sample_rate 0.5 # 50%
|
|
480
|
+
load_threshold 100_000
|
|
481
|
+
end
|
|
482
|
+
end
|
|
483
|
+
end
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
module DebugEvent
|
|
487
|
+
extend ActiveSupport::Concern
|
|
488
|
+
included do
|
|
489
|
+
sample_rate 0.01 # 1% sampling
|
|
490
|
+
|
|
491
|
+
adaptive_sampling do
|
|
492
|
+
# Increase during errors
|
|
493
|
+
on_error_spike do
|
|
494
|
+
sample_rate 0.1 # 10% during errors
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
end
|
|
499
|
+
end
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
# Usage:
|
|
503
|
+
class Events::PaymentProcessed < E11y::Event::Base
|
|
504
|
+
include E11y::Presets::HighValueEvent # ← Sampling inherited!
|
|
505
|
+
schema do; required(:transaction_id).filled(:string); end
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
class Events::DebugQuery < E11y::Event::Base
|
|
509
|
+
include E11y::Presets::DebugEvent # ← Sampling inherited!
|
|
510
|
+
schema do; required(:query).filled(:string); end
|
|
511
|
+
end
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
**Conventions for sampling (sensible defaults):**
|
|
515
|
+
|
|
516
|
+
```ruby
|
|
517
|
+
# Convention: Severity → Sample Rate
|
|
518
|
+
# :error/:fatal → 1.0 (100%, never sample errors!)
|
|
519
|
+
# :warn → 0.5 (50%)
|
|
520
|
+
# :success/:info → 0.1 (10%)
|
|
521
|
+
# :debug → 0.01 (1%)
|
|
522
|
+
|
|
523
|
+
# Zero-config event (uses conventions):
|
|
524
|
+
class Events::OrderCreated < E11y::Event::Base
|
|
525
|
+
severity :success
|
|
526
|
+
schema do; required(:order_id).filled(:string); end
|
|
527
|
+
# ← Auto: sample_rate = 0.1 (10%, from severity!)
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
# Override convention:
|
|
531
|
+
class Events::OrderCreated < E11y::Event::Base
|
|
532
|
+
severity :success
|
|
533
|
+
sample_rate 1.0 # ← Override: never sample orders
|
|
534
|
+
schema do; required(:order_id).filled(:string); end
|
|
535
|
+
end
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
**Precedence (event-level overrides global):**
|
|
539
|
+
|
|
540
|
+
```ruby
|
|
541
|
+
# Global config (infrastructure):
|
|
542
|
+
E11y.configure do |config|
|
|
543
|
+
config.sampling do
|
|
544
|
+
base_sample_rate 0.1 # 10% default
|
|
545
|
+
|
|
546
|
+
adaptive_sampling do
|
|
547
|
+
on_error_spike do
|
|
548
|
+
sample_rate 1.0
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
end
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
# Event-level config (overrides global):
|
|
555
|
+
class Events::CriticalError < E11y::Event::Base
|
|
556
|
+
sample_rate 1.0 # ← Override: always 100% (not 10%)
|
|
557
|
+
adaptive_sampling enabled: false # ← Disable adaptive (keyword arg!)
|
|
558
|
+
end
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
**Benefits:**
|
|
562
|
+
- ✅ Locality of behavior (sampling next to schema)
|
|
563
|
+
- ✅ DRY via inheritance/presets
|
|
564
|
+
- ✅ Sensible defaults (90% events need zero config)
|
|
565
|
+
- ✅ Easy to override when needed
|
|
566
|
+
|
|
567
|
+
---
|
|
568
|
+
|
|
569
|
+
### Strategy 1: Error-Based Sampling
|
|
570
|
+
|
|
571
|
+
**Increase sampling during error spikes:**
|
|
572
|
+
```ruby
|
|
573
|
+
E11y.configure do |config|
|
|
574
|
+
config.adaptive_sampling do
|
|
575
|
+
# Detect error rate increase
|
|
576
|
+
error_spike_detection do
|
|
577
|
+
enabled true
|
|
578
|
+
|
|
579
|
+
# Calculate error rate over sliding window
|
|
580
|
+
window 1.minute
|
|
581
|
+
|
|
582
|
+
# Thresholds (absolute + relative)
|
|
583
|
+
absolute_threshold 100 # >100 errors/min → spike
|
|
584
|
+
relative_threshold 3.0 # 3x normal rate → spike
|
|
585
|
+
|
|
586
|
+
# Action: increase sampling
|
|
587
|
+
on_spike do
|
|
588
|
+
sample_rate 1.0 # 100%
|
|
589
|
+
duration 5.minutes
|
|
590
|
+
exponential_decay true # Gradually return to normal
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
# Track baseline error rate
|
|
594
|
+
baseline_window 1.hour
|
|
595
|
+
baseline_calculation :p95 # Use p95 as baseline
|
|
596
|
+
end
|
|
597
|
+
end
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
# How it works:
|
|
601
|
+
# 1. Track error rate: 10 errors/min (baseline)
|
|
602
|
+
# 2. Sudden spike: 150 errors/min (15x increase!)
|
|
603
|
+
# 3. Trigger: sample_rate → 1.0 (100%)
|
|
604
|
+
# 4. Duration: 5 minutes at 100%
|
|
605
|
+
# 5. Decay: Gradually return to 10% over next 10 minutes
|
|
606
|
+
```
|
|
607
|
+
|
|
608
|
+
---
|
|
609
|
+
|
|
610
|
+
### Strategy 2: Load-Based Sampling
|
|
611
|
+
|
|
612
|
+
**Adjust sampling based on event volume:**
|
|
613
|
+
```ruby
|
|
614
|
+
E11y.configure do |config|
|
|
615
|
+
config.adaptive_sampling do
|
|
616
|
+
load_based_sampling do
|
|
617
|
+
enabled true
|
|
618
|
+
|
|
619
|
+
# Define load tiers
|
|
620
|
+
tiers [
|
|
621
|
+
{ threshold: 0, sample_rate: 1.0 }, # <1k/sec: 100%
|
|
622
|
+
{ threshold: 1_000, sample_rate: 0.5 }, # 1k-10k/sec: 50%
|
|
623
|
+
{ threshold: 10_000, sample_rate: 0.1 }, # 10k-50k/sec: 10%
|
|
624
|
+
{ threshold: 50_000, sample_rate: 0.01 } # >50k/sec: 1%
|
|
625
|
+
]
|
|
626
|
+
|
|
627
|
+
# Smooth transitions (avoid sudden jumps)
|
|
628
|
+
transition_period 30.seconds
|
|
629
|
+
|
|
630
|
+
# Hysteresis (prevent flapping)
|
|
631
|
+
hysteresis 0.2 # 20% buffer before tier change
|
|
632
|
+
end
|
|
633
|
+
end
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
# Example timeline:
|
|
637
|
+
# 10:00: 500 events/sec → 100% sampling
|
|
638
|
+
# 10:05: 5k events/sec → 50% sampling (gradual transition)
|
|
639
|
+
# 10:10: 60k events/sec → 1% sampling (overload!)
|
|
640
|
+
# 10:15: 8k events/sec → 10% sampling (returning to normal)
|
|
641
|
+
# 10:20: 500 events/sec → 100% sampling (normal)
|
|
642
|
+
```
|
|
643
|
+
|
|
644
|
+
---
|
|
645
|
+
|
|
646
|
+
### Strategy 3: Value-Based Sampling
|
|
647
|
+
|
|
648
|
+
**Always sample high-value events:**
|
|
649
|
+
```ruby
|
|
650
|
+
E11y.configure do |config|
|
|
651
|
+
config.adaptive_sampling do
|
|
652
|
+
value_based_sampling do
|
|
653
|
+
enabled true
|
|
654
|
+
|
|
655
|
+
# Sample by transaction amount
|
|
656
|
+
sample_by field: :amount,
|
|
657
|
+
threshold: 1000, # >$1000
|
|
658
|
+
sample_rate: 1.0 # Always track high-value
|
|
659
|
+
|
|
660
|
+
# Sample by user segment
|
|
661
|
+
sample_by field: :user_segment,
|
|
662
|
+
values: ['enterprise', 'vip'],
|
|
663
|
+
sample_rate: 1.0 # Always track VIP users
|
|
664
|
+
|
|
665
|
+
# Sample by event importance
|
|
666
|
+
sample_by field: :event_importance,
|
|
667
|
+
threshold: 8, # Importance score >8
|
|
668
|
+
sample_rate: 1.0
|
|
669
|
+
|
|
670
|
+
# Everything else: base sample rate
|
|
671
|
+
default_sample_rate 0.1 # 10%
|
|
672
|
+
end
|
|
673
|
+
end
|
|
674
|
+
end
|
|
675
|
+
|
|
676
|
+
# Usage:
|
|
677
|
+
Events::OrderPaid.track(
|
|
678
|
+
order_id: '123',
|
|
679
|
+
amount: 5000, # >$1000 → Always sampled!
|
|
680
|
+
user_segment: 'enterprise' # VIP → Always sampled!
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
Events::OrderPaid.track(
|
|
684
|
+
order_id: '456',
|
|
685
|
+
amount: 50, # <$1000 → 10% chance
|
|
686
|
+
user_segment: 'free' # Regular → 10% chance
|
|
687
|
+
)
|
|
688
|
+
```
|
|
689
|
+
|
|
690
|
+
---
|
|
691
|
+
|
|
692
|
+
### Strategy 4: Content-Based Sampling
|
|
693
|
+
|
|
694
|
+
**Sample based on event content/patterns:**
|
|
695
|
+
```ruby
|
|
696
|
+
E11y.configure do |config|
|
|
697
|
+
config.adaptive_sampling do
|
|
698
|
+
content_based_sampling do
|
|
699
|
+
enabled true
|
|
700
|
+
|
|
701
|
+
# Always sample specific patterns
|
|
702
|
+
always_sample patterns: [
|
|
703
|
+
'payment.*', # All payment events
|
|
704
|
+
'security.*', # All security events
|
|
705
|
+
'user.deleted', # GDPR events
|
|
706
|
+
'admin.*' # Admin actions
|
|
707
|
+
]
|
|
708
|
+
|
|
709
|
+
# Never sample (drop entirely)
|
|
710
|
+
never_sample patterns: [
|
|
711
|
+
'debug.*', # Debug events in production
|
|
712
|
+
'heartbeat.*' # Heartbeat noise
|
|
713
|
+
]
|
|
714
|
+
end
|
|
715
|
+
end
|
|
716
|
+
end
|
|
717
|
+
```
|
|
718
|
+
|
|
719
|
+
---
|
|
720
|
+
|
|
721
|
+
### Strategy 5: Tail-Based Sampling
|
|
722
|
+
|
|
723
|
+
**Sample based on final outcome (requires buffering):**
|
|
724
|
+
```ruby
|
|
725
|
+
E11y.configure do |config|
|
|
726
|
+
config.adaptive_sampling do
|
|
727
|
+
tail_based_sampling do
|
|
728
|
+
enabled true
|
|
729
|
+
|
|
730
|
+
# Buffer events for request duration
|
|
731
|
+
buffer_duration 30.seconds # Max request time
|
|
732
|
+
|
|
733
|
+
# Decision criteria (applied at request end)
|
|
734
|
+
sample_if do |events_in_request|
|
|
735
|
+
# Always sample if ANY error
|
|
736
|
+
return true if events_in_request.any? { |e| e.severity == :error }
|
|
737
|
+
|
|
738
|
+
# Always sample if slow (>1 second)
|
|
739
|
+
request_duration = events_in_request.last.timestamp - events_in_request.first.timestamp
|
|
740
|
+
return true if request_duration > 1.0
|
|
741
|
+
|
|
742
|
+
# Always sample if high-value transaction
|
|
743
|
+
return true if events_in_request.any? { |e| e.payload[:amount].to_i > 1000 }
|
|
744
|
+
|
|
745
|
+
# Otherwise: probabilistic sampling
|
|
746
|
+
rand < 0.1 # 10%
|
|
747
|
+
end
|
|
748
|
+
end
|
|
749
|
+
end
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
# How it works:
|
|
753
|
+
# Request starts → Buffer all events
|
|
754
|
+
# Request ends → Evaluate criteria
|
|
755
|
+
# Decision: Keep all or drop all events for this request
|
|
756
|
+
|
|
757
|
+
# Example:
|
|
758
|
+
# Request A: 10 events, no errors, 200ms → 10% chance (all or nothing)
|
|
759
|
+
# Request B: 10 events, 1 error → 100% (keep all!)
|
|
760
|
+
# Request C: 10 events, 1500ms → 100% (keep all - slow!)
|
|
761
|
+
```
|
|
762
|
+
|
|
763
|
+
---
|
|
764
|
+
|
|
765
|
+
### Strategy 6: Machine Learning-Based Sampling
|
|
766
|
+
|
|
767
|
+
**Learn optimal sampling from historical data:**
|
|
768
|
+
```ruby
|
|
769
|
+
E11y.configure do |config|
|
|
770
|
+
config.adaptive_sampling do
|
|
771
|
+
ml_based_sampling do
|
|
772
|
+
enabled true
|
|
773
|
+
|
|
774
|
+
# Train model on historical data
|
|
775
|
+
training_data window: 7.days,
|
|
776
|
+
features: [
|
|
777
|
+
:event_name,
|
|
778
|
+
:severity,
|
|
779
|
+
:error_rate,
|
|
780
|
+
:request_duration,
|
|
781
|
+
:time_of_day,
|
|
782
|
+
:day_of_week,
|
|
783
|
+
:load_level
|
|
784
|
+
]
|
|
785
|
+
|
|
786
|
+
# Model predicts event "importance"
|
|
787
|
+
importance_threshold 0.7 # >0.7 → always sample
|
|
788
|
+
|
|
789
|
+
# Update model periodically
|
|
790
|
+
retrain_interval 1.day
|
|
791
|
+
|
|
792
|
+
# Fallback if model fails
|
|
793
|
+
fallback_sample_rate 0.1
|
|
794
|
+
end
|
|
795
|
+
end
|
|
796
|
+
end
|
|
797
|
+
|
|
798
|
+
# Model learns patterns like:
|
|
799
|
+
# - Errors during peak hours → High importance
|
|
800
|
+
# - Slow requests on weekends → High importance
|
|
801
|
+
# - Normal events at 3 AM → Low importance
|
|
802
|
+
```
|
|
803
|
+
|
|
804
|
+
---
|
|
805
|
+
|
|
806
|
+
### Strategy 7: Trace-Consistent Sampling
|
|
807
|
+
|
|
808
|
+
**Problem:** Inconsistent sampling breaks distributed traces
|
|
809
|
+
|
|
810
|
+
```ruby
|
|
811
|
+
# ❌ PROBLEM: Inconsistent sampling creates incomplete traces
|
|
812
|
+
#
|
|
813
|
+
# HTTP request (trace_id: abc-123):
|
|
814
|
+
# → Sample rate: 10% → NOT sampled (90% case)
|
|
815
|
+
#
|
|
816
|
+
# Background job (trace_id: abc-123):
|
|
817
|
+
# → Sample rate: 10% → MAYBE sampled
|
|
818
|
+
#
|
|
819
|
+
# RESULT: Job event exists in Loki, but NO parent HTTP event!
|
|
820
|
+
# → Trace is INCOMPLETE (orphaned events, can't understand context)
|
|
821
|
+
```
|
|
822
|
+
|
|
823
|
+
**Solution:** Sample decision propagated across trace boundaries
|
|
824
|
+
|
|
825
|
+
```ruby
|
|
826
|
+
E11y.configure do |config|
|
|
827
|
+
config.adaptive_sampling do
|
|
828
|
+
# ✅ Trace-consistent sampling: All or nothing
|
|
829
|
+
trace_consistent do
|
|
830
|
+
enabled true
|
|
831
|
+
|
|
832
|
+
# Sample entire trace if ANY event is error/fatal
|
|
833
|
+
sample_on_error true
|
|
834
|
+
|
|
835
|
+
# Propagate sample decision to jobs/services
|
|
836
|
+
propagate_decision true
|
|
837
|
+
sample_decision_key 'e11y_sampled'
|
|
838
|
+
end
|
|
839
|
+
end
|
|
840
|
+
end
|
|
841
|
+
|
|
842
|
+
# How it works:
|
|
843
|
+
# 1. HTTP request arrives (trace_id: abc-123)
|
|
844
|
+
# 2. Sample decision made: rand < 0.1 → false (NOT sampled)
|
|
845
|
+
# 3. Decision stored in Current.sampled = false
|
|
846
|
+
# 4. Job enqueued with metadata: { e11y_sampled: false }
|
|
847
|
+
# 5. Job executes → reads e11y_sampled → skips tracking
|
|
848
|
+
#
|
|
849
|
+
# RESULT: Both HTTP and Job NOT sampled → Trace consistent!
|
|
850
|
+
```
|
|
851
|
+
|
|
852
|
+
**Trace lifecycle example:**
|
|
853
|
+
|
|
854
|
+
```ruby
|
|
855
|
+
# === REQUEST (trace_id: abc-123) ===
|
|
856
|
+
class OrdersController < ApplicationController
|
|
857
|
+
def create
|
|
858
|
+
# 1. Sample decision made at entry point
|
|
859
|
+
# → rand < 0.1 = 0.05 → SAMPLED!
|
|
860
|
+
# → Current.sampled = true
|
|
861
|
+
|
|
862
|
+
Events::OrderCreated.track(order_id: '123')
|
|
863
|
+
# → Tracked (sampled = true)
|
|
864
|
+
|
|
865
|
+
# 2. Enqueue job (sample decision propagated)
|
|
866
|
+
SendEmailJob.perform_later(
|
|
867
|
+
order_id: '123'
|
|
868
|
+
# E11y automatically adds: e11y_sampled: true
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
Events::OrderCompleted.track(order_id: '123')
|
|
872
|
+
# → Tracked (sampled = true)
|
|
873
|
+
end
|
|
874
|
+
end
|
|
875
|
+
|
|
876
|
+
# === BACKGROUND JOB (trace_id: abc-123, e11y_sampled: true) ===
|
|
877
|
+
class SendEmailJob < ApplicationJob
|
|
878
|
+
def perform(order_id)
|
|
879
|
+
# 3. Sample decision restored from job metadata
|
|
880
|
+
# → Current.sampled = true (from metadata)
|
|
881
|
+
|
|
882
|
+
Events::EmailSending.track(order_id: order_id)
|
|
883
|
+
# → Tracked (sampled = true, consistent with parent!)
|
|
884
|
+
|
|
885
|
+
send_email(order_id)
|
|
886
|
+
|
|
887
|
+
Events::EmailSent.track(order_id: order_id)
|
|
888
|
+
# → Tracked (sampled = true)
|
|
889
|
+
end
|
|
890
|
+
end
|
|
891
|
+
|
|
892
|
+
# RESULT: Complete trace in Loki!
|
|
893
|
+
# [abc-123] order.created (HTTP)
|
|
894
|
+
# [abc-123] order.completed (HTTP)
|
|
895
|
+
# [abc-123] email.sending (Job)
|
|
896
|
+
# [abc-123] email.sent (Job)
|
|
897
|
+
# → All events present, trace is COMPLETE!
|
|
898
|
+
```
|
|
899
|
+
|
|
900
|
+
**Exception handling:**
|
|
901
|
+
|
|
902
|
+
```ruby
|
|
903
|
+
E11y.configure do |config|
|
|
904
|
+
config.adaptive_sampling do
|
|
905
|
+
trace_consistent do
|
|
906
|
+
enabled true
|
|
907
|
+
|
|
908
|
+
# If error occurs, sample ENTIRE trace retroactively
|
|
909
|
+
sample_on_error true
|
|
910
|
+
|
|
911
|
+
# This requires buffering (see UC-001: Request-Scoped Debug Buffering)
|
|
912
|
+
# If trace was NOT sampled initially, but error occurs:
|
|
913
|
+
# → Flush buffer (contains all events)
|
|
914
|
+
# → Sample decision overridden to true
|
|
915
|
+
# → Complete trace available for debugging!
|
|
916
|
+
end
|
|
917
|
+
|
|
918
|
+
# Always sample specific patterns (override trace decision)
|
|
919
|
+
always_sample event_patterns: ['payment.*', 'security.*'],
|
|
920
|
+
severities: [:error, :fatal]
|
|
921
|
+
end
|
|
922
|
+
end
|
|
923
|
+
|
|
924
|
+
# Example:
|
|
925
|
+
# 1. HTTP request: sample decision = false (NOT sampled)
|
|
926
|
+
# 2. Order created: NOT tracked (buffer only)
|
|
927
|
+
# 3. Payment fails: ERROR!
|
|
928
|
+
# 4. sample_on_error = true → Override decision to true
|
|
929
|
+
# 5. Flush buffer → All events sent (including buffered ones)
|
|
930
|
+
# 6. Job executes with e11y_sampled: true → Tracked
|
|
931
|
+
#
|
|
932
|
+
# RESULT: Complete trace available BECAUSE of error!
|
|
933
|
+
```
|
|
934
|
+
|
|
935
|
+
**Cross-service propagation:**
|
|
936
|
+
|
|
937
|
+
```ruby
|
|
938
|
+
# Service A: API Gateway (trace_id: abc-123, sampled: true)
|
|
939
|
+
class OrdersController < ApplicationController
|
|
940
|
+
def create
|
|
941
|
+
Events::OrderReceived.track(order_id: '123')
|
|
942
|
+
# → Tracked (sampled = true)
|
|
943
|
+
|
|
944
|
+
# Call Payment Service (propagate sample decision in header)
|
|
945
|
+
response = HTTP
|
|
946
|
+
.headers(
|
|
947
|
+
'X-Trace-ID' => E11y::Current.trace_id,
|
|
948
|
+
'X-E11y-Sampled' => E11y::Current.sampled.to_s # ← Propagate!
|
|
949
|
+
)
|
|
950
|
+
.post('http://payment-service/charge', json: { order_id: '123' })
|
|
951
|
+
|
|
952
|
+
Events::OrderCreated.track(order_id: '123')
|
|
953
|
+
# → Tracked
|
|
954
|
+
end
|
|
955
|
+
end
|
|
956
|
+
|
|
957
|
+
# Service B: Payment Service (trace_id: abc-123, sampled: true from header)
|
|
958
|
+
class PaymentsController < ApplicationController
|
|
959
|
+
def charge
|
|
960
|
+
# Sample decision extracted from X-E11y-Sampled header
|
|
961
|
+
# → Current.sampled = true
|
|
962
|
+
|
|
963
|
+
Events::PaymentProcessing.track(order_id: params[:order_id])
|
|
964
|
+
# → Tracked (consistent with Service A!)
|
|
965
|
+
|
|
966
|
+
process_payment
|
|
967
|
+
|
|
968
|
+
Events::PaymentSucceeded.track(order_id: params[:order_id])
|
|
969
|
+
# → Tracked
|
|
970
|
+
end
|
|
971
|
+
end
|
|
972
|
+
|
|
973
|
+
# RESULT: Complete distributed trace!
|
|
974
|
+
# [Service A, abc-123] order.received
|
|
975
|
+
# [Service A, abc-123] order.created
|
|
976
|
+
# [Service B, abc-123] payment.processing
|
|
977
|
+
# [Service B, abc-123] payment.succeeded
|
|
978
|
+
```
|
|
979
|
+
|
|
980
|
+
**Configuration for different scenarios:**
|
|
981
|
+
|
|
982
|
+
```ruby
|
|
983
|
+
E11y.configure do |config|
|
|
984
|
+
config.adaptive_sampling do
|
|
985
|
+
# Strategy 1: Strict trace consistency (default)
|
|
986
|
+
trace_consistent do
|
|
987
|
+
enabled true
|
|
988
|
+
propagate_decision true
|
|
989
|
+
sample_on_error true
|
|
990
|
+
end
|
|
991
|
+
|
|
992
|
+
# Strategy 2: Independent sampling (simpler, but incomplete traces)
|
|
993
|
+
# trace_consistent do
|
|
994
|
+
# enabled false # Each service/job samples independently
|
|
995
|
+
# end
|
|
996
|
+
|
|
997
|
+
# Strategy 3: Always sample background jobs (practical compromise)
|
|
998
|
+
always_sample event_patterns: ['background_jobs.*']
|
|
999
|
+
# → Jobs always tracked, HTTP sampled independently
|
|
1000
|
+
# → Cost: jobs 100%, HTTP 10%
|
|
1001
|
+
# → Benefit: Never lose job events, acceptable overhead
|
|
1002
|
+
|
|
1003
|
+
# Strategy 4: Orphaned job handling
|
|
1004
|
+
# Jobs WITHOUT parent trace (cron jobs, manual triggers)
|
|
1005
|
+
orphaned_job_sampling do
|
|
1006
|
+
sample_rate 1.0 # Always sample orphaned jobs
|
|
1007
|
+
end
|
|
1008
|
+
end
|
|
1009
|
+
end
|
|
1010
|
+
```
|
|
1011
|
+
|
|
1012
|
+
**Why trace-consistent sampling matters:**
|
|
1013
|
+
|
|
1014
|
+
| Scenario | Without Trace-Consistency | With Trace-Consistency | Winner |
|
|
1015
|
+
|----------|---------------------------|------------------------|--------|
|
|
1016
|
+
| **Normal request** (sampled) | HTTP tracked, Job 10% chance | HTTP tracked, Job tracked | ✅ Same |
|
|
1017
|
+
| **Normal request** (not sampled) | HTTP dropped, Job 10% chance → orphaned! | HTTP dropped, Job dropped | ✅ Consistent |
|
|
1018
|
+
| **Error request** (initially not sampled) | HTTP buffered only, Job 10% chance | HTTP flushed + Job tracked (sample_on_error) | ✅ Complete |
|
|
1019
|
+
| **Distributed trace** | Service A sampled, Service B 10% chance → broken! | Service A sampled → Service B sampled | ✅ Complete |
|
|
1020
|
+
|
|
1021
|
+
**See also:**
|
|
1022
|
+
- **[UC-006: Trace Context Management](./UC-006-trace-context-management.md)** - Implementation details for trace propagation
|
|
1023
|
+
- **[UC-001: Request-Scoped Debug Buffering](./UC-001-request-scoped-debug-buffering.md)** - How `sample_on_error` works with buffering
|
|
1024
|
+
|
|
1025
|
+
---
|
|
1026
|
+
|
|
1027
|
+
### Strategy 8: Stratified Sampling for Accurate SLO (C11 Resolution) ⚠️
|
|
1028
|
+
|
|
1029
|
+
> **Reference:** See [ADR-009 §3.7: Stratified Sampling for SLO Accuracy](../ADR-009-cost-optimization.md#37-stratified-sampling-for-slo-accuracy-c11-resolution) for full architecture and [UC-004: SLO Tracking with Sampling Correction](./UC-004-zero-config-slo-tracking.md#sampling-correction-for-accurate-slo-c11-resolution) for SLO calculation details.
|
|
1030
|
+
|
|
1031
|
+
**Problem with Random Sampling:** Breaks SLO metrics! Errors are rare (5%) → random 10% sampling drops 90% of errors → SLO appears better than reality.
|
|
1032
|
+
|
|
1033
|
+
**Solution:** Stratified sampling preserves error/success RATIO by sampling each severity class at different rates.
|
|
1034
|
+
|
|
1035
|
+
```ruby
|
|
1036
|
+
E11y.configure do |config|
|
|
1037
|
+
config.cost_optimization do
|
|
1038
|
+
sampling do
|
|
1039
|
+
# ✅ Stratified sampling (NOT random!)
|
|
1040
|
+
strategy :stratified_adaptive
|
|
1041
|
+
|
|
1042
|
+
# Cost budget
|
|
1043
|
+
cost_budget 100_000 # events/month
|
|
1044
|
+
|
|
1045
|
+
# 🎯 SIMPLE CONFIG: Sample rate by severity (C11)
|
|
1046
|
+
stratified_rates do
|
|
1047
|
+
error 1.0 # 100% - Keep ALL errors (critical for SLO!)
|
|
1048
|
+
warn 0.5 # 50% - Medium priority
|
|
1049
|
+
info 0.1 # 10% - Low priority (успешные запросы)
|
|
1050
|
+
debug 0.05 # 5% - Very low priority
|
|
1051
|
+
end
|
|
1052
|
+
end
|
|
1053
|
+
end
|
|
1054
|
+
|
|
1055
|
+
# SLO tracking with automatic correction (enabled by default!)
|
|
1056
|
+
config.slo do
|
|
1057
|
+
enable_sampling_correction true # ✅ Corrects metrics automatically
|
|
1058
|
+
end
|
|
1059
|
+
end
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
**How It Works:**
|
|
1063
|
+
|
|
1064
|
+
```ruby
|
|
1065
|
+
# Scenario: 1000 requests (950 success, 50 errors) → 95% success rate
|
|
1066
|
+
|
|
1067
|
+
# Random sampling (10%):
|
|
1068
|
+
# - Sample 100 random events
|
|
1069
|
+
# - Might get: 98 success, 2 errors (luck!)
|
|
1070
|
+
# - Observed success rate: 98% ❌ WRONG! (should be 95%)
|
|
1071
|
+
|
|
1072
|
+
# Stratified sampling (errors: 100%, success: 10%):
|
|
1073
|
+
# - Sample 50 errors (100% × 50)
|
|
1074
|
+
# - Sample 95 success (10% × 950)
|
|
1075
|
+
# - Total: 145 events sampled (85.5% cost savings!)
|
|
1076
|
+
# - Observed success rate: 95/145 = 65.5%
|
|
1077
|
+
# - Corrected success rate: (95 × 1/0.1) / ((95 × 1/0.1) + (50 × 1/1.0)) = 950 / 1000 = 95% ✅ CORRECT!
|
|
1078
|
+
```
|
|
1079
|
+
|
|
1080
|
+
**Cost Savings Example:**
|
|
1081
|
+
|
|
1082
|
+
```ruby
|
|
1083
|
+
# 10M events/month (9.5M success, 500K errors)
|
|
1084
|
+
|
|
1085
|
+
# WITHOUT stratified sampling (100% all events):
|
|
1086
|
+
# Cost: 10M events × $0.001 = $10,000/month
|
|
1087
|
+
|
|
1088
|
+
# WITH stratified sampling (errors: 100%, success: 10%):
|
|
1089
|
+
# Kept: (500K × 1.0) + (9.5M × 0.1) = 500K + 950K = 1.45M events
|
|
1090
|
+
# Cost: 1.45M × $0.001 = $1,450/month
|
|
1091
|
+
# Savings: $8,550/month (85.5% reduction!) 💰
|
|
1092
|
+
# SLO accuracy: 100% (errors fully preserved)
|
|
1093
|
+
```
|
|
1094
|
+
|
|
1095
|
+
**Comparison: Random vs Stratified**
|
|
1096
|
+
|
|
1097
|
+
| Aspect | Random Sampling (10%) | Stratified Sampling (C11) |
|
|
1098
|
+
|--------|----------------------|--------------------------|
|
|
1099
|
+
| **Events kept** | 1M (10% of 10M) | 1.45M (14.5% of 10M) |
|
|
1100
|
+
| **Errors kept** | ~50K (10% × 500K) ❌ | 500K (100% × 500K) ✅ |
|
|
1101
|
+
| **Success kept** | ~950K (10% × 9.5M) | 950K (10% × 9.5M) |
|
|
1102
|
+
| **SLO accuracy** | ±5% error ❌ | 0% error ✅ |
|
|
1103
|
+
| **Cost savings** | 90% | 85.5% |
|
|
1104
|
+
| **Best for** | Non-critical apps | Production SLO tracking |
|
|
1105
|
+
|
|
1106
|
+
**StratifiedAdaptiveSampler Implementation:**
|
|
1107
|
+
|
|
1108
|
+
```ruby
|
|
1109
|
+
# lib/e11y/sampling/stratified_adaptive_sampler.rb
|
|
1110
|
+
module E11y
|
|
1111
|
+
module Sampling
|
|
1112
|
+
class StratifiedAdaptiveSampler < SimplifiedSampler
|
|
1113
|
+
def sample?(event)
|
|
1114
|
+
# Get sample rate for event severity
|
|
1115
|
+
sample_rate = stratified_rate_for(event)
|
|
1116
|
+
|
|
1117
|
+
# Random sampling within stratum
|
|
1118
|
+
rand < sample_rate
|
|
1119
|
+
end
|
|
1120
|
+
|
|
1121
|
+
def stratified_rate_for(event)
|
|
1122
|
+
case event.severity
|
|
1123
|
+
when :error, :fatal
|
|
1124
|
+
1.0 # 100% - never drop errors!
|
|
1125
|
+
when :warn
|
|
1126
|
+
0.5 # 50%
|
|
1127
|
+
when :info, :success
|
|
1128
|
+
0.1 # 10%
|
|
1129
|
+
when :debug
|
|
1130
|
+
0.05 # 5%
|
|
1131
|
+
else
|
|
1132
|
+
0.1 # Default
|
|
1133
|
+
end
|
|
1134
|
+
end
|
|
1135
|
+
|
|
1136
|
+
# Store sample rate in event metadata (for correction)
|
|
1137
|
+
def after_sample(event, sampled)
|
|
1138
|
+
if sampled
|
|
1139
|
+
event.metadata[:sample_rate] = stratified_rate_for(event)
|
|
1140
|
+
end
|
|
1141
|
+
end
|
|
1142
|
+
end
|
|
1143
|
+
end
|
|
1144
|
+
end
|
|
1145
|
+
```
|
|
1146
|
+
|
|
1147
|
+
**SLO Calculation with Correction:**
|
|
1148
|
+
|
|
1149
|
+
```ruby
|
|
1150
|
+
# lib/e11y/slo/calculator.rb
|
|
1151
|
+
module E11y
|
|
1152
|
+
module SLO
|
|
1153
|
+
class Calculator
|
|
1154
|
+
def error_rate
|
|
1155
|
+
# Query sampled events
|
|
1156
|
+
total_sampled = Event.count
|
|
1157
|
+
errors_sampled = Event.where(severity: [:error, :fatal]).count
|
|
1158
|
+
|
|
1159
|
+
# Apply sampling correction
|
|
1160
|
+
total_corrected = correct_count(total_sampled)
|
|
1161
|
+
errors_corrected = correct_count(errors_sampled, severity: :error)
|
|
1162
|
+
|
|
1163
|
+
# Calculate corrected error rate
|
|
1164
|
+
errors_corrected.to_f / total_corrected
|
|
1165
|
+
end
|
|
1166
|
+
|
|
1167
|
+
private
|
|
1168
|
+
|
|
1169
|
+
def correct_count(sampled_count, severity: nil)
|
|
1170
|
+
# Get sample rate for this severity
|
|
1171
|
+
sample_rate = case severity
|
|
1172
|
+
when :error, :fatal then 1.0
|
|
1173
|
+
when :warn then 0.5
|
|
1174
|
+
when :info, :success then 0.1
|
|
1175
|
+
else 0.1
|
|
1176
|
+
end
|
|
1177
|
+
|
|
1178
|
+
# Correct count: observed × (1 / sample_rate)
|
|
1179
|
+
sampled_count / sample_rate
|
|
1180
|
+
end
|
|
1181
|
+
end
|
|
1182
|
+
end
|
|
1183
|
+
end
|
|
1184
|
+
|
|
1185
|
+
# Usage:
|
|
1186
|
+
E11y::SLO.error_rate # ✅ Returns corrected error rate (accurate!)
|
|
1187
|
+
```
|
|
1188
|
+
|
|
1189
|
+
**Trade-offs:**
|
|
1190
|
+
|
|
1191
|
+
| Aspect | Pro | Con | Decision |
|
|
1192
|
+
|--------|-----|-----|----------|
|
|
1193
|
+
| **Stratified by severity** | SLO accuracy preserved | Slightly more complex | Accuracy > simplicity |
|
|
1194
|
+
| **100% errors, 10% success** | 85% cost savings + accurate SLO | More events than pure random | Best balance |
|
|
1195
|
+
| **Sampling correction math** | Accurate metrics | Need to apply correction | Auto-corrected by E11y |
|
|
1196
|
+
| **Simple config** | Easy to use | Less flexible than custom strata | Covers 95% of use cases |
|
|
1197
|
+
|
|
1198
|
+
---
|
|
1199
|
+
|
|
1200
|
+
## 💻 Implementation Examples
|
|
1201
|
+
|
|
1202
|
+
### Example 1: Production Incident Response (Phase 2.8 - All Strategies) ✅
|
|
1203
|
+
|
|
1204
|
+
```ruby
|
|
1205
|
+
# Scenario: Payment gateway outage
|
|
1206
|
+
# Normal: 1k events/sec, 1% errors
|
|
1207
|
+
# Incident: 50k events/sec, 80% errors
|
|
1208
|
+
|
|
1209
|
+
# Configuration with all 4 strategies
|
|
1210
|
+
E11y.configure do |config|
|
|
1211
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
1212
|
+
default_sample_rate: 0.1,
|
|
1213
|
+
|
|
1214
|
+
# Strategy 1: Error-Based Adaptive (FEAT-4838)
|
|
1215
|
+
error_based_adaptive: true,
|
|
1216
|
+
error_spike_config: {
|
|
1217
|
+
window: 60,
|
|
1218
|
+
absolute_threshold: 100, # 100 errors/min
|
|
1219
|
+
relative_threshold: 3.0, # 3x baseline
|
|
1220
|
+
spike_duration: 600 # 10 minutes
|
|
1221
|
+
},
|
|
1222
|
+
|
|
1223
|
+
# Strategy 2: Load-Based Adaptive (FEAT-4842)
|
|
1224
|
+
load_based_adaptive: true,
|
|
1225
|
+
load_monitor_config: {
|
|
1226
|
+
window: 60,
|
|
1227
|
+
normal_threshold: 1_000,
|
|
1228
|
+
high_threshold: 10_000,
|
|
1229
|
+
very_high_threshold: 50_000,
|
|
1230
|
+
overload_threshold: 100_000
|
|
1231
|
+
}
|
|
1232
|
+
end
|
|
1233
|
+
|
|
1234
|
+
# Strategy 3: Value-Based Sampling (FEAT-4846)
|
|
1235
|
+
class Events::PaymentProcessed < E11y::Event::Base
|
|
1236
|
+
# Always sample payment events (business-critical)
|
|
1237
|
+
sample_rate 1.0
|
|
1238
|
+
|
|
1239
|
+
# But still respect value-based rules
|
|
1240
|
+
sample_by_value field: "amount",
|
|
1241
|
+
operator: :greater_than,
|
|
1242
|
+
threshold: 1000,
|
|
1243
|
+
sample_rate: 1.0 # Always sample high-value
|
|
1244
|
+
end
|
|
1245
|
+
|
|
1246
|
+
# Timeline during incident:
|
|
1247
|
+
# 10:00: Normal (1k/sec, 1% errors)
|
|
1248
|
+
# → Load: normal → base rate 100%
|
|
1249
|
+
# → Error spike: NO → rate unchanged
|
|
1250
|
+
# → Payment events: 100% (event-level config)
|
|
1251
|
+
# → Final: 100% sampling
|
|
1252
|
+
#
|
|
1253
|
+
# 10:05: Incident starts (10k/sec, 40% errors)
|
|
1254
|
+
# → Load: high → base rate 50%
|
|
1255
|
+
# → Error spike: YES (40% > 5%) → override to 100%!
|
|
1256
|
+
# → Payment events: 100% (error spike overrides)
|
|
1257
|
+
# → Final: 100% sampling (debug priority!)
|
|
1258
|
+
#
|
|
1259
|
+
# 10:10: Incident peak (50k/sec, 80% errors)
|
|
1260
|
+
# → Load: very_high → base rate 10%
|
|
1261
|
+
# → Error spike: YES (still active) → override to 100%!
|
|
1262
|
+
# → Payment events: 100% (error spike overrides)
|
|
1263
|
+
# → Final: 100% sampling (all errors captured!)
|
|
1264
|
+
#
|
|
1265
|
+
# 10:20: Incident resolving (5k/sec, 10% errors)
|
|
1266
|
+
# → Load: high → base rate 50%
|
|
1267
|
+
# → Error spike: Still active (within 10min duration)
|
|
1268
|
+
# → Payment events: 100%
|
|
1269
|
+
# → Final: 100% sampling (spike duration not expired)
|
|
1270
|
+
#
|
|
1271
|
+
# 10:30: Normal (1k/sec, 1% errors)
|
|
1272
|
+
# → Load: normal → base rate 100%
|
|
1273
|
+
# → Error spike: NO (expired after 10min)
|
|
1274
|
+
# → Payment events: 100%
|
|
1275
|
+
# → Final: 100% sampling (back to normal)
|
|
1276
|
+
```
|
|
1277
|
+
|
|
1278
|
+
**Original Example (Conceptual):**
|
|
1279
|
+
|
|
1280
|
+
```ruby
|
|
1281
|
+
# Scenario: Payment gateway outage
|
|
1282
|
+
# Normal: 1k events/sec, 1% errors
|
|
1283
|
+
# Incident: 50k events/sec, 80% errors
|
|
1284
|
+
|
|
1285
|
+
E11y.configure do |config|
|
|
1286
|
+
config.adaptive_sampling do
|
|
1287
|
+
# Detect error spike
|
|
1288
|
+
error_spike_detection do
|
|
1289
|
+
enabled true
|
|
1290
|
+
window 1.minute
|
|
1291
|
+
absolute_threshold 100
|
|
1292
|
+
relative_threshold 3.0
|
|
1293
|
+
|
|
1294
|
+
on_spike do
|
|
1295
|
+
sample_rate 1.0 # 100% during errors
|
|
1296
|
+
duration 10.minutes
|
|
1297
|
+
end
|
|
1298
|
+
end
|
|
1299
|
+
|
|
1300
|
+
# But also protect from overload
|
|
1301
|
+
load_based_sampling do
|
|
1302
|
+
tiers [
|
|
1303
|
+
{ threshold: 0, sample_rate: 1.0 },
|
|
1304
|
+
{ threshold: 10_000, sample_rate: 0.5 },
|
|
1305
|
+
{ threshold: 50_000, sample_rate: 0.1 }
|
|
1306
|
+
]
|
|
1307
|
+
end
|
|
1308
|
+
|
|
1309
|
+
# Priority: Always sample payment errors
|
|
1310
|
+
always_sample event_patterns: ['payment.*'],
|
|
1311
|
+
severities: [:error, :fatal]
|
|
1312
|
+
end
|
|
1313
|
+
end
|
|
1314
|
+
|
|
1315
|
+
# Timeline during incident:
|
|
1316
|
+
# 10:00: Normal (1k/sec, 1% errors)
|
|
1317
|
+
# → Base sampling: 10%
|
|
1318
|
+
# → Errors: 100% (always_sample)
|
|
1319
|
+
#
|
|
1320
|
+
# 10:05: Incident starts (10k/sec, 40% errors)
|
|
1321
|
+
# → Error spike detected!
|
|
1322
|
+
# → All events: 100% (spike mode)
|
|
1323
|
+
#
|
|
1324
|
+
# 10:10: Incident peak (50k/sec, 80% errors)
|
|
1325
|
+
# → Load protection kicks in
|
|
1326
|
+
# → Non-payment events: 10% (load tier)
|
|
1327
|
+
# → Payment errors: 100% (always_sample)
|
|
1328
|
+
#
|
|
1329
|
+
# 10:20: Incident resolving (5k/sec, 10% errors)
|
|
1330
|
+
# → Gradual return to normal
|
|
1331
|
+
# → Sample rate: 50% → 30% → 10%
|
|
1332
|
+
#
|
|
1333
|
+
# 10:30: Normal (1k/sec, 1% errors)
|
|
1334
|
+
# → Back to base sampling: 10%
|
|
1335
|
+
```
|
|
1336
|
+
|
|
1337
|
+
---
|
|
1338
|
+
|
|
1339
|
+
### Example 2: Black Friday Traffic (Phase 2.8 - Load + Value-Based) ✅
|
|
1340
|
+
|
|
1341
|
+
```ruby
|
|
1342
|
+
# Scenario: Black Friday sale
|
|
1343
|
+
# Normal: 2k events/sec
|
|
1344
|
+
# Black Friday: 100k events/sec (50x increase!)
|
|
1345
|
+
|
|
1346
|
+
# Configuration with load-based + value-based sampling
|
|
1347
|
+
E11y.configure do |config|
|
|
1348
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
1349
|
+
default_sample_rate: 0.1,
|
|
1350
|
+
|
|
1351
|
+
# Load-Based Adaptive (FEAT-4842)
|
|
1352
|
+
load_based_adaptive: true,
|
|
1353
|
+
load_monitor_config: {
|
|
1354
|
+
window: 60,
|
|
1355
|
+
normal_threshold: 1_000,
|
|
1356
|
+
high_threshold: 10_000,
|
|
1357
|
+
very_high_threshold: 50_000,
|
|
1358
|
+
overload_threshold: 100_000 # Black Friday triggers this!
|
|
1359
|
+
},
|
|
1360
|
+
|
|
1361
|
+
# Error-Based Adaptive (for incident detection)
|
|
1362
|
+
error_based_adaptive: true,
|
|
1363
|
+
error_spike_config: {
|
|
1364
|
+
absolute_threshold: 100,
|
|
1365
|
+
relative_threshold: 3.0
|
|
1366
|
+
}
|
|
1367
|
+
end
|
|
1368
|
+
|
|
1369
|
+
# Value-Based Sampling (FEAT-4846)
|
|
1370
|
+
class Events::OrderPaid < E11y::Event::Base
|
|
1371
|
+
schema do
|
|
1372
|
+
required(:order_id).filled(:string)
|
|
1373
|
+
required(:amount).filled(:decimal)
|
|
1374
|
+
end
|
|
1375
|
+
|
|
1376
|
+
# Always sample high-value orders (even during overload!)
|
|
1377
|
+
sample_by_value field: "amount",
|
|
1378
|
+
operator: :greater_than,
|
|
1379
|
+
threshold: 500,
|
|
1380
|
+
sample_rate: 1.0 # 100% for >$500 orders
|
|
1381
|
+
end
|
|
1382
|
+
|
|
1383
|
+
# Black Friday timeline:
|
|
1384
|
+
# 00:00 (start): 100k events/sec
|
|
1385
|
+
# → Load: overload → base rate 1%
|
|
1386
|
+
# → High-value orders (5k/sec): 100% (value-based override)
|
|
1387
|
+
# → Regular orders (95k/sec): 1% = 950 events/sec
|
|
1388
|
+
# → Total tracked: 5k + 950 = 5,950 events/sec (5.95%)
|
|
1389
|
+
# vs. 10k with fixed 10% sampling → 40% savings!
|
|
1390
|
+
#
|
|
1391
|
+
# 12:00 (peak): 150k events/sec
|
|
1392
|
+
# → Load: overload → base rate 1%
|
|
1393
|
+
# → High-value orders (10k/sec): 100% (10k tracked)
|
|
1394
|
+
# → Regular orders (140k/sec): 1% = 1,400 tracked
|
|
1395
|
+
# → Total tracked: 11,400 events/sec (7.6%)
|
|
1396
|
+
# → Cost protection while capturing all high-value!
|
|
1397
|
+
#
|
|
1398
|
+
# 23:59 (end): 20k events/sec
|
|
1399
|
+
# → Load: high → base rate 50%
|
|
1400
|
+
# → High-value orders (1k/sec): 100% (1k tracked)
|
|
1401
|
+
# → Regular orders (19k/sec): 50% = 9,500 tracked
|
|
1402
|
+
# → Total tracked: 10,500 events/sec (52.5%)
|
|
1403
|
+
# → Returning to normal rates
|
|
1404
|
+
```
|
|
1405
|
+
|
|
1406
|
+
**Stratified Sampling SLO Accuracy (FEAT-4850):**
|
|
1407
|
+
|
|
1408
|
+
```ruby
|
|
1409
|
+
# During Black Friday, even with aggressive sampling (1% for regular, 100% for errors):
|
|
1410
|
+
# - Errors: 500/sec × 100% = 500 tracked
|
|
1411
|
+
# - Success: 99.5k/sec × 1% = 995 tracked
|
|
1412
|
+
# - Total: 1,495 tracked (1.495% sampling!)
|
|
1413
|
+
#
|
|
1414
|
+
# SLO Calculation (with correction):
|
|
1415
|
+
# Observed success rate: 995 / 1495 = 66.6% ❌ WRONG
|
|
1416
|
+
# Corrected success: 995 / 0.01 = 99,500
|
|
1417
|
+
# Corrected errors: 500 / 1.0 = 500
|
|
1418
|
+
# Corrected success rate: 99,500 / 100,000 = 99.5% ✅ ACCURATE
|
|
1419
|
+
#
|
|
1420
|
+
# Result: 98.5% cost savings with 100% SLO accuracy!
|
|
1421
|
+
```
|
|
1422
|
+
|
|
1423
|
+
**Original Example (Conceptual):**
|
|
1424
|
+
|
|
1425
|
+
```ruby
|
|
1426
|
+
# Scenario: Black Friday sale
|
|
1427
|
+
# Normal: 2k events/sec
|
|
1428
|
+
# Black Friday: 100k events/sec (50x increase!)
|
|
1429
|
+
|
|
1430
|
+
E11y.configure do |config|
|
|
1431
|
+
config.adaptive_sampling do
|
|
1432
|
+
# Load-based scaling
|
|
1433
|
+
load_based_sampling do
|
|
1434
|
+
tiers [
|
|
1435
|
+
{ threshold: 0, sample_rate: 1.0 }, # Normal
|
|
1436
|
+
{ threshold: 10_000, sample_rate: 0.5 }, # Busy
|
|
1437
|
+
{ threshold: 50_000, sample_rate: 0.1 }, # Very busy
|
|
1438
|
+
{ threshold: 100_000, sample_rate: 0.01 } # Black Friday!
|
|
1439
|
+
]
|
|
1440
|
+
end
|
|
1441
|
+
|
|
1442
|
+
# But always sample high-value orders
|
|
1443
|
+
value_based_sampling do
|
|
1444
|
+
sample_by field: :amount,
|
|
1445
|
+
threshold: 500,
|
|
1446
|
+
sample_rate: 1.0 # Always track >$500 orders
|
|
1447
|
+
end
|
|
1448
|
+
|
|
1449
|
+
# And always sample errors
|
|
1450
|
+
always_sample severities: [:error, :fatal]
|
|
1451
|
+
end
|
|
1452
|
+
end
|
|
1453
|
+
|
|
1454
|
+
# Result during Black Friday:
|
|
1455
|
+
# 100k events/sec total
|
|
1456
|
+
# → Regular events: 1% (1k events/sec tracked)
|
|
1457
|
+
# → High-value orders (5k/sec): 100% (5k tracked)
|
|
1458
|
+
# → Errors (500/sec): 100% (500 tracked)
|
|
1459
|
+
# Total tracked: 6.5k events/sec (6.5% effective rate)
|
|
1460
|
+
# → vs 10k with fixed 10% sampling (saves 35%!)
|
|
1461
|
+
```
|
|
1462
|
+
|
|
1463
|
+
---
|
|
1464
|
+
|
|
1465
|
+
### Example 3: Debug Session
|
|
1466
|
+
|
|
1467
|
+
```ruby
|
|
1468
|
+
# Scenario: Engineer debugging production issue for specific user
|
|
1469
|
+
|
|
1470
|
+
# Temporarily increase sampling for specific user
|
|
1471
|
+
E11y.with_sampling_override(
|
|
1472
|
+
user_id: 'debug_user_123',
|
|
1473
|
+
sample_rate: 1.0, # 100% for this user
|
|
1474
|
+
duration: 1.hour
|
|
1475
|
+
) do
|
|
1476
|
+
# All events for this user tracked at 100% for 1 hour
|
|
1477
|
+
# Other users: normal sampling rates
|
|
1478
|
+
end
|
|
1479
|
+
|
|
1480
|
+
# OR: Programmatically via API
|
|
1481
|
+
E11y.configure do |config|
|
|
1482
|
+
config.adaptive_sampling do
|
|
1483
|
+
# Whitelist specific users for full sampling
|
|
1484
|
+
whitelist_sampling do
|
|
1485
|
+
users ['debug_user_123', 'vip_user_456']
|
|
1486
|
+
sample_rate 1.0
|
|
1487
|
+
expires_at 1.hour.from_now
|
|
1488
|
+
end
|
|
1489
|
+
end
|
|
1490
|
+
end
|
|
1491
|
+
|
|
1492
|
+
# OR: Dynamic via Redis (supports multiple app servers)
|
|
1493
|
+
E11y::Sampling.set_override(
|
|
1494
|
+
context: { user_id: 'debug_user_123' },
|
|
1495
|
+
sample_rate: 1.0,
|
|
1496
|
+
ttl: 1.hour
|
|
1497
|
+
)
|
|
1498
|
+
```
|
|
1499
|
+
|
|
1500
|
+
---
|
|
1501
|
+
|
|
1502
|
+
## 📊 Monitoring Adaptive Sampling
|
|
1503
|
+
|
|
1504
|
+
**Track sampling effectiveness:**
|
|
1505
|
+
```ruby
|
|
1506
|
+
# Self-monitoring metrics
|
|
1507
|
+
E11y.configure do |config|
|
|
1508
|
+
config.self_monitoring do
|
|
1509
|
+
# Current sample rate (per strategy)
|
|
1510
|
+
gauge :adaptive_sampling_current_rate,
|
|
1511
|
+
tags: [:strategy]
|
|
1512
|
+
|
|
1513
|
+
# Events sampled vs dropped
|
|
1514
|
+
counter :adaptive_sampling_events_sampled_total,
|
|
1515
|
+
tags: [:strategy, :reason]
|
|
1516
|
+
counter :adaptive_sampling_events_dropped_total,
|
|
1517
|
+
tags: [:strategy, :reason]
|
|
1518
|
+
|
|
1519
|
+
# Load metrics
|
|
1520
|
+
gauge :adaptive_sampling_current_load_events_per_sec
|
|
1521
|
+
gauge :adaptive_sampling_error_rate
|
|
1522
|
+
|
|
1523
|
+
# Strategy transitions
|
|
1524
|
+
counter :adaptive_sampling_strategy_transitions_total,
|
|
1525
|
+
tags: [:from_strategy, :to_strategy, :reason]
|
|
1526
|
+
|
|
1527
|
+
# Effectiveness
|
|
1528
|
+
histogram :adaptive_sampling_resource_savings_pct,
|
|
1529
|
+
buckets: [10, 25, 50, 75, 90]
|
|
1530
|
+
end
|
|
1531
|
+
end
|
|
1532
|
+
|
|
1533
|
+
# Prometheus queries:
|
|
1534
|
+
# - Current sampling rate:
|
|
1535
|
+
# adaptive_sampling_current_rate
|
|
1536
|
+
#
|
|
1537
|
+
# - How many events dropped by adaptive sampling?
|
|
1538
|
+
# sum(increase(adaptive_sampling_events_dropped_total[1h]))
|
|
1539
|
+
#
|
|
1540
|
+
# - Resource savings:
|
|
1541
|
+
# histogram_quantile(0.5, adaptive_sampling_resource_savings_pct_bucket)
|
|
1542
|
+
```
|
|
1543
|
+
|
|
1544
|
+
---
|
|
1545
|
+
|
|
1546
|
+
## 🧪 Testing
|
|
1547
|
+
|
|
1548
|
+
```ruby
|
|
1549
|
+
# spec/e11y/adaptive_sampling_spec.rb
|
|
1550
|
+
RSpec.describe 'Adaptive Sampling' do
|
|
1551
|
+
describe 'error spike detection' do
|
|
1552
|
+
it 'increases sampling during error spikes' do
|
|
1553
|
+
E11y.configure do |config|
|
|
1554
|
+
config.adaptive_sampling do
|
|
1555
|
+
error_spike_detection do
|
|
1556
|
+
enabled true
|
|
1557
|
+
absolute_threshold 10
|
|
1558
|
+
on_spike do
|
|
1559
|
+
sample_rate 1.0
|
|
1560
|
+
end
|
|
1561
|
+
end
|
|
1562
|
+
end
|
|
1563
|
+
end
|
|
1564
|
+
|
|
1565
|
+
# Normal: 10% sampling
|
|
1566
|
+
expect(E11y.current_sample_rate).to eq(0.1)
|
|
1567
|
+
|
|
1568
|
+
# Simulate error spike (20 errors in 1 minute)
|
|
1569
|
+
20.times { Events::TestError.track(severity: :error) }
|
|
1570
|
+
|
|
1571
|
+
# Should increase to 100%
|
|
1572
|
+
expect(E11y.current_sample_rate).to eq(1.0)
|
|
1573
|
+
end
|
|
1574
|
+
end
|
|
1575
|
+
|
|
1576
|
+
describe 'load-based sampling' do
|
|
1577
|
+
it 'decreases sampling under high load' do
|
|
1578
|
+
E11y.configure do |config|
|
|
1579
|
+
config.adaptive_sampling do
|
|
1580
|
+
load_based_sampling do
|
|
1581
|
+
tiers [
|
|
1582
|
+
{ threshold: 0, sample_rate: 1.0 },
|
|
1583
|
+
{ threshold: 1_000, sample_rate: 0.1 }
|
|
1584
|
+
]
|
|
1585
|
+
end
|
|
1586
|
+
end
|
|
1587
|
+
end
|
|
1588
|
+
|
|
1589
|
+
# Low load: 100% sampling
|
|
1590
|
+
expect(E11y.current_sample_rate).to eq(1.0)
|
|
1591
|
+
|
|
1592
|
+
# Simulate high load (1500 events/sec)
|
|
1593
|
+
E11y::LoadMonitor.report_load(1500)
|
|
1594
|
+
|
|
1595
|
+
# Should decrease to 10%
|
|
1596
|
+
expect(E11y.current_sample_rate).to eq(0.1)
|
|
1597
|
+
end
|
|
1598
|
+
end
|
|
1599
|
+
|
|
1600
|
+
describe 'value-based sampling' do
|
|
1601
|
+
it 'always samples high-value events' do
|
|
1602
|
+
E11y.configure do |config|
|
|
1603
|
+
config.adaptive_sampling do
|
|
1604
|
+
base_sample_rate 0.1
|
|
1605
|
+
value_based_sampling do
|
|
1606
|
+
sample_by field: :amount, threshold: 1000, sample_rate: 1.0
|
|
1607
|
+
end
|
|
1608
|
+
end
|
|
1609
|
+
end
|
|
1610
|
+
|
|
1611
|
+
# High-value: always sampled
|
|
1612
|
+
100.times do
|
|
1613
|
+
result = Events::OrderPaid.track(amount: 5000)
|
|
1614
|
+
expect(result).to be_sampled
|
|
1615
|
+
end
|
|
1616
|
+
|
|
1617
|
+
# Low-value: 10% chance
|
|
1618
|
+
results = 1000.times.map { Events::OrderPaid.track(amount: 50) }
|
|
1619
|
+
sampled_count = results.count(&:sampled?)
|
|
1620
|
+
expect(sampled_count).to be_within(50).of(100) # ~10%
|
|
1621
|
+
end
|
|
1622
|
+
end
|
|
1623
|
+
end
|
|
1624
|
+
```
|
|
1625
|
+
|
|
1626
|
+
---
|
|
1627
|
+
|
|
1628
|
+
## 💡 Best Practices
|
|
1629
|
+
|
|
1630
|
+
### ✅ DO
|
|
1631
|
+
|
|
1632
|
+
**1. Always sample critical events**
|
|
1633
|
+
```ruby
|
|
1634
|
+
# ✅ GOOD
|
|
1635
|
+
always_sample severities: [:error, :fatal],
|
|
1636
|
+
event_patterns: ['payment.*', 'security.*']
|
|
1637
|
+
```
|
|
1638
|
+
|
|
1639
|
+
**2. Use multiple strategies together**
|
|
1640
|
+
```ruby
|
|
1641
|
+
# ✅ GOOD: Layered approach
|
|
1642
|
+
config.adaptive_sampling do
|
|
1643
|
+
error_spike_detection { ... } # Layer 1
|
|
1644
|
+
load_based_sampling { ... } # Layer 2
|
|
1645
|
+
value_based_sampling { ... } # Layer 3
|
|
1646
|
+
end
|
|
1647
|
+
```
|
|
1648
|
+
|
|
1649
|
+
**3. Monitor sampling effectiveness**
|
|
1650
|
+
```ruby
|
|
1651
|
+
# ✅ GOOD: Track metrics
|
|
1652
|
+
# - adaptive_sampling_resource_savings_pct
|
|
1653
|
+
# - adaptive_sampling_events_dropped_total
|
|
1654
|
+
```
|
|
1655
|
+
|
|
1656
|
+
**4. Test in staging first**
|
|
1657
|
+
```ruby
|
|
1658
|
+
# ✅ GOOD: Validate before production
|
|
1659
|
+
# Test error spike scenarios
|
|
1660
|
+
# Test high load scenarios
|
|
1661
|
+
# Verify critical events never dropped
|
|
1662
|
+
```
|
|
1663
|
+
|
|
1664
|
+
---
|
|
1665
|
+
|
|
1666
|
+
### ❌ DON'T
|
|
1667
|
+
|
|
1668
|
+
**1. Don't sample audit events**
|
|
1669
|
+
```ruby
|
|
1670
|
+
# ❌ BAD: Audit events must be 100%
|
|
1671
|
+
# Use .audit() not .track() for compliance events
|
|
1672
|
+
Events::UserDeleted.audit(...) # Always 100%
|
|
1673
|
+
```
|
|
1674
|
+
|
|
1675
|
+
**2. Don't use aggressive sampling without testing**
|
|
1676
|
+
```ruby
|
|
1677
|
+
# ❌ BAD: 1% sampling too aggressive
|
|
1678
|
+
base_sample_rate 0.01 # Might miss important events!
|
|
1679
|
+
|
|
1680
|
+
# ✅ GOOD: Start conservative
|
|
1681
|
+
base_sample_rate 0.1 # 10%, adjust based on data
|
|
1682
|
+
```
|
|
1683
|
+
|
|
1684
|
+
**3. Don't ignore dropped event metrics**
|
|
1685
|
+
```ruby
|
|
1686
|
+
# ❌ BAD: Not monitoring drops
|
|
1687
|
+
# → You might be dropping critical events!
|
|
1688
|
+
|
|
1689
|
+
# ✅ GOOD: Alert on unexpected drops
|
|
1690
|
+
# Alert: adaptive_sampling_events_dropped_total{reason="critical"} > 0
|
|
1691
|
+
```
|
|
1692
|
+
|
|
1693
|
+
---
|
|
1694
|
+
|
|
1695
|
+
## 🔒 Validations (NEW - v1.1)
|
|
1696
|
+
|
|
1697
|
+
> **🎯 Pattern:** Validate sampling configuration at class load time.
|
|
1698
|
+
|
|
1699
|
+
### Sample Rate Validation
|
|
1700
|
+
|
|
1701
|
+
**Problem:** Invalid sample rates → silent failures.
|
|
1702
|
+
|
|
1703
|
+
**Solution:** Validate sample_rate is between 0.0 and 1.0:
|
|
1704
|
+
|
|
1705
|
+
```ruby
|
|
1706
|
+
# Gem implementation (automatic):
|
|
1707
|
+
def self.sample_rate(rate)
|
|
1708
|
+
unless rate.is_a?(Numeric) && rate >= 0.0 && rate <= 1.0
|
|
1709
|
+
raise ArgumentError, "sample_rate must be 0.0..1.0, got: #{rate.inspect}"
|
|
1710
|
+
end
|
|
1711
|
+
self._sample_rate = rate
|
|
1712
|
+
end
|
|
1713
|
+
|
|
1714
|
+
# Result:
|
|
1715
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1716
|
+
sample_rate 1.5 # ← ERROR: "sample_rate must be 0.0..1.0, got: 1.5"
|
|
1717
|
+
end
|
|
1718
|
+
```
|
|
1719
|
+
|
|
1720
|
+
### Adaptive Sampling Validation
|
|
1721
|
+
|
|
1722
|
+
**Problem:** Invalid adaptive_sampling config → runtime errors.
|
|
1723
|
+
|
|
1724
|
+
**Solution:** Validate enabled parameter:
|
|
1725
|
+
|
|
1726
|
+
```ruby
|
|
1727
|
+
# Gem implementation (automatic):
|
|
1728
|
+
def self.adaptive_sampling(enabled:)
|
|
1729
|
+
unless [true, false].include?(enabled)
|
|
1730
|
+
raise ArgumentError, "adaptive_sampling enabled: must be true or false, got: #{enabled.inspect}"
|
|
1731
|
+
end
|
|
1732
|
+
self._adaptive_sampling = enabled
|
|
1733
|
+
end
|
|
1734
|
+
|
|
1735
|
+
# Result:
|
|
1736
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1737
|
+
adaptive_sampling enabled: "yes" # ← ERROR: "adaptive_sampling enabled: must be true or false, got: \"yes\""
|
|
1738
|
+
end
|
|
1739
|
+
```
|
|
1740
|
+
|
|
1741
|
+
### Audit Event Sampling Validation (LOCKED)
|
|
1742
|
+
|
|
1743
|
+
**Problem:** Attempting to sample audit events → compliance violations.
|
|
1744
|
+
|
|
1745
|
+
**Solution:** Lock sampling for audit events:
|
|
1746
|
+
|
|
1747
|
+
```ruby
|
|
1748
|
+
# Gem implementation (automatic):
|
|
1749
|
+
def self.sampling(enabled)
|
|
1750
|
+
if self._audit_event && enabled
|
|
1751
|
+
raise ArgumentError, "Cannot enable sampling for audit events! Audit events must never be sampled."
|
|
1752
|
+
end
|
|
1753
|
+
self._sampling = enabled
|
|
1754
|
+
end
|
|
1755
|
+
|
|
1756
|
+
# Result:
|
|
1757
|
+
class Events::UserDeleted < E11y::Event::Base
|
|
1758
|
+
audit_event true
|
|
1759
|
+
sample_rate 0.5 # ← ERROR: "Cannot enable sampling for audit events!"
|
|
1760
|
+
end
|
|
1761
|
+
```
|
|
1762
|
+
|
|
1763
|
+
---
|
|
1764
|
+
|
|
1765
|
+
## 🌍 Environment-Specific Sampling (NEW - v1.1)
|
|
1766
|
+
|
|
1767
|
+
> **🎯 Pattern:** Different sampling rates per environment.
|
|
1768
|
+
|
|
1769
|
+
### Example 1: Higher Sampling in Production
|
|
1770
|
+
|
|
1771
|
+
```ruby
|
|
1772
|
+
class Events::DebugQuery < E11y::Event::Base
|
|
1773
|
+
schema do
|
|
1774
|
+
required(:query).filled(:string)
|
|
1775
|
+
required(:duration_ms).filled(:integer)
|
|
1776
|
+
end
|
|
1777
|
+
|
|
1778
|
+
# Environment-specific sampling
|
|
1779
|
+
sample_rate Rails.env.production? ? 0.01 : 1.0 # 1% prod, 100% dev
|
|
1780
|
+
adaptive_sampling enabled: Rails.env.production? # Only in prod
|
|
1781
|
+
end
|
|
1782
|
+
```
|
|
1783
|
+
|
|
1784
|
+
### Example 2: Feature Flag for Adaptive Sampling
|
|
1785
|
+
|
|
1786
|
+
```ruby
|
|
1787
|
+
class Events::ApiRequest < E11y::Event::Base
|
|
1788
|
+
schema do
|
|
1789
|
+
required(:endpoint).filled(:string)
|
|
1790
|
+
required(:status).filled(:integer)
|
|
1791
|
+
end
|
|
1792
|
+
|
|
1793
|
+
# Enable adaptive sampling only when flag is on
|
|
1794
|
+
if ENV['ENABLE_ADAPTIVE_SAMPLING'] == 'true'
|
|
1795
|
+
adaptive_sampling enabled: true
|
|
1796
|
+
sample_rate 0.1 # Base rate: 10%
|
|
1797
|
+
else
|
|
1798
|
+
sample_rate 1.0 # Fixed: 100%
|
|
1799
|
+
end
|
|
1800
|
+
end
|
|
1801
|
+
```
|
|
1802
|
+
|
|
1803
|
+
### Example 3: Cost-Based Sampling
|
|
1804
|
+
|
|
1805
|
+
```ruby
|
|
1806
|
+
class Events::UserAction < E11y::Event::Base
|
|
1807
|
+
schema do
|
|
1808
|
+
required(:action_type).filled(:string)
|
|
1809
|
+
end
|
|
1810
|
+
|
|
1811
|
+
# Aggressive sampling in high-cost environments
|
|
1812
|
+
sample_rate case ENV['OBSERVABILITY_TIER']
|
|
1813
|
+
when 'premium' then 1.0 # 100% (unlimited budget)
|
|
1814
|
+
when 'standard' then 0.1 # 10% (moderate budget)
|
|
1815
|
+
when 'basic' then 0.01 # 1% (tight budget)
|
|
1816
|
+
else 0.1
|
|
1817
|
+
end
|
|
1818
|
+
end
|
|
1819
|
+
```
|
|
1820
|
+
|
|
1821
|
+
---
|
|
1822
|
+
|
|
1823
|
+
## 📊 Precedence Rules for Sampling (NEW - v1.1)
|
|
1824
|
+
|
|
1825
|
+
> **🎯 Pattern:** Sampling configuration precedence (most specific wins).
|
|
1826
|
+
|
|
1827
|
+
### Precedence Order (Highest to Lowest)
|
|
1828
|
+
|
|
1829
|
+
```
|
|
1830
|
+
1. Event-level explicit config (highest priority)
|
|
1831
|
+
↓
|
|
1832
|
+
2. Preset module config
|
|
1833
|
+
↓
|
|
1834
|
+
3. Base class config (inheritance)
|
|
1835
|
+
↓
|
|
1836
|
+
4. Convention-based defaults (severity → sample rate)
|
|
1837
|
+
↓
|
|
1838
|
+
5. Global config (lowest priority)
|
|
1839
|
+
```
|
|
1840
|
+
|
|
1841
|
+
### Example: Mixing Inheritance + Presets for Sampling
|
|
1842
|
+
|
|
1843
|
+
```ruby
|
|
1844
|
+
# Global config (lowest priority)
|
|
1845
|
+
E11y.configure do |config|
|
|
1846
|
+
config.sampling do
|
|
1847
|
+
base_sample_rate 0.1 # Default: 10%
|
|
1848
|
+
adaptive_sampling enabled: false
|
|
1849
|
+
end
|
|
1850
|
+
end
|
|
1851
|
+
|
|
1852
|
+
# Base class (medium priority)
|
|
1853
|
+
class Events::BasePaymentEvent < E11y::Event::Base
|
|
1854
|
+
severity :success
|
|
1855
|
+
sample_rate 1.0 # Override global (never sample payments!)
|
|
1856
|
+
adaptive_sampling enabled: false # Disable adaptive
|
|
1857
|
+
end
|
|
1858
|
+
|
|
1859
|
+
# Preset module (higher priority)
|
|
1860
|
+
module E11y::Presets::DebugEvent
|
|
1861
|
+
extend ActiveSupport::Concern
|
|
1862
|
+
included do
|
|
1863
|
+
sample_rate 0.01 # Override base (1% for debug)
|
|
1864
|
+
adaptive_sampling enabled: true # Enable adaptive
|
|
1865
|
+
end
|
|
1866
|
+
end
|
|
1867
|
+
|
|
1868
|
+
# Event (highest priority)
|
|
1869
|
+
class Events::CriticalDebug < Events::BasePaymentEvent
|
|
1870
|
+
include E11y::Presets::DebugEvent
|
|
1871
|
+
|
|
1872
|
+
sample_rate 0.1 # Override preset (10% for critical debug)
|
|
1873
|
+
|
|
1874
|
+
# Final config:
|
|
1875
|
+
# - severity: :success (from base)
|
|
1876
|
+
# - sample_rate: 0.1 (event-level override)
|
|
1877
|
+
# - adaptive_sampling: enabled: true (from preset)
|
|
1878
|
+
end
|
|
1879
|
+
```
|
|
1880
|
+
|
|
1881
|
+
### Precedence Rules Table
|
|
1882
|
+
|
|
1883
|
+
| Config | Global | Convention | Base Class | Preset | Event-Level | Winner |
|
|
1884
|
+
|--------|--------|------------|------------|--------|-------------|--------|
|
|
1885
|
+
| `sample_rate` | `0.1` | `0.5` (`:warn`) | `1.0` | `0.01` | `0.1` | **`0.1`** (event) |
|
|
1886
|
+
| `adaptive_sampling` | `false` | - | `false` | `true` | - | **`true`** (preset) |
|
|
1887
|
+
|
|
1888
|
+
### Convention-Based Defaults
|
|
1889
|
+
|
|
1890
|
+
**Convention:** Severity → sample_rate (if not specified):
|
|
1891
|
+
|
|
1892
|
+
```ruby
|
|
1893
|
+
# :error/:fatal → 1.0 (100%)
|
|
1894
|
+
class Events::PaymentFailed < E11y::Event::Base
|
|
1895
|
+
severity :error
|
|
1896
|
+
# ← Auto: sample_rate = 1.0 (convention!)
|
|
1897
|
+
end
|
|
1898
|
+
|
|
1899
|
+
# :warn → 0.5 (50%)
|
|
1900
|
+
class Events::SlowQuery < E11y::Event::Base
|
|
1901
|
+
severity :warn
|
|
1902
|
+
# ← Auto: sample_rate = 0.5 (convention!)
|
|
1903
|
+
end
|
|
1904
|
+
|
|
1905
|
+
# :debug → 0.01 (1%)
|
|
1906
|
+
class Events::DebugLog < E11y::Event::Base
|
|
1907
|
+
severity :debug
|
|
1908
|
+
# ← Auto: sample_rate = 0.01 (convention!)
|
|
1909
|
+
end
|
|
1910
|
+
```
|
|
1911
|
+
|
|
1912
|
+
---
|
|
1913
|
+
|
|
1914
|
+
## 📚 Related Use Cases
|
|
1915
|
+
|
|
1916
|
+
- **[UC-006: Trace Context Management](./UC-006-trace-context-management.md)** - Trace-consistent sampling requires trace propagation
|
|
1917
|
+
- **[UC-001: Request-Scoped Debug Buffering](./UC-001-request-scoped-debug-buffering.md)** - `sample_on_error` works with buffering
|
|
1918
|
+
- **[UC-011: Rate Limiting](./UC-011-rate-limiting.md)** - Complementary with sampling
|
|
1919
|
+
- **[UC-015: Cost Optimization](./UC-015-cost-optimization.md)** - Sampling reduces costs
|
|
1920
|
+
|
|
1921
|
+
---
|
|
1922
|
+
|
|
1923
|
+
## 🎯 Summary
|
|
1924
|
+
|
|
1925
|
+
### Cost Savings
|
|
1926
|
+
|
|
1927
|
+
| Scenario | Fixed Sampling | Adaptive Sampling | Savings |
|
|
1928
|
+
|----------|----------------|-------------------|---------|
|
|
1929
|
+
| Normal load | 10% (100 ev/sec) | 10% (100 ev/sec) | 0% |
|
|
1930
|
+
| Error spike | 10% (10k ev/sec) | 100% (100k ev/sec) | Better data! |
|
|
1931
|
+
| High load | 10% (10k ev/sec) | 1% (1k ev/sec) | **90%** |
|
|
1932
|
+
| Black Friday | 10% (10k ev/sec) | 6.5% (6.5k ev/sec) | **35%** |
|
|
1933
|
+
|
|
1934
|
+
**Result:** Same or better data quality, 35-90% cost reduction during peaks!
|
|
1935
|
+
|
|
1936
|
+
---
|
|
1937
|
+
|
|
1938
|
+
**Document Version:** 1.1 (Unified DSL)
|
|
1939
|
+
**Last Updated:** January 16, 2026
|
|
1940
|
+
**Status:** ✅ Complete - Consistent with DSL-SPECIFICATION.md v1.1.0
|