e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,2993 @@
|
|
|
1
|
+
# ADR-009: Cost Optimization
|
|
2
|
+
|
|
3
|
+
**Status:** Partially Implemented (Error-Based Sampling - 2026-01-19)
|
|
4
|
+
**Date:** January 12, 2026
|
|
5
|
+
**Last Updated:** January 19, 2026
|
|
6
|
+
**Covers:** UC-014 (Adaptive Sampling), UC-015 (Cost Optimization), UC-019 (Tiered Storage)
|
|
7
|
+
**Depends On:** ADR-001 (Core), ADR-004 (Adapters), ADR-014 (Adaptive Sampling)
|
|
8
|
+
|
|
9
|
+
**Implementation Status:**
|
|
10
|
+
- ✅ **Basic Sampling** (L2.7) - `E11y::Middleware::Sampling` with trace-aware logic
|
|
11
|
+
- ✅ **Event-level DSL** - `sample_rate` and `adaptive_sampling` in `Event::Base`
|
|
12
|
+
- ✅ **Pipeline Integration** - Sampling middleware in default pipeline
|
|
13
|
+
- ✅ **Error-Based Adaptive** (FEAT-4838) - 100% sampling during error spikes
|
|
14
|
+
- ✅ **Load-Based Adaptive** (FEAT-4842) - Tiered sampling (100%/50%/10%/1%) based on load
|
|
15
|
+
- ✅ **Value-Based Sampling** (FEAT-4846) - DSL for sampling by payload values (>, <, ==, range)
|
|
16
|
+
- ✅ **Stratified Sampling** (FEAT-4850, C11 resolution) - SLO-accurate sampling with correction
|
|
17
|
+
- ⏳ **Compression** - Not started
|
|
18
|
+
- ⏳ **Tiered Storage** - Not started
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 🚀 Implementation Summary (2026-01-19)
|
|
23
|
+
|
|
24
|
+
### Basic Sampling (L2.7) ✅
|
|
25
|
+
|
|
26
|
+
**Implemented:**
|
|
27
|
+
1. **`E11y::Middleware::Sampling`** - Core sampling logic:
|
|
28
|
+
- Trace-aware sampling (C05) - consistent decisions per `trace_id`
|
|
29
|
+
- Audit event exemption - audit events never sampled
|
|
30
|
+
- Sample rate metadata - adds `sample_rate` to event data
|
|
31
|
+
- Cache cleanup - prevents memory leaks
|
|
32
|
+
|
|
33
|
+
2. **Event-level DSL** in `Event::Base`:
|
|
34
|
+
```ruby
|
|
35
|
+
class HighFrequencyEvent < E11y::Event::Base
|
|
36
|
+
sample_rate 0.01 # 1% sampling
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
class OrderEvent < E11y::Event::Base
|
|
40
|
+
adaptive_sampling enabled: true,
|
|
41
|
+
error_rate_threshold: 0.05,
|
|
42
|
+
load_threshold: 50_000
|
|
43
|
+
end
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
3. **Pipeline Integration**:
|
|
47
|
+
- Sampling middleware added to default pipeline (zone: `:routing`)
|
|
48
|
+
- Automatic configuration in `E11y::Configuration#setup_default_pipeline`
|
|
49
|
+
|
|
50
|
+
### Error-Based Adaptive Sampling (FEAT-4838) ✅
|
|
51
|
+
|
|
52
|
+
**Implemented (2026-01-19):**
|
|
53
|
+
1. **`E11y::Sampling::ErrorSpikeDetector`** - Detects error rate spikes:
|
|
54
|
+
- Sliding window error rate calculation (configurable window)
|
|
55
|
+
- Absolute threshold (errors/minute)
|
|
56
|
+
- Relative threshold (ratio to baseline)
|
|
57
|
+
- Exponential moving average for baseline tracking
|
|
58
|
+
- Spike duration management
|
|
59
|
+
|
|
60
|
+
2. **Integration with Sampling Middleware**:
|
|
61
|
+
```ruby
|
|
62
|
+
E11y.configure do |config|
|
|
63
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
64
|
+
error_based_adaptive: true,
|
|
65
|
+
error_spike_config: {
|
|
66
|
+
window: 60, # 60 seconds sliding window
|
|
67
|
+
absolute_threshold: 100, # 100 errors/min triggers spike
|
|
68
|
+
relative_threshold: 3.0, # 3x normal rate triggers spike
|
|
69
|
+
spike_duration: 300 # Keep 100% sampling for 5 minutes
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
3. **Behavior**:
|
|
75
|
+
- **Normal conditions**: Uses configured sample rates (e.g., 10%)
|
|
76
|
+
- **During error spike**: Automatically increases to 100% sampling
|
|
77
|
+
- **After spike**: Returns to normal rates after `spike_duration`
|
|
78
|
+
|
|
79
|
+
**Tests**: 22 unit tests + 9 integration tests (all passing)
|
|
80
|
+
|
|
81
|
+
### Load-Based Adaptive Sampling (FEAT-4842) ✅
|
|
82
|
+
|
|
83
|
+
**Implemented (2026-01-20):**
|
|
84
|
+
1. **`E11y::Sampling::LoadMonitor`** - Tracks event volume and calculates load levels:
|
|
85
|
+
- Sliding window event rate calculation (events/second)
|
|
86
|
+
- Tiered load levels (normal, high, very_high, overload)
|
|
87
|
+
- Configurable thresholds for each load tier
|
|
88
|
+
- Thread-safe tracking (MonitorMixin)
|
|
89
|
+
|
|
90
|
+
2. **Integration with Sampling Middleware**:
|
|
91
|
+
```ruby
|
|
92
|
+
E11y.configure do |config|
|
|
93
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
94
|
+
default_sample_rate: 0.1,
|
|
95
|
+
load_based_adaptive: true,
|
|
96
|
+
load_monitor_config: {
|
|
97
|
+
window: 60, # 60 seconds sliding window
|
|
98
|
+
normal_threshold: 1_000, # < 1k events/sec = normal
|
|
99
|
+
high_threshold: 10_000, # 10k events/sec = high load
|
|
100
|
+
very_high_threshold: 50_000, # 50k events/sec = very high
|
|
101
|
+
overload_threshold: 100_000 # > 100k events/sec = overload
|
|
102
|
+
}
|
|
103
|
+
end
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
3. **Tiered Sampling Rates**:
|
|
107
|
+
- **Normal load** (< 1k events/sec): 100% sampling
|
|
108
|
+
- **High load** (1k-10k events/sec): 50% sampling
|
|
109
|
+
- **Very high load** (10k-50k events/sec): 10% sampling
|
|
110
|
+
- **Overload** (> 50k events/sec): 1% sampling
|
|
111
|
+
|
|
112
|
+
4. **Behavior**:
|
|
113
|
+
- Dynamically adjusts sample rate based on current event volume
|
|
114
|
+
- Works as a "base rate" that can be further restricted by event-level `resolve_sample_rate`
|
|
115
|
+
- Prioritizes error-based adaptive (100% during spikes) over load-based
|
|
116
|
+
|
|
117
|
+
**Tests**: 22 unit tests + 10 integration tests + 7 stress tests (all passing)
|
|
118
|
+
|
|
119
|
+
### Value-Based Sampling (FEAT-4846) ✅
|
|
120
|
+
|
|
121
|
+
**Implemented (2026-01-20):**
|
|
122
|
+
1. **`E11y::Sampling::ValueExtractor`** - Extracts numeric values from event payloads:
|
|
123
|
+
- Nested field extraction (dot notation: `"order.amount"`)
|
|
124
|
+
- Type coercion (numeric strings → floats)
|
|
125
|
+
- Nil/missing value handling (returns 0.0)
|
|
126
|
+
|
|
127
|
+
2. **`E11y::Event::ValueSamplingConfig`** - Defines value-based sampling rules:
|
|
128
|
+
- Comparison operators: `:greater_than`, `:less_than`, `:equals`, `:in_range`
|
|
129
|
+
- Threshold values (numeric or Range)
|
|
130
|
+
- Custom sample rates per rule
|
|
131
|
+
|
|
132
|
+
3. **Event DSL (`sample_by_value`)**:
|
|
133
|
+
```ruby
|
|
134
|
+
class OrderPaidEvent < E11y::Event::Base
|
|
135
|
+
# Always sample orders over $1000
|
|
136
|
+
sample_by_value field: "amount",
|
|
137
|
+
operator: :greater_than,
|
|
138
|
+
threshold: 1000,
|
|
139
|
+
sample_rate: 1.0
|
|
140
|
+
|
|
141
|
+
# Sample 50% of orders between $100-$500
|
|
142
|
+
sample_by_value field: "amount",
|
|
143
|
+
operator: :in_range,
|
|
144
|
+
threshold: 100..500,
|
|
145
|
+
sample_rate: 0.5
|
|
146
|
+
end
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
4. **Integration with Sampling Middleware**:
|
|
150
|
+
- **High priority** in sampling decision (after error spike, before load-based)
|
|
151
|
+
- Event-level configuration (no global config needed)
|
|
152
|
+
- Falls back to other strategies if no value-based config present
|
|
153
|
+
|
|
154
|
+
**Tests**: 19 unit tests + 8 integration tests (all passing)
|
|
155
|
+
|
|
156
|
+
### Stratified Sampling for SLO Accuracy (FEAT-4850, C11 Resolution) ✅
|
|
157
|
+
|
|
158
|
+
**Implemented (2026-01-20):**
|
|
159
|
+
1. **`E11y::Sampling::StratifiedTracker`** - Tracks sampled/total counts per severity stratum:
|
|
160
|
+
- Records each sampled event with its original sample rate
|
|
161
|
+
- Calculates sampling correction factors per severity
|
|
162
|
+
- Handles floating point precision
|
|
163
|
+
- Thread-safe tracking (MonitorMixin)
|
|
164
|
+
|
|
165
|
+
2. **SLO Sampling Correction in `E11y::SLO::Tracker`**:
|
|
166
|
+
- Applies correction factors when calculating SLO metrics
|
|
167
|
+
- Adjusts success rate to account for sampling bias
|
|
168
|
+
- Ensures < 5% error margin even with aggressive sampling
|
|
169
|
+
|
|
170
|
+
3. **Integration with Sampling Middleware**:
|
|
171
|
+
- Records sample rate metadata for each event
|
|
172
|
+
- Works seamlessly with load-based adaptive sampling
|
|
173
|
+
- No additional configuration required (automatic)
|
|
174
|
+
|
|
175
|
+
4. **Example: Accurate SLO with 85% Cost Savings**:
|
|
176
|
+
```ruby
|
|
177
|
+
# Scenario: 1000 events (950 success, 50 errors)
|
|
178
|
+
# Stratified sampling: errors 100%, success 10%
|
|
179
|
+
# Events kept: 50 + 95 = 145 (85.5% cost savings!)
|
|
180
|
+
|
|
181
|
+
# Without correction:
|
|
182
|
+
# Observed success rate: 95/145 = 65.5% ❌
|
|
183
|
+
|
|
184
|
+
# With correction:
|
|
185
|
+
# Corrected success: 95 / 0.1 = 950
|
|
186
|
+
# Corrected errors: 50 / 1.0 = 50
|
|
187
|
+
# Corrected success rate: 950 / 1000 = 95.0% ✅
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
**Tests**: 15 unit tests + 5 integration tests (all passing)
|
|
191
|
+
|
|
192
|
+
**Total Test Coverage (Phase 2.8):**
|
|
193
|
+
- **Error-Based**: 22 unit + 9 integration = 31 tests
|
|
194
|
+
- **Load-Based**: 22 unit + 10 integration + 7 stress = 39 tests
|
|
195
|
+
- **Value-Based**: 19 unit + 8 integration = 27 tests
|
|
196
|
+
- **Stratified**: 15 unit + 5 integration = 20 tests
|
|
197
|
+
- **Grand Total**: 117 tests ✅
|
|
198
|
+
|
|
199
|
+
**Deferred** (Future enhancements):
|
|
200
|
+
- Content-based sampling (pattern matching)
|
|
201
|
+
- ML-based sampling (importance prediction)
|
|
202
|
+
- Tail-based sampling (requires buffering)
|
|
203
|
+
|
|
204
|
+
**See:**
|
|
205
|
+
- Implementation details: `docs/IMPLEMENTATION_NOTES.md` (2026-01-20 entry)
|
|
206
|
+
- Middleware code: `lib/e11y/middleware/sampling.rb`
|
|
207
|
+
- Detectors: `lib/e11y/sampling/error_spike_detector.rb`, `lib/e11y/sampling/load_monitor.rb`
|
|
208
|
+
- Value sampling: `lib/e11y/sampling/value_extractor.rb`, `lib/e11y/event/value_sampling_config.rb`
|
|
209
|
+
- Stratified sampling: `lib/e11y/sampling/stratified_tracker.rb`
|
|
210
|
+
- Tests: `spec/e11y/middleware/sampling_spec.rb`, `spec/e11y/sampling/*_spec.rb`
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## 📋 Table of Contents
|
|
215
|
+
|
|
216
|
+
1. [Context & Problem](#1-context--problem)
|
|
217
|
+
2. [Architecture Overview](#2-architecture-overview)
|
|
218
|
+
3. [Adaptive Sampling](#3-adaptive-sampling)
|
|
219
|
+
- 3.6. [Trace-Aware Adaptive Sampling (C05 Resolution)](#36-trace-aware-adaptive-sampling-c05-resolution) ⚠️ CRITICAL
|
|
220
|
+
- 3.6.1. The Problem: Broken Distributed Traces
|
|
221
|
+
- 3.6.2. Decision: Trace-Level Sampling with Decision Cache
|
|
222
|
+
- 3.6.3. TraceAwareSampler Implementation
|
|
223
|
+
- 3.6.4. Configuration
|
|
224
|
+
- 3.6.5. Multi-Service Trace Scenario (Correct Behavior)
|
|
225
|
+
- 3.6.6. Cache Management & TTL
|
|
226
|
+
- 3.6.7. Head-Based Sampling (W3C Trace Context)
|
|
227
|
+
- 3.6.8. Trade-offs & Distributed Tracing Integrity (C05)
|
|
228
|
+
- 3.7. [Stratified Sampling for SLO Accuracy (C11 Resolution)](#37-stratified-sampling-for-slo-accuracy-c11-resolution) ⚠️ CRITICAL
|
|
229
|
+
- 3.7.1. The Problem: Sampling Bias Breaks SLO Metrics
|
|
230
|
+
- 3.7.2. Decision: Stratified Sampling by Event Severity
|
|
231
|
+
- 3.7.3. StratifiedAdaptiveSampler Implementation
|
|
232
|
+
- 3.7.4. SLO Calculator with Sampling Correction
|
|
233
|
+
- 3.7.5. Configuration
|
|
234
|
+
- 3.7.6. Accuracy Comparison: Random vs Stratified Sampling
|
|
235
|
+
- 3.7.7. Cost Savings vs Accuracy Trade-off
|
|
236
|
+
- 3.7.8. Testing Sampling Correction Accuracy
|
|
237
|
+
- 3.7.9. Trade-offs & SLO Accuracy (C11)
|
|
238
|
+
4. [Compression](#4-compression)
|
|
239
|
+
5. [Smart Routing](#5-smart-routing)
|
|
240
|
+
6. [Tiered Storage](#6-tiered-storage)
|
|
241
|
+
7. [Payload Minimization](#7-payload-minimization)
|
|
242
|
+
8. [Cardinality Protection (C04 Resolution)](#8-cardinality-protection-c04-resolution) ⚠️ CRITICAL
|
|
243
|
+
- 8.1. The Problem: Cardinality Explosion Across Backends
|
|
244
|
+
- 8.2. Decision: Unified Cardinality Protection for All Backends
|
|
245
|
+
- 8.3. Configuration: Inherit from Global Settings
|
|
246
|
+
- 8.4. Implementation: Apply to Yabeda + OpenTelemetry
|
|
247
|
+
- 8.5. Cost Impact: Before vs After Protection
|
|
248
|
+
- 8.6. Monitoring Metrics
|
|
249
|
+
- 8.7. Trade-offs (C04 Resolution)
|
|
250
|
+
9. [Cost Metrics](#9-cost-metrics)
|
|
251
|
+
10. [Trade-offs](#10-trade-offs)
|
|
252
|
+
11. [Complete Configuration Example](#11-complete-configuration-example)
|
|
253
|
+
12. [Backlog (Future Enhancements)](#12-backlog-future-enhancements)
|
|
254
|
+
- [12.1. Quick Start Presets](#121-quick-start-presets)
|
|
255
|
+
- [12.2. Sampling Budget](#122-sampling-budget)
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## 1. Context & Problem
|
|
260
|
+
|
|
261
|
+
### 1.1. Problem Statement
|
|
262
|
+
|
|
263
|
+
**Current Pain Points:**
|
|
264
|
+
|
|
265
|
+
1. **High Log Volume Costs:**
|
|
266
|
+
```ruby
|
|
267
|
+
# ❌ 1M events/day * 365 days = 365M events/year
|
|
268
|
+
# Loki: $0.50/GB → $10,000+/year
|
|
269
|
+
# Sentry: $0.01/event → $3,650/year
|
|
270
|
+
# Total: $13,650/year for a single service
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
2. **No Cost Awareness:**
|
|
274
|
+
```ruby
|
|
275
|
+
# ❌ No visibility into cost per event
|
|
276
|
+
Events::DebugQuery.track(sql: long_query) # How much does this cost?
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
3. **No Retention Strategy:**
|
|
280
|
+
```ruby
|
|
281
|
+
# ❌ All events stored forever
|
|
282
|
+
# Debug events from 2 years ago still in Loki → $$
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### 1.2. Goals
|
|
286
|
+
|
|
287
|
+
**Primary Goals:**
|
|
288
|
+
- ✅ **50-80% cost reduction** through optimization
|
|
289
|
+
- ✅ **Adaptive sampling** based on load/value
|
|
290
|
+
- ✅ **Compression** for network efficiency
|
|
291
|
+
- ✅ **Tiered storage** with `retention_until`
|
|
292
|
+
- ✅ **Cost visibility** per event/adapter
|
|
293
|
+
|
|
294
|
+
**Non-Goals:**
|
|
295
|
+
- ❌ Manage downstream storage (Loki ILM)
|
|
296
|
+
- ❌ Real-time cost calculation
|
|
297
|
+
- ❌ Cross-service cost optimization
|
|
298
|
+
|
|
299
|
+
### 1.3. Success Metrics
|
|
300
|
+
|
|
301
|
+
| Metric | Target | Critical? |
|
|
302
|
+
|--------|--------|-----------|
|
|
303
|
+
| **Cost reduction** | 50-80% | ✅ Yes |
|
|
304
|
+
| **Event throughput** | Same (10K/sec) | ✅ Yes |
|
|
305
|
+
| **Compression ratio** | 5:1 (Gzip) | ✅ Yes |
|
|
306
|
+
|
|
307
|
+
### 1.4. Cost Savings Estimate
|
|
308
|
+
|
|
309
|
+
**Example: 10K events/sec service**
|
|
310
|
+
|
|
311
|
+
| Optimization | Before | After | Savings |
|
|
312
|
+
|-------------|--------|-------|---------|
|
|
313
|
+
| **Adaptive Sampling** | 100% | 20% | 80% |
|
|
314
|
+
| **Compression** | 1KB/event | 200B/event | 80% |
|
|
315
|
+
| **Tiered Storage** | 365 days | 30 days | 92% |
|
|
316
|
+
| **Smart Routing** | All → Loki | Critical → Loki | 50% |
|
|
317
|
+
|
|
318
|
+
**Combined Annual Savings:** $13,650 → **$2,730** (80% reduction)
|
|
319
|
+
|
|
320
|
+
**Cost Breakdown:**
|
|
321
|
+
- **Adaptive Sampling**: ~$10,920 savings (eliminates 80% of low-value events)
|
|
322
|
+
- **Compression**: ~$8,200 savings (80% bandwidth reduction)
|
|
323
|
+
- **Smart Routing**: ~$5,000 savings (critical-only to expensive destinations)
|
|
324
|
+
- **Tiered Storage**: ~$12,570 savings (92% storage cost reduction)
|
|
325
|
+
|
|
326
|
+
---
|
|
327
|
+
|
|
328
|
+
## 2. Architecture Overview
|
|
329
|
+
|
|
330
|
+
### 2.1. System Context
|
|
331
|
+
|
|
332
|
+
```mermaid
|
|
333
|
+
C4Context
|
|
334
|
+
title Cost Optimization Context
|
|
335
|
+
|
|
336
|
+
Person(dev, "Developer", "Configures cost rules")
|
|
337
|
+
|
|
338
|
+
System(rails_app, "Rails App", "Tracks events")
|
|
339
|
+
|
|
340
|
+
System(e11y, "E11y Gem", "Cost optimization")
|
|
341
|
+
|
|
342
|
+
System_Ext(loki, "Loki", "$10K/year")
|
|
343
|
+
System_Ext(sentry, "Sentry", "$3.6K/year")
|
|
344
|
+
System_Ext(s3, "S3", "$100/year")
|
|
345
|
+
|
|
346
|
+
Rel(dev, e11y, "Configures", "Sampling rules")
|
|
347
|
+
Rel(rails_app, e11y, "Tracks events", "100% volume")
|
|
348
|
+
Rel(e11y, loki, "20% sampled + compressed", "80% cost savings")
|
|
349
|
+
Rel(e11y, sentry, "Errors only", "50% cost savings")
|
|
350
|
+
Rel(e11y, s3, "Cold storage", "Long-term archive")
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### 2.2. Component Architecture
|
|
354
|
+
|
|
355
|
+
```mermaid
|
|
356
|
+
graph TB
|
|
357
|
+
subgraph "Event Pipeline"
|
|
358
|
+
Event[Event Tracked] --> Sampler[Adaptive Sampler]
|
|
359
|
+
Sampler --> Minimizer[Payload Minimizer]
|
|
360
|
+
Minimizer --> Compressor[Compressor]
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
subgraph "Cost Optimization Layer"
|
|
364
|
+
Sampler --> SampleStrategy[Sampling Strategy]
|
|
365
|
+
SampleStrategy --> LoadBased[Load-Based]
|
|
366
|
+
SampleStrategy --> ValueBased[Value-Based]
|
|
367
|
+
SampleStrategy --> ErrorBased[Error-Based]
|
|
368
|
+
|
|
369
|
+
Compressor --> GzipEngine[Gzip]
|
|
370
|
+
Compressor --> ZstdEngine[Zstd]
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
subgraph "Smart Routing"
|
|
374
|
+
Compressor --> Router[Smart Router]
|
|
375
|
+
Router --> CriticalPath[Critical → Loki]
|
|
376
|
+
Router --> ArchivePath[Archive → S3]
|
|
377
|
+
Router --> DebugPath[Debug → /dev/null]
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
subgraph "Tiered Storage"
|
|
381
|
+
Router --> RetentionTagger[Retention Tagger]
|
|
382
|
+
RetentionTagger --> ShortTerm[retention_until: 7d]
|
|
383
|
+
RetentionTagger --> MediumTerm[retention_until: 30d]
|
|
384
|
+
RetentionTagger --> LongTerm[retention_until: 365d]
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
subgraph "Cost Tracking"
|
|
388
|
+
Router --> CostMetrics[Cost Metrics]
|
|
389
|
+
CostMetrics --> EventCost[Per-Event Cost]
|
|
390
|
+
CostMetrics --> AdapterCost[Per-Adapter Cost]
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
style Sampler fill:#fff3cd
|
|
394
|
+
style Compressor fill:#d1ecf1
|
|
395
|
+
style Router fill:#f8d7da
|
|
396
|
+
style Minimizer fill:#d4edda
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
### 2.3. Cost Optimization Flow
|
|
400
|
+
|
|
401
|
+
```mermaid
|
|
402
|
+
sequenceDiagram
|
|
403
|
+
participant App as Application
|
|
404
|
+
participant Sampler as Adaptive Sampler
|
|
405
|
+
participant Minimizer as Payload Minimizer
|
|
406
|
+
participant Compress as Compressor
|
|
407
|
+
participant Router as Smart Router
|
|
408
|
+
participant Loki as Loki (Expensive)
|
|
409
|
+
participant S3 as S3 (Cheap)
|
|
410
|
+
|
|
411
|
+
App->>Sampler: Track event (100%)
|
|
412
|
+
|
|
413
|
+
Sampler->>Sampler: Check load, error rate, value
|
|
414
|
+
|
|
415
|
+
alt Sample decision: KEEP
|
|
416
|
+
Sampler->>Minimizer: Event (20% sampled)
|
|
417
|
+
|
|
418
|
+
Minimizer->>Minimizer: Truncate strings, remove nulls
|
|
419
|
+
Minimizer->>Compress: Optimized payload (-30% size)
|
|
420
|
+
|
|
421
|
+
Compress->>Compress: Gzip payload (5:1 ratio)
|
|
422
|
+
Compress->>Router: Compressed event
|
|
423
|
+
|
|
424
|
+
Router->>Router: Classify by importance
|
|
425
|
+
|
|
426
|
+
alt Critical event
|
|
427
|
+
Router->>Loki: Send to Loki (2.8% of original)
|
|
428
|
+
else Archive-worthy
|
|
429
|
+
Router->>S3: Send to S3 (17.2% of original)
|
|
430
|
+
end
|
|
431
|
+
else Sample decision: DROP
|
|
432
|
+
Note over Sampler: 80% dropped
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
Note over Loki,S3: Final cost: 80% reduction
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
---
|
|
439
|
+
|
|
440
|
+
## 3. Adaptive Sampling
|
|
441
|
+
|
|
442
|
+
### 3.1. Sampling Strategies
|
|
443
|
+
|
|
444
|
+
```ruby
|
|
445
|
+
# lib/e11y/cost/adaptive_sampler.rb
|
|
446
|
+
module E11y
|
|
447
|
+
module Cost
|
|
448
|
+
class AdaptiveSampler
|
|
449
|
+
def initialize(config)
|
|
450
|
+
@strategies = [
|
|
451
|
+
Strategies::ErrorBased.new(config),
|
|
452
|
+
Strategies::LoadBased.new(config),
|
|
453
|
+
Strategies::ValueBased.new(config),
|
|
454
|
+
Strategies::ContentBased.new(config)
|
|
455
|
+
]
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
def should_sample?(event_data, context = {})
|
|
459
|
+
# Priority 1: Always sample errors
|
|
460
|
+
return true if event_data[:severity] >= :error
|
|
461
|
+
|
|
462
|
+
# Priority 2: Check each strategy
|
|
463
|
+
sample_rates = @strategies.map do |strategy|
|
|
464
|
+
strategy.calculate_rate(event_data, context)
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
# Use max sample rate (most aggressive strategy wins)
|
|
468
|
+
final_rate = sample_rates.max
|
|
469
|
+
|
|
470
|
+
# Random sampling
|
|
471
|
+
decision = rand < final_rate
|
|
472
|
+
|
|
473
|
+
# Track metrics
|
|
474
|
+
E11y::Metrics.increment('e11y.sampling.decision', {
|
|
475
|
+
event_name: event_data[:event_name],
|
|
476
|
+
decision: decision ? 'sampled' : 'dropped',
|
|
477
|
+
rate: final_rate
|
|
478
|
+
})
|
|
479
|
+
|
|
480
|
+
decision
|
|
481
|
+
end
|
|
482
|
+
end
|
|
483
|
+
end
|
|
484
|
+
end
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
### 3.2. Error-Based Sampling
|
|
488
|
+
|
|
489
|
+
```ruby
|
|
490
|
+
# lib/e11y/cost/strategies/error_based.rb
|
|
491
|
+
module E11y
|
|
492
|
+
module Cost
|
|
493
|
+
module Strategies
|
|
494
|
+
class ErrorBased
|
|
495
|
+
def initialize(config)
|
|
496
|
+
@error_window = config.error_window || 60.seconds
|
|
497
|
+
@error_threshold = config.error_threshold || 0.01 # 1%
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def calculate_rate(event_data, context)
|
|
501
|
+
# Get error rate for this event type
|
|
502
|
+
error_rate = E11y::Metrics.get_rate(
|
|
503
|
+
'e11y.events.errors',
|
|
504
|
+
window: @error_window,
|
|
505
|
+
tags: { event_name: event_data[:event_name] }
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# If error rate > threshold, sample 100%
|
|
509
|
+
if error_rate > @error_threshold
|
|
510
|
+
1.0
|
|
511
|
+
else
|
|
512
|
+
# Normal rate
|
|
513
|
+
0.1 # 10%
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
end
|
|
519
|
+
end
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
### 3.3. Load-Based Sampling (FEAT-4842 Implementation) ✅
|
|
523
|
+
|
|
524
|
+
**Implemented:** `E11y::Sampling::LoadMonitor`
|
|
525
|
+
|
|
526
|
+
```ruby
|
|
527
|
+
# lib/e11y/sampling/load_monitor.rb
|
|
528
|
+
module E11y
|
|
529
|
+
module Sampling
|
|
530
|
+
class LoadMonitor
|
|
531
|
+
include MonitorMixin
|
|
532
|
+
|
|
533
|
+
DEFAULT_CONFIG = {
|
|
534
|
+
window: 60, # 60 seconds sliding window
|
|
535
|
+
normal_threshold: 1_000, # < 1k events/sec = normal
|
|
536
|
+
high_threshold: 10_000, # 10k events/sec = high
|
|
537
|
+
very_high_threshold: 50_000, # 50k events/sec = very high
|
|
538
|
+
overload_threshold: 100_000 # > 100k events/sec = overload
|
|
539
|
+
}.freeze
|
|
540
|
+
|
|
541
|
+
def initialize(config = {})
|
|
542
|
+
super()
|
|
543
|
+
@config = DEFAULT_CONFIG.merge(config)
|
|
544
|
+
@events = []
|
|
545
|
+
@window = @config[:window]
|
|
546
|
+
end
|
|
547
|
+
|
|
548
|
+
# Record an event for load calculation
|
|
549
|
+
def record_event
|
|
550
|
+
synchronize do
|
|
551
|
+
now = Time.now.to_f
|
|
552
|
+
@events << now
|
|
553
|
+
cleanup_old_events(now)
|
|
554
|
+
end
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# Calculate current event rate (events/second)
|
|
558
|
+
def current_rate
|
|
559
|
+
synchronize do
|
|
560
|
+
cleanup_old_events(Time.now.to_f)
|
|
561
|
+
return 0.0 if @events.empty?
|
|
562
|
+
|
|
563
|
+
@events.size.to_f / @window
|
|
564
|
+
end
|
|
565
|
+
end
|
|
566
|
+
|
|
567
|
+
# Determine current load level
|
|
568
|
+
def load_level
|
|
569
|
+
rate = current_rate
|
|
570
|
+
return :overload if rate >= @config[:overload_threshold]
|
|
571
|
+
return :very_high if rate >= @config[:very_high_threshold]
|
|
572
|
+
return :high if rate >= @config[:high_threshold]
|
|
573
|
+
return :normal if rate >= @config[:normal_threshold]
|
|
574
|
+
:normal
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
# Get recommended sample rate based on load
|
|
578
|
+
def recommended_sample_rate
|
|
579
|
+
case load_level
|
|
580
|
+
when :normal then 1.0 # 100%
|
|
581
|
+
when :high then 0.5 # 50%
|
|
582
|
+
when :very_high then 0.1 # 10%
|
|
583
|
+
when :overload then 0.01 # 1%
|
|
584
|
+
else 1.0
|
|
585
|
+
end
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
# Check if system is overloaded
|
|
589
|
+
def overloaded?
|
|
590
|
+
load_level == :overload
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
private
|
|
594
|
+
|
|
595
|
+
def cleanup_old_events(now)
|
|
596
|
+
cutoff = now - @window
|
|
597
|
+
@events.reject! { |timestamp| timestamp < cutoff }
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
end
|
|
601
|
+
end
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
**Configuration:**
|
|
605
|
+
|
|
606
|
+
```ruby
|
|
607
|
+
# config/initializers/e11y.rb
|
|
608
|
+
E11y.configure do |config|
|
|
609
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
610
|
+
default_sample_rate: 0.1,
|
|
611
|
+
load_based_adaptive: true,
|
|
612
|
+
load_monitor_config: {
|
|
613
|
+
window: 60, # 60 seconds
|
|
614
|
+
normal_threshold: 1_000, # < 1k events/sec
|
|
615
|
+
high_threshold: 10_000, # 10k events/sec
|
|
616
|
+
very_high_threshold: 50_000, # 50k events/sec
|
|
617
|
+
overload_threshold: 100_000 # > 100k events/sec
|
|
618
|
+
}
|
|
619
|
+
end
|
|
620
|
+
```
|
|
621
|
+
|
|
622
|
+
**Behavior:**
|
|
623
|
+
|
|
624
|
+
| Load Level | Events/Sec | Sample Rate | Events Tracked |
|
|
625
|
+
|------------|-----------|-------------|----------------|
|
|
626
|
+
| Normal | < 1k | 100% | All events |
|
|
627
|
+
| High | 1k-10k | 50% | Half |
|
|
628
|
+
| Very High | 10k-50k | 10% | 1 in 10 |
|
|
629
|
+
| Overload | > 50k | 1% | 1 in 100 |
|
|
630
|
+
|
|
631
|
+
**Tests:** 22 unit + 10 integration + 7 stress = 39 tests
|
|
632
|
+
|
|
633
|
+
---
|
|
634
|
+
|
|
635
|
+
### 3.3 (OLD). Load-Based Sampling (Conceptual)
|
|
636
|
+
|
|
637
|
+
```ruby
|
|
638
|
+
# lib/e11y/cost/strategies/load_based.rb
|
|
639
|
+
module E11y
|
|
640
|
+
module Cost
|
|
641
|
+
module Strategies
|
|
642
|
+
class LoadBased
|
|
643
|
+
def initialize(config)
|
|
644
|
+
@max_events_per_sec = config.max_events_per_sec || 10_000
|
|
645
|
+
@buffer_threshold = config.buffer_threshold || 0.8 # 80% full
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
def calculate_rate(event_data, context)
|
|
649
|
+
# Strategy 1: Events per second
|
|
650
|
+
current_rate = E11y::Metrics.get_rate('e11y.events.tracked')
|
|
651
|
+
rate_ratio = current_rate / @max_events_per_sec
|
|
652
|
+
|
|
653
|
+
# Strategy 2: Buffer utilization
|
|
654
|
+
buffer_usage = E11y::Buffer.utilization # 0.0 - 1.0
|
|
655
|
+
|
|
656
|
+
# Strategy 3: System CPU/Memory
|
|
657
|
+
system_load = system_overload_factor
|
|
658
|
+
|
|
659
|
+
# Combined load factor
|
|
660
|
+
load_factor = [rate_ratio, buffer_usage, system_load].max
|
|
661
|
+
|
|
662
|
+
if load_factor > 0.9
|
|
663
|
+
# System overloaded → aggressive sampling
|
|
664
|
+
0.01 # 1%
|
|
665
|
+
elsif load_factor > 0.7
|
|
666
|
+
# High load → moderate sampling
|
|
667
|
+
0.1 # 10%
|
|
668
|
+
else
|
|
669
|
+
# Normal load → full sampling
|
|
670
|
+
1.0 # 100%
|
|
671
|
+
end
|
|
672
|
+
end
|
|
673
|
+
|
|
674
|
+
private
|
|
675
|
+
|
|
676
|
+
def system_overload_factor
|
|
677
|
+
cpu_usage = `ps -o %cpu= -p #{Process.pid}`.to_f / 100.0
|
|
678
|
+
memory_mb = `ps -o rss= -p #{Process.pid}`.to_i / 1024.0
|
|
679
|
+
memory_limit_mb = 500.0
|
|
680
|
+
|
|
681
|
+
[cpu_usage, memory_mb / memory_limit_mb].max
|
|
682
|
+
end
|
|
683
|
+
end
|
|
684
|
+
end
|
|
685
|
+
end
|
|
686
|
+
end
|
|
687
|
+
```
|
|
688
|
+
|
|
689
|
+
### 3.4. Value-Based Sampling
|
|
690
|
+
|
|
691
|
+
```ruby
|
|
692
|
+
# lib/e11y/cost/strategies/value_based.rb
|
|
693
|
+
module E11y
|
|
694
|
+
module Cost
|
|
695
|
+
module Strategies
|
|
696
|
+
class ValueBased
|
|
697
|
+
def initialize(config)
|
|
698
|
+
@high_value_patterns = config.high_value_patterns || []
|
|
699
|
+
@low_value_patterns = config.low_value_patterns || []
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
def calculate_rate(event_data, context)
|
|
703
|
+
event_name = event_data[:event_name]
|
|
704
|
+
|
|
705
|
+
# High-value events (always sample)
|
|
706
|
+
if matches_patterns?(event_name, @high_value_patterns)
|
|
707
|
+
return 1.0 # 100%
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
# Low-value events (aggressive sampling)
|
|
711
|
+
if matches_patterns?(event_name, @low_value_patterns)
|
|
712
|
+
return 0.01 # 1%
|
|
713
|
+
end
|
|
714
|
+
|
|
715
|
+
# Check payload value
|
|
716
|
+
payload_value = estimate_payload_value(event_data[:payload])
|
|
717
|
+
|
|
718
|
+
if payload_value > 1000 # High-value transaction
|
|
719
|
+
1.0
|
|
720
|
+
elsif payload_value > 100
|
|
721
|
+
0.5 # 50%
|
|
722
|
+
else
|
|
723
|
+
0.1 # 10%
|
|
724
|
+
end
|
|
725
|
+
end
|
|
726
|
+
|
|
727
|
+
private
|
|
728
|
+
|
|
729
|
+
def matches_patterns?(event_name, patterns)
|
|
730
|
+
patterns.any? do |pattern|
|
|
731
|
+
case pattern
|
|
732
|
+
when String
|
|
733
|
+
event_name == pattern
|
|
734
|
+
when Regexp
|
|
735
|
+
event_name =~ pattern
|
|
736
|
+
end
|
|
737
|
+
end
|
|
738
|
+
end
|
|
739
|
+
|
|
740
|
+
def estimate_payload_value(payload)
|
|
741
|
+
# Extract monetary value from payload
|
|
742
|
+
payload[:amount] ||
|
|
743
|
+
payload[:total_amount] ||
|
|
744
|
+
payload[:price] ||
|
|
745
|
+
0
|
|
746
|
+
end
|
|
747
|
+
end
|
|
748
|
+
end
|
|
749
|
+
end
|
|
750
|
+
end
|
|
751
|
+
```
|
|
752
|
+
|
|
753
|
+
### 3.5. Simplified Configuration
|
|
754
|
+
|
|
755
|
+
**Philosophy:** Simple, declarative sampling rules. No complex strategies.
|
|
756
|
+
|
|
757
|
+
```ruby
|
|
758
|
+
# config/initializers/e11y.rb
|
|
759
|
+
E11y.configure do |config|
|
|
760
|
+
config.sampling do
|
|
761
|
+
# ===================================================================
|
|
762
|
+
# SIMPLE APPROACH: Per-severity defaults
|
|
763
|
+
# ===================================================================
|
|
764
|
+
|
|
765
|
+
# Default sample rates by severity
|
|
766
|
+
default_rate :debug, 0.01 # 1% of debug events
|
|
767
|
+
default_rate :info, 0.1 # 10% of info events
|
|
768
|
+
default_rate :success, 0.5 # 50% of success events
|
|
769
|
+
default_rate :warn, 1.0 # 100% of warnings
|
|
770
|
+
default_rate :error, 1.0 # 100% of errors (always)
|
|
771
|
+
default_rate :fatal, 1.0 # 100% of fatal (always)
|
|
772
|
+
|
|
773
|
+
# ===================================================================
|
|
774
|
+
# PER-EVENT OVERRIDES (optional)
|
|
775
|
+
# ===================================================================
|
|
776
|
+
|
|
777
|
+
# High-value events: always sample
|
|
778
|
+
always_sample 'Events::OrderPaid'
|
|
779
|
+
always_sample 'Events::PaymentProcessed'
|
|
780
|
+
always_sample /^Events::Critical/ # Regex pattern
|
|
781
|
+
|
|
782
|
+
# Low-value events: aggressive sampling
|
|
783
|
+
sample 'Events::HealthCheck', rate: 0.001 # 0.1%
|
|
784
|
+
sample 'Events::CacheHit', rate: 0.01 # 1%
|
|
785
|
+
sample /^Events::Debug/, rate: 0.01 # 1%
|
|
786
|
+
|
|
787
|
+
# ===================================================================
|
|
788
|
+
# LOAD-BASED AUTO-ADJUSTMENT (optional)
|
|
789
|
+
# ===================================================================
|
|
790
|
+
|
|
791
|
+
# Automatically reduce sample rates when system overloaded
|
|
792
|
+
auto_adjust_on_load do
|
|
793
|
+
enabled true
|
|
794
|
+
|
|
795
|
+
# Trigger: buffer >80% full
|
|
796
|
+
trigger_buffer_percent 80
|
|
797
|
+
|
|
798
|
+
# Action: multiply all rates by 0.1 (10x reduction)
|
|
799
|
+
rate_multiplier 0.1
|
|
800
|
+
|
|
801
|
+
# Recovery: restore rates when buffer <50% full
|
|
802
|
+
recovery_buffer_percent 50
|
|
803
|
+
end
|
|
804
|
+
end
|
|
805
|
+
end
|
|
806
|
+
```
|
|
807
|
+
|
|
808
|
+
**Simplified Implementation:**
|
|
809
|
+
|
|
810
|
+
```ruby
|
|
811
|
+
# lib/e11y/sampling/simple_sampler.rb
|
|
812
|
+
module E11y
|
|
813
|
+
module Sampling
|
|
814
|
+
class SimpleSampler
|
|
815
|
+
def initialize(config)
|
|
816
|
+
@severity_rates = config.severity_rates || default_severity_rates
|
|
817
|
+
@event_rates = config.event_rates || {}
|
|
818
|
+
@auto_adjust = config.auto_adjust || {}
|
|
819
|
+
end
|
|
820
|
+
|
|
821
|
+
def should_sample?(event_data)
|
|
822
|
+
# Step 1: Get base rate (severity or event-specific)
|
|
823
|
+
base_rate = get_base_rate(event_data)
|
|
824
|
+
|
|
825
|
+
# Step 2: Apply load-based adjustment (if enabled)
|
|
826
|
+
final_rate = apply_load_adjustment(base_rate)
|
|
827
|
+
|
|
828
|
+
# Step 3: Random decision
|
|
829
|
+
rand < final_rate
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
private
|
|
833
|
+
|
|
834
|
+
def get_base_rate(event_data)
|
|
835
|
+
event_name = event_data[:event_name]
|
|
836
|
+
severity = event_data[:severity]
|
|
837
|
+
|
|
838
|
+
# Priority 1: Event-specific rate
|
|
839
|
+
@event_rates.each do |pattern, rate|
|
|
840
|
+
case pattern
|
|
841
|
+
when String
|
|
842
|
+
return rate if event_name == pattern
|
|
843
|
+
when Regexp
|
|
844
|
+
return rate if event_name =~ pattern
|
|
845
|
+
end
|
|
846
|
+
end
|
|
847
|
+
|
|
848
|
+
# Priority 2: Severity default
|
|
849
|
+
@severity_rates[severity] || 1.0
|
|
850
|
+
end
|
|
851
|
+
|
|
852
|
+
def apply_load_adjustment(base_rate)
|
|
853
|
+
return base_rate unless @auto_adjust[:enabled]
|
|
854
|
+
|
|
855
|
+
buffer_percent = E11y::Buffer.utilization_percent
|
|
856
|
+
|
|
857
|
+
if buffer_percent > @auto_adjust[:trigger_buffer_percent]
|
|
858
|
+
# System overloaded → reduce rate
|
|
859
|
+
base_rate * @auto_adjust[:rate_multiplier]
|
|
860
|
+
else
|
|
861
|
+
# Normal operation
|
|
862
|
+
base_rate
|
|
863
|
+
end
|
|
864
|
+
end
|
|
865
|
+
|
|
866
|
+
def default_severity_rates
|
|
867
|
+
{
|
|
868
|
+
debug: 0.01,
|
|
869
|
+
info: 0.1,
|
|
870
|
+
success: 0.5,
|
|
871
|
+
warn: 1.0,
|
|
872
|
+
error: 1.0,
|
|
873
|
+
fatal: 1.0
|
|
874
|
+
}
|
|
875
|
+
end
|
|
876
|
+
end
|
|
877
|
+
end
|
|
878
|
+
end
|
|
879
|
+
```
|
|
880
|
+
|
|
881
|
+
**Why Simplified?**
|
|
882
|
+
|
|
883
|
+
| Old Approach | New Approach | Benefit |
|
|
884
|
+
|--------------|--------------|---------|
|
|
885
|
+
| 4 strategies (Error, Load, Value, Content) | 1 simple sampler | Easy to understand |
|
|
886
|
+
| Complex rate calculation | Lookup table | Fast (<0.01ms) |
|
|
887
|
+
| Multiple config blocks | Single `sampling` block | Less code |
|
|
888
|
+
| Hard to predict | Deterministic | Debuggable |
|
|
889
|
+
| 300+ LOC | 50 LOC | Maintainable |
|
|
890
|
+
|
|
891
|
+
---
|
|
892
|
+
|
|
893
|
+
## 3.6. Trace-Aware Adaptive Sampling (C05 Resolution)
|
|
894
|
+
|
|
895
|
+
> **⚠️ CRITICAL: C05 Conflict Resolution - Adaptive Sampling × Trace Consistency**
|
|
896
|
+
> **See:** [CONFLICT-ANALYSIS.md C05](researches/CONFLICT-ANALYSIS.md#c05-adaptive-sampling--trace-consistent-sampling) for detailed analysis
|
|
897
|
+
> **Problem:** Per-event adaptive sampling breaks distributed traces (incomplete traces across services)
|
|
898
|
+
> **Solution:** Trace-level sampling decisions with propagation via W3C trace context
|
|
899
|
+
|
|
900
|
+
### 3.6.1. The Problem: Broken Distributed Traces
|
|
901
|
+
|
|
902
|
+
**Scenario - Incomplete Trace:**
|
|
903
|
+
|
|
904
|
+
```ruby
|
|
905
|
+
# Trace across 3 microservices (trace_id: abc-123):
|
|
906
|
+
|
|
907
|
+
# Service A: Order Service
|
|
908
|
+
Events::OrderCreated.track(
|
|
909
|
+
order_id: '123',
|
|
910
|
+
trace_id: 'abc-123'
|
|
911
|
+
)
|
|
912
|
+
# → Adaptive sampling: KEEP (within budget) ✅
|
|
913
|
+
|
|
914
|
+
# Service B: Payment Service (same trace)
|
|
915
|
+
Events::PaymentProcessing.track(
|
|
916
|
+
payment_id: '456',
|
|
917
|
+
trace_id: 'abc-123'
|
|
918
|
+
)
|
|
919
|
+
# → Adaptive sampling: DROP (budget exceeded!) ❌
|
|
920
|
+
|
|
921
|
+
# Service C: Notification Service (same trace)
|
|
922
|
+
Events::NotificationSent.track(
|
|
923
|
+
notification_id: '789',
|
|
924
|
+
trace_id: 'abc-123'
|
|
925
|
+
)
|
|
926
|
+
# → Adaptive sampling: KEEP (budget recovered) ✅
|
|
927
|
+
|
|
928
|
+
# Result: INCOMPLETE TRACE!
|
|
929
|
+
# Loki shows:
|
|
930
|
+
# - Order created: YES ✅
|
|
931
|
+
# - Payment processing: MISSING ❌ ← Gap in trace!
|
|
932
|
+
# - Notification sent: YES ✅
|
|
933
|
+
#
|
|
934
|
+
# → Can't reconstruct full user journey!
|
|
935
|
+
# → Debugging payment issues impossible!
|
|
936
|
+
```
|
|
937
|
+
|
|
938
|
+
**Why This Breaks:**
|
|
939
|
+
- ❌ **Per-event sampling:** Each service makes independent sampling decisions
|
|
940
|
+
- ❌ **Distributed traces incomplete:** Missing spans break trace visualization
|
|
941
|
+
- ❌ **Debugging impossible:** Can't see where payment processing failed
|
|
942
|
+
- ❌ **Misleading SLO metrics:** Partial traces skew latency calculations
|
|
943
|
+
|
|
944
|
+
### 3.6.2. Decision: Trace-Level Sampling with Decision Cache
|
|
945
|
+
|
|
946
|
+
**Strategy:** All events in a trace share the same sampling decision.
|
|
947
|
+
|
|
948
|
+
**Key Principles:**
|
|
949
|
+
1. **Sampling decision made per-trace** (not per-event)
|
|
950
|
+
2. **First event in trace makes decision** (head-based sampling)
|
|
951
|
+
3. **Decision propagated via W3C trace context** (`trace_flags` field)
|
|
952
|
+
4. **Decision cached per trace_id** (TTL: 1 hour)
|
|
953
|
+
|
|
954
|
+
### 3.6.3. TraceAwareSampler Implementation
|
|
955
|
+
|
|
956
|
+
```ruby
|
|
957
|
+
module E11y
|
|
958
|
+
module Cost
|
|
959
|
+
class TraceAwareSampler < SimplifiedSampler
|
|
960
|
+
def initialize(config)
|
|
961
|
+
super(config)
|
|
962
|
+
@trace_decision_cache = Concurrent::Map.new
|
|
963
|
+
@cache_ttl = config.trace_cache_ttl || 3600 # 1 hour default
|
|
964
|
+
@cache_cleanup_interval = 300 # 5 minutes
|
|
965
|
+
|
|
966
|
+
# Start cache cleanup thread
|
|
967
|
+
start_cache_cleanup!
|
|
968
|
+
end
|
|
969
|
+
|
|
970
|
+
def should_sample?(event_data, context = {})
|
|
971
|
+
# Extract trace context
|
|
972
|
+
trace_context = event_data[:trace_context] || context[:trace_context]
|
|
973
|
+
|
|
974
|
+
unless trace_context && trace_context[:trace_id]
|
|
975
|
+
# No trace context → fall back to per-event sampling
|
|
976
|
+
return super(event_data, context)
|
|
977
|
+
end
|
|
978
|
+
|
|
979
|
+
trace_id = trace_context[:trace_id]
|
|
980
|
+
|
|
981
|
+
# ✅ CRITICAL: Check if sampling decision already made for this trace
|
|
982
|
+
cached_decision = get_trace_decision(trace_id)
|
|
983
|
+
return cached_decision unless cached_decision.nil?
|
|
984
|
+
|
|
985
|
+
# No cached decision → make NEW decision for this trace
|
|
986
|
+
decision = make_trace_decision(event_data, context)
|
|
987
|
+
|
|
988
|
+
# Cache decision for this trace (all future events use same decision)
|
|
989
|
+
set_trace_decision(trace_id, decision)
|
|
990
|
+
|
|
991
|
+
# Propagate decision via trace_flags (W3C Trace Context)
|
|
992
|
+
propagate_decision_to_trace_context!(trace_context, decision)
|
|
993
|
+
|
|
994
|
+
decision
|
|
995
|
+
end
|
|
996
|
+
|
|
997
|
+
private
|
|
998
|
+
|
|
999
|
+
def get_trace_decision(trace_id)
|
|
1000
|
+
entry = @trace_decision_cache[trace_id]
|
|
1001
|
+
return nil unless entry
|
|
1002
|
+
|
|
1003
|
+
# Check if cache entry expired
|
|
1004
|
+
if Time.now.to_i > entry[:expires_at]
|
|
1005
|
+
@trace_decision_cache.delete(trace_id)
|
|
1006
|
+
return nil
|
|
1007
|
+
end
|
|
1008
|
+
|
|
1009
|
+
entry[:decision]
|
|
1010
|
+
end
|
|
1011
|
+
|
|
1012
|
+
def set_trace_decision(trace_id, decision)
|
|
1013
|
+
@trace_decision_cache[trace_id] = {
|
|
1014
|
+
decision: decision,
|
|
1015
|
+
expires_at: Time.now.to_i + @cache_ttl,
|
|
1016
|
+
created_at: Time.now.to_i
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
# Track cache size
|
|
1020
|
+
Yabeda.e11y_trace_decision_cache_size.set(@trace_decision_cache.size)
|
|
1021
|
+
end
|
|
1022
|
+
|
|
1023
|
+
def make_trace_decision(event_data, context)
|
|
1024
|
+
# Use standard sampling logic (severity + pattern-based)
|
|
1025
|
+
base_decision = super(event_data, context)
|
|
1026
|
+
|
|
1027
|
+
# Apply adaptive adjustment based on budget
|
|
1028
|
+
if over_budget?
|
|
1029
|
+
# Reduce sampling rate for traces
|
|
1030
|
+
rand() < calculate_adaptive_rate(base_decision)
|
|
1031
|
+
else
|
|
1032
|
+
base_decision
|
|
1033
|
+
end
|
|
1034
|
+
end
|
|
1035
|
+
|
|
1036
|
+
def over_budget?
|
|
1037
|
+
# Check if monthly event budget exceeded
|
|
1038
|
+
current_month_events = Yabeda.e11y_events_tracked_total.values.sum
|
|
1039
|
+
budget = @config.cost_budget || 100_000
|
|
1040
|
+
|
|
1041
|
+
current_month_events > budget
|
|
1042
|
+
end
|
|
1043
|
+
|
|
1044
|
+
def calculate_adaptive_rate(base_decision)
|
|
1045
|
+
return 1.0 if base_decision == false # Already dropping
|
|
1046
|
+
|
|
1047
|
+
budget_utilization = Yabeda.e11y_events_tracked_total.values.sum.to_f / @config.cost_budget
|
|
1048
|
+
|
|
1049
|
+
# Scale down aggressively when over budget
|
|
1050
|
+
if budget_utilization > 1.5
|
|
1051
|
+
0.1 # Keep only 10% of traces
|
|
1052
|
+
elsif budget_utilization > 1.2
|
|
1053
|
+
0.5 # Keep 50% of traces
|
|
1054
|
+
else
|
|
1055
|
+
1.0 # Keep all traces (within budget)
|
|
1056
|
+
end
|
|
1057
|
+
end
|
|
1058
|
+
|
|
1059
|
+
def propagate_decision_to_trace_context!(trace_context, decision)
|
|
1060
|
+
# Set W3C Trace Context trace_flags
|
|
1061
|
+
# Bit 0 (0x01): sampled flag
|
|
1062
|
+
if decision
|
|
1063
|
+
trace_context[:trace_flags] ||= 0x01 # Set sampled bit
|
|
1064
|
+
else
|
|
1065
|
+
trace_context[:trace_flags] ||= 0x00 # Clear sampled bit
|
|
1066
|
+
end
|
|
1067
|
+
end
|
|
1068
|
+
|
|
1069
|
+
def start_cache_cleanup!
|
|
1070
|
+
Thread.new do
|
|
1071
|
+
loop do
|
|
1072
|
+
sleep @cache_cleanup_interval
|
|
1073
|
+
|
|
1074
|
+
# Remove expired entries
|
|
1075
|
+
now = Time.now.to_i
|
|
1076
|
+
@trace_decision_cache.delete_if do |trace_id, entry|
|
|
1077
|
+
expired = now > entry[:expires_at]
|
|
1078
|
+
|
|
1079
|
+
if expired
|
|
1080
|
+
Yabeda.e11y_trace_decision_cache_evictions.increment
|
|
1081
|
+
end
|
|
1082
|
+
|
|
1083
|
+
expired
|
|
1084
|
+
end
|
|
1085
|
+
end
|
|
1086
|
+
rescue StandardError => e
|
|
1087
|
+
E11y.logger.error "[E11y] Trace cache cleanup error: #{e.message}"
|
|
1088
|
+
retry
|
|
1089
|
+
end
|
|
1090
|
+
end
|
|
1091
|
+
end
|
|
1092
|
+
end
|
|
1093
|
+
end
|
|
1094
|
+
```
|
|
1095
|
+
|
|
1096
|
+
### 3.6.4. Configuration
|
|
1097
|
+
|
|
1098
|
+
```ruby
|
|
1099
|
+
# config/initializers/e11y.rb
|
|
1100
|
+
E11y.configure do |config|
|
|
1101
|
+
config.cost_optimization do
|
|
1102
|
+
sampling do
|
|
1103
|
+
# ✅ Use trace-aware sampler for distributed tracing
|
|
1104
|
+
strategy :trace_aware # NEW: Trace-consistent sampling
|
|
1105
|
+
|
|
1106
|
+
# Trace decision cache
|
|
1107
|
+
trace_cache_ttl 3600 # 1 hour (3600 seconds)
|
|
1108
|
+
trace_cache_cleanup_interval 300 # 5 minutes
|
|
1109
|
+
|
|
1110
|
+
# Cost budget (monthly)
|
|
1111
|
+
cost_budget 100_000 # 100K events/month
|
|
1112
|
+
|
|
1113
|
+
# Per-severity sampling rates (base rates before adaptive adjustment)
|
|
1114
|
+
severity_rates do
|
|
1115
|
+
debug 0.01 # 1%
|
|
1116
|
+
info 0.1 # 10%
|
|
1117
|
+
success 0.5 # 50%
|
|
1118
|
+
warn 1.0 # 100%
|
|
1119
|
+
error 1.0 # 100%
|
|
1120
|
+
fatal 1.0 # 100%
|
|
1121
|
+
end
|
|
1122
|
+
|
|
1123
|
+
# Pattern-based overrides (take precedence)
|
|
1124
|
+
pattern_rates do
|
|
1125
|
+
pattern /^audit\./, rate: 1.0 # Always sample audit events
|
|
1126
|
+
pattern /^payment\./, rate: 1.0 # Always sample payments
|
|
1127
|
+
pattern /^debug\./, rate: 0.01 # 1% of debug events
|
|
1128
|
+
end
|
|
1129
|
+
end
|
|
1130
|
+
end
|
|
1131
|
+
end
|
|
1132
|
+
```
|
|
1133
|
+
|
|
1134
|
+
### 3.6.5. Multi-Service Trace Scenario (Correct Behavior)
|
|
1135
|
+
|
|
1136
|
+
**Service A (Order Service) - First Event:**
|
|
1137
|
+
|
|
1138
|
+
```ruby
|
|
1139
|
+
# Create new trace context
|
|
1140
|
+
trace_context = E11y::TraceContext.generate
|
|
1141
|
+
|
|
1142
|
+
# Track event (FIRST in trace → makes sampling decision)
|
|
1143
|
+
Events::OrderCreated.track(
|
|
1144
|
+
order_id: '123',
|
|
1145
|
+
user_id: 'u456',
|
|
1146
|
+
trace_context: trace_context
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
# TraceAwareSampler:
|
|
1150
|
+
# 1. No cached decision for trace_id
|
|
1151
|
+
# 2. Makes NEW decision: should_sample? → TRUE (severity: info, rate: 0.1 → sampled)
|
|
1152
|
+
# 3. Caches decision: trace_decision_cache[trace_id] = TRUE
|
|
1153
|
+
# 4. Sets trace_flags: 0x01 (sampled bit set)
|
|
1154
|
+
# 5. Event KEPT ✅
|
|
1155
|
+
|
|
1156
|
+
# HTTP call to Service B (trace context propagated via W3C headers)
|
|
1157
|
+
# traceparent: 00-abc123...-def456...-01
|
|
1158
|
+
# ^^
|
|
1159
|
+
# trace_flags = 0x01 (sampled)
|
|
1160
|
+
```
|
|
1161
|
+
|
|
1162
|
+
**Service B (Payment Service) - Downstream Event:**
|
|
1163
|
+
|
|
1164
|
+
```ruby
|
|
1165
|
+
# Receive trace context from Service A (via HTTP headers)
|
|
1166
|
+
incoming_trace_context = extract_trace_context_from_headers(request.headers)
|
|
1167
|
+
# trace_id: 'abc123...', trace_flags: 0x01 (sampled)
|
|
1168
|
+
|
|
1169
|
+
# Track event (DOWNSTREAM in trace → uses cached decision)
|
|
1170
|
+
Events::PaymentProcessing.track(
|
|
1171
|
+
payment_id: '456',
|
|
1172
|
+
order_id: '123',
|
|
1173
|
+
trace_context: incoming_trace_context
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
# TraceAwareSampler:
|
|
1177
|
+
# 1. Check cache for trace_id: abc123... → FOUND (decision: TRUE)
|
|
1178
|
+
# 2. Return cached decision: TRUE
|
|
1179
|
+
# 3. Event KEPT ✅ (consistent with Service A decision)
|
|
1180
|
+
|
|
1181
|
+
# HTTP call to Service C (trace context propagated)
|
|
1182
|
+
```
|
|
1183
|
+
|
|
1184
|
+
**Service C (Notification Service) - Further Downstream:**
|
|
1185
|
+
|
|
1186
|
+
```ruby
|
|
1187
|
+
# Receive trace context from Service B
|
|
1188
|
+
incoming_trace_context = extract_trace_context_from_headers(request.headers)
|
|
1189
|
+
|
|
1190
|
+
# Track event
|
|
1191
|
+
Events::NotificationSent.track(
|
|
1192
|
+
notification_id: '789',
|
|
1193
|
+
order_id: '123',
|
|
1194
|
+
trace_context: incoming_trace_context
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
# TraceAwareSampler:
|
|
1198
|
+
# 1. Check cache for trace_id: abc123... → FOUND (decision: TRUE)
|
|
1199
|
+
# 2. Return cached decision: TRUE
|
|
1200
|
+
# 3. Event KEPT ✅ (consistent across all services)
|
|
1201
|
+
|
|
1202
|
+
# Result: COMPLETE TRACE in Loki!
|
|
1203
|
+
# - Order created: YES ✅
|
|
1204
|
+
# - Payment processing: YES ✅
|
|
1205
|
+
# - Notification sent: YES ✅
|
|
1206
|
+
# → Full user journey reconstructed!
|
|
1207
|
+
```
|
|
1208
|
+
|
|
1209
|
+
### 3.6.6. Cache Management & TTL
|
|
1210
|
+
|
|
1211
|
+
**Why 1-hour TTL?**
|
|
1212
|
+
|
|
1213
|
+
```ruby
|
|
1214
|
+
# Typical trace duration: <10 seconds (99th percentile)
|
|
1215
|
+
# Cache TTL: 1 hour (3600 seconds)
|
|
1216
|
+
# → 360x safety margin
|
|
1217
|
+
|
|
1218
|
+
# Trade-off:
|
|
1219
|
+
# - Short TTL (e.g., 1 minute): Cache misses if service delayed (retries, async jobs)
|
|
1220
|
+
# - Long TTL (e.g., 24 hours): High memory usage (1M traces = 100MB cache)
|
|
1221
|
+
# - 1 hour: Balance between memory and cache hit rate
|
|
1222
|
+
```
|
|
1223
|
+
|
|
1224
|
+
**Cache Size Estimation:**
|
|
1225
|
+
|
|
1226
|
+
```ruby
|
|
1227
|
+
# Assumptions:
|
|
1228
|
+
# - 10,000 events/sec
|
|
1229
|
+
# - 10 events per trace (average)
|
|
1230
|
+
# - 1,000 new traces/sec
|
|
1231
|
+
# - 1-hour TTL
|
|
1232
|
+
|
|
1233
|
+
# Cache size:
|
|
1234
|
+
# 1,000 traces/sec × 3,600 seconds = 3.6M traces
|
|
1235
|
+
# 3.6M traces × 100 bytes/entry = 360MB
|
|
1236
|
+
|
|
1237
|
+
# Mitigation:
|
|
1238
|
+
# - Cache cleanup every 5 minutes (remove expired)
|
|
1239
|
+
# - LRU eviction if memory limit exceeded
|
|
1240
|
+
# - Monitor: Yabeda.e11y_trace_decision_cache_size
|
|
1241
|
+
```
|
|
1242
|
+
|
|
1243
|
+
**Cache Cleanup:**
|
|
1244
|
+
|
|
1245
|
+
```ruby
|
|
1246
|
+
# Automatic cleanup every 5 minutes
|
|
1247
|
+
config.cost_optimization.sampling do
|
|
1248
|
+
trace_cache_cleanup_interval 300 # seconds
|
|
1249
|
+
end
|
|
1250
|
+
|
|
1251
|
+
# Manual cleanup (if needed)
|
|
1252
|
+
E11y::Cost::TraceAwareSampler.instance.cleanup_expired_traces!
|
|
1253
|
+
|
|
1254
|
+
# Monitoring
|
|
1255
|
+
Yabeda.e11y_trace_decision_cache_size.observe(cache_size)
|
|
1256
|
+
Yabeda.e11y_trace_decision_cache_evictions.increment
|
|
1257
|
+
```
|
|
1258
|
+
|
|
1259
|
+
### 3.6.7. Head-Based Sampling (W3C Trace Context)
|
|
1260
|
+
|
|
1261
|
+
**W3C Trace Context Propagation:**
|
|
1262
|
+
|
|
1263
|
+
```ruby
|
|
1264
|
+
# HTTP Request Header (Service A → Service B):
|
|
1265
|
+
# traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
|
|
1266
|
+
# ││ │ │ │
|
|
1267
|
+
# ││ └─ trace_id (128-bit) └─ span_id └─ trace_flags
|
|
1268
|
+
# │└─ version (01 = sampled)
|
|
1269
|
+
# └─ format
|
|
1270
|
+
|
|
1271
|
+
# Service B extracts trace context:
|
|
1272
|
+
trace_context = {
|
|
1273
|
+
version: '00',
|
|
1274
|
+
trace_id: '4bf92f3577b34da6a3ce929d0e0e4736',
|
|
1275
|
+
parent_span_id: '00f067aa0ba902b7',
|
|
1276
|
+
trace_flags: 0x01 # ← Sampled bit set by Service A
|
|
1277
|
+
}
|
|
1278
|
+
|
|
1279
|
+
# Service B respects sampling decision:
|
|
1280
|
+
if trace_context[:trace_flags] & 0x01 == 0x01
|
|
1281
|
+
# Sampled bit set → KEEP event
|
|
1282
|
+
else
|
|
1283
|
+
# Sampled bit clear → DROP event
|
|
1284
|
+
end
|
|
1285
|
+
```
|
|
1286
|
+
|
|
1287
|
+
### 3.6.8. Trade-offs & Distributed Tracing Integrity (C05)
|
|
1288
|
+
|
|
1289
|
+
**Trade-offs:**
|
|
1290
|
+
|
|
1291
|
+
| Decision | Pro | Con | Rationale |
|
|
1292
|
+
|----------|-----|-----|-----------|
|
|
1293
|
+
| **Trace-level sampling** | Complete traces | Can't sample per-event | Trace integrity > granularity |
|
|
1294
|
+
| **Decision cache (1h TTL)** | Consistent decisions | 360MB memory (1M traces) | Cache hit rate > memory |
|
|
1295
|
+
| **Head-based sampling** | Simple propagation | First service decides for all | Simplicity > flexibility |
|
|
1296
|
+
| **W3C trace_flags** | Standard propagation | Requires trace context | Interoperability > custom |
|
|
1297
|
+
|
|
1298
|
+
**Distributed Tracing Integrity:**
|
|
1299
|
+
|
|
1300
|
+
✅ **Complete traces:**
|
|
1301
|
+
All events in a trace sampled together → no gaps in trace visualization.
|
|
1302
|
+
|
|
1303
|
+
✅ **Consistent debugging:**
|
|
1304
|
+
If Service A event visible, all downstream events visible → full user journey.
|
|
1305
|
+
|
|
1306
|
+
✅ **Accurate SLO metrics:**
|
|
1307
|
+
Complete traces provide accurate latency calculations (no partial trace skew).
|
|
1308
|
+
|
|
1309
|
+
**Limitations:**
|
|
1310
|
+
|
|
1311
|
+
⚠️ **All-or-nothing:** Can't sample some events within a trace (e.g., keep errors, drop debug)
|
|
1312
|
+
**Mitigation:** Use severity-based trace decision (errors always sampled)
|
|
1313
|
+
|
|
1314
|
+
⚠️ **Memory overhead:** Cache stores decisions for 1 hour (360MB for 1M traces)
|
|
1315
|
+
**Mitigation:** LRU eviction + periodic cleanup
|
|
1316
|
+
|
|
1317
|
+
⚠️ **Long traces:** If trace spans 2+ hours, cache may expire mid-trace
|
|
1318
|
+
**Mitigation:** Increase TTL for long-running workflows (e.g., async jobs)
|
|
1319
|
+
|
|
1320
|
+
**Monitoring Metrics:**
|
|
1321
|
+
|
|
1322
|
+
```ruby
|
|
1323
|
+
# Track trace-aware sampling effectiveness
|
|
1324
|
+
Yabeda.e11y_trace_decision_cache_hit_rate.observe(
|
|
1325
|
+
hits / (hits + misses).to_f
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1328
|
+
# Track cache size
|
|
1329
|
+
Yabeda.e11y_trace_decision_cache_size.set(cache_size)
|
|
1330
|
+
|
|
1331
|
+
# Track incomplete traces (should be 0%)
|
|
1332
|
+
Yabeda.e11y_incomplete_traces_total.increment(
|
|
1333
|
+
trace_id: 'abc123',
|
|
1334
|
+
missing_spans: 3
|
|
1335
|
+
)
|
|
1336
|
+
```
|
|
1337
|
+
|
|
1338
|
+
**Related Conflicts:**
|
|
1339
|
+
- **C11:** Stratified sampling for SLO accuracy (see §3.7 below)
|
|
1340
|
+
- **C17:** Background job tracing (see ADR-005, UC-010)
|
|
1341
|
+
- **C09:** Multi-service tracing (see UC-009)
|
|
1342
|
+
|
|
1343
|
+
---
|
|
1344
|
+
|
|
1345
|
+
## 3.7. Stratified Sampling for SLO Accuracy (C11 Resolution)
|
|
1346
|
+
|
|
1347
|
+
> **⚠️ CRITICAL: C11 Conflict Resolution - Adaptive Sampling × SLO Tracking**
|
|
1348
|
+
> **See:** [CONFLICT-ANALYSIS.md C11](researches/CONFLICT-ANALYSIS.md#c11-adaptive-sampling--slo-tracking) for detailed analysis
|
|
1349
|
+
> **Problem:** Random sampling skews SLO metrics (inaccurate success rates)
|
|
1350
|
+
> **Solution:** Stratified sampling by severity + sampling correction math
|
|
1351
|
+
|
|
1352
|
+
### 3.7.1. The Problem: Sampling Bias Breaks SLO Metrics
|
|
1353
|
+
|
|
1354
|
+
**Scenario - Inaccurate Success Rate:**
|
|
1355
|
+
|
|
1356
|
+
```ruby
|
|
1357
|
+
# Real production traffic (1000 requests):
|
|
1358
|
+
# - 950 success (HTTP 200) → 95% success rate
|
|
1359
|
+
# - 50 errors (HTTP 500) → 5% error rate
|
|
1360
|
+
|
|
1361
|
+
# Adaptive sampling (random 50% sampling to save costs):
|
|
1362
|
+
# Expected: Keep 500 events (475 success, 25 errors) → 95% success rate ✅
|
|
1363
|
+
|
|
1364
|
+
# But random sampling can be BIASED!
|
|
1365
|
+
# Actual sample: 500 events (450 success, 50 errors) → 90% success rate ❌
|
|
1366
|
+
|
|
1367
|
+
# Result: FALSE SLO VIOLATION ALERT!
|
|
1368
|
+
# - Real success rate: 95% (above 95% SLO) ✅
|
|
1369
|
+
# - Calculated success rate: 90% (below 95% SLO) ❌
|
|
1370
|
+
# → False alert triggered!
|
|
1371
|
+
```
|
|
1372
|
+
|
|
1373
|
+
**Why Random Sampling Fails:**
|
|
1374
|
+
|
|
1375
|
+
```ruby
|
|
1376
|
+
# Random sampling treats ALL events equally
|
|
1377
|
+
Events::ApiRequest.track(status: 200) # Success → 50% chance to keep
|
|
1378
|
+
Events::ApiRequest.track(status: 500) # Error → 50% chance to keep
|
|
1379
|
+
|
|
1380
|
+
# Problem: We're dropping CRITICAL ERROR events!
|
|
1381
|
+
# - Errors are rare (5% of traffic) but CRITICAL for SLO
|
|
1382
|
+
# - Success events are common (95% of traffic) but less critical
|
|
1383
|
+
# - Random 50% sampling may drop errors → undercount error rate!
|
|
1384
|
+
```
|
|
1385
|
+
|
|
1386
|
+
**Impact:**
|
|
1387
|
+
- ❌ **Inaccurate SLO metrics:** Success rate skewed by sampling bias
|
|
1388
|
+
- ❌ **False alerts:** SLO violations that don't exist
|
|
1389
|
+
- ❌ **Missed real issues:** Actual SLO violations hidden by lucky sampling
|
|
1390
|
+
- ❌ **Wrong business decisions:** Acting on bad data
|
|
1391
|
+
|
|
1392
|
+
### 3.7.2. Decision: Stratified Sampling by Event Severity
|
|
1393
|
+
|
|
1394
|
+
**Strategy:** Sample different event types at different rates to preserve statistical properties.
|
|
1395
|
+
|
|
1396
|
+
**Strata Definition:**
|
|
1397
|
+
|
|
1398
|
+
| Stratum | Criteria | Sample Rate | Rationale |
|
|
1399
|
+
|---------|----------|-------------|-----------|
|
|
1400
|
+
| **Errors** | `severity: [:error, :fatal]` OR `http_status: 5xx` | 100% | Always keep errors (critical for SLO) |
|
|
1401
|
+
| **Warnings** | `severity: [:warn]` OR `http_status: 4xx` | 50% | Medium importance |
|
|
1402
|
+
| **Success** | `severity: [:info, :debug, :success]` OR `http_status: 2xx, 3xx` | 10% | Drop 90% (common, less critical) |
|
|
1403
|
+
|
|
1404
|
+
**Key Principles:**
|
|
1405
|
+
1. **Always keep errors** (100% sampling) → accurate error rates
|
|
1406
|
+
2. **Aggressively sample success** (10% sampling) → cost savings
|
|
1407
|
+
3. **Apply sampling correction** in SLO calculations → accurate metrics
|
|
1408
|
+
|
|
1409
|
+
### 3.7.3. StratifiedAdaptiveSampler Implementation
|
|
1410
|
+
|
|
1411
|
+
```ruby
|
|
1412
|
+
module E11y
|
|
1413
|
+
module Cost
|
|
1414
|
+
class StratifiedAdaptiveSampler < SimplifiedSampler
|
|
1415
|
+
STRATA = {
|
|
1416
|
+
errors: {
|
|
1417
|
+
severities: [:error, :fatal],
|
|
1418
|
+
http_statuses: (500..599).to_a,
|
|
1419
|
+
sample_rate: 1.0 # 100% - always keep
|
|
1420
|
+
},
|
|
1421
|
+
warnings: {
|
|
1422
|
+
severities: [:warn],
|
|
1423
|
+
http_statuses: (400..499).to_a,
|
|
1424
|
+
sample_rate: 0.5 # 50%
|
|
1425
|
+
},
|
|
1426
|
+
success: {
|
|
1427
|
+
severities: [:debug, :info, :success],
|
|
1428
|
+
http_statuses: (200..399).to_a,
|
|
1429
|
+
sample_rate: 0.1 # 10% - aggressive sampling
|
|
1430
|
+
}
|
|
1431
|
+
}.freeze
|
|
1432
|
+
|
|
1433
|
+
def initialize(config)
|
|
1434
|
+
super(config)
|
|
1435
|
+
@strata_config = config.stratification || STRATA
|
|
1436
|
+
end
|
|
1437
|
+
|
|
1438
|
+
def should_sample?(event_data, context = {})
|
|
1439
|
+
# Determine event stratum
|
|
1440
|
+
stratum = determine_stratum(event_data)
|
|
1441
|
+
|
|
1442
|
+
# Get sample rate for this stratum
|
|
1443
|
+
sample_rate = @strata_config[stratum][:sample_rate]
|
|
1444
|
+
|
|
1445
|
+
# Make sampling decision
|
|
1446
|
+
decision = rand() < sample_rate
|
|
1447
|
+
|
|
1448
|
+
# Store stratum in event metadata (needed for correction later)
|
|
1449
|
+
event_data[:metadata] ||= {}
|
|
1450
|
+
event_data[:metadata][:sampling_stratum] = stratum
|
|
1451
|
+
event_data[:metadata][:sampling_rate] = sample_rate
|
|
1452
|
+
event_data[:metadata][:sampled] = decision
|
|
1453
|
+
|
|
1454
|
+
# Track metrics
|
|
1455
|
+
Yabeda.e11y_sampling_decisions_total.increment(
|
|
1456
|
+
stratum: stratum,
|
|
1457
|
+
decision: decision ? 'kept' : 'dropped',
|
|
1458
|
+
sample_rate: sample_rate
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
decision
|
|
1462
|
+
end
|
|
1463
|
+
|
|
1464
|
+
def determine_stratum(event_data)
|
|
1465
|
+
severity = event_data[:severity]
|
|
1466
|
+
http_status = event_data.dig(:payload, :http_status) ||
|
|
1467
|
+
event_data.dig(:payload, :status)
|
|
1468
|
+
|
|
1469
|
+
# Check each stratum (priority order: errors → warnings → success)
|
|
1470
|
+
@strata_config.each do |stratum_name, stratum_config|
|
|
1471
|
+
# Check severity match
|
|
1472
|
+
if stratum_config[:severities].include?(severity)
|
|
1473
|
+
return stratum_name
|
|
1474
|
+
end
|
|
1475
|
+
|
|
1476
|
+
# Check HTTP status match
|
|
1477
|
+
if http_status && stratum_config[:http_statuses].include?(http_status)
|
|
1478
|
+
return stratum_name
|
|
1479
|
+
end
|
|
1480
|
+
end
|
|
1481
|
+
|
|
1482
|
+
# Default: success stratum
|
|
1483
|
+
:success
|
|
1484
|
+
end
|
|
1485
|
+
end
|
|
1486
|
+
end
|
|
1487
|
+
end
|
|
1488
|
+
```
|
|
1489
|
+
|
|
1490
|
+
### 3.7.4. SLO Calculator with Sampling Correction
|
|
1491
|
+
|
|
1492
|
+
**Critical:** Must apply sampling correction to get accurate SLO metrics.
|
|
1493
|
+
|
|
1494
|
+
```ruby
|
|
1495
|
+
module E11y
|
|
1496
|
+
module SLO
|
|
1497
|
+
class Calculator
|
|
1498
|
+
def calculate_success_rate(events)
|
|
1499
|
+
# Group events by stratum
|
|
1500
|
+
events_by_stratum = events.group_by do |event|
|
|
1501
|
+
event[:metadata][:sampling_stratum]
|
|
1502
|
+
end
|
|
1503
|
+
|
|
1504
|
+
# Apply sampling correction for each stratum
|
|
1505
|
+
corrected_counts = {}
|
|
1506
|
+
|
|
1507
|
+
events_by_stratum.each do |stratum, stratum_events|
|
|
1508
|
+
sample_rate = stratum_events.first[:metadata][:sampling_rate]
|
|
1509
|
+
|
|
1510
|
+
# Correction factor: 1 / sample_rate
|
|
1511
|
+
# Example: 10% sample rate → multiply by 10
|
|
1512
|
+
correction_factor = 1.0 / sample_rate
|
|
1513
|
+
|
|
1514
|
+
corrected_counts[stratum] = {
|
|
1515
|
+
observed: stratum_events.count,
|
|
1516
|
+
corrected: stratum_events.count * correction_factor
|
|
1517
|
+
}
|
|
1518
|
+
end
|
|
1519
|
+
|
|
1520
|
+
# Calculate corrected totals
|
|
1521
|
+
corrected_success = corrected_counts[:success][:corrected] rescue 0
|
|
1522
|
+
corrected_warnings = corrected_counts[:warnings][:corrected] rescue 0
|
|
1523
|
+
corrected_errors = corrected_counts[:errors][:corrected] rescue 0
|
|
1524
|
+
|
|
1525
|
+
total = corrected_success + corrected_warnings + corrected_errors
|
|
1526
|
+
|
|
1527
|
+
# Success rate = (success + warnings) / total
|
|
1528
|
+
# (warnings are not SLO violations, only errors are)
|
|
1529
|
+
success_rate = (corrected_success + corrected_warnings) / total.to_f
|
|
1530
|
+
|
|
1531
|
+
{
|
|
1532
|
+
success_rate: success_rate,
|
|
1533
|
+
error_rate: corrected_errors / total.to_f,
|
|
1534
|
+
breakdown: corrected_counts,
|
|
1535
|
+
total_corrected_events: total
|
|
1536
|
+
}
|
|
1537
|
+
end
|
|
1538
|
+
|
|
1539
|
+
def calculate_p99_latency(events)
|
|
1540
|
+
# Group by stratum and apply correction
|
|
1541
|
+
latencies = []
|
|
1542
|
+
|
|
1543
|
+
events.each do |event|
|
|
1544
|
+
latency = event[:payload][:duration_ms]
|
|
1545
|
+
sample_rate = event[:metadata][:sampling_rate]
|
|
1546
|
+
correction_factor = (1.0 / sample_rate).round
|
|
1547
|
+
|
|
1548
|
+
# Duplicate latency by correction factor
|
|
1549
|
+
# (simulate missing events for percentile calculation)
|
|
1550
|
+
correction_factor.times { latencies << latency }
|
|
1551
|
+
end
|
|
1552
|
+
|
|
1553
|
+
# Calculate P99
|
|
1554
|
+
latencies.sort!
|
|
1555
|
+
p99_index = (latencies.size * 0.99).ceil - 1
|
|
1556
|
+
latencies[p99_index]
|
|
1557
|
+
end
|
|
1558
|
+
end
|
|
1559
|
+
end
|
|
1560
|
+
end
|
|
1561
|
+
```
|
|
1562
|
+
|
|
1563
|
+
### 3.7.5. Configuration
|
|
1564
|
+
|
|
1565
|
+
**Вариант 1: Единый простой конфиг (рекомендуется) 🎯**
|
|
1566
|
+
|
|
1567
|
+
```ruby
|
|
1568
|
+
# config/initializers/e11y.rb
|
|
1569
|
+
E11y.configure do |config|
|
|
1570
|
+
config.cost_optimization do
|
|
1571
|
+
sampling do
|
|
1572
|
+
# ✅ Stratified sampling - smart sampling for accurate SLO
|
|
1573
|
+
strategy :stratified_adaptive
|
|
1574
|
+
|
|
1575
|
+
# Cost budget (как и раньше)
|
|
1576
|
+
cost_budget 100_000 # events/month
|
|
1577
|
+
|
|
1578
|
+
# 🎯 ЕДИНЫЙ конфиг: sample_rate по severity (default: never drop errors!)
|
|
1579
|
+
stratified_rates do
|
|
1580
|
+
error 1.0 # 100% - keep all errors (критично для SLO!)
|
|
1581
|
+
warn 0.5 # 50% - medium priority
|
|
1582
|
+
info 0.1 # 10% - low priority (успешные запросы)
|
|
1583
|
+
debug 0.05 # 5% - очень low priority
|
|
1584
|
+
end
|
|
1585
|
+
end
|
|
1586
|
+
end
|
|
1587
|
+
|
|
1588
|
+
# SLO tracking с автоматической коррекцией (включено по умолчанию!)
|
|
1589
|
+
config.slo do
|
|
1590
|
+
enable_sampling_correction true # ✅ Automatic correction in SLO calculations
|
|
1591
|
+
end
|
|
1592
|
+
end
|
|
1593
|
+
```
|
|
1594
|
+
|
|
1595
|
+
**Как это работает:**
|
|
1596
|
+
- `error`/`fatal` severity → sample_rate **1.0** (100%, никогда не drop!)
|
|
1597
|
+
- `warn` severity → sample_rate **0.5** (50%)
|
|
1598
|
+
- `info`/`success` severity → sample_rate **0.1** (10%)
|
|
1599
|
+
- `debug` severity → sample_rate **0.05** (5%)
|
|
1600
|
+
|
|
1601
|
+
**SLO коррекция автоматическая:**
|
|
1602
|
+
```ruby
|
|
1603
|
+
# Пользователь пишет как раньше:
|
|
1604
|
+
E11y::SLO.error_rate # ✅ Автоматически скорректировано!
|
|
1605
|
+
|
|
1606
|
+
# Внутри:
|
|
1607
|
+
observed_errors = 50
|
|
1608
|
+
corrected_errors = observed_errors / error_sample_rate # 50 / 1.0 = 50
|
|
1609
|
+
|
|
1610
|
+
observed_success = 95
|
|
1611
|
+
corrected_success = observed_success / info_sample_rate # 95 / 0.1 = 950
|
|
1612
|
+
|
|
1613
|
+
corrected_error_rate = corrected_errors / (corrected_errors + corrected_success)
|
|
1614
|
+
# = 50 / (50 + 950) = 5% ✅ ACCURATE!
|
|
1615
|
+
```
|
|
1616
|
+
|
|
1617
|
+
---
|
|
1618
|
+
|
|
1619
|
+
**Вариант 2: Продвинутый конфиг (для сложных случаев)**
|
|
1620
|
+
|
|
1621
|
+
Если нужна гибкость (например, разные sample_rate для HTTP 4xx vs 5xx):
|
|
1622
|
+
|
|
1623
|
+
```ruby
|
|
1624
|
+
E11y.configure do |config|
|
|
1625
|
+
config.cost_optimization do
|
|
1626
|
+
sampling do
|
|
1627
|
+
strategy :stratified_adaptive
|
|
1628
|
+
cost_budget 100_000
|
|
1629
|
+
|
|
1630
|
+
# Продвинутая стратификация по severities + http_statuses
|
|
1631
|
+
stratification do
|
|
1632
|
+
stratum :critical_errors do
|
|
1633
|
+
severities [:error, :fatal]
|
|
1634
|
+
http_statuses (500..599).to_a
|
|
1635
|
+
sample_rate 1.0 # 100%
|
|
1636
|
+
end
|
|
1637
|
+
|
|
1638
|
+
stratum :client_errors do
|
|
1639
|
+
severities [:warn]
|
|
1640
|
+
http_statuses (400..499).to_a
|
|
1641
|
+
sample_rate 0.3 # 30% (меньше чем warn, т.к. 4xx не так критично)
|
|
1642
|
+
end
|
|
1643
|
+
|
|
1644
|
+
stratum :success do
|
|
1645
|
+
severities [:info, :success]
|
|
1646
|
+
http_statuses (200..399).to_a
|
|
1647
|
+
sample_rate 0.1 # 10%
|
|
1648
|
+
end
|
|
1649
|
+
end
|
|
1650
|
+
end
|
|
1651
|
+
end
|
|
1652
|
+
end
|
|
1653
|
+
```
|
|
1654
|
+
|
|
1655
|
+
### 3.7.6. Accuracy Comparison: Random vs Stratified Sampling
|
|
1656
|
+
|
|
1657
|
+
**Scenario:** 1000 requests (950 success, 50 errors) → 95% success rate
|
|
1658
|
+
|
|
1659
|
+
| Sampling Strategy | Events Kept | Observed Success Rate | Corrected Success Rate | Error |
|
|
1660
|
+
|-------------------|-------------|----------------------|------------------------|-------|
|
|
1661
|
+
| **No Sampling** | 1000 (100%) | 95.0% | N/A | 0% ✅ |
|
|
1662
|
+
| **Random 50%** | 500 (50%) | 90-100% (varies!) | 90-100% (varies!) | ±5% ❌ |
|
|
1663
|
+
| **Stratified** | 145 (14.5%) | 65.5% (95/145) | **95.0%** (corrected) | 0% ✅ |
|
|
1664
|
+
|
|
1665
|
+
**Stratified Sampling Breakdown:**
|
|
1666
|
+
```ruby
|
|
1667
|
+
# Stratum 1: Errors (100% sampling)
|
|
1668
|
+
50 errors × 1.0 = 50 kept → corrected: 50 / 1.0 = 50
|
|
1669
|
+
|
|
1670
|
+
# Stratum 2: Warnings (50% sampling)
|
|
1671
|
+
0 warnings × 0.5 = 0 kept → corrected: 0 / 0.5 = 0
|
|
1672
|
+
|
|
1673
|
+
# Stratum 3: Success (10% sampling)
|
|
1674
|
+
950 success × 0.1 = 95 kept → corrected: 95 / 0.1 = 950
|
|
1675
|
+
|
|
1676
|
+
# Total kept: 145 events (85.5% cost savings!)
|
|
1677
|
+
# Corrected total: 1000 events
|
|
1678
|
+
# Corrected success rate: (950 + 0) / 1000 = 95% ✅ ACCURATE!
|
|
1679
|
+
```
|
|
1680
|
+
|
|
1681
|
+
### 3.7.7. Cost Savings vs Accuracy Trade-off
|
|
1682
|
+
|
|
1683
|
+
**Example: 10M events/month (9.5M success, 500K errors)**
|
|
1684
|
+
|
|
1685
|
+
| Strategy | Events Stored | Cost | Success Rate Accuracy |
|
|
1686
|
+
|----------|---------------|------|----------------------|
|
|
1687
|
+
| **No Sampling** | 10M | $1000 | 100% (baseline) ✅ |
|
|
1688
|
+
| **Random 50%** | 5M | $500 | ~95% (biased) ⚠️ |
|
|
1689
|
+
| **Stratified** | 1.45M | $145 | **99.9%** (corrected) ✅ |
|
|
1690
|
+
|
|
1691
|
+
**Stratified Breakdown:**
|
|
1692
|
+
- Errors: 500K × 100% = 500K kept (50% of budget!)
|
|
1693
|
+
- Success: 9.5M × 10% = 950K kept (50% of budget)
|
|
1694
|
+
- **Total: 1.45M events (85.5% cost savings!)**
|
|
1695
|
+
|
|
1696
|
+
**Key Insight:** Stratified sampling provides **85% cost savings** with **99.9% accuracy** vs random sampling's **50% savings** with **95% accuracy**.
|
|
1697
|
+
|
|
1698
|
+
### 3.7.8. Testing Sampling Correction Accuracy
|
|
1699
|
+
|
|
1700
|
+
```ruby
|
|
1701
|
+
# spec/lib/e11y/slo/calculator_spec.rb
|
|
1702
|
+
RSpec.describe E11y::SLO::Calculator do
|
|
1703
|
+
describe '#calculate_success_rate with stratified sampling' do
|
|
1704
|
+
it 'accurately calculates success rate with sampling correction' do
|
|
1705
|
+
# Simulate 1000 requests (950 success, 50 errors)
|
|
1706
|
+
events = []
|
|
1707
|
+
|
|
1708
|
+
# Generate 950 success events (10% sampled → 95 kept)
|
|
1709
|
+
95.times do
|
|
1710
|
+
events << {
|
|
1711
|
+
severity: :info,
|
|
1712
|
+
payload: { http_status: 200 },
|
|
1713
|
+
metadata: {
|
|
1714
|
+
sampling_stratum: :success,
|
|
1715
|
+
sampling_rate: 0.1,
|
|
1716
|
+
sampled: true
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
end
|
|
1720
|
+
|
|
1721
|
+
# Generate 50 error events (100% sampled → 50 kept)
|
|
1722
|
+
50.times do
|
|
1723
|
+
events << {
|
|
1724
|
+
severity: :error,
|
|
1725
|
+
payload: { http_status: 500 },
|
|
1726
|
+
metadata: {
|
|
1727
|
+
sampling_stratum: :errors,
|
|
1728
|
+
sampling_rate: 1.0,
|
|
1729
|
+
sampled: true
|
|
1730
|
+
}
|
|
1731
|
+
}
|
|
1732
|
+
end
|
|
1733
|
+
|
|
1734
|
+
# Calculate SLO with correction
|
|
1735
|
+
calculator = described_class.new
|
|
1736
|
+
result = calculator.calculate_success_rate(events)
|
|
1737
|
+
|
|
1738
|
+
# Expected corrected success rate: 95%
|
|
1739
|
+
expect(result[:success_rate]).to be_within(0.001).of(0.95)
|
|
1740
|
+
expect(result[:error_rate]).to be_within(0.001).of(0.05)
|
|
1741
|
+
expect(result[:total_corrected_events]).to eq(1000)
|
|
1742
|
+
|
|
1743
|
+
# Breakdown verification
|
|
1744
|
+
expect(result[:breakdown][:success][:corrected]).to eq(950)
|
|
1745
|
+
expect(result[:breakdown][:errors][:corrected]).to eq(50)
|
|
1746
|
+
end
|
|
1747
|
+
|
|
1748
|
+
it 'matches baseline accuracy without sampling' do
|
|
1749
|
+
# Generate 1000 events without sampling
|
|
1750
|
+
baseline_events = generate_events(success: 950, errors: 50, sampled: false)
|
|
1751
|
+
baseline_rate = calculate_baseline_success_rate(baseline_events)
|
|
1752
|
+
|
|
1753
|
+
# Generate sampled events with correction
|
|
1754
|
+
sampled_events = generate_events(success: 95, errors: 50, sampled: true)
|
|
1755
|
+
corrected_rate = described_class.new.calculate_success_rate(sampled_events)[:success_rate]
|
|
1756
|
+
|
|
1757
|
+
# Should match within 1%
|
|
1758
|
+
expect(corrected_rate).to be_within(0.01).of(baseline_rate)
|
|
1759
|
+
end
|
|
1760
|
+
end
|
|
1761
|
+
end
|
|
1762
|
+
```
|
|
1763
|
+
|
|
1764
|
+
### 3.7.9. Trade-offs & SLO Accuracy (C11)
|
|
1765
|
+
|
|
1766
|
+
**Trade-offs:**
|
|
1767
|
+
|
|
1768
|
+
| Decision | Pro | Con | Rationale |
|
|
1769
|
+
|----------|-----|-----|-----------|
|
|
1770
|
+
| **Stratified sampling** | Accurate SLO metrics | Complexity (correction math) | Accuracy > simplicity |
|
|
1771
|
+
| **Always keep errors (100%)** | No error data loss | Higher cost if error rate spikes | Error visibility critical |
|
|
1772
|
+
| **Aggressive success sampling (10%)** | 90% cost savings | Large correction factor (10x) | Success events less critical |
|
|
1773
|
+
| **Sampling correction math** | Accurate percentiles | CPU overhead (~0.1ms/query) | Accuracy > performance |
|
|
1774
|
+
|
|
1775
|
+
**SLO Accuracy Guarantees:**
|
|
1776
|
+
|
|
1777
|
+
✅ **Error rate accuracy: 100%**
|
|
1778
|
+
All errors captured → no error data loss.
|
|
1779
|
+
|
|
1780
|
+
✅ **Success rate accuracy: 99.9%**
|
|
1781
|
+
Sampling correction restores true success rate (±0.1% error).
|
|
1782
|
+
|
|
1783
|
+
✅ **Latency percentiles: 95%**
|
|
1784
|
+
P99 latency within 5% of true value (correction restores distribution).
|
|
1785
|
+
|
|
1786
|
+
**Limitations:**
|
|
1787
|
+
|
|
1788
|
+
⚠️ **High error rates reduce savings:** If errors >10% of traffic, cost savings decrease
|
|
1789
|
+
**Mitigation:** Adjust success sample rate dynamically based on error rate
|
|
1790
|
+
|
|
1791
|
+
⚠️ **Correction assumes uniform distribution:** May be inaccurate if success events clustered
|
|
1792
|
+
**Mitigation:** Use time-windowed correction (per 5-minute window)
|
|
1793
|
+
|
|
1794
|
+
⚠️ **Small sample sizes:** <100 events may have large correction errors
|
|
1795
|
+
**Mitigation:** Don't apply correction for small samples, wait for more data
|
|
1796
|
+
|
|
1797
|
+
**Monitoring Metrics:**
|
|
1798
|
+
|
|
1799
|
+
```ruby
|
|
1800
|
+
# Track stratified sampling effectiveness
|
|
1801
|
+
Yabeda.e11y_sampling_decisions_total.increment(
|
|
1802
|
+
stratum: 'success',
|
|
1803
|
+
decision: 'kept',
|
|
1804
|
+
sample_rate: 0.1
|
|
1805
|
+
)
|
|
1806
|
+
|
|
1807
|
+
# Track SLO calculation accuracy
|
|
1808
|
+
Yabeda.e11y_slo_correction_factor.observe(
|
|
1809
|
+
stratum: 'success',
|
|
1810
|
+
correction_factor: 10.0
|
|
1811
|
+
)
|
|
1812
|
+
|
|
1813
|
+
# Alert on correction accuracy drift
|
|
1814
|
+
Yabeda.e11y_slo_correction_error_rate.observe(
|
|
1815
|
+
expected: 0.95,
|
|
1816
|
+
actual: 0.949,
|
|
1817
|
+
error: 0.001 # 0.1% error
|
|
1818
|
+
)
|
|
1819
|
+
```
|
|
1820
|
+
|
|
1821
|
+
**Related Conflicts:**
|
|
1822
|
+
- **C05:** Trace-aware sampling (see §3.6 above)
|
|
1823
|
+
- **UC-004:** Zero-config SLO tracking (see UC-004 for SLO calculation details)
|
|
1824
|
+
- **UC-014:** Adaptive sampling (see UC-014 for cost optimization)
|
|
1825
|
+
|
|
1826
|
+
---
|
|
1827
|
+
|
|
1828
|
+
## 4. Compression
|
|
1829
|
+
|
|
1830
|
+
### 4.1. Compression Engine
|
|
1831
|
+
|
|
1832
|
+
```ruby
|
|
1833
|
+
# lib/e11y/cost/compressor.rb
|
|
1834
|
+
module E11y
|
|
1835
|
+
module Cost
|
|
1836
|
+
class Compressor
|
|
1837
|
+
ALGORITHMS = {
|
|
1838
|
+
gzip: Algorithms::Gzip,
|
|
1839
|
+
zstd: Algorithms::Zstd,
|
|
1840
|
+
lz4: Algorithms::LZ4
|
|
1841
|
+
}.freeze
|
|
1842
|
+
|
|
1843
|
+
def initialize(config)
|
|
1844
|
+
@algorithm = config.algorithm || :gzip
|
|
1845
|
+
@min_size = config.min_size_bytes || 1024 # 1KB
|
|
1846
|
+
@compressor = ALGORITHMS[@algorithm].new(config)
|
|
1847
|
+
end
|
|
1848
|
+
|
|
1849
|
+
def compress(payload_string)
|
|
1850
|
+
# Skip compression for small payloads
|
|
1851
|
+
return payload_string if payload_string.bytesize < @min_size
|
|
1852
|
+
|
|
1853
|
+
compressed = @compressor.compress(payload_string)
|
|
1854
|
+
|
|
1855
|
+
# Only use if compression helps
|
|
1856
|
+
if compressed.bytesize < payload_string.bytesize
|
|
1857
|
+
E11y::Metrics.histogram('e11y.compression.ratio',
|
|
1858
|
+
payload_string.bytesize.to_f / compressed.bytesize,
|
|
1859
|
+
{ algorithm: @algorithm }
|
|
1860
|
+
)
|
|
1861
|
+
|
|
1862
|
+
compressed
|
|
1863
|
+
else
|
|
1864
|
+
payload_string
|
|
1865
|
+
end
|
|
1866
|
+
end
|
|
1867
|
+
|
|
1868
|
+
def decompress(compressed_string)
|
|
1869
|
+
@compressor.decompress(compressed_string)
|
|
1870
|
+
end
|
|
1871
|
+
end
|
|
1872
|
+
|
|
1873
|
+
module Algorithms
|
|
1874
|
+
class Gzip
|
|
1875
|
+
def initialize(config)
|
|
1876
|
+
@level = config.compression_level || Zlib::DEFAULT_COMPRESSION
|
|
1877
|
+
end
|
|
1878
|
+
|
|
1879
|
+
def compress(data)
|
|
1880
|
+
io = StringIO.new
|
|
1881
|
+
gz = Zlib::GzipWriter.new(io, @level)
|
|
1882
|
+
gz.write(data)
|
|
1883
|
+
gz.close
|
|
1884
|
+
io.string
|
|
1885
|
+
end
|
|
1886
|
+
|
|
1887
|
+
def decompress(data)
|
|
1888
|
+
Zlib::GzipReader.new(StringIO.new(data)).read
|
|
1889
|
+
end
|
|
1890
|
+
end
|
|
1891
|
+
|
|
1892
|
+
class Zstd
|
|
1893
|
+
def initialize(config)
|
|
1894
|
+
@level = config.compression_level || 3
|
|
1895
|
+
require 'zstd-ruby'
|
|
1896
|
+
end
|
|
1897
|
+
|
|
1898
|
+
def compress(data)
|
|
1899
|
+
::Zstd.compress(data, level: @level)
|
|
1900
|
+
end
|
|
1901
|
+
|
|
1902
|
+
def decompress(data)
|
|
1903
|
+
::Zstd.decompress(data)
|
|
1904
|
+
end
|
|
1905
|
+
end
|
|
1906
|
+
|
|
1907
|
+
class LZ4
|
|
1908
|
+
def initialize(config)
|
|
1909
|
+
require 'lz4-ruby'
|
|
1910
|
+
end
|
|
1911
|
+
|
|
1912
|
+
def compress(data)
|
|
1913
|
+
LZ4::compress(data)
|
|
1914
|
+
end
|
|
1915
|
+
|
|
1916
|
+
def decompress(data)
|
|
1917
|
+
LZ4::uncompress(data)
|
|
1918
|
+
end
|
|
1919
|
+
end
|
|
1920
|
+
end
|
|
1921
|
+
end
|
|
1922
|
+
end
|
|
1923
|
+
```
|
|
1924
|
+
|
|
1925
|
+
### 4.2. Compression Benchmarks
|
|
1926
|
+
|
|
1927
|
+
```
|
|
1928
|
+
Algorithm | Ratio | Speed (MB/s) | CPU Usage
|
|
1929
|
+
----------|-------|--------------|----------
|
|
1930
|
+
Gzip | 5:1 | 50 | Medium
|
|
1931
|
+
Zstd | 6:1 | 200 | Low
|
|
1932
|
+
LZ4 | 3:1 | 500 | Very Low
|
|
1933
|
+
```
|
|
1934
|
+
|
|
1935
|
+
---
|
|
1936
|
+
|
|
1937
|
+
## 5. Smart Routing
|
|
1938
|
+
|
|
1939
|
+
### 5.1. Routing Strategy
|
|
1940
|
+
|
|
1941
|
+
```ruby
|
|
1942
|
+
# lib/e11y/cost/smart_router.rb
|
|
1943
|
+
module E11y
|
|
1944
|
+
module Cost
|
|
1945
|
+
class SmartRouter
|
|
1946
|
+
def initialize(config)
|
|
1947
|
+
@rules = config.routing_rules
|
|
1948
|
+
end
|
|
1949
|
+
|
|
1950
|
+
def route(event_data)
|
|
1951
|
+
# Evaluate routing rules
|
|
1952
|
+
matched_rule = @rules.find do |rule|
|
|
1953
|
+
rule.matches?(event_data)
|
|
1954
|
+
end
|
|
1955
|
+
|
|
1956
|
+
adapters = matched_rule&.adapters || default_adapters
|
|
1957
|
+
|
|
1958
|
+
E11y::Metrics.increment('e11y.routing.decision', {
|
|
1959
|
+
event_name: event_data[:event_name],
|
|
1960
|
+
rule: matched_rule&.name || 'default',
|
|
1961
|
+
adapters: adapters.join(',')
|
|
1962
|
+
})
|
|
1963
|
+
|
|
1964
|
+
adapters
|
|
1965
|
+
end
|
|
1966
|
+
|
|
1967
|
+
private
|
|
1968
|
+
|
|
1969
|
+
def default_adapters
|
|
1970
|
+
E11y.config.adapters.names
|
|
1971
|
+
end
|
|
1972
|
+
end
|
|
1973
|
+
|
|
1974
|
+
class RoutingRule
|
|
1975
|
+
attr_reader :name, :adapters
|
|
1976
|
+
|
|
1977
|
+
def initialize(name:, adapters:, &condition)
|
|
1978
|
+
@name = name
|
|
1979
|
+
@adapters = adapters
|
|
1980
|
+
@condition = condition
|
|
1981
|
+
end
|
|
1982
|
+
|
|
1983
|
+
def matches?(event_data)
|
|
1984
|
+
@condition.call(event_data)
|
|
1985
|
+
end
|
|
1986
|
+
end
|
|
1987
|
+
end
|
|
1988
|
+
end
|
|
1989
|
+
```
|
|
1990
|
+
|
|
1991
|
+
### 5.2. Configuration
|
|
1992
|
+
|
|
1993
|
+
```ruby
|
|
1994
|
+
E11y.configure do |config|
|
|
1995
|
+
config.cost_optimization.smart_routing do
|
|
1996
|
+
# Rule 1: Critical events → All adapters
|
|
1997
|
+
rule 'critical_events', adapters: [:loki, :sentry, :s3] do |event|
|
|
1998
|
+
event[:severity] >= :error ||
|
|
1999
|
+
event[:event_name].start_with?('payment.') ||
|
|
2000
|
+
event[:event_name].start_with?('order.')
|
|
2001
|
+
end
|
|
2002
|
+
|
|
2003
|
+
# Rule 2: Debug events → File only (not Loki)
|
|
2004
|
+
rule 'debug_events', adapters: [:file] do |event|
|
|
2005
|
+
event[:severity] == :debug
|
|
2006
|
+
end
|
|
2007
|
+
|
|
2008
|
+
# Rule 3: Archive events → S3 only
|
|
2009
|
+
rule 'archive_events', adapters: [:s3] do |event|
|
|
2010
|
+
event[:payload][:archive] == true
|
|
2011
|
+
end
|
|
2012
|
+
|
|
2013
|
+
# Rule 4: Health checks → /dev/null
|
|
2014
|
+
rule 'health_checks', adapters: [] do |event|
|
|
2015
|
+
event[:event_name].include?('health_check')
|
|
2016
|
+
end
|
|
2017
|
+
|
|
2018
|
+
# Default: Everything → Loki
|
|
2019
|
+
default_adapters [:loki]
|
|
2020
|
+
end
|
|
2021
|
+
end
|
|
2022
|
+
```
|
|
2023
|
+
|
|
2024
|
+
---
|
|
2025
|
+
|
|
2026
|
+
## 6. Tiered Storage
|
|
2027
|
+
|
|
2028
|
+
### 6.1. Retention Tagging
|
|
2029
|
+
|
|
2030
|
+
**Design Decision:** E11y adds `retention_until` timestamp, downstream systems handle deletion.
|
|
2031
|
+
|
|
2032
|
+
```ruby
|
|
2033
|
+
# lib/e11y/cost/retention_tagger.rb
|
|
2034
|
+
module E11y
|
|
2035
|
+
module Cost
|
|
2036
|
+
class RetentionTagger
|
|
2037
|
+
def initialize(config)
|
|
2038
|
+
@retention_rules = config.retention_rules
|
|
2039
|
+
end
|
|
2040
|
+
|
|
2041
|
+
def tag_event(event_data)
|
|
2042
|
+
# Find matching retention rule
|
|
2043
|
+
retention_days = determine_retention(event_data)
|
|
2044
|
+
|
|
2045
|
+
# Calculate absolute retention_until date
|
|
2046
|
+
retention_until = Time.now + retention_days.days
|
|
2047
|
+
|
|
2048
|
+
# Add to event metadata
|
|
2049
|
+
event_data[:retention_until] = retention_until.iso8601
|
|
2050
|
+
event_data[:retention_days] = retention_days
|
|
2051
|
+
|
|
2052
|
+
E11y::Metrics.histogram('e11y.retention.days', retention_days, {
|
|
2053
|
+
event_name: event_data[:event_name]
|
|
2054
|
+
})
|
|
2055
|
+
|
|
2056
|
+
event_data
|
|
2057
|
+
end
|
|
2058
|
+
|
|
2059
|
+
private
|
|
2060
|
+
|
|
2061
|
+
def determine_retention(event_data)
|
|
2062
|
+
# Priority 1: Explicit retention in payload
|
|
2063
|
+
return event_data[:payload][:retention_days] if event_data[:payload][:retention_days]
|
|
2064
|
+
|
|
2065
|
+
# Priority 2: Pattern-based rules
|
|
2066
|
+
rule = @retention_rules.find do |r|
|
|
2067
|
+
r.matches?(event_data)
|
|
2068
|
+
end
|
|
2069
|
+
|
|
2070
|
+
return rule.retention_days if rule
|
|
2071
|
+
|
|
2072
|
+
# Default retention
|
|
2073
|
+
30 # 30 days
|
|
2074
|
+
end
|
|
2075
|
+
end
|
|
2076
|
+
|
|
2077
|
+
class RetentionRule
|
|
2078
|
+
attr_reader :retention_days
|
|
2079
|
+
|
|
2080
|
+
def initialize(retention_days:, &condition)
|
|
2081
|
+
@retention_days = retention_days
|
|
2082
|
+
@condition = condition
|
|
2083
|
+
end
|
|
2084
|
+
|
|
2085
|
+
def matches?(event_data)
|
|
2086
|
+
@condition.call(event_data)
|
|
2087
|
+
end
|
|
2088
|
+
end
|
|
2089
|
+
end
|
|
2090
|
+
end
|
|
2091
|
+
```
|
|
2092
|
+
|
|
2093
|
+
### 6.2. Configuration
|
|
2094
|
+
|
|
2095
|
+
```ruby
|
|
2096
|
+
E11y.configure do |config|
|
|
2097
|
+
config.cost_optimization.tiered_storage do
|
|
2098
|
+
# Rule 1: Audit events → 7 years (compliance)
|
|
2099
|
+
retention_rule 2555 do |event| # 7 * 365 days
|
|
2100
|
+
event[:event_name].start_with?('audit.')
|
|
2101
|
+
end
|
|
2102
|
+
|
|
2103
|
+
# Rule 2: Payment events → 2 years (legal)
|
|
2104
|
+
retention_rule 730 do |event| # 2 * 365 days
|
|
2105
|
+
event[:event_name].start_with?('payment.')
|
|
2106
|
+
end
|
|
2107
|
+
|
|
2108
|
+
# Rule 3: Debug events → 7 days (troubleshooting)
|
|
2109
|
+
retention_rule 7 do |event|
|
|
2110
|
+
event[:severity] == :debug
|
|
2111
|
+
end
|
|
2112
|
+
|
|
2113
|
+
# Rule 4: Errors → 90 days (analysis)
|
|
2114
|
+
retention_rule 90 do |event|
|
|
2115
|
+
event[:severity] >= :error
|
|
2116
|
+
end
|
|
2117
|
+
|
|
2118
|
+
# Default: 30 days
|
|
2119
|
+
default_retention 30
|
|
2120
|
+
end
|
|
2121
|
+
end
|
|
2122
|
+
```
|
|
2123
|
+
|
|
2124
|
+
### 6.3. Downstream Integration
|
|
2125
|
+
|
|
2126
|
+
**Elasticsearch ILM:**
|
|
2127
|
+
|
|
2128
|
+
```json
|
|
2129
|
+
{
|
|
2130
|
+
"policy": {
|
|
2131
|
+
"phases": {
|
|
2132
|
+
"hot": {
|
|
2133
|
+
"actions": {}
|
|
2134
|
+
},
|
|
2135
|
+
"delete": {
|
|
2136
|
+
"min_age": "0d",
|
|
2137
|
+
"actions": {
|
|
2138
|
+
"delete": {
|
|
2139
|
+
"delete_searchable_snapshot": false
|
|
2140
|
+
}
|
|
2141
|
+
}
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
}
|
|
2146
|
+
```
|
|
2147
|
+
|
|
2148
|
+
**Query for deletion:**
|
|
2149
|
+
|
|
2150
|
+
```ruby
|
|
2151
|
+
# Elasticsearch query
|
|
2152
|
+
DELETE /e11y-events-*/_query
|
|
2153
|
+
{
|
|
2154
|
+
"query": {
|
|
2155
|
+
"range": {
|
|
2156
|
+
"retention_until": {
|
|
2157
|
+
"lt": "now"
|
|
2158
|
+
}
|
|
2159
|
+
}
|
|
2160
|
+
}
|
|
2161
|
+
}
|
|
2162
|
+
```
|
|
2163
|
+
|
|
2164
|
+
**S3 Lifecycle:**
|
|
2165
|
+
|
|
2166
|
+
```json
|
|
2167
|
+
{
|
|
2168
|
+
"Rules": [
|
|
2169
|
+
{
|
|
2170
|
+
"Id": "delete-expired-events",
|
|
2171
|
+
"Status": "Enabled",
|
|
2172
|
+
"Filter": {
|
|
2173
|
+
"Prefix": "events/"
|
|
2174
|
+
},
|
|
2175
|
+
"Expiration": {
|
|
2176
|
+
"Days": 365
|
|
2177
|
+
}
|
|
2178
|
+
}
|
|
2179
|
+
]
|
|
2180
|
+
}
|
|
2181
|
+
```
|
|
2182
|
+
|
|
2183
|
+
---
|
|
2184
|
+
|
|
2185
|
+
## 7. Payload Minimization
|
|
2186
|
+
|
|
2187
|
+
### 7.1. Payload Optimizer
|
|
2188
|
+
|
|
2189
|
+
```ruby
|
|
2190
|
+
# lib/e11y/cost/payload_minimizer.rb
|
|
2191
|
+
module E11y
|
|
2192
|
+
module Cost
|
|
2193
|
+
class PayloadMinimizer
|
|
2194
|
+
def initialize(config)
|
|
2195
|
+
@remove_nulls = config.remove_nulls || true
|
|
2196
|
+
@truncate_strings = config.truncate_strings_at || 1000
|
|
2197
|
+
@truncate_arrays = config.truncate_arrays_at || 100
|
|
2198
|
+
end
|
|
2199
|
+
|
|
2200
|
+
def minimize(payload)
|
|
2201
|
+
minimized = payload.deep_dup
|
|
2202
|
+
|
|
2203
|
+
# Remove null/empty values
|
|
2204
|
+
minimized.compact! if @remove_nulls
|
|
2205
|
+
|
|
2206
|
+
# Truncate long strings
|
|
2207
|
+
minimized.transform_values! do |value|
|
|
2208
|
+
case value
|
|
2209
|
+
when String
|
|
2210
|
+
truncate_string(value)
|
|
2211
|
+
when Array
|
|
2212
|
+
truncate_array(value)
|
|
2213
|
+
when Hash
|
|
2214
|
+
minimize(value) # Recursive
|
|
2215
|
+
else
|
|
2216
|
+
value
|
|
2217
|
+
end
|
|
2218
|
+
end
|
|
2219
|
+
|
|
2220
|
+
minimized
|
|
2221
|
+
end
|
|
2222
|
+
|
|
2223
|
+
private
|
|
2224
|
+
|
|
2225
|
+
def truncate_string(str)
|
|
2226
|
+
return str if str.length <= @truncate_strings
|
|
2227
|
+
|
|
2228
|
+
"#{str[0...@truncate_strings]}... [truncated #{str.length - @truncate_strings} chars]"
|
|
2229
|
+
end
|
|
2230
|
+
|
|
2231
|
+
def truncate_array(arr)
|
|
2232
|
+
return arr if arr.length <= @truncate_arrays
|
|
2233
|
+
|
|
2234
|
+
arr.first(@truncate_arrays) + ["... [truncated #{arr.length - @truncate_arrays} items]"]
|
|
2235
|
+
end
|
|
2236
|
+
end
|
|
2237
|
+
end
|
|
2238
|
+
end
|
|
2239
|
+
```
|
|
2240
|
+
|
|
2241
|
+
---
|
|
2242
|
+
|
|
2243
|
+
## 8. Cardinality Protection (C04 Resolution) ⚠️ CRITICAL
|
|
2244
|
+
|
|
2245
|
+
**Reference:** [CONFLICT-ANALYSIS.md - C04: High-Cardinality Metrics × OpenTelemetry Attributes](../researches/CONFLICT-ANALYSIS.md#c04-high-cardinality-metrics--opentelemetry-attributes)
|
|
2246
|
+
|
|
2247
|
+
### 8.1. The Problem: Cardinality Explosion Across Backends
|
|
2248
|
+
|
|
2249
|
+
**Scenario:** UC-013 (High-Cardinality Protection) was designed to protect **Yabeda/Prometheus metrics** from cardinality explosion. However, cardinality explosion is a **cost optimization problem** affecting **ALL backends**:
|
|
2250
|
+
|
|
2251
|
+
- ❌ **Yabeda (Prometheus):** High cardinality → query slowness, OOM
|
|
2252
|
+
- ❌ **OpenTelemetry (OTLP):** High cardinality → cost spikes (Datadog, Honeycomb charge per unique attribute combination)
|
|
2253
|
+
- ❌ **Loki (Logs):** High cardinality in labels → index bloat, slow queries
|
|
2254
|
+
|
|
2255
|
+
**Example:**
|
|
2256
|
+
|
|
2257
|
+
```ruby
|
|
2258
|
+
# Configuration (UC-013):
|
|
2259
|
+
E11y.configure do |config|
|
|
2260
|
+
config.cardinality_protection do
|
|
2261
|
+
enabled true
|
|
2262
|
+
max_unique_values 100 # Per label
|
|
2263
|
+
protected_labels [:user_id, :order_id, :session_id]
|
|
2264
|
+
end
|
|
2265
|
+
end
|
|
2266
|
+
|
|
2267
|
+
# Event tracking (10,000 unique users):
|
|
2268
|
+
10_000.times do |i|
|
|
2269
|
+
Events::OrderCreated.track(
|
|
2270
|
+
order_id: "order-#{i}", # ← 10,000 unique values!
|
|
2271
|
+
user_id: "user-#{i}", # ← 10,000 unique values!
|
|
2272
|
+
amount: 99.99
|
|
2273
|
+
)
|
|
2274
|
+
end
|
|
2275
|
+
```
|
|
2276
|
+
|
|
2277
|
+
**Result BEFORE C04 fix:**
|
|
2278
|
+
|
|
2279
|
+
```
|
|
2280
|
+
✅ Yabeda (Prometheus metrics):
|
|
2281
|
+
- Cardinality protection ACTIVE
|
|
2282
|
+
- Only first 100 unique order_id/user_id tracked
|
|
2283
|
+
- Rest grouped as [OTHER]
|
|
2284
|
+
- Prometheus cardinality: 100
|
|
2285
|
+
|
|
2286
|
+
❌ OpenTelemetry (span attributes):
|
|
2287
|
+
- Cardinality protection BYPASSED
|
|
2288
|
+
- ALL 10,000 unique order_id/user_id exported
|
|
2289
|
+
- OTLP backend cardinality: 10,000
|
|
2290
|
+
- Cost spike: 100× expected!
|
|
2291
|
+
|
|
2292
|
+
❌ Loki (log labels):
|
|
2293
|
+
- Cardinality protection BYPASSED
|
|
2294
|
+
- ALL 10,000 unique order_id/user_id in labels
|
|
2295
|
+
- Index bloat, slow queries
|
|
2296
|
+
```
|
|
2297
|
+
|
|
2298
|
+
**Impact:**
|
|
2299
|
+
- ❌ **Cost explosion:** OTLP backends (Datadog, Honeycomb, Elastic) charge per unique attribute combination ($0.10/span → $1,000/day)
|
|
2300
|
+
- ❌ **Data loss:** Backend hits cardinality limit (e.g., Datadog 1000/metric), starts dropping spans
|
|
2301
|
+
- ❌ **Inconsistent protection:** Yabeda protected, OpenTelemetry/Loki not
|
|
2302
|
+
- ❌ **Misleading config:** UC-013 promises protection, but only covers **metrics** (not traces/logs)
|
|
2303
|
+
|
|
2304
|
+
### 8.2. Decision: Unified Cardinality Protection for All Backends
|
|
2305
|
+
|
|
2306
|
+
**Strategy:** Apply UC-013 cardinality protection to **ALL adapters** (Yabeda, OpenTelemetry, Loki) using a unified `CardinalityFilter` middleware.
|
|
2307
|
+
|
|
2308
|
+
**Rules:**
|
|
2309
|
+
1. **Single source of truth:** `E11y.config.cardinality_protection` applies to ALL backends by default
|
|
2310
|
+
2. **Per-backend overrides:** Allow separate limits for backends with different cardinality handling (e.g., OTLP can handle 1000, Prometheus only 100)
|
|
2311
|
+
3. **Apply in middleware:** Filter event payload in `CardinalityFilter` middleware before adapters
|
|
2312
|
+
4. **[OTHER] grouping:** Group high-cardinality values as `[OTHER]` (consistent across backends)
|
|
2313
|
+
5. **Monitor metrics:** Track filtered labels for visibility
|
|
2314
|
+
|
|
2315
|
+
### 8.3. Configuration: Inherit from Global Settings
|
|
2316
|
+
|
|
2317
|
+
```ruby
|
|
2318
|
+
# config/initializers/e11y.rb
|
|
2319
|
+
E11y.configure do |config|
|
|
2320
|
+
# ✅ GLOBAL cardinality protection (applies to ALL backends by default)
|
|
2321
|
+
config.cardinality_protection do
|
|
2322
|
+
enabled true
|
|
2323
|
+
max_unique_values 100 # Conservative default (Prometheus-safe)
|
|
2324
|
+
protected_labels [:user_id, :order_id, :session_id, :ip_address, :tenant_id]
|
|
2325
|
+
end
|
|
2326
|
+
|
|
2327
|
+
# Optional: Per-backend overrides (if needed)
|
|
2328
|
+
config.adapters do
|
|
2329
|
+
# Yabeda: Use global settings (default)
|
|
2330
|
+
yabeda do
|
|
2331
|
+
cardinality_protection.inherit_from :global
|
|
2332
|
+
end
|
|
2333
|
+
|
|
2334
|
+
# OpenTelemetry: Higher limits OK (OTLP backends handle more)
|
|
2335
|
+
opentelemetry do
|
|
2336
|
+
cardinality_protection do
|
|
2337
|
+
inherit_from :global # OR override:
|
|
2338
|
+
# max_unique_values 1000 # OTLP backends (Datadog) handle more
|
|
2339
|
+
# protected_labels [:user_id, :order_id] # Subset of global
|
|
2340
|
+
end
|
|
2341
|
+
end
|
|
2342
|
+
|
|
2343
|
+
# Loki: Use global settings (label cardinality matters!)
|
|
2344
|
+
loki do
|
|
2345
|
+
cardinality_protection.inherit_from :global
|
|
2346
|
+
end
|
|
2347
|
+
end
|
|
2348
|
+
end
|
|
2349
|
+
```
|
|
2350
|
+
|
|
2351
|
+
**Environment-specific examples:**
|
|
2352
|
+
|
|
2353
|
+
```ruby
|
|
2354
|
+
# Production: Strict limits (cost-sensitive)
|
|
2355
|
+
# config/environments/production.rb
|
|
2356
|
+
E11y.configure do |config|
|
|
2357
|
+
config.cardinality_protection do
|
|
2358
|
+
enabled true
|
|
2359
|
+
max_unique_values 100
|
|
2360
|
+
protected_labels [:user_id, :order_id, :session_id, :tenant_id]
|
|
2361
|
+
end
|
|
2362
|
+
end
|
|
2363
|
+
|
|
2364
|
+
# Development: No limits (full visibility)
|
|
2365
|
+
# config/environments/development.rb
|
|
2366
|
+
E11y.configure do |config|
|
|
2367
|
+
config.cardinality_protection.enabled false
|
|
2368
|
+
end
|
|
2369
|
+
|
|
2370
|
+
# Staging: Moderate limits (balance cost vs debugging)
|
|
2371
|
+
# config/environments/staging.rb
|
|
2372
|
+
E11y.configure do |config|
|
|
2373
|
+
config.cardinality_protection do
|
|
2374
|
+
max_unique_values 500 # More than prod, less than unlimited
|
|
2375
|
+
end
|
|
2376
|
+
|
|
2377
|
+
# OTLP backend can handle even more
|
|
2378
|
+
config.adapters.opentelemetry do
|
|
2379
|
+
cardinality_protection.max_unique_values 1000
|
|
2380
|
+
end
|
|
2381
|
+
end
|
|
2382
|
+
```
|
|
2383
|
+
|
|
2384
|
+
### 8.4. Implementation: Apply to Yabeda + OpenTelemetry
|
|
2385
|
+
|
|
2386
|
+
**CardinalityFilter middleware (unified for all backends):**
|
|
2387
|
+
|
|
2388
|
+
```ruby
|
|
2389
|
+
# lib/e11y/middleware/cardinality_filter.rb
|
|
2390
|
+
module E11y
|
|
2391
|
+
module Middleware
|
|
2392
|
+
class CardinalityFilter
|
|
2393
|
+
def initialize(app)
|
|
2394
|
+
@app = app
|
|
2395
|
+
end
|
|
2396
|
+
|
|
2397
|
+
def call(event)
|
|
2398
|
+
# Apply cardinality protection if enabled
|
|
2399
|
+
if E11y.config.cardinality_protection.enabled
|
|
2400
|
+
event.payload = filter_payload(event.payload)
|
|
2401
|
+
end
|
|
2402
|
+
|
|
2403
|
+
@app.call(event)
|
|
2404
|
+
end
|
|
2405
|
+
|
|
2406
|
+
private
|
|
2407
|
+
|
|
2408
|
+
def filter_payload(payload)
|
|
2409
|
+
filtered = payload.dup
|
|
2410
|
+
protected_labels = E11y.config.cardinality_protection.protected_labels
|
|
2411
|
+
max_unique = E11y.config.cardinality_protection.max_unique_values
|
|
2412
|
+
|
|
2413
|
+
protected_labels.each do |label|
|
|
2414
|
+
next unless filtered.key?(label)
|
|
2415
|
+
|
|
2416
|
+
original_value = filtered[label]
|
|
2417
|
+
|
|
2418
|
+
# Check if value exceeds cardinality limit
|
|
2419
|
+
if exceeds_limit?(label, original_value, max_unique)
|
|
2420
|
+
# Replace with [OTHER]
|
|
2421
|
+
filtered[label] = '[OTHER]'
|
|
2422
|
+
|
|
2423
|
+
# Track metric
|
|
2424
|
+
E11y::Metrics.increment('e11y.cardinality.filtered_labels', {
|
|
2425
|
+
label: label,
|
|
2426
|
+
backend: 'all' # Applies to all adapters
|
|
2427
|
+
})
|
|
2428
|
+
|
|
2429
|
+
# Log debug
|
|
2430
|
+
E11y.logger.debug do
|
|
2431
|
+
"Cardinality limit exceeded for #{label}: #{original_value} → [OTHER]"
|
|
2432
|
+
end
|
|
2433
|
+
else
|
|
2434
|
+
# Track unique value
|
|
2435
|
+
track_unique_value(label, original_value)
|
|
2436
|
+
end
|
|
2437
|
+
end
|
|
2438
|
+
|
|
2439
|
+
filtered
|
|
2440
|
+
end
|
|
2441
|
+
|
|
2442
|
+
def exceeds_limit?(label, value, max_unique_values)
|
|
2443
|
+
cache_key = "#{label}:#{value}"
|
|
2444
|
+
|
|
2445
|
+
# Check if value already tracked
|
|
2446
|
+
return false if unique_values_cache.key?(cache_key)
|
|
2447
|
+
|
|
2448
|
+
# Check if label already has max unique values
|
|
2449
|
+
label_cardinality = unique_values_cache.keys.count { |k| k.start_with?("#{label}:") }
|
|
2450
|
+
label_cardinality >= max_unique_values
|
|
2451
|
+
end
|
|
2452
|
+
|
|
2453
|
+
def track_unique_value(label, value)
|
|
2454
|
+
cache_key = "#{label}:#{value}"
|
|
2455
|
+
unique_values_cache[cache_key] = true
|
|
2456
|
+
end
|
|
2457
|
+
|
|
2458
|
+
def unique_values_cache
|
|
2459
|
+
@unique_values_cache ||= Concurrent::Map.new
|
|
2460
|
+
end
|
|
2461
|
+
|
|
2462
|
+
# Class method for adapter-specific overrides
|
|
2463
|
+
def self.filter(payload, max_unique_values:, protected_labels:)
|
|
2464
|
+
# Same logic as instance method, but with custom config
|
|
2465
|
+
# Used by adapters with per-backend overrides
|
|
2466
|
+
# (Implementation omitted for brevity)
|
|
2467
|
+
end
|
|
2468
|
+
end
|
|
2469
|
+
end
|
|
2470
|
+
end
|
|
2471
|
+
```
|
|
2472
|
+
|
|
2473
|
+
**Yabeda adapter (uses filtered payload from middleware):**
|
|
2474
|
+
|
|
2475
|
+
```ruby
|
|
2476
|
+
# lib/e11y/adapters/yabeda_collector.rb
|
|
2477
|
+
module E11y
|
|
2478
|
+
module Adapters
|
|
2479
|
+
class YabedaCollector < Base
|
|
2480
|
+
def send_batch(events)
|
|
2481
|
+
events.each do |event|
|
|
2482
|
+
# Payload already filtered by CardinalityFilter middleware
|
|
2483
|
+
# Just increment metrics
|
|
2484
|
+
Yabeda.e11y.events.increment({
|
|
2485
|
+
event_name: event.event_name,
|
|
2486
|
+
severity: event.severity,
|
|
2487
|
+
# HIGH-CARDINALITY labels already replaced with [OTHER]
|
|
2488
|
+
user_id: event.payload[:user_id], # ✅ Protected
|
|
2489
|
+
order_id: event.payload[:order_id] # ✅ Protected
|
|
2490
|
+
})
|
|
2491
|
+
end
|
|
2492
|
+
end
|
|
2493
|
+
end
|
|
2494
|
+
end
|
|
2495
|
+
end
|
|
2496
|
+
```
|
|
2497
|
+
|
|
2498
|
+
**OpenTelemetry adapter (uses filtered payload from middleware):**
|
|
2499
|
+
|
|
2500
|
+
```ruby
|
|
2501
|
+
# lib/e11y/adapters/opentelemetry_collector.rb
|
|
2502
|
+
module E11y
|
|
2503
|
+
module Adapters
|
|
2504
|
+
class OpenTelemetryCollector < Base
|
|
2505
|
+
def send_batch(events)
|
|
2506
|
+
events.each do |event|
|
|
2507
|
+
# Payload already filtered by CardinalityFilter middleware
|
|
2508
|
+
export_trace(event) if @export_traces
|
|
2509
|
+
end
|
|
2510
|
+
end
|
|
2511
|
+
|
|
2512
|
+
private
|
|
2513
|
+
|
|
2514
|
+
def export_trace(event)
|
|
2515
|
+
tracer = ::OpenTelemetry.tracer_provider.tracer('e11y')
|
|
2516
|
+
|
|
2517
|
+
tracer.in_span(event.event_name) do |span|
|
|
2518
|
+
# Set filtered attributes on span (cardinality protected!)
|
|
2519
|
+
event.payload.each do |key, value|
|
|
2520
|
+
span.set_attribute(key.to_s, value) # ✅ Already filtered
|
|
2521
|
+
end
|
|
2522
|
+
|
|
2523
|
+
# Add metadata
|
|
2524
|
+
span.set_attribute('event.name', event.event_name)
|
|
2525
|
+
span.set_attribute('event.severity', event.severity.to_s)
|
|
2526
|
+
span.set_attribute('event.timestamp', event.timestamp.iso8601)
|
|
2527
|
+
end
|
|
2528
|
+
end
|
|
2529
|
+
end
|
|
2530
|
+
end
|
|
2531
|
+
end
|
|
2532
|
+
```
|
|
2533
|
+
|
|
2534
|
+
### 8.5. Cost Impact: Before vs After Protection
|
|
2535
|
+
|
|
2536
|
+
**Scenario:** 10,000 orders/day, each with unique `order_id` and `user_id`
|
|
2537
|
+
|
|
2538
|
+
**BEFORE C04 fix (cardinality unprotected):**
|
|
2539
|
+
|
|
2540
|
+
```ruby
|
|
2541
|
+
# 10,000 orders/day
|
|
2542
|
+
10_000.times do |i|
|
|
2543
|
+
Events::OrderCreated.track(
|
|
2544
|
+
order_id: "order-#{i}", # 10,000 unique values
|
|
2545
|
+
user_id: "user-#{i % 5000}", # 5,000 unique users
|
|
2546
|
+
amount: rand(10..500)
|
|
2547
|
+
)
|
|
2548
|
+
end
|
|
2549
|
+
|
|
2550
|
+
# Cost in OTLP backend (e.g., Datadog):
|
|
2551
|
+
# - Span attribute cardinality: order_id=10,000, user_id=5,000
|
|
2552
|
+
# - Datadog pricing: $0.10/span with high-cardinality attributes
|
|
2553
|
+
# - Daily cost: $0.10 × 10,000 = $1,000/day
|
|
2554
|
+
# - Monthly cost: $30,000/month ❌
|
|
2555
|
+
```
|
|
2556
|
+
|
|
2557
|
+
**AFTER C04 fix (cardinality protected):**
|
|
2558
|
+
|
|
2559
|
+
```ruby
|
|
2560
|
+
# Same events, but cardinality protection ENABLED
|
|
2561
|
+
E11y.configure do |config|
|
|
2562
|
+
config.cardinality_protection do
|
|
2563
|
+
enabled true
|
|
2564
|
+
max_unique_values 100
|
|
2565
|
+
protected_labels [:user_id, :order_id]
|
|
2566
|
+
end
|
|
2567
|
+
end
|
|
2568
|
+
|
|
2569
|
+
# 10,000 orders/day (same workload)
|
|
2570
|
+
10_000.times do |i|
|
|
2571
|
+
Events::OrderCreated.track(
|
|
2572
|
+
order_id: "order-#{i}", # Filtered to 100 + [OTHER]
|
|
2573
|
+
user_id: "user-#{i % 5000}", # Filtered to 100 + [OTHER]
|
|
2574
|
+
amount: rand(10..500)
|
|
2575
|
+
)
|
|
2576
|
+
end
|
|
2577
|
+
|
|
2578
|
+
# Cost in OTLP backend (e.g., Datadog):
|
|
2579
|
+
# - Span attribute cardinality: order_id=101 (100 + [OTHER]), user_id=101
|
|
2580
|
+
# - Datadog pricing: $0.01/span (low-cardinality attributes)
|
|
2581
|
+
# - Daily cost: $0.01 × 10,000 = $100/day ✅
|
|
2582
|
+
# - Monthly cost: $3,000/month
|
|
2583
|
+
# - Monthly savings: $27,000 💰 (90% reduction!)
|
|
2584
|
+
```
|
|
2585
|
+
|
|
2586
|
+
### 8.6. Monitoring Metrics
|
|
2587
|
+
|
|
2588
|
+
**Key metrics for cardinality protection:**
|
|
2589
|
+
|
|
2590
|
+
```ruby
|
|
2591
|
+
# 1. Filtered labels (cardinality protection triggered)
|
|
2592
|
+
E11y::Metrics.increment('e11y.cardinality.filtered_labels', {
|
|
2593
|
+
label: 'user_id',
|
|
2594
|
+
backend: 'all', # or 'yabeda', 'opentelemetry', 'loki'
|
|
2595
|
+
original_value_hash: Digest::SHA256.hexdigest(original_value)[0..7]
|
|
2596
|
+
})
|
|
2597
|
+
|
|
2598
|
+
# 2. Unique values tracked per label (current cardinality)
|
|
2599
|
+
E11y::Metrics.gauge('e11y.cardinality.unique_values',
|
|
2600
|
+
CardinalityFilter.unique_values_count(label),
|
|
2601
|
+
{ label: label }
|
|
2602
|
+
)
|
|
2603
|
+
|
|
2604
|
+
# 3. Cardinality limit breaches (labels hitting max)
|
|
2605
|
+
E11y::Metrics.increment('e11y.cardinality.limit_breached', {
|
|
2606
|
+
label: label,
|
|
2607
|
+
max_unique_values: max_unique_values
|
|
2608
|
+
})
|
|
2609
|
+
```
|
|
2610
|
+
|
|
2611
|
+
**Grafana dashboard queries:**
|
|
2612
|
+
|
|
2613
|
+
```promql
|
|
2614
|
+
# Cardinality protection rate (% of labels filtered)
|
|
2615
|
+
rate(e11y_cardinality_filtered_labels_total[5m])
|
|
2616
|
+
/
|
|
2617
|
+
rate(e11y_events_tracked_total[5m]) * 100
|
|
2618
|
+
|
|
2619
|
+
# Labels at risk (approaching cardinality limit)
|
|
2620
|
+
e11y_cardinality_unique_values
|
|
2621
|
+
/
|
|
2622
|
+
100 * 100 > 80 # 80% of max_unique_values (100)
|
|
2623
|
+
|
|
2624
|
+
# Top high-cardinality labels
|
|
2625
|
+
topk(10,
|
|
2626
|
+
sum by (label) (
|
|
2627
|
+
rate(e11y_cardinality_filtered_labels_total[1h])
|
|
2628
|
+
)
|
|
2629
|
+
)
|
|
2630
|
+
|
|
2631
|
+
# Cost savings estimate (assume $0.10 per unique span attribute)
|
|
2632
|
+
sum(rate(e11y_cardinality_filtered_labels_total[1d])) * 0.10
|
|
2633
|
+
# Result: Daily $ saved
|
|
2634
|
+
```
|
|
2635
|
+
|
|
2636
|
+
**Alert rules:**
|
|
2637
|
+
|
|
2638
|
+
```yaml
|
|
2639
|
+
# Alert if too many labels being filtered (config too strict?)
|
|
2640
|
+
- alert: E11yCardinalityHighFilterRate
|
|
2641
|
+
expr: |
|
|
2642
|
+
rate(e11y_cardinality_filtered_labels_total[5m])
|
|
2643
|
+
/
|
|
2644
|
+
rate(e11y_events_tracked_total[5m]) > 0.5
|
|
2645
|
+
for: 15m
|
|
2646
|
+
annotations:
|
|
2647
|
+
summary: "E11y filtering >50% of labels (cardinality config too strict?)"
|
|
2648
|
+
|
|
2649
|
+
# Alert if label approaching cardinality limit
|
|
2650
|
+
- alert: E11yCardinalityLimitApproaching
|
|
2651
|
+
expr: |
|
|
2652
|
+
e11y_cardinality_unique_values
|
|
2653
|
+
/
|
|
2654
|
+
100 > 0.9
|
|
2655
|
+
for: 10m
|
|
2656
|
+
annotations:
|
|
2657
|
+
summary: "Label {{ $labels.label }} at 90% of cardinality limit (100 unique values)"
|
|
2658
|
+
|
|
2659
|
+
# Alert if cardinality protection disabled in production
|
|
2660
|
+
- alert: E11yCardinalityProtectionDisabled
|
|
2661
|
+
expr: |
|
|
2662
|
+
e11y_config_cardinality_protection_enabled{env="production"} == 0
|
|
2663
|
+
for: 5m
|
|
2664
|
+
annotations:
|
|
2665
|
+
summary: "⚠️ Cardinality protection DISABLED in production (cost risk!)"
|
|
2666
|
+
```
|
|
2667
|
+
|
|
2668
|
+
### 8.7. Trade-offs (C04 Resolution)
|
|
2669
|
+
|
|
2670
|
+
| Aspect | Pros | Cons | Mitigation |
|
|
2671
|
+
|--------|------|------|------------|
|
|
2672
|
+
| **Unified protection** | Consistent across all backends | One size doesn't fit all backends | Per-backend overrides (`inherit_from :global` or custom) |
|
|
2673
|
+
| **[OTHER] grouping** | Prevents cost explosion | Loses context for debugging | Log original values at debug level + query by `original_value_hash` |
|
|
2674
|
+
| **Global config** | Simple, DRY | May not fit all backend limits | Environment-specific: prod=100, staging=500, dev=unlimited |
|
|
2675
|
+
| **Middleware filtering** | Centralized, applies to all adapters | Performance overhead (filter per event) | Cache cardinality state (Concurrent::Map) |
|
|
2676
|
+
| **max_unique_values 100** | Conservative, safe for Prometheus | May be too strict for OTLP backends | Per-backend override: OTLP=1000, Yabeda=100 |
|
|
2677
|
+
| **protected_labels config** | Explicit control | Need to identify high-cardinality labels upfront | Monitor `limit_breached` metric, add labels incrementally |
|
|
2678
|
+
|
|
2679
|
+
---
|
|
2680
|
+
|
|
2681
|
+
## 9. Cost Metrics
|
|
2682
|
+
|
|
2683
|
+
### 8.1. Cost Tracking
|
|
2684
|
+
|
|
2685
|
+
```ruby
|
|
2686
|
+
# lib/e11y/cost/tracker.rb
|
|
2687
|
+
module E11y
|
|
2688
|
+
module Cost
|
|
2689
|
+
class Tracker
|
|
2690
|
+
# Estimated cost per adapter (per 1M events)
|
|
2691
|
+
ADAPTER_COSTS = {
|
|
2692
|
+
loki: 0.50, # $0.50/GB ≈ $0.50/1M events
|
|
2693
|
+
sentry: 10.00, # $0.01/event
|
|
2694
|
+
s3: 0.02, # $0.023/GB
|
|
2695
|
+
elasticsearch: 1.00
|
|
2696
|
+
}.freeze
|
|
2697
|
+
|
|
2698
|
+
def self.track_event_cost(event_data, adapters)
|
|
2699
|
+
# Estimate event size
|
|
2700
|
+
event_size_kb = estimate_size(event_data)
|
|
2701
|
+
|
|
2702
|
+
# Calculate cost per adapter
|
|
2703
|
+
adapters.each do |adapter|
|
|
2704
|
+
cost_per_mb = ADAPTER_COSTS[adapter] || 0
|
|
2705
|
+
cost = (event_size_kb / 1024.0) * cost_per_mb
|
|
2706
|
+
|
|
2707
|
+
E11y::Metrics.histogram('e11y.cost.per_event', cost, {
|
|
2708
|
+
adapter: adapter,
|
|
2709
|
+
event_name: event_data[:event_name]
|
|
2710
|
+
})
|
|
2711
|
+
end
|
|
2712
|
+
end
|
|
2713
|
+
|
|
2714
|
+
def self.estimate_size(event_data)
|
|
2715
|
+
event_data.to_json.bytesize / 1024.0 # KB
|
|
2716
|
+
end
|
|
2717
|
+
end
|
|
2718
|
+
end
|
|
2719
|
+
end
|
|
2720
|
+
```
|
|
2721
|
+
|
|
2722
|
+
### 8.2. Cost Dashboard Metrics
|
|
2723
|
+
|
|
2724
|
+
```ruby
|
|
2725
|
+
E11y::Metrics.define do
|
|
2726
|
+
# Cost metrics
|
|
2727
|
+
histogram 'e11y.cost.per_event', 'Cost per event (USD)', [:adapter, :event_name]
|
|
2728
|
+
counter 'e11y.cost.total', 'Total cost (USD)', [:adapter]
|
|
2729
|
+
|
|
2730
|
+
# Savings metrics
|
|
2731
|
+
counter 'e11y.cost.saved.sampling', 'Cost saved by sampling', [:strategy]
|
|
2732
|
+
counter 'e11y.cost.saved.compression', 'Cost saved by compression', [:algorithm]
|
|
2733
|
+
counter 'e11y.cost.saved.routing', 'Cost saved by smart routing'
|
|
2734
|
+
|
|
2735
|
+
# Efficiency metrics
|
|
2736
|
+
histogram 'e11y.compression.ratio', 'Compression ratio', [:algorithm]
|
|
2737
|
+
histogram 'e11y.sampling.rate', 'Final sampling rate', [:event_name]
|
|
2738
|
+
histogram 'e11y.payload.size_reduction', 'Payload size reduction', [:event_name]
|
|
2739
|
+
end
|
|
2740
|
+
```
|
|
2741
|
+
|
|
2742
|
+
---
|
|
2743
|
+
|
|
2744
|
+
## 10. Trade-offs
|
|
2745
|
+
|
|
2746
|
+
### 10.1. Key Decisions
|
|
2747
|
+
|
|
2748
|
+
| Decision | Pro | Con | Rationale |
|
|
2749
|
+
|----------|-----|-----|-----------|
|
|
2750
|
+
| **Adaptive sampling** | 50-80% cost savings | Data loss risk | Errors always sampled |
|
|
2751
|
+
| **Cardinality protection (C04)** ⚠️ | 90% cost reduction in OTLP | Loses debugging context | [OTHER] grouping + debug logs |
|
|
2752
|
+
| **Gzip default** | 5:1 ratio, widely supported | CPU overhead | Best balance |
|
|
2753
|
+
| **retention_until tagging** | Simple, flexible | Downstream dependency | Clean separation |
|
|
2754
|
+
| **Smart routing** | 50% cost savings | Complex rules | Worth complexity |
|
|
2755
|
+
| **Payload minimization** | 20-30% size reduction | Data truncation | Configurable limits |
|
|
2756
|
+
|
|
2757
|
+
### 10.2. Alternatives Considered
|
|
2758
|
+
|
|
2759
|
+
**A) No sampling (100% events)**
|
|
2760
|
+
- ❌ Rejected: Too expensive
|
|
2761
|
+
|
|
2762
|
+
**B) Fixed sampling rates**
|
|
2763
|
+
- ❌ Rejected: Not adaptive to load
|
|
2764
|
+
|
|
2765
|
+
**C) Manual retention management**
|
|
2766
|
+
- ❌ Rejected: Error-prone
|
|
2767
|
+
|
|
2768
|
+
**D) Event deduplication (60s window)**
|
|
2769
|
+
- ❌ Rejected for multiple critical reasons:
|
|
2770
|
+
- **High computational overhead**: Hash calculation + Redis lookup for EVERY event
|
|
2771
|
+
- **Memory cost**: 60K keys in Redis for 1000 events/sec (3.6GB memory)
|
|
2772
|
+
- **False positives**: Legitimate retries/bulk operations look like duplicates
|
|
2773
|
+
- **Debug confusion**: Users don't see events they expect (logs appear incomplete)
|
|
2774
|
+
- **Minimal real benefit**: Only ~5-10% actual duplicates in practice
|
|
2775
|
+
- **Better alternatives**: Adaptive sampling (80% reduction) + compression (80% bandwidth) achieve same cost goals without data loss
|
|
2776
|
+
|
|
2777
|
+
**E) Brotli compression**
|
|
2778
|
+
- ❌ Rejected: Slower than Zstd
|
|
2779
|
+
|
|
2780
|
+
---
|
|
2781
|
+
|
|
2782
|
+
## 11. Complete Configuration Example (Phase 2.8 - All Strategies) ✅
|
|
2783
|
+
|
|
2784
|
+
**Production-Ready Configuration with All 4 Adaptive Sampling Strategies:**
|
|
2785
|
+
|
|
2786
|
+
```ruby
|
|
2787
|
+
# config/initializers/e11y.rb
|
|
2788
|
+
E11y.configure do |config|
|
|
2789
|
+
# ===================================================================
|
|
2790
|
+
# ADVANCED SAMPLING PIPELINE (Phase 2.8 - FEAT-4837)
|
|
2791
|
+
# ===================================================================
|
|
2792
|
+
|
|
2793
|
+
config.pipeline.use E11y::Middleware::Sampling,
|
|
2794
|
+
# Base sample rate (fallback)
|
|
2795
|
+
default_sample_rate: 0.1, # 10% default
|
|
2796
|
+
|
|
2797
|
+
# ✅ STRATEGY 1: Error-Based Adaptive Sampling (FEAT-4838)
|
|
2798
|
+
error_based_adaptive: true,
|
|
2799
|
+
error_spike_config: {
|
|
2800
|
+
window: 60, # 60 seconds sliding window
|
|
2801
|
+
absolute_threshold: 100, # 100 errors/min triggers spike
|
|
2802
|
+
relative_threshold: 3.0, # 3x normal rate triggers spike
|
|
2803
|
+
spike_duration: 300 # Keep 100% sampling for 5 minutes
|
|
2804
|
+
},
|
|
2805
|
+
|
|
2806
|
+
# ✅ STRATEGY 2: Load-Based Adaptive Sampling (FEAT-4842)
|
|
2807
|
+
load_based_adaptive: true,
|
|
2808
|
+
load_monitor_config: {
|
|
2809
|
+
window: 60,
|
|
2810
|
+
normal_threshold: 1_000, # < 1k events/sec = normal (100%)
|
|
2811
|
+
high_threshold: 10_000, # 10k events/sec = high (50%)
|
|
2812
|
+
very_high_threshold: 50_000, # 50k events/sec = very high (10%)
|
|
2813
|
+
overload_threshold: 100_000 # > 100k events/sec = overload (1%)
|
|
2814
|
+
}
|
|
2815
|
+
|
|
2816
|
+
# ✅ STRATEGY 3: Value-Based Sampling (FEAT-4846)
|
|
2817
|
+
# Configured per-event using `sample_by_value` DSL (see below)
|
|
2818
|
+
|
|
2819
|
+
# ✅ STRATEGY 4: Stratified Sampling (FEAT-4850, C11)
|
|
2820
|
+
# Automatic - no config needed, records sample rates for SLO correction
|
|
2821
|
+
end
|
|
2822
|
+
|
|
2823
|
+
# ===================================================================
|
|
2824
|
+
# EVENT-LEVEL VALUE-BASED SAMPLING (FEAT-4846)
|
|
2825
|
+
# ===================================================================
|
|
2826
|
+
|
|
2827
|
+
# Always sample high-value orders
|
|
2828
|
+
class Events::OrderPaid < E11y::Event::Base
|
|
2829
|
+
schema do
|
|
2830
|
+
required(:order_id).filled(:string)
|
|
2831
|
+
required(:amount).filled(:decimal)
|
|
2832
|
+
end
|
|
2833
|
+
|
|
2834
|
+
# Always sample orders over $1000
|
|
2835
|
+
sample_by_value field: "amount",
|
|
2836
|
+
operator: :greater_than,
|
|
2837
|
+
threshold: 1000,
|
|
2838
|
+
sample_rate: 1.0
|
|
2839
|
+
end
|
|
2840
|
+
|
|
2841
|
+
# Sample important user segments
|
|
2842
|
+
class Events::UserAction < E11y::Event::Base
|
|
2843
|
+
schema do
|
|
2844
|
+
required(:action).filled(:string)
|
|
2845
|
+
required(:user_segment).filled(:string)
|
|
2846
|
+
end
|
|
2847
|
+
|
|
2848
|
+
# Always sample enterprise/VIP users
|
|
2849
|
+
sample_by_value field: "user_segment",
|
|
2850
|
+
operator: :equals,
|
|
2851
|
+
threshold: "enterprise",
|
|
2852
|
+
sample_rate: 1.0
|
|
2853
|
+
end
|
|
2854
|
+
|
|
2855
|
+
# ===================================================================
|
|
2856
|
+
# SLO TRACKING WITH SAMPLING CORRECTION (FEAT-4850, C11)
|
|
2857
|
+
# ===================================================================
|
|
2858
|
+
|
|
2859
|
+
config.slo do
|
|
2860
|
+
enabled true
|
|
2861
|
+
enable_sampling_correction true # ✅ Automatic correction for accurate SLO
|
|
2862
|
+
end
|
|
2863
|
+
```
|
|
2864
|
+
|
|
2865
|
+
**How the Strategies Work Together (Precedence Order):**
|
|
2866
|
+
|
|
2867
|
+
1. **Error Spike Detection** (Highest Priority - FEAT-4838):
|
|
2868
|
+
- If error spike detected → 100% sampling for ALL events
|
|
2869
|
+
- Overrides all other strategies during spike
|
|
2870
|
+
|
|
2871
|
+
2. **Value-Based Sampling** (High Priority - FEAT-4846):
|
|
2872
|
+
- If event has `sample_by_value` config and value meets criteria → 100% sampling
|
|
2873
|
+
- Overrides load-based sampling
|
|
2874
|
+
|
|
2875
|
+
3. **Load-Based Sampling** (Base Rate - FEAT-4842):
|
|
2876
|
+
- Provides "base rate" based on system load (100% / 50% / 10% / 1%)
|
|
2877
|
+
- Can be further restricted by event-level `resolve_sample_rate`
|
|
2878
|
+
|
|
2879
|
+
4. **Stratified Sampling** (Background - FEAT-4850):
|
|
2880
|
+
- Records sample rate metadata for each event
|
|
2881
|
+
- Enables SLO calculation with correction factors
|
|
2882
|
+
- No impact on sampling decisions (only metadata)
|
|
2883
|
+
|
|
2884
|
+
**Example Scenario (All Strategies Active):**
|
|
2885
|
+
|
|
2886
|
+
```ruby
|
|
2887
|
+
# Normal conditions: 500 events/sec, 1% error rate
|
|
2888
|
+
# → Load: normal (< 1k) → base rate 100%
|
|
2889
|
+
# → Error spike: NO → base rate unchanged
|
|
2890
|
+
# → Final: 100% sampling (unless event-level override)
|
|
2891
|
+
|
|
2892
|
+
# Sudden traffic spike: 60k events/sec, 2% error rate
|
|
2893
|
+
# → Load: overload (> 50k) → base rate 1%
|
|
2894
|
+
# → Error spike: NO (< 5% threshold) → base rate unchanged
|
|
2895
|
+
# → Final: 1% sampling (cost protection!)
|
|
2896
|
+
|
|
2897
|
+
# Error spike during normal load: 500 events/sec, 10% error rate
|
|
2898
|
+
# → Load: normal → base rate 100%
|
|
2899
|
+
# → Error spike: YES (10% > 5%) → override to 100%
|
|
2900
|
+
# → Final: 100% sampling (debug priority!)
|
|
2901
|
+
|
|
2902
|
+
# High-value order during overload: 60k events/sec, 1% errors, order=$5000
|
|
2903
|
+
# → Load: overload → base rate 1%
|
|
2904
|
+
# → Error spike: NO
|
|
2905
|
+
# → Value-based: amount > $1000 → override to 100%
|
|
2906
|
+
# → Final: 100% sampling (business-critical event!)
|
|
2907
|
+
```
|
|
2908
|
+
|
|
2909
|
+
**Old Configuration (Conceptual, Pre-Phase 2.8):**
|
|
2910
|
+
|
|
2911
|
+
```ruby
|
|
2912
|
+
E11y.configure do |config|
|
|
2913
|
+
config.cost_optimization do
|
|
2914
|
+
# Adaptive sampling
|
|
2915
|
+
adaptive_sampling do
|
|
2916
|
+
error_based { enabled true }
|
|
2917
|
+
load_based { enabled true; max_events_per_sec 10_000 }
|
|
2918
|
+
value_based do
|
|
2919
|
+
enabled true
|
|
2920
|
+
high_value_patterns [/^payment\./, /^order\./]
|
|
2921
|
+
low_value_patterns [/^debug\./, /^health_check/]
|
|
2922
|
+
end
|
|
2923
|
+
end
|
|
2924
|
+
|
|
2925
|
+
# Payload minimization
|
|
2926
|
+
payload_minimization do
|
|
2927
|
+
enabled true
|
|
2928
|
+
truncate_strings_at 1000 # Max 1KB per string field
|
|
2929
|
+
truncate_arrays_at 100 # Max 100 items per array
|
|
2930
|
+
remove_null_fields true # Drop null/empty fields
|
|
2931
|
+
end
|
|
2932
|
+
|
|
2933
|
+
# Compression
|
|
2934
|
+
compression do
|
|
2935
|
+
enabled true
|
|
2936
|
+
algorithm :gzip # or :zstd, :lz4
|
|
2937
|
+
compression_level Zlib::DEFAULT_COMPRESSION
|
|
2938
|
+
min_size_bytes 1024
|
|
2939
|
+
end
|
|
2940
|
+
|
|
2941
|
+
# Smart routing
|
|
2942
|
+
smart_routing do
|
|
2943
|
+
rule 'critical', adapters: [:loki, :sentry] do |e|
|
|
2944
|
+
e[:severity] >= :error
|
|
2945
|
+
end
|
|
2946
|
+
rule 'debug', adapters: [:file] do |e|
|
|
2947
|
+
e[:severity] == :debug
|
|
2948
|
+
end
|
|
2949
|
+
end
|
|
2950
|
+
|
|
2951
|
+
# Tiered storage
|
|
2952
|
+
tiered_storage do
|
|
2953
|
+
retention_rule(2555) { |e| e[:event_name].start_with?('audit.') }
|
|
2954
|
+
retention_rule(90) { |e| e[:severity] >= :error }
|
|
2955
|
+
retention_rule(7) { |e| e[:severity] == :debug }
|
|
2956
|
+
default_retention 30
|
|
2957
|
+
end
|
|
2958
|
+
|
|
2959
|
+
# Payload minimization
|
|
2960
|
+
payload_minimization do
|
|
2961
|
+
enabled true
|
|
2962
|
+
remove_nulls true
|
|
2963
|
+
truncate_strings_at 1000
|
|
2964
|
+
truncate_arrays_at 100
|
|
2965
|
+
end
|
|
2966
|
+
|
|
2967
|
+
# Cost tracking
|
|
2968
|
+
cost_tracking do
|
|
2969
|
+
enabled true
|
|
2970
|
+
adapter_costs do
|
|
2971
|
+
loki 0.50
|
|
2972
|
+
sentry 10.00
|
|
2973
|
+
s3 0.02
|
|
2974
|
+
end
|
|
2975
|
+
end
|
|
2976
|
+
end
|
|
2977
|
+
end
|
|
2978
|
+
```
|
|
2979
|
+
|
|
2980
|
+
---
|
|
2981
|
+
|
|
2982
|
+
## 12. Future Enhancements
|
|
2983
|
+
|
|
2984
|
+
See [Backlog](use_cases/backlog.md) for future enhancement ideas including:
|
|
2985
|
+
- Quick Start Presets (v1.1)
|
|
2986
|
+
- Sampling Budget (v1.2+)
|
|
2987
|
+
|
|
2988
|
+
---
|
|
2989
|
+
|
|
2990
|
+
**Status:** ✅ Draft Complete
|
|
2991
|
+
**Next:** ADR-003 (SLO) or ADR-007 (OpenTelemetry)
|
|
2992
|
+
**Estimated Implementation:** 2-3 weeks
|
|
2993
|
+
**Expected ROI:** 50-80% cost reduction
|