e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,2993 @@
1
+ # ADR-009: Cost Optimization
2
+
3
+ **Status:** Partially Implemented (Error-Based Sampling - 2026-01-19)
4
+ **Date:** January 12, 2026
5
+ **Last Updated:** January 19, 2026
6
+ **Covers:** UC-014 (Adaptive Sampling), UC-015 (Cost Optimization), UC-019 (Tiered Storage)
7
+ **Depends On:** ADR-001 (Core), ADR-004 (Adapters), ADR-014 (Adaptive Sampling)
8
+
9
+ **Implementation Status:**
10
+ - ✅ **Basic Sampling** (L2.7) - `E11y::Middleware::Sampling` with trace-aware logic
11
+ - ✅ **Event-level DSL** - `sample_rate` and `adaptive_sampling` in `Event::Base`
12
+ - ✅ **Pipeline Integration** - Sampling middleware in default pipeline
13
+ - ✅ **Error-Based Adaptive** (FEAT-4838) - 100% sampling during error spikes
14
+ - ✅ **Load-Based Adaptive** (FEAT-4842) - Tiered sampling (100%/50%/10%/1%) based on load
15
+ - ✅ **Value-Based Sampling** (FEAT-4846) - DSL for sampling by payload values (>, <, ==, range)
16
+ - ✅ **Stratified Sampling** (FEAT-4850, C11 resolution) - SLO-accurate sampling with correction
17
+ - ⏳ **Compression** - Not started
18
+ - ⏳ **Tiered Storage** - Not started
19
+
20
+ ---
21
+
22
+ ## 🚀 Implementation Summary (2026-01-19)
23
+
24
+ ### Basic Sampling (L2.7) ✅
25
+
26
+ **Implemented:**
27
+ 1. **`E11y::Middleware::Sampling`** - Core sampling logic:
28
+ - Trace-aware sampling (C05) - consistent decisions per `trace_id`
29
+ - Audit event exemption - audit events never sampled
30
+ - Sample rate metadata - adds `sample_rate` to event data
31
+ - Cache cleanup - prevents memory leaks
32
+
33
+ 2. **Event-level DSL** in `Event::Base`:
34
+ ```ruby
35
+ class HighFrequencyEvent < E11y::Event::Base
36
+ sample_rate 0.01 # 1% sampling
37
+ end
38
+
39
+ class OrderEvent < E11y::Event::Base
40
+ adaptive_sampling enabled: true,
41
+ error_rate_threshold: 0.05,
42
+ load_threshold: 50_000
43
+ end
44
+ ```
45
+
46
+ 3. **Pipeline Integration**:
47
+ - Sampling middleware added to default pipeline (zone: `:routing`)
48
+ - Automatic configuration in `E11y::Configuration#setup_default_pipeline`
49
+
50
+ ### Error-Based Adaptive Sampling (FEAT-4838) ✅
51
+
52
+ **Implemented (2026-01-19):**
53
+ 1. **`E11y::Sampling::ErrorSpikeDetector`** - Detects error rate spikes:
54
+ - Sliding window error rate calculation (configurable window)
55
+ - Absolute threshold (errors/minute)
56
+ - Relative threshold (ratio to baseline)
57
+ - Exponential moving average for baseline tracking
58
+ - Spike duration management
59
+
60
+ 2. **Integration with Sampling Middleware**:
61
+ ```ruby
62
+ E11y.configure do |config|
63
+ config.pipeline.use E11y::Middleware::Sampling,
64
+ error_based_adaptive: true,
65
+ error_spike_config: {
66
+ window: 60, # 60 seconds sliding window
67
+ absolute_threshold: 100, # 100 errors/min triggers spike
68
+ relative_threshold: 3.0, # 3x normal rate triggers spike
69
+ spike_duration: 300 # Keep 100% sampling for 5 minutes
70
+ }
71
+ end
72
+ ```
73
+
74
+ 3. **Behavior**:
75
+ - **Normal conditions**: Uses configured sample rates (e.g., 10%)
76
+ - **During error spike**: Automatically increases to 100% sampling
77
+ - **After spike**: Returns to normal rates after `spike_duration`
78
+
79
+ **Tests**: 22 unit tests + 9 integration tests (all passing)
80
+
81
+ ### Load-Based Adaptive Sampling (FEAT-4842) ✅
82
+
83
+ **Implemented (2026-01-20):**
84
+ 1. **`E11y::Sampling::LoadMonitor`** - Tracks event volume and calculates load levels:
85
+ - Sliding window event rate calculation (events/second)
86
+ - Tiered load levels (normal, high, very_high, overload)
87
+ - Configurable thresholds for each load tier
88
+ - Thread-safe tracking (MonitorMixin)
89
+
90
+ 2. **Integration with Sampling Middleware**:
91
+ ```ruby
92
+ E11y.configure do |config|
93
+ config.pipeline.use E11y::Middleware::Sampling,
94
+ default_sample_rate: 0.1,
95
+ load_based_adaptive: true,
96
+ load_monitor_config: {
97
+ window: 60, # 60 seconds sliding window
98
+ normal_threshold: 1_000, # < 1k events/sec = normal
99
+ high_threshold: 10_000, # 10k events/sec = high load
100
+ very_high_threshold: 50_000, # 50k events/sec = very high
101
+ overload_threshold: 100_000 # > 100k events/sec = overload
102
+ }
103
+ end
104
+ ```
105
+
106
+ 3. **Tiered Sampling Rates**:
107
+ - **Normal load** (< 1k events/sec): 100% sampling
108
+ - **High load** (1k-10k events/sec): 50% sampling
109
+ - **Very high load** (10k-50k events/sec): 10% sampling
110
+ - **Overload** (> 50k events/sec): 1% sampling
111
+
112
+ 4. **Behavior**:
113
+ - Dynamically adjusts sample rate based on current event volume
114
+ - Works as a "base rate" that can be further restricted by event-level `resolve_sample_rate`
115
+ - Prioritizes error-based adaptive (100% during spikes) over load-based
116
+
117
+ **Tests**: 22 unit tests + 10 integration tests + 7 stress tests (all passing)
118
+
119
+ ### Value-Based Sampling (FEAT-4846) ✅
120
+
121
+ **Implemented (2026-01-20):**
122
+ 1. **`E11y::Sampling::ValueExtractor`** - Extracts numeric values from event payloads:
123
+ - Nested field extraction (dot notation: `"order.amount"`)
124
+ - Type coercion (numeric strings → floats)
125
+ - Nil/missing value handling (returns 0.0)
126
+
127
+ 2. **`E11y::Event::ValueSamplingConfig`** - Defines value-based sampling rules:
128
+ - Comparison operators: `:greater_than`, `:less_than`, `:equals`, `:in_range`
129
+ - Threshold values (numeric or Range)
130
+ - Custom sample rates per rule
131
+
132
+ 3. **Event DSL (`sample_by_value`)**:
133
+ ```ruby
134
+ class OrderPaidEvent < E11y::Event::Base
135
+ # Always sample orders over $1000
136
+ sample_by_value field: "amount",
137
+ operator: :greater_than,
138
+ threshold: 1000,
139
+ sample_rate: 1.0
140
+
141
+ # Sample 50% of orders between $100-$500
142
+ sample_by_value field: "amount",
143
+ operator: :in_range,
144
+ threshold: 100..500,
145
+ sample_rate: 0.5
146
+ end
147
+ ```
148
+
149
+ 4. **Integration with Sampling Middleware**:
150
+ - **High priority** in sampling decision (after error spike, before load-based)
151
+ - Event-level configuration (no global config needed)
152
+ - Falls back to other strategies if no value-based config present
153
+
154
+ **Tests**: 19 unit tests + 8 integration tests (all passing)
155
+
156
+ ### Stratified Sampling for SLO Accuracy (FEAT-4850, C11 Resolution) ✅
157
+
158
+ **Implemented (2026-01-20):**
159
+ 1. **`E11y::Sampling::StratifiedTracker`** - Tracks sampled/total counts per severity stratum:
160
+ - Records each sampled event with its original sample rate
161
+ - Calculates sampling correction factors per severity
162
+ - Handles floating point precision
163
+ - Thread-safe tracking (MonitorMixin)
164
+
165
+ 2. **SLO Sampling Correction in `E11y::SLO::Tracker`**:
166
+ - Applies correction factors when calculating SLO metrics
167
+ - Adjusts success rate to account for sampling bias
168
+ - Ensures < 5% error margin even with aggressive sampling
169
+
170
+ 3. **Integration with Sampling Middleware**:
171
+ - Records sample rate metadata for each event
172
+ - Works seamlessly with load-based adaptive sampling
173
+ - No additional configuration required (automatic)
174
+
175
+ 4. **Example: Accurate SLO with 85% Cost Savings**:
176
+ ```ruby
177
+ # Scenario: 1000 events (950 success, 50 errors)
178
+ # Stratified sampling: errors 100%, success 10%
179
+ # Events kept: 50 + 95 = 145 (85.5% cost savings!)
180
+
181
+ # Without correction:
182
+ # Observed success rate: 95/145 = 65.5% ❌
183
+
184
+ # With correction:
185
+ # Corrected success: 95 / 0.1 = 950
186
+ # Corrected errors: 50 / 1.0 = 50
187
+ # Corrected success rate: 950 / 1000 = 95.0% ✅
188
+ ```
189
+
190
+ **Tests**: 15 unit tests + 5 integration tests (all passing)
191
+
192
+ **Total Test Coverage (Phase 2.8):**
193
+ - **Error-Based**: 22 unit + 9 integration = 31 tests
194
+ - **Load-Based**: 22 unit + 10 integration + 7 stress = 39 tests
195
+ - **Value-Based**: 19 unit + 8 integration = 27 tests
196
+ - **Stratified**: 15 unit + 5 integration = 20 tests
197
+ - **Grand Total**: 117 tests ✅
198
+
199
+ **Deferred** (Future enhancements):
200
+ - Content-based sampling (pattern matching)
201
+ - ML-based sampling (importance prediction)
202
+ - Tail-based sampling (requires buffering)
203
+
204
+ **See:**
205
+ - Implementation details: `docs/IMPLEMENTATION_NOTES.md` (2026-01-20 entry)
206
+ - Middleware code: `lib/e11y/middleware/sampling.rb`
207
+ - Detectors: `lib/e11y/sampling/error_spike_detector.rb`, `lib/e11y/sampling/load_monitor.rb`
208
+ - Value sampling: `lib/e11y/sampling/value_extractor.rb`, `lib/e11y/event/value_sampling_config.rb`
209
+ - Stratified sampling: `lib/e11y/sampling/stratified_tracker.rb`
210
+ - Tests: `spec/e11y/middleware/sampling_spec.rb`, `spec/e11y/sampling/*_spec.rb`
211
+
212
+ ---
213
+
214
+ ## 📋 Table of Contents
215
+
216
+ 1. [Context & Problem](#1-context--problem)
217
+ 2. [Architecture Overview](#2-architecture-overview)
218
+ 3. [Adaptive Sampling](#3-adaptive-sampling)
219
+ - 3.6. [Trace-Aware Adaptive Sampling (C05 Resolution)](#36-trace-aware-adaptive-sampling-c05-resolution) ⚠️ CRITICAL
220
+ - 3.6.1. The Problem: Broken Distributed Traces
221
+ - 3.6.2. Decision: Trace-Level Sampling with Decision Cache
222
+ - 3.6.3. TraceAwareSampler Implementation
223
+ - 3.6.4. Configuration
224
+ - 3.6.5. Multi-Service Trace Scenario (Correct Behavior)
225
+ - 3.6.6. Cache Management & TTL
226
+ - 3.6.7. Head-Based Sampling (W3C Trace Context)
227
+ - 3.6.8. Trade-offs & Distributed Tracing Integrity (C05)
228
+ - 3.7. [Stratified Sampling for SLO Accuracy (C11 Resolution)](#37-stratified-sampling-for-slo-accuracy-c11-resolution) ⚠️ CRITICAL
229
+ - 3.7.1. The Problem: Sampling Bias Breaks SLO Metrics
230
+ - 3.7.2. Decision: Stratified Sampling by Event Severity
231
+ - 3.7.3. StratifiedAdaptiveSampler Implementation
232
+ - 3.7.4. SLO Calculator with Sampling Correction
233
+ - 3.7.5. Configuration
234
+ - 3.7.6. Accuracy Comparison: Random vs Stratified Sampling
235
+ - 3.7.7. Cost Savings vs Accuracy Trade-off
236
+ - 3.7.8. Testing Sampling Correction Accuracy
237
+ - 3.7.9. Trade-offs & SLO Accuracy (C11)
238
+ 4. [Compression](#4-compression)
239
+ 5. [Smart Routing](#5-smart-routing)
240
+ 6. [Tiered Storage](#6-tiered-storage)
241
+ 7. [Payload Minimization](#7-payload-minimization)
242
+ 8. [Cardinality Protection (C04 Resolution)](#8-cardinality-protection-c04-resolution) ⚠️ CRITICAL
243
+ - 8.1. The Problem: Cardinality Explosion Across Backends
244
+ - 8.2. Decision: Unified Cardinality Protection for All Backends
245
+ - 8.3. Configuration: Inherit from Global Settings
246
+ - 8.4. Implementation: Apply to Yabeda + OpenTelemetry
247
+ - 8.5. Cost Impact: Before vs After Protection
248
+ - 8.6. Monitoring Metrics
249
+ - 8.7. Trade-offs (C04 Resolution)
250
+ 9. [Cost Metrics](#9-cost-metrics)
251
+ 10. [Trade-offs](#10-trade-offs)
252
+ 11. [Complete Configuration Example](#11-complete-configuration-example)
253
+ 12. [Backlog (Future Enhancements)](#12-backlog-future-enhancements)
254
+ - [12.1. Quick Start Presets](#121-quick-start-presets)
255
+ - [12.2. Sampling Budget](#122-sampling-budget)
256
+
257
+ ---
258
+
259
+ ## 1. Context & Problem
260
+
261
+ ### 1.1. Problem Statement
262
+
263
+ **Current Pain Points:**
264
+
265
+ 1. **High Log Volume Costs:**
266
+ ```ruby
267
+ # ❌ 1M events/day * 365 days = 365M events/year
268
+ # Loki: $0.50/GB → $10,000+/year
269
+ # Sentry: $0.01/event → $3,650/year
270
+ # Total: $13,650/year for a single service
271
+ ```
272
+
273
+ 2. **No Cost Awareness:**
274
+ ```ruby
275
+ # ❌ No visibility into cost per event
276
+ Events::DebugQuery.track(sql: long_query) # How much does this cost?
277
+ ```
278
+
279
+ 3. **No Retention Strategy:**
280
+ ```ruby
281
+ # ❌ All events stored forever
282
+ # Debug events from 2 years ago still in Loki → $$
283
+ ```
284
+
285
+ ### 1.2. Goals
286
+
287
+ **Primary Goals:**
288
+ - ✅ **50-80% cost reduction** through optimization
289
+ - ✅ **Adaptive sampling** based on load/value
290
+ - ✅ **Compression** for network efficiency
291
+ - ✅ **Tiered storage** with `retention_until`
292
+ - ✅ **Cost visibility** per event/adapter
293
+
294
+ **Non-Goals:**
295
+ - ❌ Manage downstream storage (Loki ILM)
296
+ - ❌ Real-time cost calculation
297
+ - ❌ Cross-service cost optimization
298
+
299
+ ### 1.3. Success Metrics
300
+
301
+ | Metric | Target | Critical? |
302
+ |--------|--------|-----------|
303
+ | **Cost reduction** | 50-80% | ✅ Yes |
304
+ | **Event throughput** | Same (10K/sec) | ✅ Yes |
305
+ | **Compression ratio** | 5:1 (Gzip) | ✅ Yes |
306
+
307
+ ### 1.4. Cost Savings Estimate
308
+
309
+ **Example: 10K events/sec service**
310
+
311
+ | Optimization | Before | After | Savings |
312
+ |-------------|--------|-------|---------|
313
+ | **Adaptive Sampling** | 100% | 20% | 80% |
314
+ | **Compression** | 1KB/event | 200B/event | 80% |
315
+ | **Tiered Storage** | 365 days | 30 days | 92% |
316
+ | **Smart Routing** | All → Loki | Critical → Loki | 50% |
317
+
318
+ **Combined Annual Savings:** $13,650 → **$2,730** (80% reduction)
319
+
320
+ **Cost Breakdown:**
321
+ - **Adaptive Sampling**: ~$10,920 savings (eliminates 80% of low-value events)
322
+ - **Compression**: ~$8,200 savings (80% bandwidth reduction)
323
+ - **Smart Routing**: ~$5,000 savings (critical-only to expensive destinations)
324
+ - **Tiered Storage**: ~$12,570 savings (92% storage cost reduction)
325
+
326
+ ---
327
+
328
+ ## 2. Architecture Overview
329
+
330
+ ### 2.1. System Context
331
+
332
+ ```mermaid
333
+ C4Context
334
+ title Cost Optimization Context
335
+
336
+ Person(dev, "Developer", "Configures cost rules")
337
+
338
+ System(rails_app, "Rails App", "Tracks events")
339
+
340
+ System(e11y, "E11y Gem", "Cost optimization")
341
+
342
+ System_Ext(loki, "Loki", "$10K/year")
343
+ System_Ext(sentry, "Sentry", "$3.6K/year")
344
+ System_Ext(s3, "S3", "$100/year")
345
+
346
+ Rel(dev, e11y, "Configures", "Sampling rules")
347
+ Rel(rails_app, e11y, "Tracks events", "100% volume")
348
+ Rel(e11y, loki, "20% sampled + compressed", "80% cost savings")
349
+ Rel(e11y, sentry, "Errors only", "50% cost savings")
350
+ Rel(e11y, s3, "Cold storage", "Long-term archive")
351
+ ```
352
+
353
+ ### 2.2. Component Architecture
354
+
355
+ ```mermaid
356
+ graph TB
357
+ subgraph "Event Pipeline"
358
+ Event[Event Tracked] --> Sampler[Adaptive Sampler]
359
+ Sampler --> Minimizer[Payload Minimizer]
360
+ Minimizer --> Compressor[Compressor]
361
+ end
362
+
363
+ subgraph "Cost Optimization Layer"
364
+ Sampler --> SampleStrategy[Sampling Strategy]
365
+ SampleStrategy --> LoadBased[Load-Based]
366
+ SampleStrategy --> ValueBased[Value-Based]
367
+ SampleStrategy --> ErrorBased[Error-Based]
368
+
369
+ Compressor --> GzipEngine[Gzip]
370
+ Compressor --> ZstdEngine[Zstd]
371
+ end
372
+
373
+ subgraph "Smart Routing"
374
+ Compressor --> Router[Smart Router]
375
+ Router --> CriticalPath[Critical → Loki]
376
+ Router --> ArchivePath[Archive → S3]
377
+ Router --> DebugPath[Debug → /dev/null]
378
+ end
379
+
380
+ subgraph "Tiered Storage"
381
+ Router --> RetentionTagger[Retention Tagger]
382
+ RetentionTagger --> ShortTerm[retention_until: 7d]
383
+ RetentionTagger --> MediumTerm[retention_until: 30d]
384
+ RetentionTagger --> LongTerm[retention_until: 365d]
385
+ end
386
+
387
+ subgraph "Cost Tracking"
388
+ Router --> CostMetrics[Cost Metrics]
389
+ CostMetrics --> EventCost[Per-Event Cost]
390
+ CostMetrics --> AdapterCost[Per-Adapter Cost]
391
+ end
392
+
393
+ style Sampler fill:#fff3cd
394
+ style Compressor fill:#d1ecf1
395
+ style Router fill:#f8d7da
396
+ style Minimizer fill:#d4edda
397
+ ```
398
+
399
+ ### 2.3. Cost Optimization Flow
400
+
401
+ ```mermaid
402
+ sequenceDiagram
403
+ participant App as Application
404
+ participant Sampler as Adaptive Sampler
405
+ participant Minimizer as Payload Minimizer
406
+ participant Compress as Compressor
407
+ participant Router as Smart Router
408
+ participant Loki as Loki (Expensive)
409
+ participant S3 as S3 (Cheap)
410
+
411
+ App->>Sampler: Track event (100%)
412
+
413
+ Sampler->>Sampler: Check load, error rate, value
414
+
415
+ alt Sample decision: KEEP
416
+ Sampler->>Minimizer: Event (20% sampled)
417
+
418
+ Minimizer->>Minimizer: Truncate strings, remove nulls
419
+ Minimizer->>Compress: Optimized payload (-30% size)
420
+
421
+ Compress->>Compress: Gzip payload (5:1 ratio)
422
+ Compress->>Router: Compressed event
423
+
424
+ Router->>Router: Classify by importance
425
+
426
+ alt Critical event
427
+ Router->>Loki: Send to Loki (2.8% of original)
428
+ else Archive-worthy
429
+ Router->>S3: Send to S3 (17.2% of original)
430
+ end
431
+ else Sample decision: DROP
432
+ Note over Sampler: 80% dropped
433
+ end
434
+
435
+ Note over Loki,S3: Final cost: 80% reduction
436
+ ```
437
+
438
+ ---
439
+
440
+ ## 3. Adaptive Sampling
441
+
442
+ ### 3.1. Sampling Strategies
443
+
444
+ ```ruby
445
+ # lib/e11y/cost/adaptive_sampler.rb
446
+ module E11y
447
+ module Cost
448
+ class AdaptiveSampler
449
+ def initialize(config)
450
+ @strategies = [
451
+ Strategies::ErrorBased.new(config),
452
+ Strategies::LoadBased.new(config),
453
+ Strategies::ValueBased.new(config),
454
+ Strategies::ContentBased.new(config)
455
+ ]
456
+ end
457
+
458
+ def should_sample?(event_data, context = {})
459
+ # Priority 1: Always sample errors
460
+ return true if event_data[:severity] >= :error
461
+
462
+ # Priority 2: Check each strategy
463
+ sample_rates = @strategies.map do |strategy|
464
+ strategy.calculate_rate(event_data, context)
465
+ end
466
+
467
+ # Use max sample rate (most aggressive strategy wins)
468
+ final_rate = sample_rates.max
469
+
470
+ # Random sampling
471
+ decision = rand < final_rate
472
+
473
+ # Track metrics
474
+ E11y::Metrics.increment('e11y.sampling.decision', {
475
+ event_name: event_data[:event_name],
476
+ decision: decision ? 'sampled' : 'dropped',
477
+ rate: final_rate
478
+ })
479
+
480
+ decision
481
+ end
482
+ end
483
+ end
484
+ end
485
+ ```
486
+
487
+ ### 3.2. Error-Based Sampling
488
+
489
+ ```ruby
490
+ # lib/e11y/cost/strategies/error_based.rb
491
+ module E11y
492
+ module Cost
493
+ module Strategies
494
+ class ErrorBased
495
+ def initialize(config)
496
+ @error_window = config.error_window || 60.seconds
497
+ @error_threshold = config.error_threshold || 0.01 # 1%
498
+ end
499
+
500
+ def calculate_rate(event_data, context)
501
+ # Get error rate for this event type
502
+ error_rate = E11y::Metrics.get_rate(
503
+ 'e11y.events.errors',
504
+ window: @error_window,
505
+ tags: { event_name: event_data[:event_name] }
506
+ )
507
+
508
+ # If error rate > threshold, sample 100%
509
+ if error_rate > @error_threshold
510
+ 1.0
511
+ else
512
+ # Normal rate
513
+ 0.1 # 10%
514
+ end
515
+ end
516
+ end
517
+ end
518
+ end
519
+ end
520
+ ```
521
+
522
+ ### 3.3. Load-Based Sampling (FEAT-4842 Implementation) ✅
523
+
524
+ **Implemented:** `E11y::Sampling::LoadMonitor`
525
+
526
+ ```ruby
527
+ # lib/e11y/sampling/load_monitor.rb
528
+ module E11y
529
+ module Sampling
530
+ class LoadMonitor
531
+ include MonitorMixin
532
+
533
+ DEFAULT_CONFIG = {
534
+ window: 60, # 60 seconds sliding window
535
+ normal_threshold: 1_000, # < 1k events/sec = normal
536
+ high_threshold: 10_000, # 10k events/sec = high
537
+ very_high_threshold: 50_000, # 50k events/sec = very high
538
+ overload_threshold: 100_000 # > 100k events/sec = overload
539
+ }.freeze
540
+
541
+ def initialize(config = {})
542
+ super()
543
+ @config = DEFAULT_CONFIG.merge(config)
544
+ @events = []
545
+ @window = @config[:window]
546
+ end
547
+
548
+ # Record an event for load calculation
549
+ def record_event
550
+ synchronize do
551
+ now = Time.now.to_f
552
+ @events << now
553
+ cleanup_old_events(now)
554
+ end
555
+ end
556
+
557
+ # Calculate current event rate (events/second)
558
+ def current_rate
559
+ synchronize do
560
+ cleanup_old_events(Time.now.to_f)
561
+ return 0.0 if @events.empty?
562
+
563
+ @events.size.to_f / @window
564
+ end
565
+ end
566
+
567
+ # Determine current load level
568
+ def load_level
569
+ rate = current_rate
570
+ return :overload if rate >= @config[:overload_threshold]
571
+ return :very_high if rate >= @config[:very_high_threshold]
572
+ return :high if rate >= @config[:high_threshold]
573
+ return :normal if rate >= @config[:normal_threshold]
574
+ :normal
575
+ end
576
+
577
+ # Get recommended sample rate based on load
578
+ def recommended_sample_rate
579
+ case load_level
580
+ when :normal then 1.0 # 100%
581
+ when :high then 0.5 # 50%
582
+ when :very_high then 0.1 # 10%
583
+ when :overload then 0.01 # 1%
584
+ else 1.0
585
+ end
586
+ end
587
+
588
+ # Check if system is overloaded
589
+ def overloaded?
590
+ load_level == :overload
591
+ end
592
+
593
+ private
594
+
595
+ def cleanup_old_events(now)
596
+ cutoff = now - @window
597
+ @events.reject! { |timestamp| timestamp < cutoff }
598
+ end
599
+ end
600
+ end
601
+ end
602
+ ```
603
+
604
+ **Configuration:**
605
+
606
+ ```ruby
607
+ # config/initializers/e11y.rb
608
+ E11y.configure do |config|
609
+ config.pipeline.use E11y::Middleware::Sampling,
610
+ default_sample_rate: 0.1,
611
+ load_based_adaptive: true,
612
+ load_monitor_config: {
613
+ window: 60, # 60 seconds
614
+ normal_threshold: 1_000, # < 1k events/sec
615
+ high_threshold: 10_000, # 10k events/sec
616
+ very_high_threshold: 50_000, # 50k events/sec
617
+ overload_threshold: 100_000 # > 100k events/sec
618
+ }
619
+ end
620
+ ```
621
+
622
+ **Behavior:**
623
+
624
+ | Load Level | Events/Sec | Sample Rate | Events Tracked |
625
+ |------------|-----------|-------------|----------------|
626
+ | Normal | < 1k | 100% | All events |
627
+ | High | 1k-10k | 50% | Half |
628
+ | Very High | 10k-50k | 10% | 1 in 10 |
629
+ | Overload | > 50k | 1% | 1 in 100 |
630
+
631
+ **Tests:** 22 unit + 10 integration + 7 stress = 39 tests
632
+
633
+ ---
634
+
635
+ ### 3.3 (OLD). Load-Based Sampling (Conceptual)
636
+
637
+ ```ruby
638
+ # lib/e11y/cost/strategies/load_based.rb
639
+ module E11y
640
+ module Cost
641
+ module Strategies
642
+ class LoadBased
643
+ def initialize(config)
644
+ @max_events_per_sec = config.max_events_per_sec || 10_000
645
+ @buffer_threshold = config.buffer_threshold || 0.8 # 80% full
646
+ end
647
+
648
+ def calculate_rate(event_data, context)
649
+ # Strategy 1: Events per second
650
+ current_rate = E11y::Metrics.get_rate('e11y.events.tracked')
651
+ rate_ratio = current_rate / @max_events_per_sec
652
+
653
+ # Strategy 2: Buffer utilization
654
+ buffer_usage = E11y::Buffer.utilization # 0.0 - 1.0
655
+
656
+ # Strategy 3: System CPU/Memory
657
+ system_load = system_overload_factor
658
+
659
+ # Combined load factor
660
+ load_factor = [rate_ratio, buffer_usage, system_load].max
661
+
662
+ if load_factor > 0.9
663
+ # System overloaded → aggressive sampling
664
+ 0.01 # 1%
665
+ elsif load_factor > 0.7
666
+ # High load → moderate sampling
667
+ 0.1 # 10%
668
+ else
669
+ # Normal load → full sampling
670
+ 1.0 # 100%
671
+ end
672
+ end
673
+
674
+ private
675
+
676
+ def system_overload_factor
677
+ cpu_usage = `ps -o %cpu= -p #{Process.pid}`.to_f / 100.0
678
+ memory_mb = `ps -o rss= -p #{Process.pid}`.to_i / 1024.0
679
+ memory_limit_mb = 500.0
680
+
681
+ [cpu_usage, memory_mb / memory_limit_mb].max
682
+ end
683
+ end
684
+ end
685
+ end
686
+ end
687
+ ```
688
+
689
+ ### 3.4. Value-Based Sampling
690
+
691
+ ```ruby
692
+ # lib/e11y/cost/strategies/value_based.rb
693
+ module E11y
694
+ module Cost
695
+ module Strategies
696
+ class ValueBased
697
+ def initialize(config)
698
+ @high_value_patterns = config.high_value_patterns || []
699
+ @low_value_patterns = config.low_value_patterns || []
700
+ end
701
+
702
+ def calculate_rate(event_data, context)
703
+ event_name = event_data[:event_name]
704
+
705
+ # High-value events (always sample)
706
+ if matches_patterns?(event_name, @high_value_patterns)
707
+ return 1.0 # 100%
708
+ end
709
+
710
+ # Low-value events (aggressive sampling)
711
+ if matches_patterns?(event_name, @low_value_patterns)
712
+ return 0.01 # 1%
713
+ end
714
+
715
+ # Check payload value
716
+ payload_value = estimate_payload_value(event_data[:payload])
717
+
718
+ if payload_value > 1000 # High-value transaction
719
+ 1.0
720
+ elsif payload_value > 100
721
+ 0.5 # 50%
722
+ else
723
+ 0.1 # 10%
724
+ end
725
+ end
726
+
727
+ private
728
+
729
+ def matches_patterns?(event_name, patterns)
730
+ patterns.any? do |pattern|
731
+ case pattern
732
+ when String
733
+ event_name == pattern
734
+ when Regexp
735
+ event_name =~ pattern
736
+ end
737
+ end
738
+ end
739
+
740
+ def estimate_payload_value(payload)
741
+ # Extract monetary value from payload
742
+ payload[:amount] ||
743
+ payload[:total_amount] ||
744
+ payload[:price] ||
745
+ 0
746
+ end
747
+ end
748
+ end
749
+ end
750
+ end
751
+ ```
752
+
753
+ ### 3.5. Simplified Configuration
754
+
755
+ **Philosophy:** Simple, declarative sampling rules. No complex strategies.
756
+
757
+ ```ruby
758
+ # config/initializers/e11y.rb
759
+ E11y.configure do |config|
760
+ config.sampling do
761
+ # ===================================================================
762
+ # SIMPLE APPROACH: Per-severity defaults
763
+ # ===================================================================
764
+
765
+ # Default sample rates by severity
766
+ default_rate :debug, 0.01 # 1% of debug events
767
+ default_rate :info, 0.1 # 10% of info events
768
+ default_rate :success, 0.5 # 50% of success events
769
+ default_rate :warn, 1.0 # 100% of warnings
770
+ default_rate :error, 1.0 # 100% of errors (always)
771
+ default_rate :fatal, 1.0 # 100% of fatal (always)
772
+
773
+ # ===================================================================
774
+ # PER-EVENT OVERRIDES (optional)
775
+ # ===================================================================
776
+
777
+ # High-value events: always sample
778
+ always_sample 'Events::OrderPaid'
779
+ always_sample 'Events::PaymentProcessed'
780
+ always_sample /^Events::Critical/ # Regex pattern
781
+
782
+ # Low-value events: aggressive sampling
783
+ sample 'Events::HealthCheck', rate: 0.001 # 0.1%
784
+ sample 'Events::CacheHit', rate: 0.01 # 1%
785
+ sample /^Events::Debug/, rate: 0.01 # 1%
786
+
787
+ # ===================================================================
788
+ # LOAD-BASED AUTO-ADJUSTMENT (optional)
789
+ # ===================================================================
790
+
791
+ # Automatically reduce sample rates when system overloaded
792
+ auto_adjust_on_load do
793
+ enabled true
794
+
795
+ # Trigger: buffer >80% full
796
+ trigger_buffer_percent 80
797
+
798
+ # Action: multiply all rates by 0.1 (10x reduction)
799
+ rate_multiplier 0.1
800
+
801
+ # Recovery: restore rates when buffer <50% full
802
+ recovery_buffer_percent 50
803
+ end
804
+ end
805
+ end
806
+ ```
807
+
808
+ **Simplified Implementation:**
809
+
810
+ ```ruby
811
+ # lib/e11y/sampling/simple_sampler.rb
812
+ module E11y
813
+ module Sampling
814
+ class SimpleSampler
815
+ def initialize(config)
816
+ @severity_rates = config.severity_rates || default_severity_rates
817
+ @event_rates = config.event_rates || {}
818
+ @auto_adjust = config.auto_adjust || {}
819
+ end
820
+
821
+ def should_sample?(event_data)
822
+ # Step 1: Get base rate (severity or event-specific)
823
+ base_rate = get_base_rate(event_data)
824
+
825
+ # Step 2: Apply load-based adjustment (if enabled)
826
+ final_rate = apply_load_adjustment(base_rate)
827
+
828
+ # Step 3: Random decision
829
+ rand < final_rate
830
+ end
831
+
832
+ private
833
+
834
+ def get_base_rate(event_data)
835
+ event_name = event_data[:event_name]
836
+ severity = event_data[:severity]
837
+
838
+ # Priority 1: Event-specific rate
839
+ @event_rates.each do |pattern, rate|
840
+ case pattern
841
+ when String
842
+ return rate if event_name == pattern
843
+ when Regexp
844
+ return rate if event_name =~ pattern
845
+ end
846
+ end
847
+
848
+ # Priority 2: Severity default
849
+ @severity_rates[severity] || 1.0
850
+ end
851
+
852
+ def apply_load_adjustment(base_rate)
853
+ return base_rate unless @auto_adjust[:enabled]
854
+
855
+ buffer_percent = E11y::Buffer.utilization_percent
856
+
857
+ if buffer_percent > @auto_adjust[:trigger_buffer_percent]
858
+ # System overloaded → reduce rate
859
+ base_rate * @auto_adjust[:rate_multiplier]
860
+ else
861
+ # Normal operation
862
+ base_rate
863
+ end
864
+ end
865
+
866
+ def default_severity_rates
867
+ {
868
+ debug: 0.01,
869
+ info: 0.1,
870
+ success: 0.5,
871
+ warn: 1.0,
872
+ error: 1.0,
873
+ fatal: 1.0
874
+ }
875
+ end
876
+ end
877
+ end
878
+ end
879
+ ```
880
+
881
+ **Why Simplified?**
882
+
883
+ | Old Approach | New Approach | Benefit |
884
+ |--------------|--------------|---------|
885
+ | 4 strategies (Error, Load, Value, Content) | 1 simple sampler | Easy to understand |
886
+ | Complex rate calculation | Lookup table | Fast (<0.01ms) |
887
+ | Multiple config blocks | Single `sampling` block | Less code |
888
+ | Hard to predict | Deterministic | Debuggable |
889
+ | 300+ LOC | 50 LOC | Maintainable |
890
+
891
+ ---
892
+
893
+ ## 3.6. Trace-Aware Adaptive Sampling (C05 Resolution)
894
+
895
+ > **⚠️ CRITICAL: C05 Conflict Resolution - Adaptive Sampling × Trace Consistency**
896
+ > **See:** [CONFLICT-ANALYSIS.md C05](researches/CONFLICT-ANALYSIS.md#c05-adaptive-sampling--trace-consistent-sampling) for detailed analysis
897
+ > **Problem:** Per-event adaptive sampling breaks distributed traces (incomplete traces across services)
898
+ > **Solution:** Trace-level sampling decisions with propagation via W3C trace context
899
+
900
+ ### 3.6.1. The Problem: Broken Distributed Traces
901
+
902
+ **Scenario - Incomplete Trace:**
903
+
904
+ ```ruby
905
+ # Trace across 3 microservices (trace_id: abc-123):
906
+
907
+ # Service A: Order Service
908
+ Events::OrderCreated.track(
909
+ order_id: '123',
910
+ trace_id: 'abc-123'
911
+ )
912
+ # → Adaptive sampling: KEEP (within budget) ✅
913
+
914
+ # Service B: Payment Service (same trace)
915
+ Events::PaymentProcessing.track(
916
+ payment_id: '456',
917
+ trace_id: 'abc-123'
918
+ )
919
+ # → Adaptive sampling: DROP (budget exceeded!) ❌
920
+
921
+ # Service C: Notification Service (same trace)
922
+ Events::NotificationSent.track(
923
+ notification_id: '789',
924
+ trace_id: 'abc-123'
925
+ )
926
+ # → Adaptive sampling: KEEP (budget recovered) ✅
927
+
928
+ # Result: INCOMPLETE TRACE!
929
+ # Loki shows:
930
+ # - Order created: YES ✅
931
+ # - Payment processing: MISSING ❌ ← Gap in trace!
932
+ # - Notification sent: YES ✅
933
+ #
934
+ # → Can't reconstruct full user journey!
935
+ # → Debugging payment issues impossible!
936
+ ```
937
+
938
+ **Why This Breaks:**
939
+ - ❌ **Per-event sampling:** Each service makes independent sampling decisions
940
+ - ❌ **Distributed traces incomplete:** Missing spans break trace visualization
941
+ - ❌ **Debugging impossible:** Can't see where payment processing failed
942
+ - ❌ **Misleading SLO metrics:** Partial traces skew latency calculations
943
+
944
+ ### 3.6.2. Decision: Trace-Level Sampling with Decision Cache
945
+
946
+ **Strategy:** All events in a trace share the same sampling decision.
947
+
948
+ **Key Principles:**
949
+ 1. **Sampling decision made per-trace** (not per-event)
950
+ 2. **First event in trace makes decision** (head-based sampling)
951
+ 3. **Decision propagated via W3C trace context** (`trace_flags` field)
952
+ 4. **Decision cached per trace_id** (TTL: 1 hour)
953
+
954
+ ### 3.6.3. TraceAwareSampler Implementation
955
+
956
+ ```ruby
957
+ module E11y
958
+ module Cost
959
+ class TraceAwareSampler < SimplifiedSampler
960
+ def initialize(config)
961
+ super(config)
962
+ @trace_decision_cache = Concurrent::Map.new
963
+ @cache_ttl = config.trace_cache_ttl || 3600 # 1 hour default
964
+ @cache_cleanup_interval = 300 # 5 minutes
965
+
966
+ # Start cache cleanup thread
967
+ start_cache_cleanup!
968
+ end
969
+
970
+ def should_sample?(event_data, context = {})
971
+ # Extract trace context
972
+ trace_context = event_data[:trace_context] || context[:trace_context]
973
+
974
+ unless trace_context && trace_context[:trace_id]
975
+ # No trace context → fall back to per-event sampling
976
+ return super(event_data, context)
977
+ end
978
+
979
+ trace_id = trace_context[:trace_id]
980
+
981
+ # ✅ CRITICAL: Check if sampling decision already made for this trace
982
+ cached_decision = get_trace_decision(trace_id)
983
+ return cached_decision unless cached_decision.nil?
984
+
985
+ # No cached decision → make NEW decision for this trace
986
+ decision = make_trace_decision(event_data, context)
987
+
988
+ # Cache decision for this trace (all future events use same decision)
989
+ set_trace_decision(trace_id, decision)
990
+
991
+ # Propagate decision via trace_flags (W3C Trace Context)
992
+ propagate_decision_to_trace_context!(trace_context, decision)
993
+
994
+ decision
995
+ end
996
+
997
+ private
998
+
999
+ def get_trace_decision(trace_id)
1000
+ entry = @trace_decision_cache[trace_id]
1001
+ return nil unless entry
1002
+
1003
+ # Check if cache entry expired
1004
+ if Time.now.to_i > entry[:expires_at]
1005
+ @trace_decision_cache.delete(trace_id)
1006
+ return nil
1007
+ end
1008
+
1009
+ entry[:decision]
1010
+ end
1011
+
1012
+ def set_trace_decision(trace_id, decision)
1013
+ @trace_decision_cache[trace_id] = {
1014
+ decision: decision,
1015
+ expires_at: Time.now.to_i + @cache_ttl,
1016
+ created_at: Time.now.to_i
1017
+ }
1018
+
1019
+ # Track cache size
1020
+ Yabeda.e11y_trace_decision_cache_size.set(@trace_decision_cache.size)
1021
+ end
1022
+
1023
+ def make_trace_decision(event_data, context)
1024
+ # Use standard sampling logic (severity + pattern-based)
1025
+ base_decision = super(event_data, context)
1026
+
1027
+ # Apply adaptive adjustment based on budget
1028
+ if over_budget?
1029
+ # Reduce sampling rate for traces
1030
+ rand() < calculate_adaptive_rate(base_decision)
1031
+ else
1032
+ base_decision
1033
+ end
1034
+ end
1035
+
1036
+ def over_budget?
1037
+ # Check if monthly event budget exceeded
1038
+ current_month_events = Yabeda.e11y_events_tracked_total.values.sum
1039
+ budget = @config.cost_budget || 100_000
1040
+
1041
+ current_month_events > budget
1042
+ end
1043
+
1044
+ def calculate_adaptive_rate(base_decision)
1045
+ return 1.0 if base_decision == false # Already dropping
1046
+
1047
+ budget_utilization = Yabeda.e11y_events_tracked_total.values.sum.to_f / @config.cost_budget
1048
+
1049
+ # Scale down aggressively when over budget
1050
+ if budget_utilization > 1.5
1051
+ 0.1 # Keep only 10% of traces
1052
+ elsif budget_utilization > 1.2
1053
+ 0.5 # Keep 50% of traces
1054
+ else
1055
+ 1.0 # Keep all traces (within budget)
1056
+ end
1057
+ end
1058
+
1059
+ def propagate_decision_to_trace_context!(trace_context, decision)
1060
+ # Set W3C Trace Context trace_flags
1061
+ # Bit 0 (0x01): sampled flag
1062
+ if decision
1063
+ trace_context[:trace_flags] ||= 0x01 # Set sampled bit
1064
+ else
1065
+ trace_context[:trace_flags] ||= 0x00 # Clear sampled bit
1066
+ end
1067
+ end
1068
+
1069
+ def start_cache_cleanup!
1070
+ Thread.new do
1071
+ loop do
1072
+ sleep @cache_cleanup_interval
1073
+
1074
+ # Remove expired entries
1075
+ now = Time.now.to_i
1076
+ @trace_decision_cache.delete_if do |trace_id, entry|
1077
+ expired = now > entry[:expires_at]
1078
+
1079
+ if expired
1080
+ Yabeda.e11y_trace_decision_cache_evictions.increment
1081
+ end
1082
+
1083
+ expired
1084
+ end
1085
+ end
1086
+ rescue StandardError => e
1087
+ E11y.logger.error "[E11y] Trace cache cleanup error: #{e.message}"
1088
+ retry
1089
+ end
1090
+ end
1091
+ end
1092
+ end
1093
+ end
1094
+ ```
1095
+
1096
+ ### 3.6.4. Configuration
1097
+
1098
+ ```ruby
1099
+ # config/initializers/e11y.rb
1100
+ E11y.configure do |config|
1101
+ config.cost_optimization do
1102
+ sampling do
1103
+ # ✅ Use trace-aware sampler for distributed tracing
1104
+ strategy :trace_aware # NEW: Trace-consistent sampling
1105
+
1106
+ # Trace decision cache
1107
+ trace_cache_ttl 3600 # 1 hour (3600 seconds)
1108
+ trace_cache_cleanup_interval 300 # 5 minutes
1109
+
1110
+ # Cost budget (monthly)
1111
+ cost_budget 100_000 # 100K events/month
1112
+
1113
+ # Per-severity sampling rates (base rates before adaptive adjustment)
1114
+ severity_rates do
1115
+ debug 0.01 # 1%
1116
+ info 0.1 # 10%
1117
+ success 0.5 # 50%
1118
+ warn 1.0 # 100%
1119
+ error 1.0 # 100%
1120
+ fatal 1.0 # 100%
1121
+ end
1122
+
1123
+ # Pattern-based overrides (take precedence)
1124
+ pattern_rates do
1125
+ pattern /^audit\./, rate: 1.0 # Always sample audit events
1126
+ pattern /^payment\./, rate: 1.0 # Always sample payments
1127
+ pattern /^debug\./, rate: 0.01 # 1% of debug events
1128
+ end
1129
+ end
1130
+ end
1131
+ end
1132
+ ```
1133
+
1134
+ ### 3.6.5. Multi-Service Trace Scenario (Correct Behavior)
1135
+
1136
+ **Service A (Order Service) - First Event:**
1137
+
1138
+ ```ruby
1139
+ # Create new trace context
1140
+ trace_context = E11y::TraceContext.generate
1141
+
1142
+ # Track event (FIRST in trace → makes sampling decision)
1143
+ Events::OrderCreated.track(
1144
+ order_id: '123',
1145
+ user_id: 'u456',
1146
+ trace_context: trace_context
1147
+ )
1148
+
1149
+ # TraceAwareSampler:
1150
+ # 1. No cached decision for trace_id
1151
+ # 2. Makes NEW decision: should_sample? → TRUE (severity: info, rate: 0.1 → sampled)
1152
+ # 3. Caches decision: trace_decision_cache[trace_id] = TRUE
1153
+ # 4. Sets trace_flags: 0x01 (sampled bit set)
1154
+ # 5. Event KEPT ✅
1155
+
1156
+ # HTTP call to Service B (trace context propagated via W3C headers)
1157
+ # traceparent: 00-abc123...-def456...-01
1158
+ # ^^
1159
+ # trace_flags = 0x01 (sampled)
1160
+ ```
1161
+
1162
+ **Service B (Payment Service) - Downstream Event:**
1163
+
1164
+ ```ruby
1165
+ # Receive trace context from Service A (via HTTP headers)
1166
+ incoming_trace_context = extract_trace_context_from_headers(request.headers)
1167
+ # trace_id: 'abc123...', trace_flags: 0x01 (sampled)
1168
+
1169
+ # Track event (DOWNSTREAM in trace → uses cached decision)
1170
+ Events::PaymentProcessing.track(
1171
+ payment_id: '456',
1172
+ order_id: '123',
1173
+ trace_context: incoming_trace_context
1174
+ )
1175
+
1176
+ # TraceAwareSampler:
1177
+ # 1. Check cache for trace_id: abc123... → FOUND (decision: TRUE)
1178
+ # 2. Return cached decision: TRUE
1179
+ # 3. Event KEPT ✅ (consistent with Service A decision)
1180
+
1181
+ # HTTP call to Service C (trace context propagated)
1182
+ ```
1183
+
1184
+ **Service C (Notification Service) - Further Downstream:**
1185
+
1186
+ ```ruby
1187
+ # Receive trace context from Service B
1188
+ incoming_trace_context = extract_trace_context_from_headers(request.headers)
1189
+
1190
+ # Track event
1191
+ Events::NotificationSent.track(
1192
+ notification_id: '789',
1193
+ order_id: '123',
1194
+ trace_context: incoming_trace_context
1195
+ )
1196
+
1197
+ # TraceAwareSampler:
1198
+ # 1. Check cache for trace_id: abc123... → FOUND (decision: TRUE)
1199
+ # 2. Return cached decision: TRUE
1200
+ # 3. Event KEPT ✅ (consistent across all services)
1201
+
1202
+ # Result: COMPLETE TRACE in Loki!
1203
+ # - Order created: YES ✅
1204
+ # - Payment processing: YES ✅
1205
+ # - Notification sent: YES ✅
1206
+ # → Full user journey reconstructed!
1207
+ ```
1208
+
1209
+ ### 3.6.6. Cache Management & TTL
1210
+
1211
+ **Why 1-hour TTL?**
1212
+
1213
+ ```ruby
1214
+ # Typical trace duration: <10 seconds (99th percentile)
1215
+ # Cache TTL: 1 hour (3600 seconds)
1216
+ # → 360x safety margin
1217
+
1218
+ # Trade-off:
1219
+ # - Short TTL (e.g., 1 minute): Cache misses if service delayed (retries, async jobs)
1220
+ # - Long TTL (e.g., 24 hours): High memory usage (1M traces = 100MB cache)
1221
+ # - 1 hour: Balance between memory and cache hit rate
1222
+ ```
1223
+
1224
+ **Cache Size Estimation:**
1225
+
1226
+ ```ruby
1227
+ # Assumptions:
1228
+ # - 10,000 events/sec
1229
+ # - 10 events per trace (average)
1230
+ # - 1,000 new traces/sec
1231
+ # - 1-hour TTL
1232
+
1233
+ # Cache size:
1234
+ # 1,000 traces/sec × 3,600 seconds = 3.6M traces
1235
+ # 3.6M traces × 100 bytes/entry = 360MB
1236
+
1237
+ # Mitigation:
1238
+ # - Cache cleanup every 5 minutes (remove expired)
1239
+ # - LRU eviction if memory limit exceeded
1240
+ # - Monitor: Yabeda.e11y_trace_decision_cache_size
1241
+ ```
1242
+
1243
+ **Cache Cleanup:**
1244
+
1245
+ ```ruby
1246
+ # Automatic cleanup every 5 minutes
1247
+ config.cost_optimization.sampling do
1248
+ trace_cache_cleanup_interval 300 # seconds
1249
+ end
1250
+
1251
+ # Manual cleanup (if needed)
1252
+ E11y::Cost::TraceAwareSampler.instance.cleanup_expired_traces!
1253
+
1254
+ # Monitoring
1255
+ Yabeda.e11y_trace_decision_cache_size.observe(cache_size)
1256
+ Yabeda.e11y_trace_decision_cache_evictions.increment
1257
+ ```
1258
+
1259
+ ### 3.6.7. Head-Based Sampling (W3C Trace Context)
1260
+
1261
+ **W3C Trace Context Propagation:**
1262
+
1263
+ ```ruby
1264
+ # HTTP Request Header (Service A → Service B):
1265
+ # traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
1266
+ # ││ │ │ │
1267
+ # ││ └─ trace_id (128-bit) └─ span_id └─ trace_flags
1268
+ # │└─ version (01 = sampled)
1269
+ # └─ format
1270
+
1271
+ # Service B extracts trace context:
1272
+ trace_context = {
1273
+ version: '00',
1274
+ trace_id: '4bf92f3577b34da6a3ce929d0e0e4736',
1275
+ parent_span_id: '00f067aa0ba902b7',
1276
+ trace_flags: 0x01 # ← Sampled bit set by Service A
1277
+ }
1278
+
1279
+ # Service B respects sampling decision:
1280
+ if trace_context[:trace_flags] & 0x01 == 0x01
1281
+ # Sampled bit set → KEEP event
1282
+ else
1283
+ # Sampled bit clear → DROP event
1284
+ end
1285
+ ```
1286
+
1287
+ ### 3.6.8. Trade-offs & Distributed Tracing Integrity (C05)
1288
+
1289
+ **Trade-offs:**
1290
+
1291
+ | Decision | Pro | Con | Rationale |
1292
+ |----------|-----|-----|-----------|
1293
+ | **Trace-level sampling** | Complete traces | Can't sample per-event | Trace integrity > granularity |
1294
+ | **Decision cache (1h TTL)** | Consistent decisions | 360MB memory (1M traces) | Cache hit rate > memory |
1295
+ | **Head-based sampling** | Simple propagation | First service decides for all | Simplicity > flexibility |
1296
+ | **W3C trace_flags** | Standard propagation | Requires trace context | Interoperability > custom |
1297
+
1298
+ **Distributed Tracing Integrity:**
1299
+
1300
+ ✅ **Complete traces:**
1301
+ All events in a trace sampled together → no gaps in trace visualization.
1302
+
1303
+ ✅ **Consistent debugging:**
1304
+ If Service A event visible, all downstream events visible → full user journey.
1305
+
1306
+ ✅ **Accurate SLO metrics:**
1307
+ Complete traces provide accurate latency calculations (no partial trace skew).
1308
+
1309
+ **Limitations:**
1310
+
1311
+ ⚠️ **All-or-nothing:** Can't sample some events within a trace (e.g., keep errors, drop debug)
1312
+ **Mitigation:** Use severity-based trace decision (errors always sampled)
1313
+
1314
+ ⚠️ **Memory overhead:** Cache stores decisions for 1 hour (360MB for 1M traces)
1315
+ **Mitigation:** LRU eviction + periodic cleanup
1316
+
1317
+ ⚠️ **Long traces:** If trace spans 2+ hours, cache may expire mid-trace
1318
+ **Mitigation:** Increase TTL for long-running workflows (e.g., async jobs)
1319
+
1320
+ **Monitoring Metrics:**
1321
+
1322
+ ```ruby
1323
+ # Track trace-aware sampling effectiveness
1324
+ Yabeda.e11y_trace_decision_cache_hit_rate.observe(
1325
+ hits / (hits + misses).to_f
1326
+ )
1327
+
1328
+ # Track cache size
1329
+ Yabeda.e11y_trace_decision_cache_size.set(cache_size)
1330
+
1331
+ # Track incomplete traces (should be 0%)
1332
+ Yabeda.e11y_incomplete_traces_total.increment(
1333
+ trace_id: 'abc123',
1334
+ missing_spans: 3
1335
+ )
1336
+ ```
1337
+
1338
+ **Related Conflicts:**
1339
+ - **C11:** Stratified sampling for SLO accuracy (see §3.7 below)
1340
+ - **C17:** Background job tracing (see ADR-005, UC-010)
1341
+ - **C09:** Multi-service tracing (see UC-009)
1342
+
1343
+ ---
1344
+
1345
+ ## 3.7. Stratified Sampling for SLO Accuracy (C11 Resolution)
1346
+
1347
+ > **⚠️ CRITICAL: C11 Conflict Resolution - Adaptive Sampling × SLO Tracking**
1348
+ > **See:** [CONFLICT-ANALYSIS.md C11](researches/CONFLICT-ANALYSIS.md#c11-adaptive-sampling--slo-tracking) for detailed analysis
1349
+ > **Problem:** Random sampling skews SLO metrics (inaccurate success rates)
1350
+ > **Solution:** Stratified sampling by severity + sampling correction math
1351
+
1352
+ ### 3.7.1. The Problem: Sampling Bias Breaks SLO Metrics
1353
+
1354
+ **Scenario - Inaccurate Success Rate:**
1355
+
1356
+ ```ruby
1357
+ # Real production traffic (1000 requests):
1358
+ # - 950 success (HTTP 200) → 95% success rate
1359
+ # - 50 errors (HTTP 500) → 5% error rate
1360
+
1361
+ # Adaptive sampling (random 50% sampling to save costs):
1362
+ # Expected: Keep 500 events (475 success, 25 errors) → 95% success rate ✅
1363
+
1364
+ # But random sampling can be BIASED!
1365
+ # Actual sample: 500 events (450 success, 50 errors) → 90% success rate ❌
1366
+
1367
+ # Result: FALSE SLO VIOLATION ALERT!
1368
+ # - Real success rate: 95% (above 95% SLO) ✅
1369
+ # - Calculated success rate: 90% (below 95% SLO) ❌
1370
+ # → False alert triggered!
1371
+ ```
1372
+
1373
+ **Why Random Sampling Fails:**
1374
+
1375
+ ```ruby
1376
+ # Random sampling treats ALL events equally
1377
+ Events::ApiRequest.track(status: 200) # Success → 50% chance to keep
1378
+ Events::ApiRequest.track(status: 500) # Error → 50% chance to keep
1379
+
1380
+ # Problem: We're dropping CRITICAL ERROR events!
1381
+ # - Errors are rare (5% of traffic) but CRITICAL for SLO
1382
+ # - Success events are common (95% of traffic) but less critical
1383
+ # - Random 50% sampling may drop errors → undercount error rate!
1384
+ ```
1385
+
1386
+ **Impact:**
1387
+ - ❌ **Inaccurate SLO metrics:** Success rate skewed by sampling bias
1388
+ - ❌ **False alerts:** SLO violations that don't exist
1389
+ - ❌ **Missed real issues:** Actual SLO violations hidden by lucky sampling
1390
+ - ❌ **Wrong business decisions:** Acting on bad data
1391
+
1392
+ ### 3.7.2. Decision: Stratified Sampling by Event Severity
1393
+
1394
+ **Strategy:** Sample different event types at different rates to preserve statistical properties.
1395
+
1396
+ **Strata Definition:**
1397
+
1398
+ | Stratum | Criteria | Sample Rate | Rationale |
1399
+ |---------|----------|-------------|-----------|
1400
+ | **Errors** | `severity: [:error, :fatal]` OR `http_status: 5xx` | 100% | Always keep errors (critical for SLO) |
1401
+ | **Warnings** | `severity: [:warn]` OR `http_status: 4xx` | 50% | Medium importance |
1402
+ | **Success** | `severity: [:info, :debug, :success]` OR `http_status: 2xx, 3xx` | 10% | Drop 90% (common, less critical) |
1403
+
1404
+ **Key Principles:**
1405
+ 1. **Always keep errors** (100% sampling) → accurate error rates
1406
+ 2. **Aggressively sample success** (10% sampling) → cost savings
1407
+ 3. **Apply sampling correction** in SLO calculations → accurate metrics
1408
+
1409
+ ### 3.7.3. StratifiedAdaptiveSampler Implementation
1410
+
1411
+ ```ruby
1412
+ module E11y
1413
+ module Cost
1414
+ class StratifiedAdaptiveSampler < SimplifiedSampler
1415
+ STRATA = {
1416
+ errors: {
1417
+ severities: [:error, :fatal],
1418
+ http_statuses: (500..599).to_a,
1419
+ sample_rate: 1.0 # 100% - always keep
1420
+ },
1421
+ warnings: {
1422
+ severities: [:warn],
1423
+ http_statuses: (400..499).to_a,
1424
+ sample_rate: 0.5 # 50%
1425
+ },
1426
+ success: {
1427
+ severities: [:debug, :info, :success],
1428
+ http_statuses: (200..399).to_a,
1429
+ sample_rate: 0.1 # 10% - aggressive sampling
1430
+ }
1431
+ }.freeze
1432
+
1433
+ def initialize(config)
1434
+ super(config)
1435
+ @strata_config = config.stratification || STRATA
1436
+ end
1437
+
1438
+ def should_sample?(event_data, context = {})
1439
+ # Determine event stratum
1440
+ stratum = determine_stratum(event_data)
1441
+
1442
+ # Get sample rate for this stratum
1443
+ sample_rate = @strata_config[stratum][:sample_rate]
1444
+
1445
+ # Make sampling decision
1446
+ decision = rand() < sample_rate
1447
+
1448
+ # Store stratum in event metadata (needed for correction later)
1449
+ event_data[:metadata] ||= {}
1450
+ event_data[:metadata][:sampling_stratum] = stratum
1451
+ event_data[:metadata][:sampling_rate] = sample_rate
1452
+ event_data[:metadata][:sampled] = decision
1453
+
1454
+ # Track metrics
1455
+ Yabeda.e11y_sampling_decisions_total.increment(
1456
+ stratum: stratum,
1457
+ decision: decision ? 'kept' : 'dropped',
1458
+ sample_rate: sample_rate
1459
+ )
1460
+
1461
+ decision
1462
+ end
1463
+
1464
+ def determine_stratum(event_data)
1465
+ severity = event_data[:severity]
1466
+ http_status = event_data.dig(:payload, :http_status) ||
1467
+ event_data.dig(:payload, :status)
1468
+
1469
+ # Check each stratum (priority order: errors → warnings → success)
1470
+ @strata_config.each do |stratum_name, stratum_config|
1471
+ # Check severity match
1472
+ if stratum_config[:severities].include?(severity)
1473
+ return stratum_name
1474
+ end
1475
+
1476
+ # Check HTTP status match
1477
+ if http_status && stratum_config[:http_statuses].include?(http_status)
1478
+ return stratum_name
1479
+ end
1480
+ end
1481
+
1482
+ # Default: success stratum
1483
+ :success
1484
+ end
1485
+ end
1486
+ end
1487
+ end
1488
+ ```
1489
+
1490
+ ### 3.7.4. SLO Calculator with Sampling Correction
1491
+
1492
+ **Critical:** Must apply sampling correction to get accurate SLO metrics.
1493
+
1494
+ ```ruby
1495
+ module E11y
1496
+ module SLO
1497
+ class Calculator
1498
+ def calculate_success_rate(events)
1499
+ # Group events by stratum
1500
+ events_by_stratum = events.group_by do |event|
1501
+ event[:metadata][:sampling_stratum]
1502
+ end
1503
+
1504
+ # Apply sampling correction for each stratum
1505
+ corrected_counts = {}
1506
+
1507
+ events_by_stratum.each do |stratum, stratum_events|
1508
+ sample_rate = stratum_events.first[:metadata][:sampling_rate]
1509
+
1510
+ # Correction factor: 1 / sample_rate
1511
+ # Example: 10% sample rate → multiply by 10
1512
+ correction_factor = 1.0 / sample_rate
1513
+
1514
+ corrected_counts[stratum] = {
1515
+ observed: stratum_events.count,
1516
+ corrected: stratum_events.count * correction_factor
1517
+ }
1518
+ end
1519
+
1520
+ # Calculate corrected totals
1521
+ corrected_success = corrected_counts[:success][:corrected] rescue 0
1522
+ corrected_warnings = corrected_counts[:warnings][:corrected] rescue 0
1523
+ corrected_errors = corrected_counts[:errors][:corrected] rescue 0
1524
+
1525
+ total = corrected_success + corrected_warnings + corrected_errors
1526
+
1527
+ # Success rate = (success + warnings) / total
1528
+ # (warnings are not SLO violations, only errors are)
1529
+ success_rate = (corrected_success + corrected_warnings) / total.to_f
1530
+
1531
+ {
1532
+ success_rate: success_rate,
1533
+ error_rate: corrected_errors / total.to_f,
1534
+ breakdown: corrected_counts,
1535
+ total_corrected_events: total
1536
+ }
1537
+ end
1538
+
1539
+ def calculate_p99_latency(events)
1540
+ # Group by stratum and apply correction
1541
+ latencies = []
1542
+
1543
+ events.each do |event|
1544
+ latency = event[:payload][:duration_ms]
1545
+ sample_rate = event[:metadata][:sampling_rate]
1546
+ correction_factor = (1.0 / sample_rate).round
1547
+
1548
+ # Duplicate latency by correction factor
1549
+ # (simulate missing events for percentile calculation)
1550
+ correction_factor.times { latencies << latency }
1551
+ end
1552
+
1553
+ # Calculate P99
1554
+ latencies.sort!
1555
+ p99_index = (latencies.size * 0.99).ceil - 1
1556
+ latencies[p99_index]
1557
+ end
1558
+ end
1559
+ end
1560
+ end
1561
+ ```
1562
+
1563
+ ### 3.7.5. Configuration
1564
+
1565
+ **Вариант 1: Единый простой конфиг (рекомендуется) 🎯**
1566
+
1567
+ ```ruby
1568
+ # config/initializers/e11y.rb
1569
+ E11y.configure do |config|
1570
+ config.cost_optimization do
1571
+ sampling do
1572
+ # ✅ Stratified sampling - smart sampling for accurate SLO
1573
+ strategy :stratified_adaptive
1574
+
1575
+ # Cost budget (как и раньше)
1576
+ cost_budget 100_000 # events/month
1577
+
1578
+ # 🎯 ЕДИНЫЙ конфиг: sample_rate по severity (default: never drop errors!)
1579
+ stratified_rates do
1580
+ error 1.0 # 100% - keep all errors (критично для SLO!)
1581
+ warn 0.5 # 50% - medium priority
1582
+ info 0.1 # 10% - low priority (успешные запросы)
1583
+ debug 0.05 # 5% - очень low priority
1584
+ end
1585
+ end
1586
+ end
1587
+
1588
+ # SLO tracking с автоматической коррекцией (включено по умолчанию!)
1589
+ config.slo do
1590
+ enable_sampling_correction true # ✅ Automatic correction in SLO calculations
1591
+ end
1592
+ end
1593
+ ```
1594
+
1595
+ **Как это работает:**
1596
+ - `error`/`fatal` severity → sample_rate **1.0** (100%, никогда не drop!)
1597
+ - `warn` severity → sample_rate **0.5** (50%)
1598
+ - `info`/`success` severity → sample_rate **0.1** (10%)
1599
+ - `debug` severity → sample_rate **0.05** (5%)
1600
+
1601
+ **SLO коррекция автоматическая:**
1602
+ ```ruby
1603
+ # Пользователь пишет как раньше:
1604
+ E11y::SLO.error_rate # ✅ Автоматически скорректировано!
1605
+
1606
+ # Внутри:
1607
+ observed_errors = 50
1608
+ corrected_errors = observed_errors / error_sample_rate # 50 / 1.0 = 50
1609
+
1610
+ observed_success = 95
1611
+ corrected_success = observed_success / info_sample_rate # 95 / 0.1 = 950
1612
+
1613
+ corrected_error_rate = corrected_errors / (corrected_errors + corrected_success)
1614
+ # = 50 / (50 + 950) = 5% ✅ ACCURATE!
1615
+ ```
1616
+
1617
+ ---
1618
+
1619
+ **Вариант 2: Продвинутый конфиг (для сложных случаев)**
1620
+
1621
+ Если нужна гибкость (например, разные sample_rate для HTTP 4xx vs 5xx):
1622
+
1623
+ ```ruby
1624
+ E11y.configure do |config|
1625
+ config.cost_optimization do
1626
+ sampling do
1627
+ strategy :stratified_adaptive
1628
+ cost_budget 100_000
1629
+
1630
+ # Продвинутая стратификация по severities + http_statuses
1631
+ stratification do
1632
+ stratum :critical_errors do
1633
+ severities [:error, :fatal]
1634
+ http_statuses (500..599).to_a
1635
+ sample_rate 1.0 # 100%
1636
+ end
1637
+
1638
+ stratum :client_errors do
1639
+ severities [:warn]
1640
+ http_statuses (400..499).to_a
1641
+ sample_rate 0.3 # 30% (меньше чем warn, т.к. 4xx не так критично)
1642
+ end
1643
+
1644
+ stratum :success do
1645
+ severities [:info, :success]
1646
+ http_statuses (200..399).to_a
1647
+ sample_rate 0.1 # 10%
1648
+ end
1649
+ end
1650
+ end
1651
+ end
1652
+ end
1653
+ ```
1654
+
1655
+ ### 3.7.6. Accuracy Comparison: Random vs Stratified Sampling
1656
+
1657
+ **Scenario:** 1000 requests (950 success, 50 errors) → 95% success rate
1658
+
1659
+ | Sampling Strategy | Events Kept | Observed Success Rate | Corrected Success Rate | Error |
1660
+ |-------------------|-------------|----------------------|------------------------|-------|
1661
+ | **No Sampling** | 1000 (100%) | 95.0% | N/A | 0% ✅ |
1662
+ | **Random 50%** | 500 (50%) | 90-100% (varies!) | 90-100% (varies!) | ±5% ❌ |
1663
+ | **Stratified** | 145 (14.5%) | 65.5% (95/145) | **95.0%** (corrected) | 0% ✅ |
1664
+
1665
+ **Stratified Sampling Breakdown:**
1666
+ ```ruby
1667
+ # Stratum 1: Errors (100% sampling)
1668
+ 50 errors × 1.0 = 50 kept → corrected: 50 / 1.0 = 50
1669
+
1670
+ # Stratum 2: Warnings (50% sampling)
1671
+ 0 warnings × 0.5 = 0 kept → corrected: 0 / 0.5 = 0
1672
+
1673
+ # Stratum 3: Success (10% sampling)
1674
+ 950 success × 0.1 = 95 kept → corrected: 95 / 0.1 = 950
1675
+
1676
+ # Total kept: 145 events (85.5% cost savings!)
1677
+ # Corrected total: 1000 events
1678
+ # Corrected success rate: (950 + 0) / 1000 = 95% ✅ ACCURATE!
1679
+ ```
1680
+
1681
+ ### 3.7.7. Cost Savings vs Accuracy Trade-off
1682
+
1683
+ **Example: 10M events/month (9.5M success, 500K errors)**
1684
+
1685
+ | Strategy | Events Stored | Cost | Success Rate Accuracy |
1686
+ |----------|---------------|------|----------------------|
1687
+ | **No Sampling** | 10M | $1000 | 100% (baseline) ✅ |
1688
+ | **Random 50%** | 5M | $500 | ~95% (biased) ⚠️ |
1689
+ | **Stratified** | 1.45M | $145 | **99.9%** (corrected) ✅ |
1690
+
1691
+ **Stratified Breakdown:**
1692
+ - Errors: 500K × 100% = 500K kept (50% of budget!)
1693
+ - Success: 9.5M × 10% = 950K kept (50% of budget)
1694
+ - **Total: 1.45M events (85.5% cost savings!)**
1695
+
1696
+ **Key Insight:** Stratified sampling provides **85% cost savings** with **99.9% accuracy** vs random sampling's **50% savings** with **95% accuracy**.
1697
+
1698
+ ### 3.7.8. Testing Sampling Correction Accuracy
1699
+
1700
+ ```ruby
1701
+ # spec/lib/e11y/slo/calculator_spec.rb
1702
+ RSpec.describe E11y::SLO::Calculator do
1703
+ describe '#calculate_success_rate with stratified sampling' do
1704
+ it 'accurately calculates success rate with sampling correction' do
1705
+ # Simulate 1000 requests (950 success, 50 errors)
1706
+ events = []
1707
+
1708
+ # Generate 950 success events (10% sampled → 95 kept)
1709
+ 95.times do
1710
+ events << {
1711
+ severity: :info,
1712
+ payload: { http_status: 200 },
1713
+ metadata: {
1714
+ sampling_stratum: :success,
1715
+ sampling_rate: 0.1,
1716
+ sampled: true
1717
+ }
1718
+ }
1719
+ end
1720
+
1721
+ # Generate 50 error events (100% sampled → 50 kept)
1722
+ 50.times do
1723
+ events << {
1724
+ severity: :error,
1725
+ payload: { http_status: 500 },
1726
+ metadata: {
1727
+ sampling_stratum: :errors,
1728
+ sampling_rate: 1.0,
1729
+ sampled: true
1730
+ }
1731
+ }
1732
+ end
1733
+
1734
+ # Calculate SLO with correction
1735
+ calculator = described_class.new
1736
+ result = calculator.calculate_success_rate(events)
1737
+
1738
+ # Expected corrected success rate: 95%
1739
+ expect(result[:success_rate]).to be_within(0.001).of(0.95)
1740
+ expect(result[:error_rate]).to be_within(0.001).of(0.05)
1741
+ expect(result[:total_corrected_events]).to eq(1000)
1742
+
1743
+ # Breakdown verification
1744
+ expect(result[:breakdown][:success][:corrected]).to eq(950)
1745
+ expect(result[:breakdown][:errors][:corrected]).to eq(50)
1746
+ end
1747
+
1748
+ it 'matches baseline accuracy without sampling' do
1749
+ # Generate 1000 events without sampling
1750
+ baseline_events = generate_events(success: 950, errors: 50, sampled: false)
1751
+ baseline_rate = calculate_baseline_success_rate(baseline_events)
1752
+
1753
+ # Generate sampled events with correction
1754
+ sampled_events = generate_events(success: 95, errors: 50, sampled: true)
1755
+ corrected_rate = described_class.new.calculate_success_rate(sampled_events)[:success_rate]
1756
+
1757
+ # Should match within 1%
1758
+ expect(corrected_rate).to be_within(0.01).of(baseline_rate)
1759
+ end
1760
+ end
1761
+ end
1762
+ ```
1763
+
1764
+ ### 3.7.9. Trade-offs & SLO Accuracy (C11)
1765
+
1766
+ **Trade-offs:**
1767
+
1768
+ | Decision | Pro | Con | Rationale |
1769
+ |----------|-----|-----|-----------|
1770
+ | **Stratified sampling** | Accurate SLO metrics | Complexity (correction math) | Accuracy > simplicity |
1771
+ | **Always keep errors (100%)** | No error data loss | Higher cost if error rate spikes | Error visibility critical |
1772
+ | **Aggressive success sampling (10%)** | 90% cost savings | Large correction factor (10x) | Success events less critical |
1773
+ | **Sampling correction math** | Accurate percentiles | CPU overhead (~0.1ms/query) | Accuracy > performance |
1774
+
1775
+ **SLO Accuracy Guarantees:**
1776
+
1777
+ ✅ **Error rate accuracy: 100%**
1778
+ All errors captured → no error data loss.
1779
+
1780
+ ✅ **Success rate accuracy: 99.9%**
1781
+ Sampling correction restores true success rate (±0.1% error).
1782
+
1783
+ ✅ **Latency percentiles: 95%**
1784
+ P99 latency within 5% of true value (correction restores distribution).
1785
+
1786
+ **Limitations:**
1787
+
1788
+ ⚠️ **High error rates reduce savings:** If errors >10% of traffic, cost savings decrease
1789
+ **Mitigation:** Adjust success sample rate dynamically based on error rate
1790
+
1791
+ ⚠️ **Correction assumes uniform distribution:** May be inaccurate if success events clustered
1792
+ **Mitigation:** Use time-windowed correction (per 5-minute window)
1793
+
1794
+ ⚠️ **Small sample sizes:** <100 events may have large correction errors
1795
+ **Mitigation:** Don't apply correction for small samples, wait for more data
1796
+
1797
+ **Monitoring Metrics:**
1798
+
1799
+ ```ruby
1800
+ # Track stratified sampling effectiveness
1801
+ Yabeda.e11y_sampling_decisions_total.increment(
1802
+ stratum: 'success',
1803
+ decision: 'kept',
1804
+ sample_rate: 0.1
1805
+ )
1806
+
1807
+ # Track SLO calculation accuracy
1808
+ Yabeda.e11y_slo_correction_factor.observe(
1809
+ stratum: 'success',
1810
+ correction_factor: 10.0
1811
+ )
1812
+
1813
+ # Alert on correction accuracy drift
1814
+ Yabeda.e11y_slo_correction_error_rate.observe(
1815
+ expected: 0.95,
1816
+ actual: 0.949,
1817
+ error: 0.001 # 0.1% error
1818
+ )
1819
+ ```
1820
+
1821
+ **Related Conflicts:**
1822
+ - **C05:** Trace-aware sampling (see §3.6 above)
1823
+ - **UC-004:** Zero-config SLO tracking (see UC-004 for SLO calculation details)
1824
+ - **UC-014:** Adaptive sampling (see UC-014 for cost optimization)
1825
+
1826
+ ---
1827
+
1828
+ ## 4. Compression
1829
+
1830
+ ### 4.1. Compression Engine
1831
+
1832
+ ```ruby
1833
+ # lib/e11y/cost/compressor.rb
1834
+ module E11y
1835
+ module Cost
1836
+ class Compressor
1837
+ ALGORITHMS = {
1838
+ gzip: Algorithms::Gzip,
1839
+ zstd: Algorithms::Zstd,
1840
+ lz4: Algorithms::LZ4
1841
+ }.freeze
1842
+
1843
+ def initialize(config)
1844
+ @algorithm = config.algorithm || :gzip
1845
+ @min_size = config.min_size_bytes || 1024 # 1KB
1846
+ @compressor = ALGORITHMS[@algorithm].new(config)
1847
+ end
1848
+
1849
+ def compress(payload_string)
1850
+ # Skip compression for small payloads
1851
+ return payload_string if payload_string.bytesize < @min_size
1852
+
1853
+ compressed = @compressor.compress(payload_string)
1854
+
1855
+ # Only use if compression helps
1856
+ if compressed.bytesize < payload_string.bytesize
1857
+ E11y::Metrics.histogram('e11y.compression.ratio',
1858
+ payload_string.bytesize.to_f / compressed.bytesize,
1859
+ { algorithm: @algorithm }
1860
+ )
1861
+
1862
+ compressed
1863
+ else
1864
+ payload_string
1865
+ end
1866
+ end
1867
+
1868
+ def decompress(compressed_string)
1869
+ @compressor.decompress(compressed_string)
1870
+ end
1871
+ end
1872
+
1873
+ module Algorithms
1874
+ class Gzip
1875
+ def initialize(config)
1876
+ @level = config.compression_level || Zlib::DEFAULT_COMPRESSION
1877
+ end
1878
+
1879
+ def compress(data)
1880
+ io = StringIO.new
1881
+ gz = Zlib::GzipWriter.new(io, @level)
1882
+ gz.write(data)
1883
+ gz.close
1884
+ io.string
1885
+ end
1886
+
1887
+ def decompress(data)
1888
+ Zlib::GzipReader.new(StringIO.new(data)).read
1889
+ end
1890
+ end
1891
+
1892
+ class Zstd
1893
+ def initialize(config)
1894
+ @level = config.compression_level || 3
1895
+ require 'zstd-ruby'
1896
+ end
1897
+
1898
+ def compress(data)
1899
+ ::Zstd.compress(data, level: @level)
1900
+ end
1901
+
1902
+ def decompress(data)
1903
+ ::Zstd.decompress(data)
1904
+ end
1905
+ end
1906
+
1907
+ class LZ4
1908
+ def initialize(config)
1909
+ require 'lz4-ruby'
1910
+ end
1911
+
1912
+ def compress(data)
1913
+ LZ4::compress(data)
1914
+ end
1915
+
1916
+ def decompress(data)
1917
+ LZ4::uncompress(data)
1918
+ end
1919
+ end
1920
+ end
1921
+ end
1922
+ end
1923
+ ```
1924
+
1925
+ ### 4.2. Compression Benchmarks
1926
+
1927
+ ```
1928
+ Algorithm | Ratio | Speed (MB/s) | CPU Usage
1929
+ ----------|-------|--------------|----------
1930
+ Gzip | 5:1 | 50 | Medium
1931
+ Zstd | 6:1 | 200 | Low
1932
+ LZ4 | 3:1 | 500 | Very Low
1933
+ ```
1934
+
1935
+ ---
1936
+
1937
+ ## 5. Smart Routing
1938
+
1939
+ ### 5.1. Routing Strategy
1940
+
1941
+ ```ruby
1942
+ # lib/e11y/cost/smart_router.rb
1943
+ module E11y
1944
+ module Cost
1945
+ class SmartRouter
1946
+ def initialize(config)
1947
+ @rules = config.routing_rules
1948
+ end
1949
+
1950
+ def route(event_data)
1951
+ # Evaluate routing rules
1952
+ matched_rule = @rules.find do |rule|
1953
+ rule.matches?(event_data)
1954
+ end
1955
+
1956
+ adapters = matched_rule&.adapters || default_adapters
1957
+
1958
+ E11y::Metrics.increment('e11y.routing.decision', {
1959
+ event_name: event_data[:event_name],
1960
+ rule: matched_rule&.name || 'default',
1961
+ adapters: adapters.join(',')
1962
+ })
1963
+
1964
+ adapters
1965
+ end
1966
+
1967
+ private
1968
+
1969
+ def default_adapters
1970
+ E11y.config.adapters.names
1971
+ end
1972
+ end
1973
+
1974
+ class RoutingRule
1975
+ attr_reader :name, :adapters
1976
+
1977
+ def initialize(name:, adapters:, &condition)
1978
+ @name = name
1979
+ @adapters = adapters
1980
+ @condition = condition
1981
+ end
1982
+
1983
+ def matches?(event_data)
1984
+ @condition.call(event_data)
1985
+ end
1986
+ end
1987
+ end
1988
+ end
1989
+ ```
1990
+
1991
+ ### 5.2. Configuration
1992
+
1993
+ ```ruby
1994
+ E11y.configure do |config|
1995
+ config.cost_optimization.smart_routing do
1996
+ # Rule 1: Critical events → All adapters
1997
+ rule 'critical_events', adapters: [:loki, :sentry, :s3] do |event|
1998
+ event[:severity] >= :error ||
1999
+ event[:event_name].start_with?('payment.') ||
2000
+ event[:event_name].start_with?('order.')
2001
+ end
2002
+
2003
+ # Rule 2: Debug events → File only (not Loki)
2004
+ rule 'debug_events', adapters: [:file] do |event|
2005
+ event[:severity] == :debug
2006
+ end
2007
+
2008
+ # Rule 3: Archive events → S3 only
2009
+ rule 'archive_events', adapters: [:s3] do |event|
2010
+ event[:payload][:archive] == true
2011
+ end
2012
+
2013
+ # Rule 4: Health checks → /dev/null
2014
+ rule 'health_checks', adapters: [] do |event|
2015
+ event[:event_name].include?('health_check')
2016
+ end
2017
+
2018
+ # Default: Everything → Loki
2019
+ default_adapters [:loki]
2020
+ end
2021
+ end
2022
+ ```
2023
+
2024
+ ---
2025
+
2026
+ ## 6. Tiered Storage
2027
+
2028
+ ### 6.1. Retention Tagging
2029
+
2030
+ **Design Decision:** E11y adds `retention_until` timestamp, downstream systems handle deletion.
2031
+
2032
+ ```ruby
2033
+ # lib/e11y/cost/retention_tagger.rb
2034
+ module E11y
2035
+ module Cost
2036
+ class RetentionTagger
2037
+ def initialize(config)
2038
+ @retention_rules = config.retention_rules
2039
+ end
2040
+
2041
+ def tag_event(event_data)
2042
+ # Find matching retention rule
2043
+ retention_days = determine_retention(event_data)
2044
+
2045
+ # Calculate absolute retention_until date
2046
+ retention_until = Time.now + retention_days.days
2047
+
2048
+ # Add to event metadata
2049
+ event_data[:retention_until] = retention_until.iso8601
2050
+ event_data[:retention_days] = retention_days
2051
+
2052
+ E11y::Metrics.histogram('e11y.retention.days', retention_days, {
2053
+ event_name: event_data[:event_name]
2054
+ })
2055
+
2056
+ event_data
2057
+ end
2058
+
2059
+ private
2060
+
2061
+ def determine_retention(event_data)
2062
+ # Priority 1: Explicit retention in payload
2063
+ return event_data[:payload][:retention_days] if event_data[:payload][:retention_days]
2064
+
2065
+ # Priority 2: Pattern-based rules
2066
+ rule = @retention_rules.find do |r|
2067
+ r.matches?(event_data)
2068
+ end
2069
+
2070
+ return rule.retention_days if rule
2071
+
2072
+ # Default retention
2073
+ 30 # 30 days
2074
+ end
2075
+ end
2076
+
2077
+ class RetentionRule
2078
+ attr_reader :retention_days
2079
+
2080
+ def initialize(retention_days:, &condition)
2081
+ @retention_days = retention_days
2082
+ @condition = condition
2083
+ end
2084
+
2085
+ def matches?(event_data)
2086
+ @condition.call(event_data)
2087
+ end
2088
+ end
2089
+ end
2090
+ end
2091
+ ```
2092
+
2093
+ ### 6.2. Configuration
2094
+
2095
+ ```ruby
2096
+ E11y.configure do |config|
2097
+ config.cost_optimization.tiered_storage do
2098
+ # Rule 1: Audit events → 7 years (compliance)
2099
+ retention_rule 2555 do |event| # 7 * 365 days
2100
+ event[:event_name].start_with?('audit.')
2101
+ end
2102
+
2103
+ # Rule 2: Payment events → 2 years (legal)
2104
+ retention_rule 730 do |event| # 2 * 365 days
2105
+ event[:event_name].start_with?('payment.')
2106
+ end
2107
+
2108
+ # Rule 3: Debug events → 7 days (troubleshooting)
2109
+ retention_rule 7 do |event|
2110
+ event[:severity] == :debug
2111
+ end
2112
+
2113
+ # Rule 4: Errors → 90 days (analysis)
2114
+ retention_rule 90 do |event|
2115
+ event[:severity] >= :error
2116
+ end
2117
+
2118
+ # Default: 30 days
2119
+ default_retention 30
2120
+ end
2121
+ end
2122
+ ```
2123
+
2124
+ ### 6.3. Downstream Integration
2125
+
2126
+ **Elasticsearch ILM:**
2127
+
2128
+ ```json
2129
+ {
2130
+ "policy": {
2131
+ "phases": {
2132
+ "hot": {
2133
+ "actions": {}
2134
+ },
2135
+ "delete": {
2136
+ "min_age": "0d",
2137
+ "actions": {
2138
+ "delete": {
2139
+ "delete_searchable_snapshot": false
2140
+ }
2141
+ }
2142
+ }
2143
+ }
2144
+ }
2145
+ }
2146
+ ```
2147
+
2148
+ **Query for deletion:**
2149
+
2150
+ ```ruby
2151
+ # Elasticsearch query
2152
+ DELETE /e11y-events-*/_query
2153
+ {
2154
+ "query": {
2155
+ "range": {
2156
+ "retention_until": {
2157
+ "lt": "now"
2158
+ }
2159
+ }
2160
+ }
2161
+ }
2162
+ ```
2163
+
2164
+ **S3 Lifecycle:**
2165
+
2166
+ ```json
2167
+ {
2168
+ "Rules": [
2169
+ {
2170
+ "Id": "delete-expired-events",
2171
+ "Status": "Enabled",
2172
+ "Filter": {
2173
+ "Prefix": "events/"
2174
+ },
2175
+ "Expiration": {
2176
+ "Days": 365
2177
+ }
2178
+ }
2179
+ ]
2180
+ }
2181
+ ```
2182
+
2183
+ ---
2184
+
2185
+ ## 7. Payload Minimization
2186
+
2187
+ ### 7.1. Payload Optimizer
2188
+
2189
+ ```ruby
2190
+ # lib/e11y/cost/payload_minimizer.rb
2191
+ module E11y
2192
+ module Cost
2193
+ class PayloadMinimizer
2194
+ def initialize(config)
2195
+ @remove_nulls = config.remove_nulls || true
2196
+ @truncate_strings = config.truncate_strings_at || 1000
2197
+ @truncate_arrays = config.truncate_arrays_at || 100
2198
+ end
2199
+
2200
+ def minimize(payload)
2201
+ minimized = payload.deep_dup
2202
+
2203
+ # Remove null/empty values
2204
+ minimized.compact! if @remove_nulls
2205
+
2206
+ # Truncate long strings
2207
+ minimized.transform_values! do |value|
2208
+ case value
2209
+ when String
2210
+ truncate_string(value)
2211
+ when Array
2212
+ truncate_array(value)
2213
+ when Hash
2214
+ minimize(value) # Recursive
2215
+ else
2216
+ value
2217
+ end
2218
+ end
2219
+
2220
+ minimized
2221
+ end
2222
+
2223
+ private
2224
+
2225
+ def truncate_string(str)
2226
+ return str if str.length <= @truncate_strings
2227
+
2228
+ "#{str[0...@truncate_strings]}... [truncated #{str.length - @truncate_strings} chars]"
2229
+ end
2230
+
2231
+ def truncate_array(arr)
2232
+ return arr if arr.length <= @truncate_arrays
2233
+
2234
+ arr.first(@truncate_arrays) + ["... [truncated #{arr.length - @truncate_arrays} items]"]
2235
+ end
2236
+ end
2237
+ end
2238
+ end
2239
+ ```
2240
+
2241
+ ---
2242
+
2243
+ ## 8. Cardinality Protection (C04 Resolution) ⚠️ CRITICAL
2244
+
2245
+ **Reference:** [CONFLICT-ANALYSIS.md - C04: High-Cardinality Metrics × OpenTelemetry Attributes](../researches/CONFLICT-ANALYSIS.md#c04-high-cardinality-metrics--opentelemetry-attributes)
2246
+
2247
+ ### 8.1. The Problem: Cardinality Explosion Across Backends
2248
+
2249
+ **Scenario:** UC-013 (High-Cardinality Protection) was designed to protect **Yabeda/Prometheus metrics** from cardinality explosion. However, cardinality explosion is a **cost optimization problem** affecting **ALL backends**:
2250
+
2251
+ - ❌ **Yabeda (Prometheus):** High cardinality → query slowness, OOM
2252
+ - ❌ **OpenTelemetry (OTLP):** High cardinality → cost spikes (Datadog, Honeycomb charge per unique attribute combination)
2253
+ - ❌ **Loki (Logs):** High cardinality in labels → index bloat, slow queries
2254
+
2255
+ **Example:**
2256
+
2257
+ ```ruby
2258
+ # Configuration (UC-013):
2259
+ E11y.configure do |config|
2260
+ config.cardinality_protection do
2261
+ enabled true
2262
+ max_unique_values 100 # Per label
2263
+ protected_labels [:user_id, :order_id, :session_id]
2264
+ end
2265
+ end
2266
+
2267
+ # Event tracking (10,000 unique users):
2268
+ 10_000.times do |i|
2269
+ Events::OrderCreated.track(
2270
+ order_id: "order-#{i}", # ← 10,000 unique values!
2271
+ user_id: "user-#{i}", # ← 10,000 unique values!
2272
+ amount: 99.99
2273
+ )
2274
+ end
2275
+ ```
2276
+
2277
+ **Result BEFORE C04 fix:**
2278
+
2279
+ ```
2280
+ ✅ Yabeda (Prometheus metrics):
2281
+ - Cardinality protection ACTIVE
2282
+ - Only first 100 unique order_id/user_id tracked
2283
+ - Rest grouped as [OTHER]
2284
+ - Prometheus cardinality: 100
2285
+
2286
+ ❌ OpenTelemetry (span attributes):
2287
+ - Cardinality protection BYPASSED
2288
+ - ALL 10,000 unique order_id/user_id exported
2289
+ - OTLP backend cardinality: 10,000
2290
+ - Cost spike: 100× expected!
2291
+
2292
+ ❌ Loki (log labels):
2293
+ - Cardinality protection BYPASSED
2294
+ - ALL 10,000 unique order_id/user_id in labels
2295
+ - Index bloat, slow queries
2296
+ ```
2297
+
2298
+ **Impact:**
2299
+ - ❌ **Cost explosion:** OTLP backends (Datadog, Honeycomb, Elastic) charge per unique attribute combination ($0.10/span → $1,000/day)
2300
+ - ❌ **Data loss:** Backend hits cardinality limit (e.g., Datadog 1000/metric), starts dropping spans
2301
+ - ❌ **Inconsistent protection:** Yabeda protected, OpenTelemetry/Loki not
2302
+ - ❌ **Misleading config:** UC-013 promises protection, but only covers **metrics** (not traces/logs)
2303
+
2304
+ ### 8.2. Decision: Unified Cardinality Protection for All Backends
2305
+
2306
+ **Strategy:** Apply UC-013 cardinality protection to **ALL adapters** (Yabeda, OpenTelemetry, Loki) using a unified `CardinalityFilter` middleware.
2307
+
2308
+ **Rules:**
2309
+ 1. **Single source of truth:** `E11y.config.cardinality_protection` applies to ALL backends by default
2310
+ 2. **Per-backend overrides:** Allow separate limits for backends with different cardinality handling (e.g., OTLP can handle 1000, Prometheus only 100)
2311
+ 3. **Apply in middleware:** Filter event payload in `CardinalityFilter` middleware before adapters
2312
+ 4. **[OTHER] grouping:** Group high-cardinality values as `[OTHER]` (consistent across backends)
2313
+ 5. **Monitor metrics:** Track filtered labels for visibility
2314
+
2315
+ ### 8.3. Configuration: Inherit from Global Settings
2316
+
2317
+ ```ruby
2318
+ # config/initializers/e11y.rb
2319
+ E11y.configure do |config|
2320
+ # ✅ GLOBAL cardinality protection (applies to ALL backends by default)
2321
+ config.cardinality_protection do
2322
+ enabled true
2323
+ max_unique_values 100 # Conservative default (Prometheus-safe)
2324
+ protected_labels [:user_id, :order_id, :session_id, :ip_address, :tenant_id]
2325
+ end
2326
+
2327
+ # Optional: Per-backend overrides (if needed)
2328
+ config.adapters do
2329
+ # Yabeda: Use global settings (default)
2330
+ yabeda do
2331
+ cardinality_protection.inherit_from :global
2332
+ end
2333
+
2334
+ # OpenTelemetry: Higher limits OK (OTLP backends handle more)
2335
+ opentelemetry do
2336
+ cardinality_protection do
2337
+ inherit_from :global # OR override:
2338
+ # max_unique_values 1000 # OTLP backends (Datadog) handle more
2339
+ # protected_labels [:user_id, :order_id] # Subset of global
2340
+ end
2341
+ end
2342
+
2343
+ # Loki: Use global settings (label cardinality matters!)
2344
+ loki do
2345
+ cardinality_protection.inherit_from :global
2346
+ end
2347
+ end
2348
+ end
2349
+ ```
2350
+
2351
+ **Environment-specific examples:**
2352
+
2353
+ ```ruby
2354
+ # Production: Strict limits (cost-sensitive)
2355
+ # config/environments/production.rb
2356
+ E11y.configure do |config|
2357
+ config.cardinality_protection do
2358
+ enabled true
2359
+ max_unique_values 100
2360
+ protected_labels [:user_id, :order_id, :session_id, :tenant_id]
2361
+ end
2362
+ end
2363
+
2364
+ # Development: No limits (full visibility)
2365
+ # config/environments/development.rb
2366
+ E11y.configure do |config|
2367
+ config.cardinality_protection.enabled false
2368
+ end
2369
+
2370
+ # Staging: Moderate limits (balance cost vs debugging)
2371
+ # config/environments/staging.rb
2372
+ E11y.configure do |config|
2373
+ config.cardinality_protection do
2374
+ max_unique_values 500 # More than prod, less than unlimited
2375
+ end
2376
+
2377
+ # OTLP backend can handle even more
2378
+ config.adapters.opentelemetry do
2379
+ cardinality_protection.max_unique_values 1000
2380
+ end
2381
+ end
2382
+ ```
2383
+
2384
+ ### 8.4. Implementation: Apply to Yabeda + OpenTelemetry
2385
+
2386
+ **CardinalityFilter middleware (unified for all backends):**
2387
+
2388
+ ```ruby
2389
+ # lib/e11y/middleware/cardinality_filter.rb
2390
+ module E11y
2391
+ module Middleware
2392
+ class CardinalityFilter
2393
+ def initialize(app)
2394
+ @app = app
2395
+ end
2396
+
2397
+ def call(event)
2398
+ # Apply cardinality protection if enabled
2399
+ if E11y.config.cardinality_protection.enabled
2400
+ event.payload = filter_payload(event.payload)
2401
+ end
2402
+
2403
+ @app.call(event)
2404
+ end
2405
+
2406
+ private
2407
+
2408
+ def filter_payload(payload)
2409
+ filtered = payload.dup
2410
+ protected_labels = E11y.config.cardinality_protection.protected_labels
2411
+ max_unique = E11y.config.cardinality_protection.max_unique_values
2412
+
2413
+ protected_labels.each do |label|
2414
+ next unless filtered.key?(label)
2415
+
2416
+ original_value = filtered[label]
2417
+
2418
+ # Check if value exceeds cardinality limit
2419
+ if exceeds_limit?(label, original_value, max_unique)
2420
+ # Replace with [OTHER]
2421
+ filtered[label] = '[OTHER]'
2422
+
2423
+ # Track metric
2424
+ E11y::Metrics.increment('e11y.cardinality.filtered_labels', {
2425
+ label: label,
2426
+ backend: 'all' # Applies to all adapters
2427
+ })
2428
+
2429
+ # Log debug
2430
+ E11y.logger.debug do
2431
+ "Cardinality limit exceeded for #{label}: #{original_value} → [OTHER]"
2432
+ end
2433
+ else
2434
+ # Track unique value
2435
+ track_unique_value(label, original_value)
2436
+ end
2437
+ end
2438
+
2439
+ filtered
2440
+ end
2441
+
2442
+ def exceeds_limit?(label, value, max_unique_values)
2443
+ cache_key = "#{label}:#{value}"
2444
+
2445
+ # Check if value already tracked
2446
+ return false if unique_values_cache.key?(cache_key)
2447
+
2448
+ # Check if label already has max unique values
2449
+ label_cardinality = unique_values_cache.keys.count { |k| k.start_with?("#{label}:") }
2450
+ label_cardinality >= max_unique_values
2451
+ end
2452
+
2453
+ def track_unique_value(label, value)
2454
+ cache_key = "#{label}:#{value}"
2455
+ unique_values_cache[cache_key] = true
2456
+ end
2457
+
2458
+ def unique_values_cache
2459
+ @unique_values_cache ||= Concurrent::Map.new
2460
+ end
2461
+
2462
+ # Class method for adapter-specific overrides
2463
+ def self.filter(payload, max_unique_values:, protected_labels:)
2464
+ # Same logic as instance method, but with custom config
2465
+ # Used by adapters with per-backend overrides
2466
+ # (Implementation omitted for brevity)
2467
+ end
2468
+ end
2469
+ end
2470
+ end
2471
+ ```
2472
+
2473
+ **Yabeda adapter (uses filtered payload from middleware):**
2474
+
2475
+ ```ruby
2476
+ # lib/e11y/adapters/yabeda_collector.rb
2477
+ module E11y
2478
+ module Adapters
2479
+ class YabedaCollector < Base
2480
+ def send_batch(events)
2481
+ events.each do |event|
2482
+ # Payload already filtered by CardinalityFilter middleware
2483
+ # Just increment metrics
2484
+ Yabeda.e11y.events.increment({
2485
+ event_name: event.event_name,
2486
+ severity: event.severity,
2487
+ # HIGH-CARDINALITY labels already replaced with [OTHER]
2488
+ user_id: event.payload[:user_id], # ✅ Protected
2489
+ order_id: event.payload[:order_id] # ✅ Protected
2490
+ })
2491
+ end
2492
+ end
2493
+ end
2494
+ end
2495
+ end
2496
+ ```
2497
+
2498
+ **OpenTelemetry adapter (uses filtered payload from middleware):**
2499
+
2500
+ ```ruby
2501
+ # lib/e11y/adapters/opentelemetry_collector.rb
2502
+ module E11y
2503
+ module Adapters
2504
+ class OpenTelemetryCollector < Base
2505
+ def send_batch(events)
2506
+ events.each do |event|
2507
+ # Payload already filtered by CardinalityFilter middleware
2508
+ export_trace(event) if @export_traces
2509
+ end
2510
+ end
2511
+
2512
+ private
2513
+
2514
+ def export_trace(event)
2515
+ tracer = ::OpenTelemetry.tracer_provider.tracer('e11y')
2516
+
2517
+ tracer.in_span(event.event_name) do |span|
2518
+ # Set filtered attributes on span (cardinality protected!)
2519
+ event.payload.each do |key, value|
2520
+ span.set_attribute(key.to_s, value) # ✅ Already filtered
2521
+ end
2522
+
2523
+ # Add metadata
2524
+ span.set_attribute('event.name', event.event_name)
2525
+ span.set_attribute('event.severity', event.severity.to_s)
2526
+ span.set_attribute('event.timestamp', event.timestamp.iso8601)
2527
+ end
2528
+ end
2529
+ end
2530
+ end
2531
+ end
2532
+ ```
2533
+
2534
+ ### 8.5. Cost Impact: Before vs After Protection
2535
+
2536
+ **Scenario:** 10,000 orders/day, each with unique `order_id` and `user_id`
2537
+
2538
+ **BEFORE C04 fix (cardinality unprotected):**
2539
+
2540
+ ```ruby
2541
+ # 10,000 orders/day
2542
+ 10_000.times do |i|
2543
+ Events::OrderCreated.track(
2544
+ order_id: "order-#{i}", # 10,000 unique values
2545
+ user_id: "user-#{i % 5000}", # 5,000 unique users
2546
+ amount: rand(10..500)
2547
+ )
2548
+ end
2549
+
2550
+ # Cost in OTLP backend (e.g., Datadog):
2551
+ # - Span attribute cardinality: order_id=10,000, user_id=5,000
2552
+ # - Datadog pricing: $0.10/span with high-cardinality attributes
2553
+ # - Daily cost: $0.10 × 10,000 = $1,000/day
2554
+ # - Monthly cost: $30,000/month ❌
2555
+ ```
2556
+
2557
+ **AFTER C04 fix (cardinality protected):**
2558
+
2559
+ ```ruby
2560
+ # Same events, but cardinality protection ENABLED
2561
+ E11y.configure do |config|
2562
+ config.cardinality_protection do
2563
+ enabled true
2564
+ max_unique_values 100
2565
+ protected_labels [:user_id, :order_id]
2566
+ end
2567
+ end
2568
+
2569
+ # 10,000 orders/day (same workload)
2570
+ 10_000.times do |i|
2571
+ Events::OrderCreated.track(
2572
+ order_id: "order-#{i}", # Filtered to 100 + [OTHER]
2573
+ user_id: "user-#{i % 5000}", # Filtered to 100 + [OTHER]
2574
+ amount: rand(10..500)
2575
+ )
2576
+ end
2577
+
2578
+ # Cost in OTLP backend (e.g., Datadog):
2579
+ # - Span attribute cardinality: order_id=101 (100 + [OTHER]), user_id=101
2580
+ # - Datadog pricing: $0.01/span (low-cardinality attributes)
2581
+ # - Daily cost: $0.01 × 10,000 = $100/day ✅
2582
+ # - Monthly cost: $3,000/month
2583
+ # - Monthly savings: $27,000 💰 (90% reduction!)
2584
+ ```
2585
+
2586
+ ### 8.6. Monitoring Metrics
2587
+
2588
+ **Key metrics for cardinality protection:**
2589
+
2590
+ ```ruby
2591
+ # 1. Filtered labels (cardinality protection triggered)
2592
+ E11y::Metrics.increment('e11y.cardinality.filtered_labels', {
2593
+ label: 'user_id',
2594
+ backend: 'all', # or 'yabeda', 'opentelemetry', 'loki'
2595
+ original_value_hash: Digest::SHA256.hexdigest(original_value)[0..7]
2596
+ })
2597
+
2598
+ # 2. Unique values tracked per label (current cardinality)
2599
+ E11y::Metrics.gauge('e11y.cardinality.unique_values',
2600
+ CardinalityFilter.unique_values_count(label),
2601
+ { label: label }
2602
+ )
2603
+
2604
+ # 3. Cardinality limit breaches (labels hitting max)
2605
+ E11y::Metrics.increment('e11y.cardinality.limit_breached', {
2606
+ label: label,
2607
+ max_unique_values: max_unique_values
2608
+ })
2609
+ ```
2610
+
2611
+ **Grafana dashboard queries:**
2612
+
2613
+ ```promql
2614
+ # Cardinality protection rate (% of labels filtered)
2615
+ rate(e11y_cardinality_filtered_labels_total[5m])
2616
+ /
2617
+ rate(e11y_events_tracked_total[5m]) * 100
2618
+
2619
+ # Labels at risk (approaching cardinality limit)
2620
+ e11y_cardinality_unique_values
2621
+ /
2622
+ 100 * 100 > 80 # 80% of max_unique_values (100)
2623
+
2624
+ # Top high-cardinality labels
2625
+ topk(10,
2626
+ sum by (label) (
2627
+ rate(e11y_cardinality_filtered_labels_total[1h])
2628
+ )
2629
+ )
2630
+
2631
+ # Cost savings estimate (assume $0.10 per unique span attribute)
2632
+ sum(rate(e11y_cardinality_filtered_labels_total[1d])) * 0.10
2633
+ # Result: Daily $ saved
2634
+ ```
2635
+
2636
+ **Alert rules:**
2637
+
2638
+ ```yaml
2639
+ # Alert if too many labels being filtered (config too strict?)
2640
+ - alert: E11yCardinalityHighFilterRate
2641
+ expr: |
2642
+ rate(e11y_cardinality_filtered_labels_total[5m])
2643
+ /
2644
+ rate(e11y_events_tracked_total[5m]) > 0.5
2645
+ for: 15m
2646
+ annotations:
2647
+ summary: "E11y filtering >50% of labels (cardinality config too strict?)"
2648
+
2649
+ # Alert if label approaching cardinality limit
2650
+ - alert: E11yCardinalityLimitApproaching
2651
+ expr: |
2652
+ e11y_cardinality_unique_values
2653
+ /
2654
+ 100 > 0.9
2655
+ for: 10m
2656
+ annotations:
2657
+ summary: "Label {{ $labels.label }} at 90% of cardinality limit (100 unique values)"
2658
+
2659
+ # Alert if cardinality protection disabled in production
2660
+ - alert: E11yCardinalityProtectionDisabled
2661
+ expr: |
2662
+ e11y_config_cardinality_protection_enabled{env="production"} == 0
2663
+ for: 5m
2664
+ annotations:
2665
+ summary: "⚠️ Cardinality protection DISABLED in production (cost risk!)"
2666
+ ```
2667
+
2668
+ ### 8.7. Trade-offs (C04 Resolution)
2669
+
2670
+ | Aspect | Pros | Cons | Mitigation |
2671
+ |--------|------|------|------------|
2672
+ | **Unified protection** | Consistent across all backends | One size doesn't fit all backends | Per-backend overrides (`inherit_from :global` or custom) |
2673
+ | **[OTHER] grouping** | Prevents cost explosion | Loses context for debugging | Log original values at debug level + query by `original_value_hash` |
2674
+ | **Global config** | Simple, DRY | May not fit all backend limits | Environment-specific: prod=100, staging=500, dev=unlimited |
2675
+ | **Middleware filtering** | Centralized, applies to all adapters | Performance overhead (filter per event) | Cache cardinality state (Concurrent::Map) |
2676
+ | **max_unique_values 100** | Conservative, safe for Prometheus | May be too strict for OTLP backends | Per-backend override: OTLP=1000, Yabeda=100 |
2677
+ | **protected_labels config** | Explicit control | Need to identify high-cardinality labels upfront | Monitor `limit_breached` metric, add labels incrementally |
2678
+
2679
+ ---
2680
+
2681
+ ## 9. Cost Metrics
2682
+
2683
+ ### 8.1. Cost Tracking
2684
+
2685
+ ```ruby
2686
+ # lib/e11y/cost/tracker.rb
2687
+ module E11y
2688
+ module Cost
2689
+ class Tracker
2690
+ # Estimated cost per adapter (per 1M events)
2691
+ ADAPTER_COSTS = {
2692
+ loki: 0.50, # $0.50/GB ≈ $0.50/1M events
2693
+ sentry: 10.00, # $0.01/event
2694
+ s3: 0.02, # $0.023/GB
2695
+ elasticsearch: 1.00
2696
+ }.freeze
2697
+
2698
+ def self.track_event_cost(event_data, adapters)
2699
+ # Estimate event size
2700
+ event_size_kb = estimate_size(event_data)
2701
+
2702
+ # Calculate cost per adapter
2703
+ adapters.each do |adapter|
2704
+ cost_per_mb = ADAPTER_COSTS[adapter] || 0
2705
+ cost = (event_size_kb / 1024.0) * cost_per_mb
2706
+
2707
+ E11y::Metrics.histogram('e11y.cost.per_event', cost, {
2708
+ adapter: adapter,
2709
+ event_name: event_data[:event_name]
2710
+ })
2711
+ end
2712
+ end
2713
+
2714
+ def self.estimate_size(event_data)
2715
+ event_data.to_json.bytesize / 1024.0 # KB
2716
+ end
2717
+ end
2718
+ end
2719
+ end
2720
+ ```
2721
+
2722
+ ### 8.2. Cost Dashboard Metrics
2723
+
2724
+ ```ruby
2725
+ E11y::Metrics.define do
2726
+ # Cost metrics
2727
+ histogram 'e11y.cost.per_event', 'Cost per event (USD)', [:adapter, :event_name]
2728
+ counter 'e11y.cost.total', 'Total cost (USD)', [:adapter]
2729
+
2730
+ # Savings metrics
2731
+ counter 'e11y.cost.saved.sampling', 'Cost saved by sampling', [:strategy]
2732
+ counter 'e11y.cost.saved.compression', 'Cost saved by compression', [:algorithm]
2733
+ counter 'e11y.cost.saved.routing', 'Cost saved by smart routing'
2734
+
2735
+ # Efficiency metrics
2736
+ histogram 'e11y.compression.ratio', 'Compression ratio', [:algorithm]
2737
+ histogram 'e11y.sampling.rate', 'Final sampling rate', [:event_name]
2738
+ histogram 'e11y.payload.size_reduction', 'Payload size reduction', [:event_name]
2739
+ end
2740
+ ```
2741
+
2742
+ ---
2743
+
2744
+ ## 10. Trade-offs
2745
+
2746
+ ### 10.1. Key Decisions
2747
+
2748
+ | Decision | Pro | Con | Rationale |
2749
+ |----------|-----|-----|-----------|
2750
+ | **Adaptive sampling** | 50-80% cost savings | Data loss risk | Errors always sampled |
2751
+ | **Cardinality protection (C04)** ⚠️ | 90% cost reduction in OTLP | Loses debugging context | [OTHER] grouping + debug logs |
2752
+ | **Gzip default** | 5:1 ratio, widely supported | CPU overhead | Best balance |
2753
+ | **retention_until tagging** | Simple, flexible | Downstream dependency | Clean separation |
2754
+ | **Smart routing** | 50% cost savings | Complex rules | Worth complexity |
2755
+ | **Payload minimization** | 20-30% size reduction | Data truncation | Configurable limits |
2756
+
2757
+ ### 10.2. Alternatives Considered
2758
+
2759
+ **A) No sampling (100% events)**
2760
+ - ❌ Rejected: Too expensive
2761
+
2762
+ **B) Fixed sampling rates**
2763
+ - ❌ Rejected: Not adaptive to load
2764
+
2765
+ **C) Manual retention management**
2766
+ - ❌ Rejected: Error-prone
2767
+
2768
+ **D) Event deduplication (60s window)**
2769
+ - ❌ Rejected for multiple critical reasons:
2770
+ - **High computational overhead**: Hash calculation + Redis lookup for EVERY event
2771
+ - **Memory cost**: 60K keys in Redis for 1000 events/sec (3.6GB memory)
2772
+ - **False positives**: Legitimate retries/bulk operations look like duplicates
2773
+ - **Debug confusion**: Users don't see events they expect (logs appear incomplete)
2774
+ - **Minimal real benefit**: Only ~5-10% actual duplicates in practice
2775
+ - **Better alternatives**: Adaptive sampling (80% reduction) + compression (80% bandwidth) achieve same cost goals without data loss
2776
+
2777
+ **E) Brotli compression**
2778
+ - ❌ Rejected: Slower than Zstd
2779
+
2780
+ ---
2781
+
2782
+ ## 11. Complete Configuration Example (Phase 2.8 - All Strategies) ✅
2783
+
2784
+ **Production-Ready Configuration with All 4 Adaptive Sampling Strategies:**
2785
+
2786
+ ```ruby
2787
+ # config/initializers/e11y.rb
2788
+ E11y.configure do |config|
2789
+ # ===================================================================
2790
+ # ADVANCED SAMPLING PIPELINE (Phase 2.8 - FEAT-4837)
2791
+ # ===================================================================
2792
+
2793
+ config.pipeline.use E11y::Middleware::Sampling,
2794
+ # Base sample rate (fallback)
2795
+ default_sample_rate: 0.1, # 10% default
2796
+
2797
+ # ✅ STRATEGY 1: Error-Based Adaptive Sampling (FEAT-4838)
2798
+ error_based_adaptive: true,
2799
+ error_spike_config: {
2800
+ window: 60, # 60 seconds sliding window
2801
+ absolute_threshold: 100, # 100 errors/min triggers spike
2802
+ relative_threshold: 3.0, # 3x normal rate triggers spike
2803
+ spike_duration: 300 # Keep 100% sampling for 5 minutes
2804
+ },
2805
+
2806
+ # ✅ STRATEGY 2: Load-Based Adaptive Sampling (FEAT-4842)
2807
+ load_based_adaptive: true,
2808
+ load_monitor_config: {
2809
+ window: 60,
2810
+ normal_threshold: 1_000, # < 1k events/sec = normal (100%)
2811
+ high_threshold: 10_000, # 10k events/sec = high (50%)
2812
+ very_high_threshold: 50_000, # 50k events/sec = very high (10%)
2813
+ overload_threshold: 100_000 # > 100k events/sec = overload (1%)
2814
+ }
2815
+
2816
+ # ✅ STRATEGY 3: Value-Based Sampling (FEAT-4846)
2817
+ # Configured per-event using `sample_by_value` DSL (see below)
2818
+
2819
+ # ✅ STRATEGY 4: Stratified Sampling (FEAT-4850, C11)
2820
+ # Automatic - no config needed, records sample rates for SLO correction
2821
+ end
2822
+
2823
+ # ===================================================================
2824
+ # EVENT-LEVEL VALUE-BASED SAMPLING (FEAT-4846)
2825
+ # ===================================================================
2826
+
2827
+ # Always sample high-value orders
2828
+ class Events::OrderPaid < E11y::Event::Base
2829
+ schema do
2830
+ required(:order_id).filled(:string)
2831
+ required(:amount).filled(:decimal)
2832
+ end
2833
+
2834
+ # Always sample orders over $1000
2835
+ sample_by_value field: "amount",
2836
+ operator: :greater_than,
2837
+ threshold: 1000,
2838
+ sample_rate: 1.0
2839
+ end
2840
+
2841
+ # Sample important user segments
2842
+ class Events::UserAction < E11y::Event::Base
2843
+ schema do
2844
+ required(:action).filled(:string)
2845
+ required(:user_segment).filled(:string)
2846
+ end
2847
+
2848
+ # Always sample enterprise/VIP users
2849
+ sample_by_value field: "user_segment",
2850
+ operator: :equals,
2851
+ threshold: "enterprise",
2852
+ sample_rate: 1.0
2853
+ end
2854
+
2855
+ # ===================================================================
2856
+ # SLO TRACKING WITH SAMPLING CORRECTION (FEAT-4850, C11)
2857
+ # ===================================================================
2858
+
2859
+ config.slo do
2860
+ enabled true
2861
+ enable_sampling_correction true # ✅ Automatic correction for accurate SLO
2862
+ end
2863
+ ```
2864
+
2865
+ **How the Strategies Work Together (Precedence Order):**
2866
+
2867
+ 1. **Error Spike Detection** (Highest Priority - FEAT-4838):
2868
+ - If error spike detected → 100% sampling for ALL events
2869
+ - Overrides all other strategies during spike
2870
+
2871
+ 2. **Value-Based Sampling** (High Priority - FEAT-4846):
2872
+ - If event has `sample_by_value` config and value meets criteria → 100% sampling
2873
+ - Overrides load-based sampling
2874
+
2875
+ 3. **Load-Based Sampling** (Base Rate - FEAT-4842):
2876
+ - Provides "base rate" based on system load (100% / 50% / 10% / 1%)
2877
+ - Can be further restricted by event-level `resolve_sample_rate`
2878
+
2879
+ 4. **Stratified Sampling** (Background - FEAT-4850):
2880
+ - Records sample rate metadata for each event
2881
+ - Enables SLO calculation with correction factors
2882
+ - No impact on sampling decisions (only metadata)
2883
+
2884
+ **Example Scenario (All Strategies Active):**
2885
+
2886
+ ```ruby
2887
+ # Normal conditions: 500 events/sec, 1% error rate
2888
+ # → Load: normal (< 1k) → base rate 100%
2889
+ # → Error spike: NO → base rate unchanged
2890
+ # → Final: 100% sampling (unless event-level override)
2891
+
2892
+ # Sudden traffic spike: 60k events/sec, 2% error rate
2893
+ # → Load: overload (> 50k) → base rate 1%
2894
+ # → Error spike: NO (< 5% threshold) → base rate unchanged
2895
+ # → Final: 1% sampling (cost protection!)
2896
+
2897
+ # Error spike during normal load: 500 events/sec, 10% error rate
2898
+ # → Load: normal → base rate 100%
2899
+ # → Error spike: YES (10% > 5%) → override to 100%
2900
+ # → Final: 100% sampling (debug priority!)
2901
+
2902
+ # High-value order during overload: 60k events/sec, 1% errors, order=$5000
2903
+ # → Load: overload → base rate 1%
2904
+ # → Error spike: NO
2905
+ # → Value-based: amount > $1000 → override to 100%
2906
+ # → Final: 100% sampling (business-critical event!)
2907
+ ```
2908
+
2909
+ **Old Configuration (Conceptual, Pre-Phase 2.8):**
2910
+
2911
+ ```ruby
2912
+ E11y.configure do |config|
2913
+ config.cost_optimization do
2914
+ # Adaptive sampling
2915
+ adaptive_sampling do
2916
+ error_based { enabled true }
2917
+ load_based { enabled true; max_events_per_sec 10_000 }
2918
+ value_based do
2919
+ enabled true
2920
+ high_value_patterns [/^payment\./, /^order\./]
2921
+ low_value_patterns [/^debug\./, /^health_check/]
2922
+ end
2923
+ end
2924
+
2925
+ # Payload minimization
2926
+ payload_minimization do
2927
+ enabled true
2928
+ truncate_strings_at 1000 # Max 1KB per string field
2929
+ truncate_arrays_at 100 # Max 100 items per array
2930
+ remove_null_fields true # Drop null/empty fields
2931
+ end
2932
+
2933
+ # Compression
2934
+ compression do
2935
+ enabled true
2936
+ algorithm :gzip # or :zstd, :lz4
2937
+ compression_level Zlib::DEFAULT_COMPRESSION
2938
+ min_size_bytes 1024
2939
+ end
2940
+
2941
+ # Smart routing
2942
+ smart_routing do
2943
+ rule 'critical', adapters: [:loki, :sentry] do |e|
2944
+ e[:severity] >= :error
2945
+ end
2946
+ rule 'debug', adapters: [:file] do |e|
2947
+ e[:severity] == :debug
2948
+ end
2949
+ end
2950
+
2951
+ # Tiered storage
2952
+ tiered_storage do
2953
+ retention_rule(2555) { |e| e[:event_name].start_with?('audit.') }
2954
+ retention_rule(90) { |e| e[:severity] >= :error }
2955
+ retention_rule(7) { |e| e[:severity] == :debug }
2956
+ default_retention 30
2957
+ end
2958
+
2959
+ # Payload minimization
2960
+ payload_minimization do
2961
+ enabled true
2962
+ remove_nulls true
2963
+ truncate_strings_at 1000
2964
+ truncate_arrays_at 100
2965
+ end
2966
+
2967
+ # Cost tracking
2968
+ cost_tracking do
2969
+ enabled true
2970
+ adapter_costs do
2971
+ loki 0.50
2972
+ sentry 10.00
2973
+ s3 0.02
2974
+ end
2975
+ end
2976
+ end
2977
+ end
2978
+ ```
2979
+
2980
+ ---
2981
+
2982
+ ## 12. Future Enhancements
2983
+
2984
+ See [Backlog](use_cases/backlog.md) for future enhancement ideas including:
2985
+ - Quick Start Presets (v1.1)
2986
+ - Sampling Budget (v1.2+)
2987
+
2988
+ ---
2989
+
2990
+ **Status:** ✅ Draft Complete
2991
+ **Next:** ADR-003 (SLO) or ADR-007 (OpenTelemetry)
2992
+ **Estimated Implementation:** 2-3 weeks
2993
+ **Expected ROI:** 50-80% cost reduction