e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,728 @@
1
+ # UC-004: Zero-Config SLO Tracking
2
+
3
+ **Status:** Core Feature (Phase 3)
4
+ **Complexity:** Intermediate
5
+ **Setup Time:** 5 minutes (one line of config!)
6
+ **Target Users:** DevOps, SRE, Engineering Managers
7
+
8
+ ---
9
+
10
+ ## 📋 Overview
11
+
12
+ ### Problem Statement
13
+
14
+ **Current SLO Tracking:**
15
+ - Manual instrumentation (middleware, metrics, alerts)
16
+ - Complex setup (Prometheus exporters, PromQL, Grafana dashboards)
17
+ - Time investment: 1-2 weeks for proper SLO monitoring
18
+ - Maintenance burden: keep dashboards/alerts updated
19
+
20
+ ### E11y Solution
21
+
22
+ **One line of config → full SLO monitoring:**
23
+ ```ruby
24
+ E11y.configure { |config| config.slo_tracking = true }
25
+ ```
26
+
27
+ **Result:**
28
+ - ✅ HTTP request metrics (availability, latency)
29
+ - ✅ Background job metrics (success rate, duration)
30
+ - ✅ Auto-generated Grafana dashboards
31
+ - ✅ Auto-generated Prometheus alerts
32
+
33
+ ---
34
+
35
+ ## 🎯 Configuration
36
+
37
+ > **Implementation:** See [ADR-003 Section 3: Multi-Level SLO Strategy](../ADR-003-slo-observability.md#3-multi-level-slo-strategy) and [Section 4: Per-Endpoint SLO Configuration](../ADR-003-slo-observability.md#4-per-endpoint-slo-configuration) for detailed architecture.
38
+
39
+ ### Minimal Setup (5 seconds)
40
+
41
+ ```ruby
42
+ # config/initializers/e11y.rb
43
+ E11y.configure do |config|
44
+ config.slo_tracking = true # That's it!
45
+ end
46
+ ```
47
+
48
+ **Auto-enabled:**
49
+ - Rack middleware (HTTP requests)
50
+ - Sidekiq middleware (background jobs)
51
+ - ActiveJob instrumentation
52
+ - Prometheus metrics export
53
+
54
+ ---
55
+
56
+ ### Production Setup (5 minutes)
57
+
58
+ ```ruby
59
+ E11y.configure do |config|
60
+ config.slo_tracking = true
61
+
62
+ config.slo do
63
+ # Ignore non-user-facing endpoints
64
+ controller 'HealthController' do
65
+ ignore true
66
+ end
67
+
68
+ controller 'MetricsController' do
69
+ ignore true
70
+ end
71
+
72
+ # Admin endpoints: different SLO
73
+ controller 'Admin::BaseController' do
74
+ ignore true # Or set lenient targets
75
+ end
76
+
77
+ # Critical endpoints: strict SLO
78
+ controller 'Api::OrdersController', action: 'create' do
79
+ latency_target_p95 200 # ms
80
+ end
81
+
82
+ # Long-running jobs: exclude from SLO
83
+ job 'ReportGenerationJob' do
84
+ ignore true
85
+ end
86
+ end
87
+ end
88
+ ```
89
+
90
+ ---
91
+
92
+ ## 📊 Auto-Generated Metrics
93
+
94
+ > **Implementation:** See [ADR-003 Section 3.1: Application-Wide SLO](../ADR-003-slo-observability.md#31-level-1-application-wide-slo-zero-config) for automatic metric generation architecture.
95
+
96
+ ### HTTP Metrics
97
+
98
+ ```promql
99
+ # Request count by status
100
+ yabeda_slo_http_requests_total{controller="OrdersController",action="create",status="200"}
101
+
102
+ # Latency histogram
103
+ yabeda_slo_http_request_duration_seconds{controller="OrdersController",action="create"}
104
+
105
+ # Availability (derived)
106
+ 100 * (
107
+ sum(rate(yabeda_slo_http_requests_total{status=~"2..|3.."}[30d])) /
108
+ sum(rate(yabeda_slo_http_requests_total[30d]))
109
+ )
110
+ ```
111
+
112
+ ### Background Job Metrics
113
+
114
+ ```promql
115
+ # Job success/failure
116
+ yabeda_slo_sidekiq_jobs_total{class="ProcessOrderJob",status="success"}
117
+ yabeda_slo_sidekiq_jobs_total{class="ProcessOrderJob",status="failed"}
118
+
119
+ # Job duration
120
+ yabeda_slo_sidekiq_job_duration_seconds{class="ProcessOrderJob"}
121
+ ```
122
+
123
+ ---
124
+
125
+ ## 📐 Sampling Correction for Accurate SLO (C11 Resolution) ⚠️ CRITICAL
126
+
127
+ **Reference:** [ADR-009 Section 3.7: Stratified Sampling for SLO Accuracy (C11 Resolution)](../ADR-009-cost-optimization.md#37-stratified-sampling-for-slo-accuracy-c11-resolution) and [CONFLICT-ANALYSIS.md C11](../researches/CONFLICT-ANALYSIS.md#c11-adaptive-sampling--slo-tracking)
128
+
129
+ ### Problem: Sampling Bias Breaks SLO Metrics
130
+
131
+ When E11y uses **adaptive sampling** to reduce costs (dropping 90% of events), **naive SLO calculations become inaccurate** because sampling is not uniform across success and error events.
132
+
133
+ **Example - Inaccurate Success Rate:**
134
+
135
+ ```ruby
136
+ # Real production traffic (1000 requests):
137
+ # - 950 success (HTTP 200) → 95% success rate ✅ TRUE
138
+ # - 50 errors (HTTP 500) → 5% error rate
139
+
140
+ # With random sampling (10% sample rate):
141
+ # - 95 success observed (10% of 950)
142
+ # - 5 errors observed (10% of 50)
143
+ # Total: 100 events observed
144
+
145
+ # Naive SLO calculation (without correction):
146
+ success_rate = 95 / (95 + 5) = 0.95 # 95% ✅ CORRECT (by luck!)
147
+
148
+ # But if sampling is biased (more success dropped than errors):
149
+ # - 85 success observed (9% of 950 - unlucky!)
150
+ # - 5 errors observed (10% of 50)
151
+ # Total: 90 events
152
+
153
+ # Naive calculation:
154
+ success_rate = 85 / (85 + 5) = 0.944 # 94.4% ❌ WRONG! (Should be 95%)
155
+ ```
156
+
157
+ **Impact:**
158
+ - ❌ **False SLO alerts:** Dashboard shows 94.4% (failing SLO) when true rate is 95% (passing)
159
+ - ❌ **Wrong business decisions:** Acting on inaccurate metrics
160
+ - ❌ **Lost trust:** Teams stop believing SLO dashboard
161
+
162
+ ### Solution: Stratified Sampling + Correction Math
163
+
164
+ E11y uses **stratified sampling** (keep 100% of errors, sample 10% of success) and **sampling correction** to restore accurate SLO metrics.
165
+
166
+ **Correction Formula:**
167
+
168
+ ```ruby
169
+ # For each severity stratum (errors, warnings, success):
170
+ corrected_count = observed_count × (1 / sample_rate)
171
+
172
+ # Example:
173
+ # - Errors: observed=50, sample_rate=1.0 → corrected=50 × 1 = 50 ✅
174
+ # - Success: observed=95, sample_rate=0.1 → corrected=95 × 10 = 950 ✅
175
+
176
+ # Corrected success rate:
177
+ corrected_success_rate = (corrected_success + corrected_warnings) /
178
+ (corrected_success + corrected_warnings + corrected_errors)
179
+ = (950 + 0) / (950 + 0 + 50)
180
+ = 950 / 1000
181
+ = 0.95 # 95% ✅ ACCURATE!
182
+ ```
183
+
184
+ ### SLO Calculator with Sampling Correction
185
+
186
+ **E11y automatically applies correction** when calculating SLO metrics:
187
+
188
+ ```ruby
189
+ # lib/e11y/slo/calculator.rb
190
+ module E11y
191
+ module SLO
192
+ class Calculator
193
+ # Calculate success rate with sampling correction
194
+ def calculate_success_rate(events)
195
+ # Group events by sampling stratum
196
+ events_by_stratum = events.group_by do |event|
197
+ event[:metadata][:sampling_stratum] # :errors, :warnings, :success
198
+ end
199
+
200
+ # Apply sampling correction for each stratum
201
+ corrected_counts = {}
202
+
203
+ events_by_stratum.each do |stratum, stratum_events|
204
+ sample_rate = stratum_events.first[:metadata][:sampling_rate]
205
+
206
+ # Correction factor: 1 / sample_rate
207
+ # Example: 10% sample rate → multiply by 10
208
+ correction_factor = 1.0 / sample_rate
209
+
210
+ corrected_counts[stratum] = {
211
+ observed: stratum_events.count,
212
+ corrected: (stratum_events.count * correction_factor).round,
213
+ sample_rate: sample_rate
214
+ }
215
+ end
216
+
217
+ # Calculate corrected totals
218
+ corrected_success = corrected_counts.dig(:success, :corrected) || 0
219
+ corrected_warnings = corrected_counts.dig(:warnings, :corrected) || 0
220
+ corrected_errors = corrected_counts.dig(:errors, :corrected) || 0
221
+
222
+ total = corrected_success + corrected_warnings + corrected_errors
223
+
224
+ # Success rate = (success + warnings) / total
225
+ # (warnings are not SLO violations, only errors are)
226
+ success_rate = (corrected_success + corrected_warnings) / total.to_f
227
+
228
+ {
229
+ success_rate: success_rate,
230
+ error_rate: corrected_errors / total.to_f,
231
+ breakdown: corrected_counts,
232
+ total_corrected_events: total,
233
+ sampling_correction_applied: true
234
+ }
235
+ end
236
+
237
+ # Calculate P99 latency with correction
238
+ def calculate_p99_latency(events)
239
+ latencies = []
240
+
241
+ events.each do |event|
242
+ latency = event[:payload][:duration_ms]
243
+ sample_rate = event[:metadata][:sampling_rate]
244
+ correction_factor = (1.0 / sample_rate).round
245
+
246
+ # Duplicate latency by correction factor
247
+ # (simulate missing events for percentile calculation)
248
+ correction_factor.times { latencies << latency }
249
+ end
250
+
251
+ # Calculate P99
252
+ latencies.sort!
253
+ p99_index = (latencies.size * 0.99).ceil - 1
254
+ latencies[p99_index]
255
+ end
256
+ end
257
+ end
258
+ end
259
+ ```
260
+
261
+ **Usage:**
262
+
263
+ ```ruby
264
+ # SLO calculation automatically applies correction
265
+ calculator = E11y::SLO::Calculator.new
266
+ result = calculator.calculate_success_rate(events)
267
+
268
+ puts result[:success_rate] # => 0.95 (95% - accurate!)
269
+ puts result[:breakdown]
270
+ # => {
271
+ # errors: { observed: 50, corrected: 50, sample_rate: 1.0 },
272
+ # success: { observed: 95, corrected: 950, sample_rate: 0.1 }
273
+ # }
274
+ ```
275
+
276
+ ### Accuracy Comparison: With vs Without Correction
277
+
278
+ | Scenario | True Success Rate | Naive Calculation | With Correction | Error |
279
+ |----------|-------------------|-------------------|-----------------|-------|
280
+ | **Uniform sampling** | 95.0% | 95.0% | 95.0% | 0.0% ✅ |
281
+ | **Stratified (errors 100%, success 10%)** | 95.0% | 94.4% ❌ | 95.0% ✅ | -0.6% |
282
+ | **High error rate (10%)** | 90.0% | 84.6% ❌ | 90.0% ✅ | -5.4% |
283
+ | **Very high error rate (50%)** | 50.0% | 33.3% ❌ | 50.0% ✅ | -16.7% |
284
+
285
+ **Key Insight:**
286
+ Without sampling correction, **error rate spikes cause SLO calculations to become severely inaccurate** (up to 16.7% error!). With correction, accuracy is maintained regardless of error rate.
287
+
288
+ ### Configuration
289
+
290
+ ```ruby
291
+ # config/initializers/e11y.rb
292
+ E11y.configure do |config|
293
+ config.slo_tracking = true
294
+
295
+ # Stratified sampling for accurate SLO
296
+ config.cost_optimization do
297
+ sampling do
298
+ strategy :stratified_adaptive # ✅ Use stratified sampler
299
+
300
+ stratification do
301
+ # Stratum 1: Errors (always keep - 100% accuracy)
302
+ stratum :errors do
303
+ severities [:error, :fatal]
304
+ http_statuses (500..599).to_a
305
+ sample_rate 1.0 # 100% - never drop errors!
306
+ end
307
+
308
+ # Stratum 2: Warnings (medium sampling)
309
+ stratum :warnings do
310
+ severities [:warn]
311
+ http_statuses (400..499).to_a
312
+ sample_rate 0.5 # 50%
313
+ end
314
+
315
+ # Stratum 3: Success (aggressive sampling - 90% cost savings)
316
+ stratum :success do
317
+ severities [:info, :debug, :success]
318
+ http_statuses (200..399).to_a
319
+ sample_rate 0.1 # 10% - drop 90%!
320
+ end
321
+ end
322
+
323
+ # SLO calculation with automatic correction
324
+ slo_correction do
325
+ enabled true # ✅ Apply sampling correction
326
+
327
+ # Verify correction accuracy (alert if off by >1%)
328
+ verify_accuracy true
329
+ alert_threshold 0.01 # 1% error tolerance
330
+ end
331
+ end
332
+ end
333
+ end
334
+ ```
335
+
336
+ ### Monitoring Correction Accuracy
337
+
338
+ E11y exposes metrics to monitor sampling correction accuracy:
339
+
340
+ ```ruby
341
+ # Grafana dashboard queries:
342
+
343
+ # 1. Correction factor by stratum
344
+ yabeda_e11y_slo_correction_factor{stratum="success"}
345
+ # => 10.0 (10% sample rate → 10x correction)
346
+
347
+ yabeda_e11y_slo_correction_factor{stratum="errors"}
348
+ # => 1.0 (100% sample rate → no correction)
349
+
350
+ # 2. Correction error rate (should be < 1%)
351
+ yabeda_e11y_slo_correction_error_rate
352
+ # => 0.001 (0.1% error - within tolerance ✅)
353
+
354
+ # 3. SLO accuracy drift alert
355
+ # Alert if correction error > 1%
356
+ ALERTS[yabeda_e11y_slo_correction_error_rate > 0.01]
357
+ ```
358
+
359
+ **Alert example:**
360
+
361
+ ```yaml
362
+ # prometheus/alerts/e11y_slo.yml
363
+ - alert: E11ySLOCorrectionInaccurate
364
+ expr: yabeda_e11y_slo_correction_error_rate > 0.01
365
+ for: 10m
366
+ annotations:
367
+ summary: "E11y SLO correction error > 1% (stratified sampling may be misconfigured)"
368
+ description: "Expected success rate: {{ $labels.expected }}, Actual: {{ $labels.actual }}, Error: {{ $value }}"
369
+ ```
370
+
371
+ ### Cost Savings vs Accuracy Trade-off
372
+
373
+ | Sampling Strategy | Success Sample Rate | Cost Savings | SLO Accuracy | Recommendation |
374
+ |-------------------|---------------------|--------------|--------------|----------------|
375
+ | **No sampling** | 100% | 0% | 100% | ❌ Expensive |
376
+ | **Random 50%** | 50% | 50% | ~95% | ⚠️ Inaccurate |
377
+ | **Stratified 50%** | 50% (errors 100%) | 50% | 99.9% ✅ | ✅ Balanced |
378
+ | **Stratified 10%** | 10% (errors 100%) | **90%** | 99.9% ✅ | ✅ **Best** |
379
+ | **Stratified 1%** | 1% (errors 100%) | 99% | 95% | ⚠️ Too aggressive |
380
+
381
+ **Recommendation:** Use **stratified sampling with 10% success sample rate** for optimal cost savings (90%) while maintaining SLO accuracy (99.9%).
382
+
383
+ ### Testing Sampling Correction
384
+
385
+ ```ruby
386
+ # spec/e11y/slo/calculator_spec.rb
387
+ RSpec.describe E11y::SLO::Calculator do
388
+ describe '#calculate_success_rate' do
389
+ context 'with stratified sampling (errors 100%, success 10%)' do
390
+ it 'applies sampling correction for accurate SLO' do
391
+ # Simulate observed events after sampling:
392
+ # - 50 errors (100% sample rate)
393
+ # - 95 success (10% sample rate)
394
+ events = []
395
+
396
+ # Errors (observed: 50, corrected: 50)
397
+ 50.times do
398
+ events << build_event(
399
+ severity: :error,
400
+ metadata: { sampling_stratum: :errors, sampling_rate: 1.0 }
401
+ )
402
+ end
403
+
404
+ # Success (observed: 95, corrected: 950)
405
+ 95.times do
406
+ events << build_event(
407
+ severity: :info,
408
+ metadata: { sampling_stratum: :success, sampling_rate: 0.1 }
409
+ )
410
+ end
411
+
412
+ # Calculate SLO with correction
413
+ calculator = described_class.new
414
+ result = calculator.calculate_success_rate(events)
415
+
416
+ # Expected corrected success rate: 95%
417
+ # (950 success / 1000 total = 0.95)
418
+ expect(result[:success_rate]).to be_within(0.001).of(0.95)
419
+ expect(result[:error_rate]).to be_within(0.001).of(0.05)
420
+ expect(result[:total_corrected_events]).to eq(1000)
421
+
422
+ # Verify breakdown
423
+ expect(result[:breakdown][:success][:observed]).to eq(95)
424
+ expect(result[:breakdown][:success][:corrected]).to eq(950)
425
+ expect(result[:breakdown][:errors][:observed]).to eq(50)
426
+ expect(result[:breakdown][:errors][:corrected]).to eq(50)
427
+ end
428
+ end
429
+
430
+ context 'without sampling correction (naive calculation)' do
431
+ it 'produces inaccurate SLO metrics' do
432
+ # Same events as above
433
+ events = [...] # (145 events total)
434
+
435
+ # Naive calculation (no correction):
436
+ naive_success_rate = 95 / (95 + 50).to_f
437
+ # => 0.655 (65.5%) ❌ WRONG! (True rate is 95%)
438
+
439
+ expect(naive_success_rate).to eq(0.655)
440
+ expect(naive_success_rate).not_to be_within(0.05).of(0.95)
441
+ # ❌ 29.5% error! (Completely useless for SLO)
442
+ end
443
+ end
444
+ end
445
+ end
446
+ ```
447
+
448
+ ### Summary: SLO Accuracy Guarantees
449
+
450
+ With stratified sampling + sampling correction, E11y provides:
451
+
452
+ ✅ **Error rate accuracy: 100%**
453
+ All errors captured (sample rate 1.0) → no error data loss.
454
+
455
+ ✅ **Success rate accuracy: 99.9%**
456
+ Sampling correction restores true success rate (±0.1% error).
457
+
458
+ ✅ **Latency percentiles accuracy: 95%**
459
+ Latency correction (duplicate by factor) preserves percentile distribution.
460
+
461
+ ✅ **Cost savings: 90%**
462
+ 10% success sample rate → 90% reduction in events stored.
463
+
464
+ **Trade-off:**
465
+ Sampling correction adds ~0.1ms CPU overhead per SLO query (negligible compared to 90% cost savings).
466
+
467
+ ---
468
+
469
+ ## 🎨 Auto-Generated Dashboards
470
+
471
+ > **Implementation:** See [ADR-003 Section 8.1: Per-Endpoint Grafana Dashboard](../ADR-003-slo-observability.md#81-per-endpoint-grafana-dashboard) for dashboard architecture and templates.
472
+
473
+ ### Generate Grafana Dashboard
474
+
475
+ ```bash
476
+ # One command generates full dashboard JSON
477
+ rails g e11y:grafana_dashboard
478
+
479
+ # Output: config/grafana/e11y_slo_dashboard.json
480
+ ```
481
+
482
+ **Dashboard includes:**
483
+ - HTTP availability (99.9% target)
484
+ - HTTP p95/p99 latency
485
+ - Error rate by endpoint
486
+ - Background job success rate
487
+ - SLO compliance score
488
+
489
+ **Import to Grafana:**
490
+ ```bash
491
+ # Option 1: Manual import (dashboard JSON)
492
+ # Grafana UI → Dashboards → Import → Upload JSON
493
+
494
+ # Option 2: Terraform (infrastructure as code)
495
+ resource "grafana_dashboard" "e11y_slo" {
496
+ config_json = file("config/grafana/e11y_slo_dashboard.json")
497
+ }
498
+ ```
499
+
500
+ ---
501
+
502
+ ## 🚨 Auto-Generated Alerts
503
+
504
+ > **Implementation:** See [ADR-003 Section 5: Multi-Window Multi-Burn Rate Alerts](../ADR-003-slo-observability.md#5-multi-window-multi-burn-rate-alerts) for Google SRE best practice alert architecture.
505
+
506
+ ### Generate Prometheus Alerts
507
+
508
+ ```bash
509
+ rails g e11y:prometheus_alerts
510
+
511
+ # Output: config/prometheus/e11y_slo_alerts.yml
512
+ ```
513
+
514
+ **Alerts include:**
515
+ - High error rate (>1%)
516
+ - Low availability (<99.9%)
517
+ - High latency (p95 >200ms)
518
+ - Job failure rate (>5%)
519
+
520
+ **Example alerts.yml:**
521
+ ```yaml
522
+ groups:
523
+ - name: e11y_slo
524
+ rules:
525
+ - alert: HighErrorRate
526
+ expr: |
527
+ (
528
+ sum(rate(yabeda_slo_http_requests_total{status=~"5.."}[5m])) /
529
+ sum(rate(yabeda_slo_http_requests_total[5m]))
530
+ ) > 0.01
531
+ for: 5m
532
+ annotations:
533
+ summary: "HTTP error rate >1%"
534
+
535
+ - alert: HighLatency
536
+ expr: histogram_quantile(0.95, rate(yabeda_slo_http_request_duration_seconds_bucket[5m])) > 0.2
537
+ for: 5m
538
+ annotations:
539
+ summary: "HTTP p95 latency >200ms"
540
+ ```
541
+
542
+ ---
543
+
544
+ ## 🎯 Error Budget Management
545
+
546
+ > **Implementation:** See [ADR-003 Section 7: Error Budget Management](../ADR-003-slo-observability.md#7-error-budget-management) for detailed architecture and deployment gates.
547
+
548
+ **Track your SLO error budget in real-time:**
549
+
550
+ ```ruby
551
+ # Query error budget for any endpoint
552
+ budget = E11y::SLO::ErrorBudget.new('OrdersController', 'create', slo_config)
553
+
554
+ budget.total # => 0.001 (0.1% for 99.9% target)
555
+ budget.consumed # => 0.0005 (50% of budget used)
556
+ budget.remaining # => 0.0005 (50% of budget left)
557
+ budget.percent_consumed # => 50.0
558
+ budget.exhausted? # => false
559
+ budget.time_until_exhaustion # => 14.5 days (at current burn rate)
560
+ ```
561
+
562
+ ### Deployment Gate (Optional)
563
+
564
+ **Prevent deployments when error budget is exhausted:**
565
+
566
+ ```ruby
567
+ # config/initializers/e11y.rb
568
+ E11y.configure do |config|
569
+ config.slo do
570
+ error_budget do
571
+ # Block deployments if <20% budget remaining
572
+ deployment_gate enabled: true, minimum_budget_percent: 20
573
+ end
574
+ end
575
+ end
576
+ ```
577
+
578
+ **CI/CD integration:**
579
+
580
+ ```bash
581
+ # Before deployment, check error budget
582
+ rails e11y:slo:check_budget
583
+
584
+ # Exit code 0: ✅ Budget available, deploy
585
+ # Exit code 1: ❌ Budget exhausted, block deploy
586
+ ```
587
+
588
+ **Example output:**
589
+
590
+ ```
591
+ Checking SLO Error Budget...
592
+
593
+ OrdersController#create:
594
+ ✅ Budget: 75% remaining (Target: 99.9%, Actual: 99.925%)
595
+
596
+ PaymentsController#process:
597
+ ❌ Budget: 5% remaining (Target: 99.95%, Actual: 99.902%)
598
+ ⚠️ DEPLOYMENT BLOCKED: Error budget below 20% threshold
599
+
600
+ Overall: ❌ FAILED
601
+ Cannot deploy: 1 endpoint(s) below minimum error budget
602
+ ```
603
+
604
+ ---
605
+
606
+ ## 🔍 SLO Config Validation
607
+
608
+ > **Implementation:** See [ADR-003 Section 6: SLO Config Validation & Linting](../ADR-003-slo-observability.md#6-slo-config-validation--linting) for validator architecture and edge cases.
609
+
610
+ **Validate your SLO configuration before deployment:**
611
+
612
+ ```bash
613
+ # Validate slo.yml file
614
+ rails e11y:slo:validate
615
+
616
+ # Output:
617
+ # ✅ Version: 1 (valid)
618
+ # ✅ Schema structure: valid
619
+ # ✅ All endpoints exist in routes (12 endpoints checked)
620
+ # ✅ All jobs exist in Sidekiq (3 jobs checked)
621
+ # ✅ SLO targets: valid (99.9%, 200ms p95)
622
+ # ⚠️ Warning: OrdersController#show has no latency target (using default 200ms)
623
+ #
624
+ # Validation: PASSED (0 errors, 1 warning)
625
+ ```
626
+
627
+ ### CI/CD Integration
628
+
629
+ **Catch configuration errors before deploy:**
630
+
631
+ ```yaml
632
+ # .github/workflows/ci.yml
633
+ name: CI
634
+ on: [push]
635
+ jobs:
636
+ slo-validation:
637
+ runs-on: ubuntu-latest
638
+ steps:
639
+ - uses: actions/checkout@v3
640
+ - name: Validate SLO Config
641
+ run: bundle exec rails e11y:slo:validate --strict
642
+ # --strict flag: warnings become errors
643
+ ```
644
+
645
+ ### Common Validation Errors
646
+
647
+ ```ruby
648
+ # ❌ ERROR: Endpoint doesn't exist in routes
649
+ endpoint 'OrdersController', action: 'destroy' do
650
+ latency_target_p95 200
651
+ end
652
+ # Fix: Ensure route exists or remove from slo.yml
653
+
654
+ # ❌ ERROR: Invalid SLO target (must be 0.0-1.0)
655
+ availability_target 99.9 # ❌ Should be 0.999, not 99.9
656
+ availability_target 0.999 # ✅ Correct
657
+
658
+ # ❌ ERROR: Job class doesn't exist
659
+ job 'NonExistentJob' do
660
+ success_rate_target 0.99
661
+ end
662
+ # Fix: Ensure job class is loaded or remove from config
663
+
664
+ # ⚠️ WARNING: Conflicting latency targets
665
+ # Global: 200ms, Endpoint: 300ms
666
+ # Resolution: Endpoint-specific target (300ms) takes precedence
667
+ ```
668
+
669
+ ---
670
+
671
+ ## 💡 Best Practices
672
+
673
+ ### ✅ DO
674
+
675
+ 1. **Exclude internal endpoints**
676
+ ```ruby
677
+ config.slo do
678
+ controller 'HealthController' { ignore true }
679
+ controller 'MetricsController' { ignore true }
680
+ end
681
+ ```
682
+
683
+ 2. **Set realistic targets**
684
+ ```ruby
685
+ config.slo do
686
+ latency_target_p95 200 # Default: reasonable
687
+ controller 'Api::SearchController' do
688
+ latency_target_p95 500 # Search = slower, OK
689
+ end
690
+ end
691
+ ```
692
+
693
+ 3. **Ignore expected errors**
694
+ ```ruby
695
+ config.slo do
696
+ http_ignore_statuses [404, 401, 422] # Not service errors
697
+ end
698
+ ```
699
+
700
+ ### ❌ DON'T
701
+
702
+ 1. **Don't include test traffic in SLO**
703
+ ```ruby
704
+ # ✅ Filter test traffic
705
+ config.slo do
706
+ ignore_if { |event| event.context[:user_agent] =~ /healthcheck|pingdom/ }
707
+ end
708
+ ```
709
+
710
+ 2. **Don't set unrealistic targets**
711
+ ```ruby
712
+ config.slo do
713
+ latency_target_p95 10 # ❌ 10ms is too aggressive for most apps
714
+ latency_target_p95 200 # ✅ 200ms reasonable default
715
+ end
716
+ ```
717
+
718
+ ---
719
+
720
+ ## 📚 Related Use Cases
721
+
722
+ - **[UC-002: Business Event Tracking](./UC-002-business-event-tracking.md)** - Events vs SLO metrics
723
+ - **[UC-003: Pattern-Based Metrics](./UC-003-pattern-based-metrics.md)** - Custom metrics
724
+
725
+ ---
726
+
727
+ **Document Version:** 1.0
728
+ **Last Updated:** January 12, 2026