e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "e11y/middleware/base"
|
|
4
|
+
|
|
5
|
+
module E11y
|
|
6
|
+
module Middleware
|
|
7
|
+
# Sampling Middleware
|
|
8
|
+
#
|
|
9
|
+
# Filters events based on sampling configuration to reduce volume and costs.
|
|
10
|
+
# Supports:
|
|
11
|
+
# - Per-event sample rates (from Event::Base)
|
|
12
|
+
# - Severity-based sampling (errors always sampled)
|
|
13
|
+
# - Pattern-based sampling (e.g., "debug.*" → 1%)
|
|
14
|
+
# - Trace-aware sampling (C05 - all events in trace sampled or none)
|
|
15
|
+
# - Error-based adaptive sampling (FEAT-4838 - 100% during error spikes)
|
|
16
|
+
# - Load-based adaptive sampling (FEAT-4842 - tiered sampling based on event volume)
|
|
17
|
+
#
|
|
18
|
+
# @example Basic usage
|
|
19
|
+
# E11y.configure do |config|
|
|
20
|
+
# config.middleware.use E11y::Middleware::Sampling, zone: :routing
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
# @example Event-level sampling
|
|
24
|
+
# class Events::DebugQuery < E11y::Event::Base
|
|
25
|
+
# sample_rate 0.01 # 1% sampling
|
|
26
|
+
# end
|
|
27
|
+
#
|
|
28
|
+
# @example Error-based adaptive sampling
|
|
29
|
+
# E11y.configure do |config|
|
|
30
|
+
# config.middleware.use E11y::Middleware::Sampling,
|
|
31
|
+
# error_based_adaptive: true,
|
|
32
|
+
# error_spike_config: {
|
|
33
|
+
# window: 60,
|
|
34
|
+
# absolute_threshold: 100,
|
|
35
|
+
# relative_threshold: 3.0,
|
|
36
|
+
# spike_duration: 300
|
|
37
|
+
# }
|
|
38
|
+
# end
|
|
39
|
+
#
|
|
40
|
+
# @example Load-based adaptive sampling
|
|
41
|
+
# E11y.configure do |config|
|
|
42
|
+
# config.middleware.use E11y::Middleware::Sampling,
|
|
43
|
+
# load_based_adaptive: true,
|
|
44
|
+
# load_monitor_config: {
|
|
45
|
+
# window: 60,
|
|
46
|
+
# thresholds: {
|
|
47
|
+
# normal: 1_000, # 0-1k events/sec → 100% sampling
|
|
48
|
+
# high: 10_000, # 1k-10k → 50%
|
|
49
|
+
# very_high: 50_000,# 10k-50k → 10%
|
|
50
|
+
# overload: 100_000 # >100k → 1%
|
|
51
|
+
# }
|
|
52
|
+
# }
|
|
53
|
+
# end
|
|
54
|
+
class Sampling < Base
|
|
55
|
+
middleware_zone :routing
|
|
56
|
+
|
|
57
|
+
# Initialize sampling middleware
|
|
58
|
+
#
|
|
59
|
+
# @param config [Hash] Configuration options
|
|
60
|
+
# @option config [Float] :default_sample_rate (1.0) Default sample rate for events without explicit config
|
|
61
|
+
# @option config [Boolean] :trace_aware (true) Enable trace-aware sampling (C05)
|
|
62
|
+
# @option config [Hash] :severity_rates ({}) Override sample rates by severity
|
|
63
|
+
# @option config [Boolean] :error_based_adaptive (false) Enable error-based adaptive sampling (FEAT-4838)
|
|
64
|
+
# @option config [Hash] :error_spike_config ({}) Configuration for ErrorSpikeDetector
|
|
65
|
+
# @option config [Boolean] :load_based_adaptive (false) Enable load-based adaptive sampling (FEAT-4842)
|
|
66
|
+
# @option config [Hash] :load_monitor_config ({}) Configuration for LoadMonitor
|
|
67
|
+
def initialize(config = {})
|
|
68
|
+
# Extract config before calling super (which sets @config)
|
|
69
|
+
config ||= {}
|
|
70
|
+
@default_sample_rate = config.fetch(:default_sample_rate, 1.0)
|
|
71
|
+
@trace_aware = config.fetch(:trace_aware, true)
|
|
72
|
+
@severity_rates = config.fetch(:severity_rates, {})
|
|
73
|
+
@trace_decisions = {} # Cache for trace-level sampling decisions
|
|
74
|
+
@trace_decisions_mutex = Mutex.new
|
|
75
|
+
|
|
76
|
+
# Error-based adaptive sampling (FEAT-4838)
|
|
77
|
+
@error_based_adaptive = config.fetch(:error_based_adaptive, false)
|
|
78
|
+
if @error_based_adaptive
|
|
79
|
+
require "e11y/sampling/error_spike_detector"
|
|
80
|
+
error_spike_config = config.fetch(:error_spike_config, {})
|
|
81
|
+
@error_spike_detector = E11y::Sampling::ErrorSpikeDetector.new(error_spike_config)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Load-based adaptive sampling (FEAT-4842)
|
|
85
|
+
@load_based_adaptive = config.fetch(:load_based_adaptive, false)
|
|
86
|
+
if @load_based_adaptive
|
|
87
|
+
require "e11y/sampling/load_monitor"
|
|
88
|
+
load_monitor_config = config.fetch(:load_monitor_config, {})
|
|
89
|
+
@load_monitor = E11y::Sampling::LoadMonitor.new(load_monitor_config)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Call super to set @config and other base middleware state
|
|
93
|
+
super
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Process event through sampling filter
|
|
97
|
+
#
|
|
98
|
+
# @param event_data [Hash] The event payload
|
|
99
|
+
# @return [Hash, nil] The event payload if sampled, nil if dropped
|
|
100
|
+
def call(event_data)
|
|
101
|
+
event_class = event_data[:event_class]
|
|
102
|
+
|
|
103
|
+
# Track errors for error-based adaptive sampling (FEAT-4838)
|
|
104
|
+
@error_spike_detector.record_event(event_data) if @error_based_adaptive && @error_spike_detector
|
|
105
|
+
|
|
106
|
+
# Track events for load-based adaptive sampling (FEAT-4842)
|
|
107
|
+
@load_monitor&.record_event
|
|
108
|
+
|
|
109
|
+
# Determine if event should be sampled
|
|
110
|
+
# Drop event if not sampled
|
|
111
|
+
return nil unless should_sample?(event_data, event_class)
|
|
112
|
+
|
|
113
|
+
# Mark as sampled for downstream middleware
|
|
114
|
+
event_data[:sampled] = true
|
|
115
|
+
event_data[:sample_rate] = determine_sample_rate(event_class, event_data)
|
|
116
|
+
|
|
117
|
+
# Pass to next middleware
|
|
118
|
+
@app.call(event_data)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Returns capabilities of this middleware
|
|
122
|
+
#
|
|
123
|
+
# @return [Hash] Capabilities
|
|
124
|
+
def capabilities
|
|
125
|
+
{
|
|
126
|
+
filters_events: true,
|
|
127
|
+
trace_aware: @trace_aware,
|
|
128
|
+
severity_aware: true,
|
|
129
|
+
error_based_adaptive: @error_based_adaptive, # FEAT-4838
|
|
130
|
+
load_based_adaptive: @load_based_adaptive # FEAT-4842
|
|
131
|
+
}
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
# Determine if event should be sampled
|
|
137
|
+
#
|
|
138
|
+
# @param event_data [Hash] The event payload
|
|
139
|
+
# @param event_class [Class] The event class
|
|
140
|
+
# @return [Boolean] true if event should be sampled
|
|
141
|
+
def should_sample?(event_data, event_class)
|
|
142
|
+
# 1. Check if audit event (never sample audit events!)
|
|
143
|
+
return true if event_class.respond_to?(:audit_event?) && event_class.audit_event?
|
|
144
|
+
|
|
145
|
+
# 2. Check trace-aware sampling (C05)
|
|
146
|
+
if @trace_aware && event_data[:trace_id]
|
|
147
|
+
return trace_sampling_decision(event_data[:trace_id], event_class,
|
|
148
|
+
event_data)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# 3. Get sample rate for this event
|
|
152
|
+
sample_rate = determine_sample_rate(event_class, event_data)
|
|
153
|
+
|
|
154
|
+
# 4. Random sampling decision
|
|
155
|
+
rand < sample_rate
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Determine sample rate for event
|
|
159
|
+
#
|
|
160
|
+
# Priority (highest to lowest):
|
|
161
|
+
# 0. Error spike override (100% during spike) - FEAT-4838
|
|
162
|
+
# 1. Value-based sampling (high-value events) - FEAT-4849
|
|
163
|
+
# 2. Load-based adaptive (tiered rates) - FEAT-4842
|
|
164
|
+
# 3. Severity-based override from config (@severity_rates)
|
|
165
|
+
# 4. Event-level config (event_class.resolve_sample_rate)
|
|
166
|
+
# 5. Default sample rate (@default_sample_rate)
|
|
167
|
+
#
|
|
168
|
+
# @param event_class [Class] The event class
|
|
169
|
+
# @param event_data [Hash] Event payload (for value-based sampling)
|
|
170
|
+
# @return [Float] Sample rate (0.0-1.0)
|
|
171
|
+
def determine_sample_rate(event_class, event_data = nil)
|
|
172
|
+
# 0. Error-based adaptive sampling (FEAT-4838) - highest priority!
|
|
173
|
+
if @error_based_adaptive && @error_spike_detector&.error_spike?
|
|
174
|
+
return 1.0 # 100% sampling during error spike
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# 1. Value-based sampling (FEAT-4849) - high-value events always sampled
|
|
178
|
+
if event_data && event_class.respond_to?(:value_sampling_configs)
|
|
179
|
+
configs = event_class.value_sampling_configs
|
|
180
|
+
unless configs.empty?
|
|
181
|
+
require "e11y/sampling/value_extractor"
|
|
182
|
+
extractor = E11y::Sampling::ValueExtractor.new
|
|
183
|
+
if configs.any? { |config| config.matches?(event_data, extractor) }
|
|
184
|
+
return 1.0 # 100% sampling for high-value events
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# 2. Load-based adaptive sampling (FEAT-4842)
|
|
190
|
+
# Apply load-based rate if enabled, but can be overridden by higher priority rules below
|
|
191
|
+
base_rate = if @load_based_adaptive && @load_monitor
|
|
192
|
+
@load_monitor.recommended_sample_rate
|
|
193
|
+
else
|
|
194
|
+
@default_sample_rate
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# 2. Severity-based override from middleware config
|
|
198
|
+
if event_class.respond_to?(:severity)
|
|
199
|
+
severity = event_class.severity
|
|
200
|
+
return @severity_rates[severity] if @severity_rates.key?(severity)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# 3. Event-level config (from Event::Base)
|
|
204
|
+
# If event has explicit sample_rate, use min(event_rate, load_rate)
|
|
205
|
+
if event_class.respond_to?(:resolve_sample_rate)
|
|
206
|
+
event_rate = event_class.resolve_sample_rate
|
|
207
|
+
return [event_rate, base_rate].min # Take the more restrictive rate
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# 4. Default/load-based rate
|
|
211
|
+
base_rate
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Trace-aware sampling decision (C05 Resolution)
|
|
215
|
+
#
|
|
216
|
+
# All events in a trace share the same sampling decision.
|
|
217
|
+
# This prevents incomplete traces in distributed systems.
|
|
218
|
+
#
|
|
219
|
+
# @param trace_id [String] The trace ID
|
|
220
|
+
# @param event_class [Class] The event class
|
|
221
|
+
# @param event_data [Hash] Event payload (for value-based sampling)
|
|
222
|
+
# @return [Boolean] true if trace should be sampled
|
|
223
|
+
def trace_sampling_decision(trace_id, event_class, event_data = nil)
|
|
224
|
+
@trace_decisions_mutex.synchronize do
|
|
225
|
+
# Check if decision already made for this trace
|
|
226
|
+
return @trace_decisions[trace_id] if @trace_decisions.key?(trace_id)
|
|
227
|
+
|
|
228
|
+
# Make new sampling decision
|
|
229
|
+
sample_rate = determine_sample_rate(event_class, event_data)
|
|
230
|
+
decision = rand < sample_rate
|
|
231
|
+
|
|
232
|
+
# Cache decision (TTL handled by periodic cleanup)
|
|
233
|
+
@trace_decisions[trace_id] = decision
|
|
234
|
+
|
|
235
|
+
# Cleanup old decisions periodically (every 1000 traces)
|
|
236
|
+
cleanup_trace_decisions if @trace_decisions.size > 1000
|
|
237
|
+
|
|
238
|
+
decision
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Cleanup old trace decisions to prevent memory leaks
|
|
243
|
+
#
|
|
244
|
+
# Removes random 50% of cached decisions when cache grows too large.
|
|
245
|
+
# This is a simple heuristic - traces typically complete in <10 seconds,
|
|
246
|
+
# so old decisions are likely stale.
|
|
247
|
+
def cleanup_trace_decisions
|
|
248
|
+
# Remove random 50% of decisions
|
|
249
|
+
keys_to_remove = @trace_decisions.keys.sample(@trace_decisions.size / 2)
|
|
250
|
+
keys_to_remove.each { |key| @trace_decisions.delete(key) }
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "e11y/middleware/base"
|
|
4
|
+
require "e11y/slo/event_driven"
|
|
5
|
+
|
|
6
|
+
module E11y
|
|
7
|
+
module Middleware
|
|
8
|
+
# SLO Middleware for Event-Driven SLO tracking (ADR-014).
|
|
9
|
+
#
|
|
10
|
+
# Automatically processes events with SLO configuration enabled,
|
|
11
|
+
# computes `slo_status` from payload, and emits metrics.
|
|
12
|
+
#
|
|
13
|
+
# **Features:**
|
|
14
|
+
# - Auto-detects events with `slo { enabled true }`
|
|
15
|
+
# - Calls `slo_status_from` proc to compute 'success'/'failure'
|
|
16
|
+
# - Emits `slo_event_result_total{slo_status}` metric to Yabeda
|
|
17
|
+
# - Never fails event tracking (graceful error handling)
|
|
18
|
+
#
|
|
19
|
+
# **Middleware Zone:** `:post_processing` (after routing, before adapters)
|
|
20
|
+
#
|
|
21
|
+
# **ADR References:**
|
|
22
|
+
# - ADR-014 §3 (Event SLO DSL)
|
|
23
|
+
# - ADR-014 §4 (SLO Status Calculation)
|
|
24
|
+
# - ADR-014 §6 (Metrics Export)
|
|
25
|
+
# - ADR-015 §3 (Middleware Order)
|
|
26
|
+
#
|
|
27
|
+
# **Use Case:** UC-014 (Event-Driven SLO)
|
|
28
|
+
#
|
|
29
|
+
# @example Configuration
|
|
30
|
+
# E11y.configure do |config|
|
|
31
|
+
# # Enable SLO middleware (auto-enabled if any Events have slo { enabled true })
|
|
32
|
+
# config.pipeline.use E11y::Middleware::SLO, zone: :post_processing
|
|
33
|
+
# end
|
|
34
|
+
#
|
|
35
|
+
# @example Event with SLO
|
|
36
|
+
# module Events
|
|
37
|
+
# class PaymentProcessed < E11y::Event::Base
|
|
38
|
+
# schema do
|
|
39
|
+
# required(:payment_id).filled(:string)
|
|
40
|
+
# required(:status).filled(:string)
|
|
41
|
+
# end
|
|
42
|
+
#
|
|
43
|
+
# slo do
|
|
44
|
+
# enabled true
|
|
45
|
+
# slo_status_from do |payload|
|
|
46
|
+
# case payload[:status]
|
|
47
|
+
# when 'completed' then 'success'
|
|
48
|
+
# when 'failed' then 'failure'
|
|
49
|
+
# else nil # Not counted
|
|
50
|
+
# end
|
|
51
|
+
# end
|
|
52
|
+
# end
|
|
53
|
+
# end
|
|
54
|
+
# end
|
|
55
|
+
#
|
|
56
|
+
# # Tracking will automatically emit SLO metric:
|
|
57
|
+
# Events::PaymentProcessed.track(payment_id: 'p123', status: 'completed')
|
|
58
|
+
# # → Emits: slo_event_result_total{event_name="payment.processed", slo_status="success"} +1
|
|
59
|
+
#
|
|
60
|
+
# @see ADR-014 for complete Event-Driven SLO architecture
|
|
61
|
+
class SLO < Base
|
|
62
|
+
middleware_zone :post_processing
|
|
63
|
+
|
|
64
|
+
# Process event and emit SLO metric if SLO is enabled.
|
|
65
|
+
#
|
|
66
|
+
# @param event_data [Hash] Event payload
|
|
67
|
+
# @return [Hash] Unchanged event_data (passthrough)
|
|
68
|
+
def call(event_data)
|
|
69
|
+
# Skip if SLO not enabled for this event
|
|
70
|
+
event_class = resolve_event_class(event_data)
|
|
71
|
+
return event_data unless event_class&.respond_to?(:slo_config)
|
|
72
|
+
return event_data unless event_class.slo_config&.enabled
|
|
73
|
+
|
|
74
|
+
# Compute slo_status from payload
|
|
75
|
+
slo_status = compute_slo_status(event_class, event_data[:payload])
|
|
76
|
+
return event_data unless slo_status
|
|
77
|
+
|
|
78
|
+
# Emit SLO metric
|
|
79
|
+
emit_slo_metric(event_class, slo_status, event_data[:payload])
|
|
80
|
+
|
|
81
|
+
event_data # Passthrough (never modify event_data)
|
|
82
|
+
rescue StandardError => e
|
|
83
|
+
# Never fail event tracking due to SLO processing
|
|
84
|
+
E11y.logger.error(
|
|
85
|
+
"[E11y::Middleware::SLO] SLO processing failed for #{event_data[:event_name]}: #{e.message}"
|
|
86
|
+
)
|
|
87
|
+
event_data
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
private
|
|
91
|
+
|
|
92
|
+
# Resolve Event class from event_name.
|
|
93
|
+
#
|
|
94
|
+
# @param event_data [Hash] Event payload
|
|
95
|
+
# @return [Class, nil] Event class or nil if not found
|
|
96
|
+
def resolve_event_class(event_data)
|
|
97
|
+
event_name = event_data[:event_name]
|
|
98
|
+
return nil unless event_name
|
|
99
|
+
|
|
100
|
+
# Convert event_name to class name (e.g., "payment.processed" → "Events::PaymentProcessed")
|
|
101
|
+
# This assumes Rails autoloading or explicit requires
|
|
102
|
+
class_name = event_name.to_s.split(".").map(&:capitalize).join
|
|
103
|
+
"Events::#{class_name}".constantize
|
|
104
|
+
rescue NameError
|
|
105
|
+
# Event class not found (may be from external source)
|
|
106
|
+
nil
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Compute slo_status using event's slo_status_from proc.
|
|
110
|
+
#
|
|
111
|
+
# @param event_class [Class] Event class
|
|
112
|
+
# @param payload [Hash] Event payload
|
|
113
|
+
# @return [String, nil] 'success', 'failure', or nil
|
|
114
|
+
def compute_slo_status(event_class, payload)
|
|
115
|
+
return nil unless event_class.slo_config.slo_status_proc
|
|
116
|
+
|
|
117
|
+
event_class.slo_config.slo_status_proc.call(payload)
|
|
118
|
+
rescue StandardError => e
|
|
119
|
+
E11y.logger.error(
|
|
120
|
+
"[E11y::Middleware::SLO] Failed to compute slo_status for #{event_class.name}: #{e.message}"
|
|
121
|
+
)
|
|
122
|
+
nil
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Emit SLO metric to Yabeda/Prometheus.
|
|
126
|
+
#
|
|
127
|
+
# @param event_class [Class] Event class
|
|
128
|
+
# @param slo_status [String] 'success' or 'failure'
|
|
129
|
+
# @param payload [Hash] Event payload
|
|
130
|
+
# @return [void]
|
|
131
|
+
def emit_slo_metric(event_class, slo_status, payload)
|
|
132
|
+
labels = build_slo_labels(event_class, slo_status, payload)
|
|
133
|
+
|
|
134
|
+
E11y::Metrics.increment(:slo_event_result_total, labels)
|
|
135
|
+
rescue StandardError => e
|
|
136
|
+
E11y.logger.error(
|
|
137
|
+
"[E11y::Middleware::SLO] Failed to emit SLO metric for #{event_class.name}: #{e.message}"
|
|
138
|
+
)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Build metric labels for SLO.
|
|
142
|
+
#
|
|
143
|
+
# @param event_class [Class] Event class
|
|
144
|
+
# @param slo_status [String] 'success' or 'failure'
|
|
145
|
+
# @param payload [Hash] Event payload
|
|
146
|
+
# @return [Hash] Metric labels
|
|
147
|
+
def build_slo_labels(event_class, slo_status, payload)
|
|
148
|
+
labels = {
|
|
149
|
+
event_name: event_class.event_name,
|
|
150
|
+
slo_status: slo_status
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Add custom SLO name if configured
|
|
154
|
+
if event_class.slo_config.contributes_to
|
|
155
|
+
labels[:slo_name] = event_class.slo_config.contributes_to
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Add group_by field if configured
|
|
159
|
+
if event_class.slo_config.group_by_field
|
|
160
|
+
field = event_class.slo_config.group_by_field
|
|
161
|
+
labels[:group_by] = payload[field].to_s if payload[field]
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
labels
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "securerandom"
|
|
4
|
+
require "time"
|
|
5
|
+
|
|
6
|
+
module E11y
|
|
7
|
+
module Middleware
|
|
8
|
+
# TraceContext middleware adds distributed tracing metadata to all events.
|
|
9
|
+
#
|
|
10
|
+
# This is the FIRST middleware in the pipeline (pre-processing zone),
|
|
11
|
+
# ensuring every event has tracing context for correlation across services.
|
|
12
|
+
#
|
|
13
|
+
# @see ADR-015 §3.1 Pipeline Flow
|
|
14
|
+
# @see ADR-005 Tracing Context Management
|
|
15
|
+
# @see UC-006 Trace Context Management
|
|
16
|
+
# @see UC-009 Multi-Service Tracing
|
|
17
|
+
#
|
|
18
|
+
# @example Automatic tracing metadata
|
|
19
|
+
# Events::OrderPaid.track(order_id: 123)
|
|
20
|
+
#
|
|
21
|
+
# # Event data after TraceContext middleware:
|
|
22
|
+
# {
|
|
23
|
+
# event_name: 'Events::OrderPaid',
|
|
24
|
+
# payload: { order_id: 123 },
|
|
25
|
+
# trace_id: 'a1b2c3d4e5f6g7h8', # 32-char hex
|
|
26
|
+
# span_id: 'i9j0k1l2', # 16-char hex
|
|
27
|
+
# timestamp: '2026-01-17T12:34:56.789Z' # ISO8601
|
|
28
|
+
# }
|
|
29
|
+
#
|
|
30
|
+
# @example Request-scoped tracing (propagation)
|
|
31
|
+
# # In Rails controller/middleware:
|
|
32
|
+
# Thread.current[:e11y_trace_id] = request.headers['X-Trace-ID']
|
|
33
|
+
#
|
|
34
|
+
# Events::OrderPaid.track(order_id: 123)
|
|
35
|
+
# # Uses propagated trace_id from thread-local storage
|
|
36
|
+
#
|
|
37
|
+
# @example Manual trace_id injection
|
|
38
|
+
# Events::OrderPaid.track(order_id: 123, trace_id: 'custom-trace-id')
|
|
39
|
+
# # Manual trace_id preserved (not overridden)
|
|
40
|
+
class TraceContext < Base
|
|
41
|
+
middleware_zone :pre_processing
|
|
42
|
+
|
|
43
|
+
# Adds tracing metadata to event data.
|
|
44
|
+
#
|
|
45
|
+
# **Hybrid Tracing (C17 Resolution)**:
|
|
46
|
+
# - trace_id: Current trace (from E11y::Current or generated)
|
|
47
|
+
# - span_id: Always new for each event
|
|
48
|
+
# - parent_trace_id: Link to parent trace (for background jobs)
|
|
49
|
+
#
|
|
50
|
+
# @param event_data [Hash] The event data to enrich
|
|
51
|
+
# @option event_data [String] :trace_id Existing trace ID (optional)
|
|
52
|
+
# @option event_data [String] :span_id Existing span ID (optional)
|
|
53
|
+
# @option event_data [String] :parent_trace_id Parent trace ID (optional)
|
|
54
|
+
# @option event_data [Time,String] :timestamp Existing timestamp (optional)
|
|
55
|
+
# @return [Hash, nil] Enriched event data, or nil if dropped
|
|
56
|
+
def call(event_data)
|
|
57
|
+
# Add trace_id (propagate from E11y::Current or Thread.current or generate new)
|
|
58
|
+
event_data[:trace_id] ||= current_trace_id || generate_trace_id
|
|
59
|
+
|
|
60
|
+
# Add span_id (always generate new for this event)
|
|
61
|
+
event_data[:span_id] ||= generate_span_id
|
|
62
|
+
|
|
63
|
+
# Add parent_trace_id (if job has parent trace) - C17 Resolution
|
|
64
|
+
event_data[:parent_trace_id] ||= current_parent_trace_id if current_parent_trace_id
|
|
65
|
+
|
|
66
|
+
# Add timestamp (use existing or current time)
|
|
67
|
+
event_data[:timestamp] ||= format_timestamp(Time.now.utc)
|
|
68
|
+
|
|
69
|
+
# Increment metrics
|
|
70
|
+
increment_metric("e11y.middleware.trace_context.processed")
|
|
71
|
+
|
|
72
|
+
@app.call(event_data)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# Get current trace ID from E11y::Current or thread-local storage (request context).
|
|
78
|
+
#
|
|
79
|
+
# Priority: E11y::Current > Thread.current
|
|
80
|
+
#
|
|
81
|
+
# @return [String, nil] Current trace ID if set, nil otherwise
|
|
82
|
+
def current_trace_id
|
|
83
|
+
E11y::Current.trace_id || Thread.current[:e11y_trace_id]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get current parent trace ID from E11y::Current (background job context).
|
|
87
|
+
#
|
|
88
|
+
# Only set for background jobs that have a parent request trace.
|
|
89
|
+
#
|
|
90
|
+
# @return [String, nil] Parent trace ID if set, nil otherwise
|
|
91
|
+
def current_parent_trace_id
|
|
92
|
+
E11y::Current.parent_trace_id
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Generate a new trace ID (32-character hexadecimal).
|
|
96
|
+
#
|
|
97
|
+
# Compatible with OpenTelemetry trace_id format (16 bytes = 32 hex chars).
|
|
98
|
+
#
|
|
99
|
+
# @return [String] New trace ID
|
|
100
|
+
def generate_trace_id
|
|
101
|
+
SecureRandom.hex(16) # 32 chars
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Generate a new span ID (16-character hexadecimal).
|
|
105
|
+
#
|
|
106
|
+
# Compatible with OpenTelemetry span_id format (8 bytes = 16 hex chars).
|
|
107
|
+
#
|
|
108
|
+
# @return [String] New span ID
|
|
109
|
+
def generate_span_id
|
|
110
|
+
SecureRandom.hex(8) # 16 chars
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Format timestamp to ISO8601 with millisecond precision.
|
|
114
|
+
#
|
|
115
|
+
# @param time [Time] Time object to format
|
|
116
|
+
# @return [String] ISO8601 formatted timestamp (e.g., "2026-01-17T12:34:56.789Z")
|
|
117
|
+
def format_timestamp(time)
|
|
118
|
+
time.utc.iso8601(3)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Placeholder for metrics instrumentation.
|
|
122
|
+
#
|
|
123
|
+
# @param metric_name [String] Metric name
|
|
124
|
+
# @return [void]
|
|
125
|
+
def increment_metric(_metric_name)
|
|
126
|
+
# TODO: Integrate with Yabeda/Prometheus in Phase 2
|
|
127
|
+
# Yabeda.e11y.middleware_trace_context_processed.increment
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Middleware
|
|
5
|
+
# Validation middleware performs schema validation on event payloads.
|
|
6
|
+
#
|
|
7
|
+
# This middleware runs in the pre-processing zone, AFTER TraceContext and
|
|
8
|
+
# BEFORE PII filtering. It validates the event payload against the schema
|
|
9
|
+
# defined in the event class.
|
|
10
|
+
#
|
|
11
|
+
# **CRITICAL:** Validation MUST use the ORIGINAL class name (e.g., Events::OrderPaidV2),
|
|
12
|
+
# NOT the normalized name (Events::OrderPaid), because schemas are version-specific.
|
|
13
|
+
#
|
|
14
|
+
# @see ADR-015 §3.1 Pipeline Flow (line 96-97)
|
|
15
|
+
# @see ADR-015 §3.2 Why Each Middleware Needs Original Class Name (line 125)
|
|
16
|
+
# @see E11y::Event::Base#validate_payload! for validation logic
|
|
17
|
+
#
|
|
18
|
+
# @example Valid event passes through
|
|
19
|
+
# class Events::OrderPaid < E11y::Event::Base
|
|
20
|
+
# schema do
|
|
21
|
+
# required(:order_id).filled(:integer)
|
|
22
|
+
# end
|
|
23
|
+
# end
|
|
24
|
+
#
|
|
25
|
+
# event_data = {
|
|
26
|
+
# event_class: Events::OrderPaid,
|
|
27
|
+
# payload: { order_id: 123 }
|
|
28
|
+
# }
|
|
29
|
+
#
|
|
30
|
+
# # Validation passes ✅
|
|
31
|
+
# middleware.call(event_data) # → event_data (unchanged)
|
|
32
|
+
#
|
|
33
|
+
# @example Invalid event raises error
|
|
34
|
+
# event_data = {
|
|
35
|
+
# event_class: Events::OrderPaid,
|
|
36
|
+
# payload: { order_id: "invalid" } # ❌ Should be integer
|
|
37
|
+
# }
|
|
38
|
+
#
|
|
39
|
+
# middleware.call(event_data)
|
|
40
|
+
# # Raises E11y::ValidationError: "Validation failed for Events::OrderPaid: order_id must be an integer"
|
|
41
|
+
#
|
|
42
|
+
# @example Schema-less events pass through
|
|
43
|
+
# class Events::SimpleEvent < E11y::Event::Base
|
|
44
|
+
# # No schema defined
|
|
45
|
+
# end
|
|
46
|
+
#
|
|
47
|
+
# # Validation skipped (no schema) ✅
|
|
48
|
+
# middleware.call(event_data) # → event_data (unchanged)
|
|
49
|
+
class Validation < Base
|
|
50
|
+
middleware_zone :pre_processing
|
|
51
|
+
|
|
52
|
+
# Validates event payload against its schema.
|
|
53
|
+
#
|
|
54
|
+
# @param event_data [Hash] The event data to validate
|
|
55
|
+
# @option event_data [Class] :event_class The event class (required)
|
|
56
|
+
# @option event_data [Hash] :payload The event payload (required)
|
|
57
|
+
# @return [Hash, nil] Validated event data, or nil if dropped
|
|
58
|
+
# @raise [E11y::ValidationError] if validation fails
|
|
59
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
60
|
+
def call(event_data)
|
|
61
|
+
# Skip validation if no event_class or payload
|
|
62
|
+
return @app.call(event_data) unless event_data[:event_class] && event_data[:payload]
|
|
63
|
+
|
|
64
|
+
event_class = event_data[:event_class]
|
|
65
|
+
payload = event_data[:payload]
|
|
66
|
+
|
|
67
|
+
# Get compiled schema from event class
|
|
68
|
+
schema = event_class.compiled_schema
|
|
69
|
+
|
|
70
|
+
# Skip validation if no schema defined (schema-less events)
|
|
71
|
+
unless schema
|
|
72
|
+
increment_metric("e11y.middleware.validation.skipped")
|
|
73
|
+
return @app.call(event_data)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Perform validation
|
|
77
|
+
result = schema.call(payload)
|
|
78
|
+
|
|
79
|
+
if result.success?
|
|
80
|
+
# Validation passed
|
|
81
|
+
increment_metric("e11y.middleware.validation.passed")
|
|
82
|
+
@app.call(event_data)
|
|
83
|
+
else
|
|
84
|
+
# Validation failed - raise error with details
|
|
85
|
+
increment_metric("e11y.middleware.validation.failed")
|
|
86
|
+
|
|
87
|
+
error_message = format_validation_errors(event_class, result.errors)
|
|
88
|
+
raise E11y::ValidationError, error_message
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
# Format validation errors into a human-readable message.
|
|
96
|
+
#
|
|
97
|
+
# @param event_class [Class] The event class
|
|
98
|
+
# @param errors [Dry::Schema::MessageSet] Validation errors
|
|
99
|
+
# @return [String] Formatted error message
|
|
100
|
+
def format_validation_errors(event_class, errors)
|
|
101
|
+
error_details = errors.to_h.map do |field, messages|
|
|
102
|
+
"#{field}: #{messages.join(', ')}"
|
|
103
|
+
end.join("; ")
|
|
104
|
+
|
|
105
|
+
"Validation failed for #{event_class.name}: #{error_details}"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Placeholder for metrics instrumentation.
|
|
109
|
+
#
|
|
110
|
+
# @param metric_name [String] Metric name
|
|
111
|
+
# @return [void]
|
|
112
|
+
def increment_metric(_metric_name)
|
|
113
|
+
# TODO: Integrate with Yabeda/Prometheus in Phase 2
|
|
114
|
+
# Yabeda.e11y.middleware_validation_passed.increment
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|