e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,254 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "e11y/middleware/base"
4
+
5
+ module E11y
6
+ module Middleware
7
+ # Sampling Middleware
8
+ #
9
+ # Filters events based on sampling configuration to reduce volume and costs.
10
+ # Supports:
11
+ # - Per-event sample rates (from Event::Base)
12
+ # - Severity-based sampling (errors always sampled)
13
+ # - Pattern-based sampling (e.g., "debug.*" → 1%)
14
+ # - Trace-aware sampling (C05 - all events in trace sampled or none)
15
+ # - Error-based adaptive sampling (FEAT-4838 - 100% during error spikes)
16
+ # - Load-based adaptive sampling (FEAT-4842 - tiered sampling based on event volume)
17
+ #
18
+ # @example Basic usage
19
+ # E11y.configure do |config|
20
+ # config.middleware.use E11y::Middleware::Sampling, zone: :routing
21
+ # end
22
+ #
23
+ # @example Event-level sampling
24
+ # class Events::DebugQuery < E11y::Event::Base
25
+ # sample_rate 0.01 # 1% sampling
26
+ # end
27
+ #
28
+ # @example Error-based adaptive sampling
29
+ # E11y.configure do |config|
30
+ # config.middleware.use E11y::Middleware::Sampling,
31
+ # error_based_adaptive: true,
32
+ # error_spike_config: {
33
+ # window: 60,
34
+ # absolute_threshold: 100,
35
+ # relative_threshold: 3.0,
36
+ # spike_duration: 300
37
+ # }
38
+ # end
39
+ #
40
+ # @example Load-based adaptive sampling
41
+ # E11y.configure do |config|
42
+ # config.middleware.use E11y::Middleware::Sampling,
43
+ # load_based_adaptive: true,
44
+ # load_monitor_config: {
45
+ # window: 60,
46
+ # thresholds: {
47
+ # normal: 1_000, # 0-1k events/sec → 100% sampling
48
+ # high: 10_000, # 1k-10k → 50%
49
+ # very_high: 50_000,# 10k-50k → 10%
50
+ # overload: 100_000 # >100k → 1%
51
+ # }
52
+ # }
53
+ # end
54
+ class Sampling < Base
55
+ middleware_zone :routing
56
+
57
+ # Initialize sampling middleware
58
+ #
59
+ # @param config [Hash] Configuration options
60
+ # @option config [Float] :default_sample_rate (1.0) Default sample rate for events without explicit config
61
+ # @option config [Boolean] :trace_aware (true) Enable trace-aware sampling (C05)
62
+ # @option config [Hash] :severity_rates ({}) Override sample rates by severity
63
+ # @option config [Boolean] :error_based_adaptive (false) Enable error-based adaptive sampling (FEAT-4838)
64
+ # @option config [Hash] :error_spike_config ({}) Configuration for ErrorSpikeDetector
65
+ # @option config [Boolean] :load_based_adaptive (false) Enable load-based adaptive sampling (FEAT-4842)
66
+ # @option config [Hash] :load_monitor_config ({}) Configuration for LoadMonitor
67
+ def initialize(config = {})
68
+ # Extract config before calling super (which sets @config)
69
+ config ||= {}
70
+ @default_sample_rate = config.fetch(:default_sample_rate, 1.0)
71
+ @trace_aware = config.fetch(:trace_aware, true)
72
+ @severity_rates = config.fetch(:severity_rates, {})
73
+ @trace_decisions = {} # Cache for trace-level sampling decisions
74
+ @trace_decisions_mutex = Mutex.new
75
+
76
+ # Error-based adaptive sampling (FEAT-4838)
77
+ @error_based_adaptive = config.fetch(:error_based_adaptive, false)
78
+ if @error_based_adaptive
79
+ require "e11y/sampling/error_spike_detector"
80
+ error_spike_config = config.fetch(:error_spike_config, {})
81
+ @error_spike_detector = E11y::Sampling::ErrorSpikeDetector.new(error_spike_config)
82
+ end
83
+
84
+ # Load-based adaptive sampling (FEAT-4842)
85
+ @load_based_adaptive = config.fetch(:load_based_adaptive, false)
86
+ if @load_based_adaptive
87
+ require "e11y/sampling/load_monitor"
88
+ load_monitor_config = config.fetch(:load_monitor_config, {})
89
+ @load_monitor = E11y::Sampling::LoadMonitor.new(load_monitor_config)
90
+ end
91
+
92
+ # Call super to set @config and other base middleware state
93
+ super
94
+ end
95
+
96
+ # Process event through sampling filter
97
+ #
98
+ # @param event_data [Hash] The event payload
99
+ # @return [Hash, nil] The event payload if sampled, nil if dropped
100
+ def call(event_data)
101
+ event_class = event_data[:event_class]
102
+
103
+ # Track errors for error-based adaptive sampling (FEAT-4838)
104
+ @error_spike_detector.record_event(event_data) if @error_based_adaptive && @error_spike_detector
105
+
106
+ # Track events for load-based adaptive sampling (FEAT-4842)
107
+ @load_monitor&.record_event
108
+
109
+ # Determine if event should be sampled
110
+ # Drop event if not sampled
111
+ return nil unless should_sample?(event_data, event_class)
112
+
113
+ # Mark as sampled for downstream middleware
114
+ event_data[:sampled] = true
115
+ event_data[:sample_rate] = determine_sample_rate(event_class, event_data)
116
+
117
+ # Pass to next middleware
118
+ @app.call(event_data)
119
+ end
120
+
121
+ # Returns capabilities of this middleware
122
+ #
123
+ # @return [Hash] Capabilities
124
+ def capabilities
125
+ {
126
+ filters_events: true,
127
+ trace_aware: @trace_aware,
128
+ severity_aware: true,
129
+ error_based_adaptive: @error_based_adaptive, # FEAT-4838
130
+ load_based_adaptive: @load_based_adaptive # FEAT-4842
131
+ }
132
+ end
133
+
134
+ private
135
+
136
+ # Determine if event should be sampled
137
+ #
138
+ # @param event_data [Hash] The event payload
139
+ # @param event_class [Class] The event class
140
+ # @return [Boolean] true if event should be sampled
141
+ def should_sample?(event_data, event_class)
142
+ # 1. Check if audit event (never sample audit events!)
143
+ return true if event_class.respond_to?(:audit_event?) && event_class.audit_event?
144
+
145
+ # 2. Check trace-aware sampling (C05)
146
+ if @trace_aware && event_data[:trace_id]
147
+ return trace_sampling_decision(event_data[:trace_id], event_class,
148
+ event_data)
149
+ end
150
+
151
+ # 3. Get sample rate for this event
152
+ sample_rate = determine_sample_rate(event_class, event_data)
153
+
154
+ # 4. Random sampling decision
155
+ rand < sample_rate
156
+ end
157
+
158
+ # Determine sample rate for event
159
+ #
160
+ # Priority (highest to lowest):
161
+ # 0. Error spike override (100% during spike) - FEAT-4838
162
+ # 1. Value-based sampling (high-value events) - FEAT-4849
163
+ # 2. Load-based adaptive (tiered rates) - FEAT-4842
164
+ # 3. Severity-based override from config (@severity_rates)
165
+ # 4. Event-level config (event_class.resolve_sample_rate)
166
+ # 5. Default sample rate (@default_sample_rate)
167
+ #
168
+ # @param event_class [Class] The event class
169
+ # @param event_data [Hash] Event payload (for value-based sampling)
170
+ # @return [Float] Sample rate (0.0-1.0)
171
+ def determine_sample_rate(event_class, event_data = nil)
172
+ # 0. Error-based adaptive sampling (FEAT-4838) - highest priority!
173
+ if @error_based_adaptive && @error_spike_detector&.error_spike?
174
+ return 1.0 # 100% sampling during error spike
175
+ end
176
+
177
+ # 1. Value-based sampling (FEAT-4849) - high-value events always sampled
178
+ if event_data && event_class.respond_to?(:value_sampling_configs)
179
+ configs = event_class.value_sampling_configs
180
+ unless configs.empty?
181
+ require "e11y/sampling/value_extractor"
182
+ extractor = E11y::Sampling::ValueExtractor.new
183
+ if configs.any? { |config| config.matches?(event_data, extractor) }
184
+ return 1.0 # 100% sampling for high-value events
185
+ end
186
+ end
187
+ end
188
+
189
+ # 2. Load-based adaptive sampling (FEAT-4842)
190
+ # Apply load-based rate if enabled, but can be overridden by higher priority rules below
191
+ base_rate = if @load_based_adaptive && @load_monitor
192
+ @load_monitor.recommended_sample_rate
193
+ else
194
+ @default_sample_rate
195
+ end
196
+
197
+ # 2. Severity-based override from middleware config
198
+ if event_class.respond_to?(:severity)
199
+ severity = event_class.severity
200
+ return @severity_rates[severity] if @severity_rates.key?(severity)
201
+ end
202
+
203
+ # 3. Event-level config (from Event::Base)
204
+ # If event has explicit sample_rate, use min(event_rate, load_rate)
205
+ if event_class.respond_to?(:resolve_sample_rate)
206
+ event_rate = event_class.resolve_sample_rate
207
+ return [event_rate, base_rate].min # Take the more restrictive rate
208
+ end
209
+
210
+ # 4. Default/load-based rate
211
+ base_rate
212
+ end
213
+
214
+ # Trace-aware sampling decision (C05 Resolution)
215
+ #
216
+ # All events in a trace share the same sampling decision.
217
+ # This prevents incomplete traces in distributed systems.
218
+ #
219
+ # @param trace_id [String] The trace ID
220
+ # @param event_class [Class] The event class
221
+ # @param event_data [Hash] Event payload (for value-based sampling)
222
+ # @return [Boolean] true if trace should be sampled
223
+ def trace_sampling_decision(trace_id, event_class, event_data = nil)
224
+ @trace_decisions_mutex.synchronize do
225
+ # Check if decision already made for this trace
226
+ return @trace_decisions[trace_id] if @trace_decisions.key?(trace_id)
227
+
228
+ # Make new sampling decision
229
+ sample_rate = determine_sample_rate(event_class, event_data)
230
+ decision = rand < sample_rate
231
+
232
+ # Cache decision (TTL handled by periodic cleanup)
233
+ @trace_decisions[trace_id] = decision
234
+
235
+ # Cleanup old decisions periodically (every 1000 traces)
236
+ cleanup_trace_decisions if @trace_decisions.size > 1000
237
+
238
+ decision
239
+ end
240
+ end
241
+
242
+ # Cleanup old trace decisions to prevent memory leaks
243
+ #
244
+ # Removes random 50% of cached decisions when cache grows too large.
245
+ # This is a simple heuristic - traces typically complete in <10 seconds,
246
+ # so old decisions are likely stale.
247
+ def cleanup_trace_decisions
248
+ # Remove random 50% of decisions
249
+ keys_to_remove = @trace_decisions.keys.sample(@trace_decisions.size / 2)
250
+ keys_to_remove.each { |key| @trace_decisions.delete(key) }
251
+ end
252
+ end
253
+ end
254
+ end
@@ -0,0 +1,168 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "e11y/middleware/base"
4
+ require "e11y/slo/event_driven"
5
+
6
+ module E11y
7
+ module Middleware
8
+ # SLO Middleware for Event-Driven SLO tracking (ADR-014).
9
+ #
10
+ # Automatically processes events with SLO configuration enabled,
11
+ # computes `slo_status` from payload, and emits metrics.
12
+ #
13
+ # **Features:**
14
+ # - Auto-detects events with `slo { enabled true }`
15
+ # - Calls `slo_status_from` proc to compute 'success'/'failure'
16
+ # - Emits `slo_event_result_total{slo_status}` metric to Yabeda
17
+ # - Never fails event tracking (graceful error handling)
18
+ #
19
+ # **Middleware Zone:** `:post_processing` (after routing, before adapters)
20
+ #
21
+ # **ADR References:**
22
+ # - ADR-014 §3 (Event SLO DSL)
23
+ # - ADR-014 §4 (SLO Status Calculation)
24
+ # - ADR-014 §6 (Metrics Export)
25
+ # - ADR-015 §3 (Middleware Order)
26
+ #
27
+ # **Use Case:** UC-014 (Event-Driven SLO)
28
+ #
29
+ # @example Configuration
30
+ # E11y.configure do |config|
31
+ # # Enable SLO middleware (auto-enabled if any Events have slo { enabled true })
32
+ # config.pipeline.use E11y::Middleware::SLO, zone: :post_processing
33
+ # end
34
+ #
35
+ # @example Event with SLO
36
+ # module Events
37
+ # class PaymentProcessed < E11y::Event::Base
38
+ # schema do
39
+ # required(:payment_id).filled(:string)
40
+ # required(:status).filled(:string)
41
+ # end
42
+ #
43
+ # slo do
44
+ # enabled true
45
+ # slo_status_from do |payload|
46
+ # case payload[:status]
47
+ # when 'completed' then 'success'
48
+ # when 'failed' then 'failure'
49
+ # else nil # Not counted
50
+ # end
51
+ # end
52
+ # end
53
+ # end
54
+ # end
55
+ #
56
+ # # Tracking will automatically emit SLO metric:
57
+ # Events::PaymentProcessed.track(payment_id: 'p123', status: 'completed')
58
+ # # → Emits: slo_event_result_total{event_name="payment.processed", slo_status="success"} +1
59
+ #
60
+ # @see ADR-014 for complete Event-Driven SLO architecture
61
+ class SLO < Base
62
+ middleware_zone :post_processing
63
+
64
+ # Process event and emit SLO metric if SLO is enabled.
65
+ #
66
+ # @param event_data [Hash] Event payload
67
+ # @return [Hash] Unchanged event_data (passthrough)
68
+ def call(event_data)
69
+ # Skip if SLO not enabled for this event
70
+ event_class = resolve_event_class(event_data)
71
+ return event_data unless event_class&.respond_to?(:slo_config)
72
+ return event_data unless event_class.slo_config&.enabled
73
+
74
+ # Compute slo_status from payload
75
+ slo_status = compute_slo_status(event_class, event_data[:payload])
76
+ return event_data unless slo_status
77
+
78
+ # Emit SLO metric
79
+ emit_slo_metric(event_class, slo_status, event_data[:payload])
80
+
81
+ event_data # Passthrough (never modify event_data)
82
+ rescue StandardError => e
83
+ # Never fail event tracking due to SLO processing
84
+ E11y.logger.error(
85
+ "[E11y::Middleware::SLO] SLO processing failed for #{event_data[:event_name]}: #{e.message}"
86
+ )
87
+ event_data
88
+ end
89
+
90
+ private
91
+
92
+ # Resolve Event class from event_name.
93
+ #
94
+ # @param event_data [Hash] Event payload
95
+ # @return [Class, nil] Event class or nil if not found
96
+ def resolve_event_class(event_data)
97
+ event_name = event_data[:event_name]
98
+ return nil unless event_name
99
+
100
+ # Convert event_name to class name (e.g., "payment.processed" → "Events::PaymentProcessed")
101
+ # This assumes Rails autoloading or explicit requires
102
+ class_name = event_name.to_s.split(".").map(&:capitalize).join
103
+ "Events::#{class_name}".constantize
104
+ rescue NameError
105
+ # Event class not found (may be from external source)
106
+ nil
107
+ end
108
+
109
+ # Compute slo_status using event's slo_status_from proc.
110
+ #
111
+ # @param event_class [Class] Event class
112
+ # @param payload [Hash] Event payload
113
+ # @return [String, nil] 'success', 'failure', or nil
114
+ def compute_slo_status(event_class, payload)
115
+ return nil unless event_class.slo_config.slo_status_proc
116
+
117
+ event_class.slo_config.slo_status_proc.call(payload)
118
+ rescue StandardError => e
119
+ E11y.logger.error(
120
+ "[E11y::Middleware::SLO] Failed to compute slo_status for #{event_class.name}: #{e.message}"
121
+ )
122
+ nil
123
+ end
124
+
125
+ # Emit SLO metric to Yabeda/Prometheus.
126
+ #
127
+ # @param event_class [Class] Event class
128
+ # @param slo_status [String] 'success' or 'failure'
129
+ # @param payload [Hash] Event payload
130
+ # @return [void]
131
+ def emit_slo_metric(event_class, slo_status, payload)
132
+ labels = build_slo_labels(event_class, slo_status, payload)
133
+
134
+ E11y::Metrics.increment(:slo_event_result_total, labels)
135
+ rescue StandardError => e
136
+ E11y.logger.error(
137
+ "[E11y::Middleware::SLO] Failed to emit SLO metric for #{event_class.name}: #{e.message}"
138
+ )
139
+ end
140
+
141
+ # Build metric labels for SLO.
142
+ #
143
+ # @param event_class [Class] Event class
144
+ # @param slo_status [String] 'success' or 'failure'
145
+ # @param payload [Hash] Event payload
146
+ # @return [Hash] Metric labels
147
+ def build_slo_labels(event_class, slo_status, payload)
148
+ labels = {
149
+ event_name: event_class.event_name,
150
+ slo_status: slo_status
151
+ }
152
+
153
+ # Add custom SLO name if configured
154
+ if event_class.slo_config.contributes_to
155
+ labels[:slo_name] = event_class.slo_config.contributes_to
156
+ end
157
+
158
+ # Add group_by field if configured
159
+ if event_class.slo_config.group_by_field
160
+ field = event_class.slo_config.group_by_field
161
+ labels[:group_by] = payload[field].to_s if payload[field]
162
+ end
163
+
164
+ labels
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "securerandom"
4
+ require "time"
5
+
6
+ module E11y
7
+ module Middleware
8
+ # TraceContext middleware adds distributed tracing metadata to all events.
9
+ #
10
+ # This is the FIRST middleware in the pipeline (pre-processing zone),
11
+ # ensuring every event has tracing context for correlation across services.
12
+ #
13
+ # @see ADR-015 §3.1 Pipeline Flow
14
+ # @see ADR-005 Tracing Context Management
15
+ # @see UC-006 Trace Context Management
16
+ # @see UC-009 Multi-Service Tracing
17
+ #
18
+ # @example Automatic tracing metadata
19
+ # Events::OrderPaid.track(order_id: 123)
20
+ #
21
+ # # Event data after TraceContext middleware:
22
+ # {
23
+ # event_name: 'Events::OrderPaid',
24
+ # payload: { order_id: 123 },
25
+ # trace_id: 'a1b2c3d4e5f6g7h8', # 32-char hex
26
+ # span_id: 'i9j0k1l2', # 16-char hex
27
+ # timestamp: '2026-01-17T12:34:56.789Z' # ISO8601
28
+ # }
29
+ #
30
+ # @example Request-scoped tracing (propagation)
31
+ # # In Rails controller/middleware:
32
+ # Thread.current[:e11y_trace_id] = request.headers['X-Trace-ID']
33
+ #
34
+ # Events::OrderPaid.track(order_id: 123)
35
+ # # Uses propagated trace_id from thread-local storage
36
+ #
37
+ # @example Manual trace_id injection
38
+ # Events::OrderPaid.track(order_id: 123, trace_id: 'custom-trace-id')
39
+ # # Manual trace_id preserved (not overridden)
40
+ class TraceContext < Base
41
+ middleware_zone :pre_processing
42
+
43
+ # Adds tracing metadata to event data.
44
+ #
45
+ # **Hybrid Tracing (C17 Resolution)**:
46
+ # - trace_id: Current trace (from E11y::Current or generated)
47
+ # - span_id: Always new for each event
48
+ # - parent_trace_id: Link to parent trace (for background jobs)
49
+ #
50
+ # @param event_data [Hash] The event data to enrich
51
+ # @option event_data [String] :trace_id Existing trace ID (optional)
52
+ # @option event_data [String] :span_id Existing span ID (optional)
53
+ # @option event_data [String] :parent_trace_id Parent trace ID (optional)
54
+ # @option event_data [Time,String] :timestamp Existing timestamp (optional)
55
+ # @return [Hash, nil] Enriched event data, or nil if dropped
56
+ def call(event_data)
57
+ # Add trace_id (propagate from E11y::Current or Thread.current or generate new)
58
+ event_data[:trace_id] ||= current_trace_id || generate_trace_id
59
+
60
+ # Add span_id (always generate new for this event)
61
+ event_data[:span_id] ||= generate_span_id
62
+
63
+ # Add parent_trace_id (if job has parent trace) - C17 Resolution
64
+ event_data[:parent_trace_id] ||= current_parent_trace_id if current_parent_trace_id
65
+
66
+ # Add timestamp (use existing or current time)
67
+ event_data[:timestamp] ||= format_timestamp(Time.now.utc)
68
+
69
+ # Increment metrics
70
+ increment_metric("e11y.middleware.trace_context.processed")
71
+
72
+ @app.call(event_data)
73
+ end
74
+
75
+ private
76
+
77
+ # Get current trace ID from E11y::Current or thread-local storage (request context).
78
+ #
79
+ # Priority: E11y::Current > Thread.current
80
+ #
81
+ # @return [String, nil] Current trace ID if set, nil otherwise
82
+ def current_trace_id
83
+ E11y::Current.trace_id || Thread.current[:e11y_trace_id]
84
+ end
85
+
86
+ # Get current parent trace ID from E11y::Current (background job context).
87
+ #
88
+ # Only set for background jobs that have a parent request trace.
89
+ #
90
+ # @return [String, nil] Parent trace ID if set, nil otherwise
91
+ def current_parent_trace_id
92
+ E11y::Current.parent_trace_id
93
+ end
94
+
95
+ # Generate a new trace ID (32-character hexadecimal).
96
+ #
97
+ # Compatible with OpenTelemetry trace_id format (16 bytes = 32 hex chars).
98
+ #
99
+ # @return [String] New trace ID
100
+ def generate_trace_id
101
+ SecureRandom.hex(16) # 32 chars
102
+ end
103
+
104
+ # Generate a new span ID (16-character hexadecimal).
105
+ #
106
+ # Compatible with OpenTelemetry span_id format (8 bytes = 16 hex chars).
107
+ #
108
+ # @return [String] New span ID
109
+ def generate_span_id
110
+ SecureRandom.hex(8) # 16 chars
111
+ end
112
+
113
+ # Format timestamp to ISO8601 with millisecond precision.
114
+ #
115
+ # @param time [Time] Time object to format
116
+ # @return [String] ISO8601 formatted timestamp (e.g., "2026-01-17T12:34:56.789Z")
117
+ def format_timestamp(time)
118
+ time.utc.iso8601(3)
119
+ end
120
+
121
+ # Placeholder for metrics instrumentation.
122
+ #
123
+ # @param metric_name [String] Metric name
124
+ # @return [void]
125
+ def increment_metric(_metric_name)
126
+ # TODO: Integrate with Yabeda/Prometheus in Phase 2
127
+ # Yabeda.e11y.middleware_trace_context_processed.increment
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Middleware
5
+ # Validation middleware performs schema validation on event payloads.
6
+ #
7
+ # This middleware runs in the pre-processing zone, AFTER TraceContext and
8
+ # BEFORE PII filtering. It validates the event payload against the schema
9
+ # defined in the event class.
10
+ #
11
+ # **CRITICAL:** Validation MUST use the ORIGINAL class name (e.g., Events::OrderPaidV2),
12
+ # NOT the normalized name (Events::OrderPaid), because schemas are version-specific.
13
+ #
14
+ # @see ADR-015 §3.1 Pipeline Flow (line 96-97)
15
+ # @see ADR-015 §3.2 Why Each Middleware Needs Original Class Name (line 125)
16
+ # @see E11y::Event::Base#validate_payload! for validation logic
17
+ #
18
+ # @example Valid event passes through
19
+ # class Events::OrderPaid < E11y::Event::Base
20
+ # schema do
21
+ # required(:order_id).filled(:integer)
22
+ # end
23
+ # end
24
+ #
25
+ # event_data = {
26
+ # event_class: Events::OrderPaid,
27
+ # payload: { order_id: 123 }
28
+ # }
29
+ #
30
+ # # Validation passes ✅
31
+ # middleware.call(event_data) # → event_data (unchanged)
32
+ #
33
+ # @example Invalid event raises error
34
+ # event_data = {
35
+ # event_class: Events::OrderPaid,
36
+ # payload: { order_id: "invalid" } # ❌ Should be integer
37
+ # }
38
+ #
39
+ # middleware.call(event_data)
40
+ # # Raises E11y::ValidationError: "Validation failed for Events::OrderPaid: order_id must be an integer"
41
+ #
42
+ # @example Schema-less events pass through
43
+ # class Events::SimpleEvent < E11y::Event::Base
44
+ # # No schema defined
45
+ # end
46
+ #
47
+ # # Validation skipped (no schema) ✅
48
+ # middleware.call(event_data) # → event_data (unchanged)
49
+ class Validation < Base
50
+ middleware_zone :pre_processing
51
+
52
+ # Validates event payload against its schema.
53
+ #
54
+ # @param event_data [Hash] The event data to validate
55
+ # @option event_data [Class] :event_class The event class (required)
56
+ # @option event_data [Hash] :payload The event payload (required)
57
+ # @return [Hash, nil] Validated event data, or nil if dropped
58
+ # @raise [E11y::ValidationError] if validation fails
59
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
60
+ def call(event_data)
61
+ # Skip validation if no event_class or payload
62
+ return @app.call(event_data) unless event_data[:event_class] && event_data[:payload]
63
+
64
+ event_class = event_data[:event_class]
65
+ payload = event_data[:payload]
66
+
67
+ # Get compiled schema from event class
68
+ schema = event_class.compiled_schema
69
+
70
+ # Skip validation if no schema defined (schema-less events)
71
+ unless schema
72
+ increment_metric("e11y.middleware.validation.skipped")
73
+ return @app.call(event_data)
74
+ end
75
+
76
+ # Perform validation
77
+ result = schema.call(payload)
78
+
79
+ if result.success?
80
+ # Validation passed
81
+ increment_metric("e11y.middleware.validation.passed")
82
+ @app.call(event_data)
83
+ else
84
+ # Validation failed - raise error with details
85
+ increment_metric("e11y.middleware.validation.failed")
86
+
87
+ error_message = format_validation_errors(event_class, result.errors)
88
+ raise E11y::ValidationError, error_message
89
+ end
90
+ end
91
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
92
+
93
+ private
94
+
95
+ # Format validation errors into a human-readable message.
96
+ #
97
+ # @param event_class [Class] The event class
98
+ # @param errors [Dry::Schema::MessageSet] Validation errors
99
+ # @return [String] Formatted error message
100
+ def format_validation_errors(event_class, errors)
101
+ error_details = errors.to_h.map do |field, messages|
102
+ "#{field}: #{messages.join(', ')}"
103
+ end.join("; ")
104
+
105
+ "Validation failed for #{event_class.name}: #{error_details}"
106
+ end
107
+
108
+ # Placeholder for metrics instrumentation.
109
+ #
110
+ # @param metric_name [String] Metric name
111
+ # @return [void]
112
+ def increment_metric(_metric_name)
113
+ # TODO: Integrate with Yabeda/Prometheus in Phase 2
114
+ # Yabeda.e11y.middleware_validation_passed.increment
115
+ end
116
+ end
117
+ end
118
+ end