e11y 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +130 -10
  3. data/CHANGELOG.md +56 -1
  4. data/CLAUDE.md +168 -0
  5. data/CONTRIBUTING.md +640 -0
  6. data/README.md +134 -702
  7. data/RELEASE.md +18 -3
  8. data/Rakefile +108 -29
  9. data/config/README.md +1 -1
  10. data/config/loki-local-config.yaml +12 -0
  11. data/config/otel-collector-config.yaml +44 -0
  12. data/cucumber.yml +1 -0
  13. data/docker-compose.yml +18 -2
  14. data/docs/ADAPTERS.md +76 -0
  15. data/docs/ADAPTIVE_SAMPLING.md +59 -0
  16. data/docs/COMPARISON.md +104 -0
  17. data/docs/CONFIGURATION.md +52 -0
  18. data/docs/DISTRIBUTED_TRACING.md +44 -0
  19. data/docs/LIMITATIONS.md +13 -0
  20. data/docs/METRICS_DSL.md +84 -0
  21. data/docs/PERFORMANCE.md +60 -0
  22. data/docs/PII_FILTERING.md +40 -0
  23. data/docs/PRESETS.md +65 -0
  24. data/docs/QUICK-START.md +546 -587
  25. data/docs/RAILS_INTEGRATION.md +29 -0
  26. data/docs/SCHEMA_VALIDATION.md +63 -0
  27. data/docs/SLO-PROMQL-ALERTS.md +161 -0
  28. data/docs/TESTING.md +69 -0
  29. data/docs/{ADR-001-architecture.md → architecture/ADR-001-architecture.md} +35 -64
  30. data/docs/{ADR-002-metrics-yabeda.md → architecture/ADR-002-metrics-yabeda.md} +62 -236
  31. data/docs/{ADR-003-slo-observability.md → architecture/ADR-003-slo-observability.md} +27 -466
  32. data/docs/{ADR-004-adapter-architecture.md → architecture/ADR-004-adapter-architecture.md} +163 -146
  33. data/docs/{ADR-005-tracing-context.md → architecture/ADR-005-tracing-context.md} +10 -9
  34. data/docs/{ADR-006-security-compliance.md → architecture/ADR-006-security-compliance.md} +184 -191
  35. data/docs/{ADR-007-opentelemetry-integration.md → architecture/ADR-007-opentelemetry-integration.md} +3 -21
  36. data/docs/{ADR-008-rails-integration.md → architecture/ADR-008-rails-integration.md} +209 -339
  37. data/docs/{ADR-009-cost-optimization.md → architecture/ADR-009-cost-optimization.md} +45 -54
  38. data/docs/architecture/ADR-010-developer-experience.md +522 -0
  39. data/docs/{ADR-011-testing-strategy.md → architecture/ADR-011-testing-strategy.md} +41 -83
  40. data/docs/{ADR-013-reliability-error-handling.md → architecture/ADR-013-reliability-error-handling.md} +37 -12
  41. data/docs/{ADR-014-event-driven-slo.md → architecture/ADR-014-event-driven-slo.md} +12 -24
  42. data/docs/{ADR-015-middleware-order.md → architecture/ADR-015-middleware-order.md} +23 -41
  43. data/docs/{ADR-016-self-monitoring-slo.md → architecture/ADR-016-self-monitoring-slo.md} +52 -349
  44. data/docs/{ADR-017-multi-rails-compatibility.md → architecture/ADR-017-multi-rails-compatibility.md} +4 -11
  45. data/docs/architecture/ADR-018-memory-optimization.md +366 -0
  46. data/docs/{ADR-INDEX.md → architecture/ADR-INDEX.md} +11 -6
  47. data/docs/{00-ICP-AND-TIMELINE.md → prd/00-ICP-AND-TIMELINE.md} +6 -6
  48. data/docs/{01-SCALE-REQUIREMENTS.md → prd/01-SCALE-REQUIREMENTS.md} +6 -6
  49. data/docs/prd/01-overview-vision.md +19 -14
  50. data/docs/use_cases/README.md +22 -23
  51. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +50 -44
  52. data/docs/use_cases/UC-002-business-event-tracking.md +26 -95
  53. data/docs/use_cases/UC-003-event-metrics.md +66 -0
  54. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +42 -101
  55. data/docs/use_cases/UC-005-sentry-integration.md +13 -15
  56. data/docs/use_cases/UC-006-trace-context-management.md +30 -28
  57. data/docs/use_cases/UC-007-pii-filtering.md +35 -87
  58. data/docs/use_cases/UC-008-opentelemetry-integration.md +51 -89
  59. data/docs/use_cases/UC-009-multi-service-tracing.md +4 -4
  60. data/docs/use_cases/UC-010-background-job-tracking.md +5 -5
  61. data/docs/use_cases/UC-011-rate-limiting.md +95 -168
  62. data/docs/use_cases/UC-012-audit-trail.md +21 -46
  63. data/docs/use_cases/UC-013-high-cardinality-protection.md +29 -167
  64. data/docs/use_cases/UC-014-adaptive-sampling.md +2 -2
  65. data/docs/use_cases/UC-015-cost-optimization.md +46 -99
  66. data/docs/use_cases/UC-016-rails-logger-migration.md +39 -213
  67. data/docs/use_cases/UC-017-local-development.md +203 -777
  68. data/docs/use_cases/UC-018-testing-events.md +3 -3
  69. data/docs/use_cases/UC-019-retention-based-routing.md +53 -106
  70. data/docs/use_cases/UC-020-event-versioning.md +8 -9
  71. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +18 -22
  72. data/docs/use_cases/UC-022-event-registry.md +15 -21
  73. data/docs/use_cases/backlog.md +119 -87
  74. data/e11y.gemspec +2 -2
  75. data/gems/e11y-devtools/README.md +136 -0
  76. data/gems/e11y-devtools/config/routes.rb +8 -0
  77. data/gems/e11y-devtools/e11y-devtools.gemspec +25 -0
  78. data/gems/e11y-devtools/exe/e11y +34 -0
  79. data/gems/e11y-devtools/lib/e11y/devtools/mcp/server.rb +96 -0
  80. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tool_base.rb +25 -0
  81. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/clear.rb +31 -0
  82. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/errors.rb +35 -0
  83. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/event_detail.rb +33 -0
  84. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/events_by_trace.rb +33 -0
  85. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/interactions.rb +40 -0
  86. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/recent_events.rb +34 -0
  87. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/search.rb +34 -0
  88. data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/stats.rb +30 -0
  89. data/gems/e11y-devtools/lib/e11y/devtools/overlay/assets/overlay.js +115 -0
  90. data/gems/e11y-devtools/lib/e11y/devtools/overlay/controller.rb +54 -0
  91. data/gems/e11y-devtools/lib/e11y/devtools/overlay/engine.rb +26 -0
  92. data/gems/e11y-devtools/lib/e11y/devtools/overlay/middleware.rb +80 -0
  93. data/gems/e11y-devtools/lib/e11y/devtools/overlay/rails_controller.rb +42 -0
  94. data/gems/e11y-devtools/lib/e11y/devtools/tui/app.rb +262 -0
  95. data/gems/e11y-devtools/lib/e11y/devtools/tui/grouping.rb +66 -0
  96. data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/event_detail.rb +62 -0
  97. data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/event_list.rb +70 -0
  98. data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/interaction_list.rb +47 -0
  99. data/gems/e11y-devtools/lib/e11y/devtools/version.rb +8 -0
  100. data/gems/e11y-devtools/lib/e11y/devtools.rb +13 -0
  101. data/gems/e11y-devtools/spec/e11y/devtools/mcp/tools_spec.rb +107 -0
  102. data/gems/e11y-devtools/spec/e11y/devtools/overlay/controller_spec.rb +58 -0
  103. data/gems/e11y-devtools/spec/e11y/devtools/overlay/middleware_spec.rb +46 -0
  104. data/gems/e11y-devtools/spec/e11y/devtools/tui/app_spec.rb +85 -0
  105. data/gems/e11y-devtools/spec/e11y/devtools/tui/grouping_spec.rb +64 -0
  106. data/gems/e11y-devtools/spec/spec_helper.rb +5 -0
  107. data/gems/e11y-devtools/spec/tui/widgets/event_list_spec.rb +44 -0
  108. data/gems/e11y-devtools/spec/tui/widgets/interaction_list_spec.rb +62 -0
  109. data/lib/e11y/adapters/audit_encrypted.rb +53 -11
  110. data/lib/e11y/adapters/base.rb +33 -34
  111. data/lib/e11y/adapters/dev_log/file_store.rb +143 -0
  112. data/lib/e11y/adapters/dev_log/query.rb +219 -0
  113. data/lib/e11y/adapters/dev_log.rb +118 -0
  114. data/lib/e11y/adapters/file.rb +3 -6
  115. data/lib/e11y/adapters/in_memory.rb +52 -5
  116. data/lib/e11y/adapters/in_memory_test.rb +29 -0
  117. data/lib/e11y/adapters/loki.rb +58 -23
  118. data/lib/e11y/adapters/null.rb +82 -0
  119. data/lib/e11y/adapters/opentelemetry_collector.rb +183 -0
  120. data/lib/e11y/adapters/otel_logs.rb +136 -23
  121. data/lib/e11y/adapters/sentry.rb +4 -7
  122. data/lib/e11y/adapters/stdout.rb +73 -7
  123. data/lib/e11y/adapters/yabeda.rb +153 -29
  124. data/lib/e11y/buffers/adaptive_buffer.rb +3 -17
  125. data/lib/e11y/buffers/{request_scoped_buffer.rb → ephemeral_buffer.rb} +72 -58
  126. data/lib/e11y/buffers/ring_buffer.rb +3 -16
  127. data/lib/e11y/configuration.rb +272 -0
  128. data/lib/e11y/console.rb +10 -17
  129. data/lib/e11y/current.rb +53 -1
  130. data/lib/e11y/debug/pipeline_inspector.rb +96 -0
  131. data/lib/e11y/documentation/generator.rb +48 -0
  132. data/lib/e11y/event/base.rb +176 -82
  133. data/lib/e11y/event/value_sampling_config.rb +1 -5
  134. data/lib/e11y/events/rails/database/query.rb +1 -4
  135. data/lib/e11y/events/rails/job/failed.rb +2 -0
  136. data/lib/e11y/instruments/active_job.rb +46 -12
  137. data/lib/e11y/instruments/rails_instrumentation.rb +49 -24
  138. data/lib/e11y/instruments/sidekiq.rb +137 -31
  139. data/lib/e11y/linters/base.rb +11 -0
  140. data/lib/e11y/linters/pii/pii_declaration_linter.rb +120 -0
  141. data/lib/e11y/linters/slo/config_consistency_linter.rb +76 -0
  142. data/lib/e11y/linters/slo/explicit_declaration_linter.rb +36 -0
  143. data/lib/e11y/linters/slo/slo_status_from_linter.rb +41 -0
  144. data/lib/e11y/logger/bridge.rb +26 -7
  145. data/lib/e11y/metrics/cardinality_protection.rb +10 -15
  146. data/lib/e11y/metrics/cardinality_tracker.rb +16 -6
  147. data/lib/e11y/metrics/registry.rb +3 -5
  148. data/lib/e11y/metrics/test_backend.rb +62 -0
  149. data/lib/e11y/metrics.rb +56 -10
  150. data/lib/e11y/middleware/adapter_resolver.rb +40 -0
  151. data/lib/e11y/middleware/audit_signing.rb +43 -6
  152. data/lib/e11y/middleware/baggage_protection.rb +75 -0
  153. data/lib/e11y/middleware/dev_log_source.rb +24 -0
  154. data/lib/e11y/middleware/event_slo.rb +23 -9
  155. data/lib/e11y/middleware/otel_span.rb +23 -0
  156. data/lib/e11y/middleware/pii_filter.rb +104 -75
  157. data/lib/e11y/middleware/rate_limiting.rb +54 -27
  158. data/lib/e11y/middleware/request.rb +70 -23
  159. data/lib/e11y/middleware/routing.rb +78 -21
  160. data/lib/e11y/middleware/sampling.rb +66 -17
  161. data/lib/e11y/middleware/self_monitoring_emit.rb +39 -0
  162. data/lib/e11y/middleware/trace_context.rb +45 -10
  163. data/lib/e11y/middleware/track_latency.rb +34 -0
  164. data/lib/e11y/middleware/validation.rb +7 -16
  165. data/lib/e11y/middleware/versioning.rb +26 -22
  166. data/lib/e11y/opentelemetry/semantic_conventions.rb +109 -0
  167. data/lib/e11y/opentelemetry/span_creator.rb +142 -0
  168. data/lib/e11y/pii/patterns.rb +12 -1
  169. data/lib/e11y/pipeline/builder.rb +1 -1
  170. data/lib/e11y/presets/audit_event.rb +13 -2
  171. data/lib/e11y/railtie.rb +52 -15
  172. data/lib/e11y/registry.rb +306 -0
  173. data/lib/e11y/reliability/circuit_breaker.rb +19 -21
  174. data/lib/e11y/reliability/dlq/base.rb +71 -0
  175. data/lib/e11y/reliability/dlq/file_adapter.rb +301 -0
  176. data/lib/e11y/reliability/dlq/file_storage.rb +63 -34
  177. data/lib/e11y/reliability/dlq/filter.rb +37 -54
  178. data/lib/e11y/reliability/retry_handler.rb +26 -29
  179. data/lib/e11y/reliability/retry_rate_limiter.rb +3 -11
  180. data/lib/e11y/sampling/error_spike_detector.rb +0 -2
  181. data/lib/e11y/sampling/load_monitor.rb +5 -9
  182. data/lib/e11y/sampling/stratified_tracker.rb +18 -0
  183. data/lib/e11y/self_monitoring/buffer_monitor.rb +2 -0
  184. data/lib/e11y/self_monitoring/performance_monitor.rb +19 -61
  185. data/lib/e11y/self_monitoring/reliability_monitor.rb +4 -74
  186. data/lib/e11y/slo/config_loader.rb +40 -0
  187. data/lib/e11y/slo/config_validator.rb +58 -0
  188. data/lib/e11y/slo/dashboard_generator.rb +122 -0
  189. data/lib/e11y/slo/event_driven.rb +8 -0
  190. data/lib/e11y/slo/tracker.rb +31 -4
  191. data/lib/e11y/testing/have_tracked_event_matcher.rb +190 -0
  192. data/lib/e11y/testing/rspec_matchers.rb +21 -0
  193. data/lib/e11y/testing/snapshot_matcher.rb +86 -0
  194. data/lib/e11y/trace_context/sampler.rb +35 -0
  195. data/lib/e11y/tracing/faraday_middleware.rb +31 -0
  196. data/lib/e11y/tracing/net_http_patch.rb +33 -0
  197. data/lib/e11y/tracing/propagator.rb +116 -0
  198. data/lib/e11y/tracing.rb +47 -0
  199. data/lib/e11y/version.rb +1 -1
  200. data/lib/e11y/versioning/version_extractor.rb +32 -0
  201. data/lib/e11y.rb +141 -265
  202. data/lib/generators/e11y/event/event_generator.rb +22 -0
  203. data/lib/generators/e11y/event/templates/event.rb.tt +16 -0
  204. data/lib/generators/e11y/grafana_dashboard/grafana_dashboard_generator.rb +30 -0
  205. data/lib/generators/e11y/grafana_dashboard/templates/e11y_dashboard.json +81 -0
  206. data/lib/generators/e11y/install/install_generator.rb +34 -0
  207. data/lib/generators/e11y/install/templates/e11y.rb +239 -0
  208. data/lib/generators/e11y/prometheus_alerts/prometheus_alerts_generator.rb +29 -0
  209. data/lib/generators/e11y/prometheus_alerts/templates/e11y_alerts.yml +28 -0
  210. data/lib/tasks/e11y_docs.rake +30 -0
  211. data/lib/tasks/e11y_events.rake +71 -0
  212. data/lib/tasks/e11y_lint.rake +91 -0
  213. data/lib/tasks/e11y_slo.rake +29 -0
  214. metadata +129 -39
  215. data/docs/ADR-010-developer-experience.md +0 -2166
  216. data/docs/API-REFERENCE-L28.md +0 -914
  217. data/docs/COMPREHENSIVE-CONFIGURATION.md +0 -2366
  218. data/docs/CONTRIBUTING.md +0 -312
  219. data/docs/IMPLEMENTATION_NOTES.md +0 -2804
  220. data/docs/IMPLEMENTATION_PLAN.md +0 -1971
  221. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +0 -586
  222. data/docs/PLAN.md +0 -148
  223. data/docs/README.md +0 -296
  224. data/docs/design/00-memory-optimization.md +0 -593
  225. data/docs/guides/MIGRATION-L27-L28.md +0 -692
  226. data/docs/guides/PERFORMANCE-BENCHMARKS.md +0 -434
  227. data/docs/guides/README.md +0 -44
  228. data/docs/use_cases/UC-003-pattern-based-metrics.md +0 -1627
  229. data/lib/e11y/adapters/registry.rb +0 -141
  230. /data/docs/{ADR-012-event-evolution.md → architecture/ADR-012-event-evolution.md} +0 -0
@@ -30,7 +30,7 @@ module E11y
30
30
  #
31
31
  # @example Critical Event Bypass (C02)
32
32
  # # Payment events bypass rate limiting → DLQ if limited
33
- # config.dlq_filter.always_save_patterns = [/^payment\./]
33
+ # config.dlq_filter.should_save?(event_data) # Event DSL: use_dlq
34
34
  #
35
35
  # # Result: Rate-limited payment events go to DLQ, not dropped
36
36
  #
@@ -40,22 +40,33 @@ module E11y
40
40
  # Initialize rate limiting middleware
41
41
  #
42
42
  # @param app [Object] Next middleware in pipeline
43
- # @param global_limit [Integer] Max events/sec globally (default: 10_000)
44
- # @param per_event_limit [Integer] Max events/sec per event type (default: 1_000)
45
- # @param window [Float] Time window in seconds (default: 1.0)
46
- def initialize(app, global_limit: 10_000, per_event_limit: 1_000, window: 1.0)
43
+ # @param global_limit [Integer] Max events/sec globally (default: from E11y.config)
44
+ # @param per_event_limit [Integer] Max events/sec per event type (default: from E11y.config)
45
+ # @param window [Float] Time window in seconds (default: from E11y.config)
46
+ def initialize(app, global_limit: nil, per_event_limit: nil, window: nil)
47
47
  super(app)
48
- @global_limit = global_limit
49
- @per_event_limit = per_event_limit
50
- @window = window
48
+ config = E11y.config
49
+ # When explicit limits are passed (e.g. from pipeline options), enable for this instance
50
+ explicit_opts = global_limit || per_event_limit || window
51
+ @enabled = explicit_opts ? true : config.rate_limiting_enabled
52
+ @global_limit = global_limit || config.rate_limiting_global_limit
53
+ @global_window = window || config.rate_limiting_global_window
54
+ @window = @global_window # Alias for spec compatibility
55
+ @per_event_limit = per_event_limit || config.rate_limiting_per_event_limit
56
+ @explicit_per_event = per_event_limit && window
51
57
 
52
58
  # Token buckets for rate limiting
53
- @global_bucket = TokenBucket.new(capacity: @global_limit, refill_rate: @global_limit, window: @window)
59
+ @global_bucket = TokenBucket.new(
60
+ capacity: @global_limit,
61
+ refill_rate: @global_limit,
62
+ window: @global_window
63
+ )
54
64
  @per_event_buckets = Hash.new do |hash, event_name|
65
+ limit_cfg = @explicit_per_event ? { limit: @per_event_limit, window: @window } : config.rate_limit_for(event_name)
55
66
  hash[event_name] = TokenBucket.new(
56
- capacity: @per_event_limit,
57
- refill_rate: @per_event_limit,
58
- window: @window
67
+ capacity: limit_cfg[:limit],
68
+ refill_rate: limit_cfg[:limit],
69
+ window: limit_cfg[:window]
59
70
  )
60
71
  end
61
72
 
@@ -67,6 +78,8 @@ module E11y
67
78
  # @param event_data [Hash] Event payload
68
79
  # @return [Hash, nil] Event data if allowed, nil if rate limited
69
80
  def call(event_data)
81
+ return @app.call(event_data) unless @enabled
82
+
70
83
  event_name = event_data[:event_name]
71
84
 
72
85
  # Check global rate limit
@@ -83,7 +96,7 @@ module E11y
83
96
  end
84
97
 
85
98
  # Rate limit not exceeded - continue pipeline
86
- event_data
99
+ @app.call(event_data)
87
100
  end
88
101
 
89
102
  private
@@ -97,16 +110,31 @@ module E11y
97
110
  def handle_rate_limited(event_data, limit_type)
98
111
  event_name = event_data[:event_name]
99
112
 
100
- # Log rate limiting
101
- warn "[E11y] Rate limit exceeded (#{limit_type}) for event: #{event_name}"
113
+ # Log rate limiting (via E11y.logger so it respects Rails.logger in test env)
114
+ E11y.logger&.warn("[E11y] Rate limit exceeded (#{limit_type}) for event: #{event_name}")
102
115
 
103
116
  # C02 Resolution: Check if event should be saved to DLQ
104
- return unless should_save_to_dlq?(event_data)
105
-
106
- save_to_dlq(event_data, limit_type)
117
+ if should_save_to_dlq?(event_data)
118
+ record_dropped_metric(event_data, "rate_limited_#{limit_type}_dlq")
119
+ save_to_dlq(event_data, limit_type)
120
+ else
121
+ record_dropped_metric(event_data, "rate_limited_#{limit_type}")
122
+ end
123
+ end
107
124
 
108
- # Non-critical events are dropped (no DLQ)
109
- # TODO: Track metric e11y.rate_limiter.dropped
125
+ # Record e11y_events_dropped_total metric (non-fatal, safe when Metrics unavailable)
126
+ #
127
+ # @param event_data [Hash] Event payload
128
+ # @param reason [String] Drop reason (e.g., sampled_out, rate_limited_global)
129
+ def record_dropped_metric(event_data, reason)
130
+ return unless defined?(E11y::Metrics) && E11y::Metrics.respond_to?(:increment)
131
+
132
+ E11y::Metrics.increment(:e11y_events_dropped_total, {
133
+ reason: reason,
134
+ event_type: event_data[:event_name].to_s
135
+ })
136
+ rescue StandardError
137
+ # non-fatal
110
138
  end
111
139
 
112
140
  # Check if rate-limited event should be saved to DLQ (C02 Resolution)
@@ -120,9 +148,8 @@ module E11y
120
148
  dlq_filter = E11y.config.dlq_filter
121
149
  return false unless dlq_filter
122
150
 
123
- # Check if event matches always_save_patterns
124
- event_name = event_data[:event_name]
125
- dlq_filter.always_save_patterns&.any? { |pattern| pattern.match?(event_name) }
151
+ # Use DLQ filter (Event DSL: use_dlq, severity, default)
152
+ dlq_filter.should_save?(event_data)
126
153
  end
127
154
 
128
155
  # Save rate-limited critical event to DLQ (C02 Resolution)
@@ -135,19 +162,19 @@ module E11y
135
162
  dlq_storage = E11y.config.dlq_storage
136
163
  return unless dlq_storage
137
164
 
165
+ per_event_limit = limit_type == :per_event ? E11y.config.rate_limit_for(event_data[:event_name])[:limit] : @per_event_limit
138
166
  dlq_storage.save(event_data, metadata: {
139
167
  reason: "rate_limited_#{limit_type}",
140
168
  limit_type: limit_type,
141
169
  global_limit: @global_limit,
142
- per_event_limit: @per_event_limit,
170
+ per_event_limit: per_event_limit,
143
171
  timestamp: Time.now.utc.iso8601
144
172
  })
145
173
 
146
- warn "[E11y] Rate-limited critical event saved to DLQ: #{event_data[:event_name]}"
147
- # TODO: Track metric e11y.rate_limiter.dlq_saved
174
+ E11y.logger&.warn("[E11y] Rate-limited critical event saved to DLQ: #{event_data[:event_name]}")
148
175
  rescue StandardError => e
149
176
  # Don't fail if DLQ save fails (C18 Resolution)
150
- warn "[E11y] Failed to save rate-limited event to DLQ: #{e.message}"
177
+ E11y.logger&.warn("[E11y] Failed to save rate-limited event to DLQ: #{e.message}")
151
178
  end
152
179
 
153
180
  # Token Bucket implementation for rate limiting
@@ -2,6 +2,8 @@
2
2
 
3
3
  require "rack/request"
4
4
  require "securerandom"
5
+ require "e11y/tracing/propagator"
6
+ require "e11y/trace_context/sampler"
5
7
 
6
8
  module E11y
7
9
  module Middleware
@@ -32,13 +34,14 @@ module E11y
32
34
  # Process request
33
35
  # @param env [Hash] Rack environment
34
36
  # @return [Array] Rack response [status, headers, body]
35
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
37
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
36
38
  # Rack middleware request processing requires sequential setup of tracing, context, buffer, and SLO tracking
37
39
  def call(env)
38
40
  request = Rack::Request.new(env)
39
41
 
40
- # Extract or generate trace_id
41
- trace_id = extract_trace_id(request) || generate_trace_id
42
+ # Extract or generate trace context (trace_id, sampled from traceparent)
43
+ trace_ctx = extract_trace_context(request)
44
+ trace_id = trace_ctx[:trace_id] || generate_trace_id
42
45
  span_id = generate_span_id
43
46
 
44
47
  # Set request context (ActiveSupport::CurrentAttributes)
@@ -50,9 +53,10 @@ module E11y
50
53
  E11y::Current.user_agent = request.user_agent
51
54
  E11y::Current.request_method = request.request_method
52
55
  E11y::Current.request_path = request.path
56
+ E11y::Current.sampled = resolve_sampled(trace_ctx)
53
57
 
54
58
  # Start request-scoped buffer (for debug events)
55
- E11y::Buffers::RequestScopedBuffer.initialize! if E11y.config.request_buffer&.enabled
59
+ E11y::Buffers::EphemeralBuffer.initialize! if E11y.config.ephemeral_buffer_enabled
56
60
 
57
61
  # Track request start time for SLO
58
62
  start_time = Time.now
@@ -60,6 +64,9 @@ module E11y
60
64
  # Call next middleware/app
61
65
  status, headers, body = @app.call(env)
62
66
 
67
+ # Flush buffer if status matches configured flush_on_statuses (default: 5xx only)
68
+ E11y::Buffers::EphemeralBuffer.flush_on_error if should_flush_buffer?(status)
69
+
63
70
  # Track SLO metrics (if enabled)
64
71
  track_http_request_slo(env, status, start_time)
65
72
 
@@ -70,38 +77,80 @@ module E11y
70
77
  [status, headers, body]
71
78
  rescue StandardError
72
79
  # Flush request buffer on error (includes debug events)
73
- E11y::Buffers::RequestScopedBuffer.flush_on_error if E11y.config.request_buffer&.enabled
80
+ E11y::Buffers::EphemeralBuffer.flush_on_error if E11y.config.ephemeral_buffer_enabled
74
81
 
75
82
  raise # Re-raise original exception
76
83
  ensure
77
84
  # Discard request buffer on success (not on error, already flushed above)
78
85
  # We need to check if we're here from normal completion or exception
79
86
  # If there was an exception, buffer was already flushed in rescue block
80
- if !$ERROR_INFO && E11y.config.request_buffer&.enabled # No exception occurred
81
- E11y::Buffers::RequestScopedBuffer.discard
82
- end
87
+ E11y::Buffers::EphemeralBuffer.discard if !$ERROR_INFO && E11y.config.ephemeral_buffer_enabled # No exception occurred
83
88
 
84
89
  # Reset context
85
90
  E11y::Current.reset
86
91
  end
87
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
92
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
88
93
 
89
94
  private
90
95
 
91
- # Extract trace_id from request headers (W3C Trace Context or custom headers)
96
+ # Determine whether the request-scoped buffer should be flushed for this status code.
97
+ #
98
+ # Two independent conditions (either is sufficient):
99
+ # - +flush_on_error+ (default: true) — flushes on any 5xx server error
100
+ # - +flush_on_statuses+ (default: []) — extra status codes/ranges, e.g. [403]
101
+ #
102
+ # @example Default behaviour — flush on 5xx only
103
+ # config.ephemeral_buffer_flush_on_error = true # default
104
+ # config.ephemeral_buffer_flush_on_statuses = [] # default
105
+ #
106
+ # @example Flush on 403 in addition to 5xx
107
+ # config.ephemeral_buffer_flush_on_statuses = [403]
108
+ #
109
+ # @example Flush only on explicit statuses (disable 5xx default)
110
+ # config.ephemeral_buffer_flush_on_error = false
111
+ # config.ephemeral_buffer_flush_on_statuses = [403, 422]
112
+ #
113
+ # @param status [Integer] HTTP response status code
114
+ # @return [Boolean]
115
+ def should_flush_buffer?(status)
116
+ return false unless E11y.config.ephemeral_buffer_enabled
117
+
118
+ # Condition 1: server error flush (5xx)
119
+ return true if E11y.config.ephemeral_buffer_flush_on_error && status >= 500
120
+
121
+ # Condition 2: explicit extra statuses
122
+ extra = E11y.config.ephemeral_buffer_flush_on_statuses
123
+ extra&.any? { |s| s === status } || false # rubocop:disable Style/CaseEquality
124
+ end
125
+
126
+ # Extract trace context from request headers (W3C Trace Context or custom).
127
+ # Also extracts tracestate into E11y::Current.baggage (F-014).
92
128
  # @param request [Rack::Request] Rack request
93
- # @return [String, nil] Trace ID or nil if not found
94
- def extract_trace_id(request)
95
- # W3C Trace Context (traceparent header)
96
- # Format: version-trace_id-span_id-flags
97
- # Example: 00-0af7651916cd43dd8448eb211c80319c-00f067aa0ba902b7-01
129
+ # @return [Hash] { trace_id:, sampled: (from traceparent, or nil if new trace) }
130
+ def extract_trace_context(request)
98
131
  traceparent = request.get_header("HTTP_TRACEPARENT")
99
- return traceparent.split("-")[1] if traceparent
132
+ tracestate = request.get_header("HTTP_TRACESTATE")
133
+
134
+ if tracestate && E11y::Current.respond_to?(:baggage=)
135
+ baggage = E11y::Tracing::Propagator.parse_tracestate(tracestate)
136
+ E11y::Current.baggage = baggage if baggage.any?
137
+ end
138
+
139
+ if traceparent
140
+ parsed = E11y::Tracing::Propagator.parse(traceparent)
141
+ return { trace_id: parsed[:trace_id], sampled: parsed[:sampled] } if parsed
142
+ end
143
+
144
+ trace_id = request.get_header("HTTP_X_REQUEST_ID") || request.get_header("HTTP_X_TRACE_ID")
145
+ { trace_id: trace_id, sampled: nil }
146
+ end
147
+
148
+ # Resolve sampling decision: from parent (traceparent) or Sampler for new trace.
149
+ # Context for Sampler = E11y::Current.to_context (already set above).
150
+ def resolve_sampled(trace_ctx)
151
+ return trace_ctx[:sampled] if trace_ctx.key?(:sampled) && !trace_ctx[:sampled].nil?
100
152
 
101
- # X-Request-ID (Rails default)
102
- request.get_header("HTTP_X_REQUEST_ID") ||
103
- # X-Trace-Id (custom)
104
- request.get_header("HTTP_X_TRACE_ID")
153
+ E11y::TraceContext::Sampler.should_sample?(E11y::Current.to_context)
105
154
  end
106
155
 
107
156
  # Extract request_id from Rack env
@@ -141,10 +190,9 @@ module E11y
141
190
  # @param start_time [Time] Request start time
142
191
  # @return [void]
143
192
  # @api private
144
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
145
193
  # SLO tracking requires extracting controller/action, calculating duration, and error handling
146
194
  def track_http_request_slo(env, status, start_time)
147
- return unless E11y.config.slo_tracking&.enabled
195
+ return unless E11y.config.respond_to?(:slo_tracking_enabled) && E11y.config.slo_tracking_enabled
148
196
 
149
197
  duration_ms = ((Time.now - start_time) * 1000).round(2)
150
198
 
@@ -163,7 +211,6 @@ module E11y
163
211
  # Don't fail if SLO tracking fails
164
212
  warn "[E11y] SLO tracking error: #{e.message}"
165
213
  end
166
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity
167
214
  end
168
215
  end
169
216
  end
@@ -40,13 +40,8 @@ module E11y
40
40
  # # Rule: ->(e) { :audit_encrypted if e[:audit_event] }
41
41
  # # Routes to: [:audit_encrypted]
42
42
  #
43
- # @example Retention-based routing
44
- # event_data = {
45
- # event_name: 'order.placed',
46
- # retention_until: '2026-04-21T...' # 90 days
47
- # }
48
- # # Rule: ->(e) { days > 30 ? :s3_standard : :loki }
49
- # # Routes to: [:s3_standard]
43
+ # Note: retention_until is for archival jobs (run separately), not for routing.
44
+ # Archival happens later — cron/Loki compaction filters by retention_until.
50
45
  class Routing < Base
51
46
  middleware_zone :adapters
52
47
 
@@ -58,10 +53,23 @@ module E11y
58
53
  # @option event_data [Boolean] :audit_event Audit event flag (optional, for routing rules)
59
54
  # @option event_data [Symbol] :severity Event severity (optional, for routing rules)
60
55
  # @return [Hash, nil] Event data (passed to next middleware), or nil if dropped
61
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
56
+ # rubocop:disable Metrics/PerceivedComplexity
62
57
  # Routing logic requires adapter selection, iteration with error handling,
63
58
  # metadata enrichment, and metrics tracking
64
59
  def call(event_data)
60
+ # Handle nil from upstream middleware (e.g., rate limiting, sampling)
61
+ return nil unless event_data
62
+
63
+ # 0. Request-scoped buffer: buffer debug events instead of writing when enabled
64
+ # Skip when event is from a flush (avoid re-buffering)
65
+ if !event_data[:from_ephemeral_buffer_flush] &&
66
+ event_data[:severity] == :debug &&
67
+ E11y.config.ephemeral_buffer_enabled &&
68
+ E11y::Buffers::EphemeralBuffer.active? && E11y::Buffers::EphemeralBuffer.add_event(event_data)
69
+ # Buffered — skip adapter writes, pass through
70
+ return @app&.call(event_data)
71
+ end
72
+
65
73
  # 1. Determine target adapters (explicit or via routing rules)
66
74
  target_adapters = if event_data[:adapters]&.any?
67
75
  # Explicit adapters bypass routing rules
@@ -71,18 +79,28 @@ module E11y
71
79
  apply_routing_rules(event_data)
72
80
  end
73
81
 
82
+ # 1.5. Validate audit events have proper routing (UC-012 compliance requirement)
83
+ validate_audit_routing!(event_data, target_adapters)
84
+
74
85
  # 2. Write to selected adapters
75
86
  target_adapters.each do |adapter_name|
76
87
  adapter = E11y.configuration.adapters[adapter_name]
77
88
  next unless adapter
78
89
 
90
+ # Per-adapter payload: merge payload_rewrites only when present (explicit_pii exclude_adapters)
91
+ data_to_write = if event_data[:payload_rewrites] && event_data[:payload_rewrites][adapter_name]
92
+ payload = event_data[:payload]&.dup || {}
93
+ payload.merge!(event_data[:payload_rewrites][adapter_name])
94
+ event_data.merge(payload: payload)
95
+ else
96
+ event_data
97
+ end
98
+
79
99
  begin
80
- adapter.write(event_data)
81
- increment_metric("e11y.middleware.routing.write_success", adapter: adapter_name)
100
+ adapter.write(data_to_write)
82
101
  rescue StandardError => e
83
102
  # Log routing error but don't fail pipeline
84
103
  warn "E11y routing error for adapter #{adapter_name}: #{e.message}"
85
- increment_metric("e11y.middleware.routing.write_error", adapter: adapter_name)
86
104
  end
87
105
  end
88
106
 
@@ -94,9 +112,9 @@ module E11y
94
112
  }
95
113
 
96
114
  # 4. Increment metrics
97
- increment_metric("e11y.middleware.routing.routed",
98
- adapters_count: target_adapters.size,
99
- routing_type: event_data[:routing][:routing_type])
115
+ E11y::Metrics.increment("e11y.middleware.routing.routed",
116
+ adapters_count: target_adapters.size,
117
+ routing_type: event_data[:routing][:routing_type])
100
118
 
101
119
  # 5. Log routing decision (for debugging)
102
120
  log_routing_decision(event_data, target_adapters) if debug_enabled?
@@ -104,7 +122,7 @@ module E11y
104
122
  # 6. Pass to next app (if any)
105
123
  @app&.call(event_data)
106
124
  end
107
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
125
+ # rubocop:enable Metrics/PerceivedComplexity
108
126
 
109
127
  private
110
128
 
@@ -124,12 +142,12 @@ module E11y
124
142
  # ->(event) { :audit_encrypted if event[:audit_event] },
125
143
  # ->(event) {
126
144
  # days = (Time.parse(event[:retention_until]) - Time.now) / 86400
127
- # days > 90 ? :s3_glacier : :loki
145
+ # days > 90 ? :archive : :loki
128
146
  # }
129
147
  # ]
130
148
  #
131
149
  # apply_routing_rules(event_data)
132
- # # => [:audit_encrypted] or [:loki] or [:s3_glacier]
150
+ # # => [:audit_encrypted] or [:loki] or [:archive]
133
151
  def apply_routing_rules(event_data)
134
152
  matched_adapters = []
135
153
 
@@ -143,10 +161,12 @@ module E11y
143
161
  warn "E11y routing rule error: #{e.message}"
144
162
  end
145
163
 
146
- # Return unique adapters or fallback
164
+ # Track whether fallback was used (for audit validation)
147
165
  if matched_adapters.any?
166
+ event_data[:routing_used_fallback] = false
148
167
  matched_adapters.uniq
149
168
  else
169
+ event_data[:routing_used_fallback] = true
150
170
  E11y.configuration.fallback_adapters || [:stdout]
151
171
  end
152
172
  end
@@ -175,9 +195,46 @@ module E11y
175
195
  # @param metric_name [String] Metric name
176
196
  # @param tags [Hash] Metric tags
177
197
  # @return [void]
178
- def increment_metric(_metric_name, **_tags)
179
- # TODO: Integrate with Yabeda/Prometheus
180
- # Yabeda.e11y.middleware_routing_routed.increment(tags)
198
+ # Validate audit events have proper routing configuration.
199
+ #
200
+ # Audit events MUST be routed via explicit adapters OR routing rules.
201
+ # Relying on fallback routing (no rule matched) is a compliance configuration error.
202
+ #
203
+ # @param event_data [Hash] Event data
204
+ # @param target_adapters [Array<Symbol>] Target adapters
205
+ # @raise [E11y::Error] if audit event misconfigured
206
+ # @return [void]
207
+ def validate_audit_routing!(event_data, target_adapters)
208
+ return unless event_data[:audit_event]
209
+
210
+ # Audit events are valid if:
211
+ # 1. They have explicit adapters (non-empty), OR
212
+ # 2. They matched a routing rule (routing_used_fallback = false)
213
+
214
+ has_explicit_adapters = event_data[:adapters]&.any?
215
+ return if has_explicit_adapters # Explicit adapters → valid
216
+
217
+ # Check if fallback was used (set by apply_routing_rules)
218
+ used_fallback = event_data[:routing_used_fallback]
219
+ return unless used_fallback
220
+
221
+ # CRITICAL: Audit event using fallback routing (no rule matched!)
222
+ error_message = <<~ERROR
223
+ [E11y] CRITICAL: Audit event has no routing configuration!
224
+
225
+ Event: #{event_data[:event_name]}
226
+ Routed to: #{target_adapters.inspect} (fallback adapters)
227
+
228
+ Audit events MUST be explicitly routed to compliance-grade storage.
229
+
230
+ Fix options:
231
+ 1. Add explicit adapters: `adapters :audit_encrypted`
232
+ 2. Configure routing rule: `config.routing_rules = [->(e) { :audit_encrypted if e[:audit_event] }]`
233
+
234
+ See UC-012 Audit Trail documentation for details.
235
+ ERROR
236
+
237
+ raise E11y::Error, error_message
181
238
  end
182
239
  end
183
240
  end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "e11y/middleware/base"
4
+ require "e11y/sampling/stratified_tracker"
4
5
 
5
6
  module E11y
6
7
  module Middleware
@@ -51,6 +52,8 @@ module E11y
51
52
  # }
52
53
  # }
53
54
  # end
55
+ # rubocop:disable Metrics/ClassLength
56
+ # Class has 6 adaptive sampling strategies each requiring dedicated setup + private methods
54
57
  class Sampling < Base
55
58
  middleware_zone :routing
56
59
 
@@ -79,6 +82,9 @@ module E11y
79
82
  # @param event_data [Hash] The event payload
80
83
  # @return [Hash, nil] The event payload if sampled, nil if dropped
81
84
  def call(event_data)
85
+ # Handle nil from upstream middleware (e.g., rate limiting)
86
+ return nil unless event_data
87
+
82
88
  event_class = event_data[:event_class]
83
89
 
84
90
  # Track errors for error-based adaptive sampling (FEAT-4838)
@@ -87,13 +93,34 @@ module E11y
87
93
  # Track events for load-based adaptive sampling (FEAT-4842)
88
94
  @load_monitor&.record_event
89
95
 
96
+ # C11: Get sample rate and severity before decision (for StratifiedTracker)
97
+ sample_rate = determine_sample_rate(event_class, event_data)
98
+ severity = event_data[:severity] || (event_class.respond_to?(:severity) ? event_class.severity : :info)
99
+
90
100
  # Determine if event should be sampled
91
101
  # Drop event if not sampled
92
- return nil unless should_sample?(event_data, event_class)
102
+ unless should_sample?(event_data, event_class)
103
+ # C11: Record dropped event to StratifiedTracker for sampling correction
104
+ E11y::Sampling.stratified_tracker.record_sample(severity: severity, sample_rate: sample_rate, sampled: false)
105
+ begin
106
+ if defined?(E11y::Metrics) && E11y::Metrics.respond_to?(:increment)
107
+ E11y::Metrics.increment(:e11y_events_dropped_total, {
108
+ reason: "sampled_out",
109
+ event_type: event_data[:event_name].to_s
110
+ })
111
+ end
112
+ rescue StandardError
113
+ # non-fatal
114
+ end
115
+ return nil
116
+ end
93
117
 
94
118
  # Mark as sampled for downstream middleware
95
119
  event_data[:sampled] = true
96
- event_data[:sample_rate] = determine_sample_rate(event_class, event_data)
120
+ event_data[:sample_rate] = sample_rate
121
+
122
+ # C11: Record sampled event to StratifiedTracker for sampling correction
123
+ E11y::Sampling.stratified_tracker.record_sample(severity: severity, sample_rate: sample_rate, sampled: true)
97
124
 
98
125
  # Pass to next middleware
99
126
  @app.call(event_data)
@@ -121,6 +148,7 @@ module E11y
121
148
  @default_sample_rate = config.fetch(:default_sample_rate, 1.0)
122
149
  @trace_aware = config.fetch(:trace_aware, true)
123
150
  @severity_rates = config.fetch(:severity_rates, {})
151
+ @pattern_rates = config.fetch(:pattern_rates, []) # [[Regexp, Float], ...]
124
152
  @trace_decisions = {} # Cache for trace-level sampling decisions
125
153
  @trace_decisions_mutex = Mutex.new
126
154
  end
@@ -158,8 +186,10 @@ module E11y
158
186
  # 1. Check if audit event (never sample audit events!)
159
187
  return true if event_class.respond_to?(:audit_event?) && event_class.audit_event?
160
188
 
161
- # 2. Check trace-aware sampling (C05)
189
+ # 2. Trace-consistent sampling (ADR-005 §7): prefer E11y::Current.sampled when trace_aware
162
190
  if @trace_aware && event_data[:trace_id]
191
+ return E11y::Current.sampled if E11y::Current.respond_to?(:sampled) && !E11y::Current.sampled.nil?
192
+
163
193
  return trace_sampling_decision(event_data[:trace_id], event_class, event_data)
164
194
  end
165
195
 
@@ -183,22 +213,32 @@ module E11y
183
213
  # @param event_class [Class] The event class
184
214
  # @param event_data [Hash] Event payload (for value-based sampling)
185
215
  # @return [Float] Sample rate (0.0-1.0)
186
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
187
- # Sample rate determination follows priority chain: error spike → value-based →
188
- # load-based severity event-level default
216
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
217
+ # Sample rate determination follows a 6-step priority chain:
218
+ # error spike (0) → pattern-based (0.5)value-based (1)
219
+ # load-based (2) → severity (3) → event-level (4) → default (5)
189
220
  def determine_sample_rate(event_class, event_data = nil)
190
221
  # 0. Error-based adaptive sampling (FEAT-4838) - highest priority!
191
222
  if @error_based_adaptive && @error_spike_detector&.error_spike?
192
223
  return 1.0 # 100% sampling during error spike
193
224
  end
194
225
 
226
+ # 0.5. Pattern-based sampling (by event_name) - overrides event-level config
227
+ if event_data && !@pattern_rates.empty?
228
+ event_name = event_data[:event_name].to_s
229
+ @pattern_rates.each do |pattern, rate|
230
+ return rate if pattern.match?(event_name)
231
+ end
232
+ end
233
+
195
234
  # 1. Value-based sampling (FEAT-4849) - high-value events always sampled
196
235
  if event_data && event_class.respond_to?(:value_sampling_configs)
197
236
  configs = event_class.value_sampling_configs
198
237
  unless configs.empty?
199
238
  require "e11y/sampling/value_extractor"
200
239
  extractor = E11y::Sampling::ValueExtractor.new
201
- if configs.any? { |config| config.matches?(event_data, extractor) }
240
+ payload = event_data[:payload] || event_data
241
+ if configs.any? { |config| config.matches?(payload, extractor) }
202
242
  return 1.0 # 100% sampling for high-value events
203
243
  end
204
244
  end
@@ -228,7 +268,7 @@ module E11y
228
268
  # 4. Default/load-based rate
229
269
  base_rate
230
270
  end
231
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
271
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
232
272
 
233
273
  # Trace-aware sampling decision (C05 Resolution)
234
274
  #
@@ -241,15 +281,21 @@ module E11y
241
281
  # @return [Boolean] true if trace should be sampled
242
282
  def trace_sampling_decision(trace_id, event_class, event_data = nil)
243
283
  @trace_decisions_mutex.synchronize do
284
+ # Use monotonic clock (Float) to avoid Time object allocation — prevents memory leak in hot path
285
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
286
+
244
287
  # Check if decision already made for this trace
245
- return @trace_decisions[trace_id] if @trace_decisions.key?(trace_id)
288
+ if (entry = @trace_decisions[trace_id])
289
+ entry[:last_access] = now # LRU touch
290
+ return entry[:decision]
291
+ end
246
292
 
247
293
  # Make new sampling decision
248
294
  sample_rate = determine_sample_rate(event_class, event_data)
249
295
  decision = rand < sample_rate
250
296
 
251
- # Cache decision (TTL handled by periodic cleanup)
252
- @trace_decisions[trace_id] = decision
297
+ # Cache decision with LRU metadata (evict oldest on cleanup)
298
+ @trace_decisions[trace_id] = { decision: decision, last_access: now }
253
299
 
254
300
  # Cleanup old decisions periodically (every 1000 traces)
255
301
  cleanup_trace_decisions if @trace_decisions.size > 1000
@@ -260,14 +306,17 @@ module E11y
260
306
 
261
307
  # Cleanup old trace decisions to prevent memory leaks
262
308
  #
263
- # Removes random 50% of cached decisions when cache grows too large.
264
- # This is a simple heuristic - traces typically complete in <10 seconds,
265
- # so old decisions are likely stale.
309
+ # Evicts oldest 50% by last_access (LRU). Active traces stay in cache
310
+ # because they are touched on each lookup, preserving trace-level consistency.
266
311
  def cleanup_trace_decisions
267
- # Remove random 50% of decisions
268
- keys_to_remove = @trace_decisions.keys.sample(@trace_decisions.size / 2)
269
- keys_to_remove.each { |key| @trace_decisions.delete(key) }
312
+ return if @trace_decisions.size <= 100
313
+
314
+ size_to_remove = @trace_decisions.size / 2
315
+ sorted = @trace_decisions.to_a.sort_by { |_, v| v[:last_access] }
316
+ keys_to_remove = sorted.first(size_to_remove).map(&:first)
317
+ keys_to_remove.each { |k| @trace_decisions.delete(k) }
270
318
  end
271
319
  end
320
+ # rubocop:enable Metrics/ClassLength
272
321
  end
273
322
  end