e11y 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +130 -10
- data/CHANGELOG.md +56 -1
- data/CLAUDE.md +168 -0
- data/CONTRIBUTING.md +640 -0
- data/README.md +134 -702
- data/RELEASE.md +18 -3
- data/Rakefile +108 -29
- data/config/README.md +1 -1
- data/config/loki-local-config.yaml +12 -0
- data/config/otel-collector-config.yaml +44 -0
- data/cucumber.yml +1 -0
- data/docker-compose.yml +18 -2
- data/docs/ADAPTERS.md +76 -0
- data/docs/ADAPTIVE_SAMPLING.md +59 -0
- data/docs/COMPARISON.md +104 -0
- data/docs/CONFIGURATION.md +52 -0
- data/docs/DISTRIBUTED_TRACING.md +44 -0
- data/docs/LIMITATIONS.md +13 -0
- data/docs/METRICS_DSL.md +84 -0
- data/docs/PERFORMANCE.md +60 -0
- data/docs/PII_FILTERING.md +40 -0
- data/docs/PRESETS.md +65 -0
- data/docs/QUICK-START.md +546 -587
- data/docs/RAILS_INTEGRATION.md +29 -0
- data/docs/SCHEMA_VALIDATION.md +63 -0
- data/docs/SLO-PROMQL-ALERTS.md +161 -0
- data/docs/TESTING.md +69 -0
- data/docs/{ADR-001-architecture.md → architecture/ADR-001-architecture.md} +35 -64
- data/docs/{ADR-002-metrics-yabeda.md → architecture/ADR-002-metrics-yabeda.md} +62 -236
- data/docs/{ADR-003-slo-observability.md → architecture/ADR-003-slo-observability.md} +27 -466
- data/docs/{ADR-004-adapter-architecture.md → architecture/ADR-004-adapter-architecture.md} +163 -146
- data/docs/{ADR-005-tracing-context.md → architecture/ADR-005-tracing-context.md} +10 -9
- data/docs/{ADR-006-security-compliance.md → architecture/ADR-006-security-compliance.md} +184 -191
- data/docs/{ADR-007-opentelemetry-integration.md → architecture/ADR-007-opentelemetry-integration.md} +3 -21
- data/docs/{ADR-008-rails-integration.md → architecture/ADR-008-rails-integration.md} +209 -339
- data/docs/{ADR-009-cost-optimization.md → architecture/ADR-009-cost-optimization.md} +45 -54
- data/docs/architecture/ADR-010-developer-experience.md +522 -0
- data/docs/{ADR-011-testing-strategy.md → architecture/ADR-011-testing-strategy.md} +41 -83
- data/docs/{ADR-013-reliability-error-handling.md → architecture/ADR-013-reliability-error-handling.md} +37 -12
- data/docs/{ADR-014-event-driven-slo.md → architecture/ADR-014-event-driven-slo.md} +12 -24
- data/docs/{ADR-015-middleware-order.md → architecture/ADR-015-middleware-order.md} +23 -41
- data/docs/{ADR-016-self-monitoring-slo.md → architecture/ADR-016-self-monitoring-slo.md} +52 -349
- data/docs/{ADR-017-multi-rails-compatibility.md → architecture/ADR-017-multi-rails-compatibility.md} +4 -11
- data/docs/architecture/ADR-018-memory-optimization.md +366 -0
- data/docs/{ADR-INDEX.md → architecture/ADR-INDEX.md} +11 -6
- data/docs/{00-ICP-AND-TIMELINE.md → prd/00-ICP-AND-TIMELINE.md} +6 -6
- data/docs/{01-SCALE-REQUIREMENTS.md → prd/01-SCALE-REQUIREMENTS.md} +6 -6
- data/docs/prd/01-overview-vision.md +19 -14
- data/docs/use_cases/README.md +22 -23
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +50 -44
- data/docs/use_cases/UC-002-business-event-tracking.md +26 -95
- data/docs/use_cases/UC-003-event-metrics.md +66 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +42 -101
- data/docs/use_cases/UC-005-sentry-integration.md +13 -15
- data/docs/use_cases/UC-006-trace-context-management.md +30 -28
- data/docs/use_cases/UC-007-pii-filtering.md +35 -87
- data/docs/use_cases/UC-008-opentelemetry-integration.md +51 -89
- data/docs/use_cases/UC-009-multi-service-tracing.md +4 -4
- data/docs/use_cases/UC-010-background-job-tracking.md +5 -5
- data/docs/use_cases/UC-011-rate-limiting.md +95 -168
- data/docs/use_cases/UC-012-audit-trail.md +21 -46
- data/docs/use_cases/UC-013-high-cardinality-protection.md +29 -167
- data/docs/use_cases/UC-014-adaptive-sampling.md +2 -2
- data/docs/use_cases/UC-015-cost-optimization.md +46 -99
- data/docs/use_cases/UC-016-rails-logger-migration.md +39 -213
- data/docs/use_cases/UC-017-local-development.md +203 -777
- data/docs/use_cases/UC-018-testing-events.md +3 -3
- data/docs/use_cases/UC-019-retention-based-routing.md +53 -106
- data/docs/use_cases/UC-020-event-versioning.md +8 -9
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +18 -22
- data/docs/use_cases/UC-022-event-registry.md +15 -21
- data/docs/use_cases/backlog.md +119 -87
- data/e11y.gemspec +2 -2
- data/gems/e11y-devtools/README.md +136 -0
- data/gems/e11y-devtools/config/routes.rb +8 -0
- data/gems/e11y-devtools/e11y-devtools.gemspec +25 -0
- data/gems/e11y-devtools/exe/e11y +34 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/server.rb +96 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tool_base.rb +25 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/clear.rb +31 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/errors.rb +35 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/event_detail.rb +33 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/events_by_trace.rb +33 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/interactions.rb +40 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/recent_events.rb +34 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/search.rb +34 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/stats.rb +30 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/assets/overlay.js +115 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/controller.rb +54 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/engine.rb +26 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/middleware.rb +80 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/rails_controller.rb +42 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/app.rb +262 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/grouping.rb +66 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/event_detail.rb +62 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/event_list.rb +70 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/interaction_list.rb +47 -0
- data/gems/e11y-devtools/lib/e11y/devtools/version.rb +8 -0
- data/gems/e11y-devtools/lib/e11y/devtools.rb +13 -0
- data/gems/e11y-devtools/spec/e11y/devtools/mcp/tools_spec.rb +107 -0
- data/gems/e11y-devtools/spec/e11y/devtools/overlay/controller_spec.rb +58 -0
- data/gems/e11y-devtools/spec/e11y/devtools/overlay/middleware_spec.rb +46 -0
- data/gems/e11y-devtools/spec/e11y/devtools/tui/app_spec.rb +85 -0
- data/gems/e11y-devtools/spec/e11y/devtools/tui/grouping_spec.rb +64 -0
- data/gems/e11y-devtools/spec/spec_helper.rb +5 -0
- data/gems/e11y-devtools/spec/tui/widgets/event_list_spec.rb +44 -0
- data/gems/e11y-devtools/spec/tui/widgets/interaction_list_spec.rb +62 -0
- data/lib/e11y/adapters/audit_encrypted.rb +53 -11
- data/lib/e11y/adapters/base.rb +33 -34
- data/lib/e11y/adapters/dev_log/file_store.rb +143 -0
- data/lib/e11y/adapters/dev_log/query.rb +219 -0
- data/lib/e11y/adapters/dev_log.rb +118 -0
- data/lib/e11y/adapters/file.rb +3 -6
- data/lib/e11y/adapters/in_memory.rb +52 -5
- data/lib/e11y/adapters/in_memory_test.rb +29 -0
- data/lib/e11y/adapters/loki.rb +58 -23
- data/lib/e11y/adapters/null.rb +82 -0
- data/lib/e11y/adapters/opentelemetry_collector.rb +183 -0
- data/lib/e11y/adapters/otel_logs.rb +136 -23
- data/lib/e11y/adapters/sentry.rb +4 -7
- data/lib/e11y/adapters/stdout.rb +73 -7
- data/lib/e11y/adapters/yabeda.rb +153 -29
- data/lib/e11y/buffers/adaptive_buffer.rb +3 -17
- data/lib/e11y/buffers/{request_scoped_buffer.rb → ephemeral_buffer.rb} +72 -58
- data/lib/e11y/buffers/ring_buffer.rb +3 -16
- data/lib/e11y/configuration.rb +272 -0
- data/lib/e11y/console.rb +10 -17
- data/lib/e11y/current.rb +53 -1
- data/lib/e11y/debug/pipeline_inspector.rb +96 -0
- data/lib/e11y/documentation/generator.rb +48 -0
- data/lib/e11y/event/base.rb +176 -82
- data/lib/e11y/event/value_sampling_config.rb +1 -5
- data/lib/e11y/events/rails/database/query.rb +1 -4
- data/lib/e11y/events/rails/job/failed.rb +2 -0
- data/lib/e11y/instruments/active_job.rb +46 -12
- data/lib/e11y/instruments/rails_instrumentation.rb +49 -24
- data/lib/e11y/instruments/sidekiq.rb +137 -31
- data/lib/e11y/linters/base.rb +11 -0
- data/lib/e11y/linters/pii/pii_declaration_linter.rb +120 -0
- data/lib/e11y/linters/slo/config_consistency_linter.rb +76 -0
- data/lib/e11y/linters/slo/explicit_declaration_linter.rb +36 -0
- data/lib/e11y/linters/slo/slo_status_from_linter.rb +41 -0
- data/lib/e11y/logger/bridge.rb +26 -7
- data/lib/e11y/metrics/cardinality_protection.rb +10 -15
- data/lib/e11y/metrics/cardinality_tracker.rb +16 -6
- data/lib/e11y/metrics/registry.rb +3 -5
- data/lib/e11y/metrics/test_backend.rb +62 -0
- data/lib/e11y/metrics.rb +56 -10
- data/lib/e11y/middleware/adapter_resolver.rb +40 -0
- data/lib/e11y/middleware/audit_signing.rb +43 -6
- data/lib/e11y/middleware/baggage_protection.rb +75 -0
- data/lib/e11y/middleware/dev_log_source.rb +24 -0
- data/lib/e11y/middleware/event_slo.rb +23 -9
- data/lib/e11y/middleware/otel_span.rb +23 -0
- data/lib/e11y/middleware/pii_filter.rb +104 -75
- data/lib/e11y/middleware/rate_limiting.rb +54 -27
- data/lib/e11y/middleware/request.rb +70 -23
- data/lib/e11y/middleware/routing.rb +78 -21
- data/lib/e11y/middleware/sampling.rb +66 -17
- data/lib/e11y/middleware/self_monitoring_emit.rb +39 -0
- data/lib/e11y/middleware/trace_context.rb +45 -10
- data/lib/e11y/middleware/track_latency.rb +34 -0
- data/lib/e11y/middleware/validation.rb +7 -16
- data/lib/e11y/middleware/versioning.rb +26 -22
- data/lib/e11y/opentelemetry/semantic_conventions.rb +109 -0
- data/lib/e11y/opentelemetry/span_creator.rb +142 -0
- data/lib/e11y/pii/patterns.rb +12 -1
- data/lib/e11y/pipeline/builder.rb +1 -1
- data/lib/e11y/presets/audit_event.rb +13 -2
- data/lib/e11y/railtie.rb +52 -15
- data/lib/e11y/registry.rb +306 -0
- data/lib/e11y/reliability/circuit_breaker.rb +19 -21
- data/lib/e11y/reliability/dlq/base.rb +71 -0
- data/lib/e11y/reliability/dlq/file_adapter.rb +301 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +63 -34
- data/lib/e11y/reliability/dlq/filter.rb +37 -54
- data/lib/e11y/reliability/retry_handler.rb +26 -29
- data/lib/e11y/reliability/retry_rate_limiter.rb +3 -11
- data/lib/e11y/sampling/error_spike_detector.rb +0 -2
- data/lib/e11y/sampling/load_monitor.rb +5 -9
- data/lib/e11y/sampling/stratified_tracker.rb +18 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +2 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +19 -61
- data/lib/e11y/self_monitoring/reliability_monitor.rb +4 -74
- data/lib/e11y/slo/config_loader.rb +40 -0
- data/lib/e11y/slo/config_validator.rb +58 -0
- data/lib/e11y/slo/dashboard_generator.rb +122 -0
- data/lib/e11y/slo/event_driven.rb +8 -0
- data/lib/e11y/slo/tracker.rb +31 -4
- data/lib/e11y/testing/have_tracked_event_matcher.rb +190 -0
- data/lib/e11y/testing/rspec_matchers.rb +21 -0
- data/lib/e11y/testing/snapshot_matcher.rb +86 -0
- data/lib/e11y/trace_context/sampler.rb +35 -0
- data/lib/e11y/tracing/faraday_middleware.rb +31 -0
- data/lib/e11y/tracing/net_http_patch.rb +33 -0
- data/lib/e11y/tracing/propagator.rb +116 -0
- data/lib/e11y/tracing.rb +47 -0
- data/lib/e11y/version.rb +1 -1
- data/lib/e11y/versioning/version_extractor.rb +32 -0
- data/lib/e11y.rb +141 -265
- data/lib/generators/e11y/event/event_generator.rb +22 -0
- data/lib/generators/e11y/event/templates/event.rb.tt +16 -0
- data/lib/generators/e11y/grafana_dashboard/grafana_dashboard_generator.rb +30 -0
- data/lib/generators/e11y/grafana_dashboard/templates/e11y_dashboard.json +81 -0
- data/lib/generators/e11y/install/install_generator.rb +34 -0
- data/lib/generators/e11y/install/templates/e11y.rb +239 -0
- data/lib/generators/e11y/prometheus_alerts/prometheus_alerts_generator.rb +29 -0
- data/lib/generators/e11y/prometheus_alerts/templates/e11y_alerts.yml +28 -0
- data/lib/tasks/e11y_docs.rake +30 -0
- data/lib/tasks/e11y_events.rake +71 -0
- data/lib/tasks/e11y_lint.rake +91 -0
- data/lib/tasks/e11y_slo.rake +29 -0
- metadata +129 -39
- data/docs/ADR-010-developer-experience.md +0 -2166
- data/docs/API-REFERENCE-L28.md +0 -914
- data/docs/COMPREHENSIVE-CONFIGURATION.md +0 -2366
- data/docs/CONTRIBUTING.md +0 -312
- data/docs/IMPLEMENTATION_NOTES.md +0 -2804
- data/docs/IMPLEMENTATION_PLAN.md +0 -1971
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +0 -586
- data/docs/PLAN.md +0 -148
- data/docs/README.md +0 -296
- data/docs/design/00-memory-optimization.md +0 -593
- data/docs/guides/MIGRATION-L27-L28.md +0 -692
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +0 -434
- data/docs/guides/README.md +0 -44
- data/docs/use_cases/UC-003-pattern-based-metrics.md +0 -1627
- data/lib/e11y/adapters/registry.rb +0 -141
- /data/docs/{ADR-012-event-evolution.md → architecture/ADR-012-event-evolution.md} +0 -0
|
@@ -30,7 +30,7 @@ module E11y
|
|
|
30
30
|
#
|
|
31
31
|
# @example Critical Event Bypass (C02)
|
|
32
32
|
# # Payment events bypass rate limiting → DLQ if limited
|
|
33
|
-
# config.dlq_filter.
|
|
33
|
+
# config.dlq_filter.should_save?(event_data) # Event DSL: use_dlq
|
|
34
34
|
#
|
|
35
35
|
# # Result: Rate-limited payment events go to DLQ, not dropped
|
|
36
36
|
#
|
|
@@ -40,22 +40,33 @@ module E11y
|
|
|
40
40
|
# Initialize rate limiting middleware
|
|
41
41
|
#
|
|
42
42
|
# @param app [Object] Next middleware in pipeline
|
|
43
|
-
# @param global_limit [Integer] Max events/sec globally (default:
|
|
44
|
-
# @param per_event_limit [Integer] Max events/sec per event type (default:
|
|
45
|
-
# @param window [Float] Time window in seconds (default:
|
|
46
|
-
def initialize(app, global_limit:
|
|
43
|
+
# @param global_limit [Integer] Max events/sec globally (default: from E11y.config)
|
|
44
|
+
# @param per_event_limit [Integer] Max events/sec per event type (default: from E11y.config)
|
|
45
|
+
# @param window [Float] Time window in seconds (default: from E11y.config)
|
|
46
|
+
def initialize(app, global_limit: nil, per_event_limit: nil, window: nil)
|
|
47
47
|
super(app)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
config = E11y.config
|
|
49
|
+
# When explicit limits are passed (e.g. from pipeline options), enable for this instance
|
|
50
|
+
explicit_opts = global_limit || per_event_limit || window
|
|
51
|
+
@enabled = explicit_opts ? true : config.rate_limiting_enabled
|
|
52
|
+
@global_limit = global_limit || config.rate_limiting_global_limit
|
|
53
|
+
@global_window = window || config.rate_limiting_global_window
|
|
54
|
+
@window = @global_window # Alias for spec compatibility
|
|
55
|
+
@per_event_limit = per_event_limit || config.rate_limiting_per_event_limit
|
|
56
|
+
@explicit_per_event = per_event_limit && window
|
|
51
57
|
|
|
52
58
|
# Token buckets for rate limiting
|
|
53
|
-
@global_bucket = TokenBucket.new(
|
|
59
|
+
@global_bucket = TokenBucket.new(
|
|
60
|
+
capacity: @global_limit,
|
|
61
|
+
refill_rate: @global_limit,
|
|
62
|
+
window: @global_window
|
|
63
|
+
)
|
|
54
64
|
@per_event_buckets = Hash.new do |hash, event_name|
|
|
65
|
+
limit_cfg = @explicit_per_event ? { limit: @per_event_limit, window: @window } : config.rate_limit_for(event_name)
|
|
55
66
|
hash[event_name] = TokenBucket.new(
|
|
56
|
-
capacity:
|
|
57
|
-
refill_rate:
|
|
58
|
-
window:
|
|
67
|
+
capacity: limit_cfg[:limit],
|
|
68
|
+
refill_rate: limit_cfg[:limit],
|
|
69
|
+
window: limit_cfg[:window]
|
|
59
70
|
)
|
|
60
71
|
end
|
|
61
72
|
|
|
@@ -67,6 +78,8 @@ module E11y
|
|
|
67
78
|
# @param event_data [Hash] Event payload
|
|
68
79
|
# @return [Hash, nil] Event data if allowed, nil if rate limited
|
|
69
80
|
def call(event_data)
|
|
81
|
+
return @app.call(event_data) unless @enabled
|
|
82
|
+
|
|
70
83
|
event_name = event_data[:event_name]
|
|
71
84
|
|
|
72
85
|
# Check global rate limit
|
|
@@ -83,7 +96,7 @@ module E11y
|
|
|
83
96
|
end
|
|
84
97
|
|
|
85
98
|
# Rate limit not exceeded - continue pipeline
|
|
86
|
-
event_data
|
|
99
|
+
@app.call(event_data)
|
|
87
100
|
end
|
|
88
101
|
|
|
89
102
|
private
|
|
@@ -97,16 +110,31 @@ module E11y
|
|
|
97
110
|
def handle_rate_limited(event_data, limit_type)
|
|
98
111
|
event_name = event_data[:event_name]
|
|
99
112
|
|
|
100
|
-
# Log rate limiting
|
|
101
|
-
warn
|
|
113
|
+
# Log rate limiting (via E11y.logger so it respects Rails.logger in test env)
|
|
114
|
+
E11y.logger&.warn("[E11y] Rate limit exceeded (#{limit_type}) for event: #{event_name}")
|
|
102
115
|
|
|
103
116
|
# C02 Resolution: Check if event should be saved to DLQ
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
117
|
+
if should_save_to_dlq?(event_data)
|
|
118
|
+
record_dropped_metric(event_data, "rate_limited_#{limit_type}_dlq")
|
|
119
|
+
save_to_dlq(event_data, limit_type)
|
|
120
|
+
else
|
|
121
|
+
record_dropped_metric(event_data, "rate_limited_#{limit_type}")
|
|
122
|
+
end
|
|
123
|
+
end
|
|
107
124
|
|
|
108
|
-
|
|
109
|
-
|
|
125
|
+
# Record e11y_events_dropped_total metric (non-fatal, safe when Metrics unavailable)
|
|
126
|
+
#
|
|
127
|
+
# @param event_data [Hash] Event payload
|
|
128
|
+
# @param reason [String] Drop reason (e.g., sampled_out, rate_limited_global)
|
|
129
|
+
def record_dropped_metric(event_data, reason)
|
|
130
|
+
return unless defined?(E11y::Metrics) && E11y::Metrics.respond_to?(:increment)
|
|
131
|
+
|
|
132
|
+
E11y::Metrics.increment(:e11y_events_dropped_total, {
|
|
133
|
+
reason: reason,
|
|
134
|
+
event_type: event_data[:event_name].to_s
|
|
135
|
+
})
|
|
136
|
+
rescue StandardError
|
|
137
|
+
# non-fatal
|
|
110
138
|
end
|
|
111
139
|
|
|
112
140
|
# Check if rate-limited event should be saved to DLQ (C02 Resolution)
|
|
@@ -120,9 +148,8 @@ module E11y
|
|
|
120
148
|
dlq_filter = E11y.config.dlq_filter
|
|
121
149
|
return false unless dlq_filter
|
|
122
150
|
|
|
123
|
-
#
|
|
124
|
-
|
|
125
|
-
dlq_filter.always_save_patterns&.any? { |pattern| pattern.match?(event_name) }
|
|
151
|
+
# Use DLQ filter (Event DSL: use_dlq, severity, default)
|
|
152
|
+
dlq_filter.should_save?(event_data)
|
|
126
153
|
end
|
|
127
154
|
|
|
128
155
|
# Save rate-limited critical event to DLQ (C02 Resolution)
|
|
@@ -135,19 +162,19 @@ module E11y
|
|
|
135
162
|
dlq_storage = E11y.config.dlq_storage
|
|
136
163
|
return unless dlq_storage
|
|
137
164
|
|
|
165
|
+
per_event_limit = limit_type == :per_event ? E11y.config.rate_limit_for(event_data[:event_name])[:limit] : @per_event_limit
|
|
138
166
|
dlq_storage.save(event_data, metadata: {
|
|
139
167
|
reason: "rate_limited_#{limit_type}",
|
|
140
168
|
limit_type: limit_type,
|
|
141
169
|
global_limit: @global_limit,
|
|
142
|
-
per_event_limit:
|
|
170
|
+
per_event_limit: per_event_limit,
|
|
143
171
|
timestamp: Time.now.utc.iso8601
|
|
144
172
|
})
|
|
145
173
|
|
|
146
|
-
warn
|
|
147
|
-
# TODO: Track metric e11y.rate_limiter.dlq_saved
|
|
174
|
+
E11y.logger&.warn("[E11y] Rate-limited critical event saved to DLQ: #{event_data[:event_name]}")
|
|
148
175
|
rescue StandardError => e
|
|
149
176
|
# Don't fail if DLQ save fails (C18 Resolution)
|
|
150
|
-
warn
|
|
177
|
+
E11y.logger&.warn("[E11y] Failed to save rate-limited event to DLQ: #{e.message}")
|
|
151
178
|
end
|
|
152
179
|
|
|
153
180
|
# Token Bucket implementation for rate limiting
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require "rack/request"
|
|
4
4
|
require "securerandom"
|
|
5
|
+
require "e11y/tracing/propagator"
|
|
6
|
+
require "e11y/trace_context/sampler"
|
|
5
7
|
|
|
6
8
|
module E11y
|
|
7
9
|
module Middleware
|
|
@@ -32,13 +34,14 @@ module E11y
|
|
|
32
34
|
# Process request
|
|
33
35
|
# @param env [Hash] Rack environment
|
|
34
36
|
# @return [Array] Rack response [status, headers, body]
|
|
35
|
-
# rubocop:disable Metrics/AbcSize, Metrics/
|
|
37
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
36
38
|
# Rack middleware request processing requires sequential setup of tracing, context, buffer, and SLO tracking
|
|
37
39
|
def call(env)
|
|
38
40
|
request = Rack::Request.new(env)
|
|
39
41
|
|
|
40
|
-
# Extract or generate trace_id
|
|
41
|
-
|
|
42
|
+
# Extract or generate trace context (trace_id, sampled from traceparent)
|
|
43
|
+
trace_ctx = extract_trace_context(request)
|
|
44
|
+
trace_id = trace_ctx[:trace_id] || generate_trace_id
|
|
42
45
|
span_id = generate_span_id
|
|
43
46
|
|
|
44
47
|
# Set request context (ActiveSupport::CurrentAttributes)
|
|
@@ -50,9 +53,10 @@ module E11y
|
|
|
50
53
|
E11y::Current.user_agent = request.user_agent
|
|
51
54
|
E11y::Current.request_method = request.request_method
|
|
52
55
|
E11y::Current.request_path = request.path
|
|
56
|
+
E11y::Current.sampled = resolve_sampled(trace_ctx)
|
|
53
57
|
|
|
54
58
|
# Start request-scoped buffer (for debug events)
|
|
55
|
-
E11y::Buffers::
|
|
59
|
+
E11y::Buffers::EphemeralBuffer.initialize! if E11y.config.ephemeral_buffer_enabled
|
|
56
60
|
|
|
57
61
|
# Track request start time for SLO
|
|
58
62
|
start_time = Time.now
|
|
@@ -60,6 +64,9 @@ module E11y
|
|
|
60
64
|
# Call next middleware/app
|
|
61
65
|
status, headers, body = @app.call(env)
|
|
62
66
|
|
|
67
|
+
# Flush buffer if status matches configured flush_on_statuses (default: 5xx only)
|
|
68
|
+
E11y::Buffers::EphemeralBuffer.flush_on_error if should_flush_buffer?(status)
|
|
69
|
+
|
|
63
70
|
# Track SLO metrics (if enabled)
|
|
64
71
|
track_http_request_slo(env, status, start_time)
|
|
65
72
|
|
|
@@ -70,38 +77,80 @@ module E11y
|
|
|
70
77
|
[status, headers, body]
|
|
71
78
|
rescue StandardError
|
|
72
79
|
# Flush request buffer on error (includes debug events)
|
|
73
|
-
E11y::Buffers::
|
|
80
|
+
E11y::Buffers::EphemeralBuffer.flush_on_error if E11y.config.ephemeral_buffer_enabled
|
|
74
81
|
|
|
75
82
|
raise # Re-raise original exception
|
|
76
83
|
ensure
|
|
77
84
|
# Discard request buffer on success (not on error, already flushed above)
|
|
78
85
|
# We need to check if we're here from normal completion or exception
|
|
79
86
|
# If there was an exception, buffer was already flushed in rescue block
|
|
80
|
-
if !$ERROR_INFO && E11y.config.
|
|
81
|
-
E11y::Buffers::RequestScopedBuffer.discard
|
|
82
|
-
end
|
|
87
|
+
E11y::Buffers::EphemeralBuffer.discard if !$ERROR_INFO && E11y.config.ephemeral_buffer_enabled # No exception occurred
|
|
83
88
|
|
|
84
89
|
# Reset context
|
|
85
90
|
E11y::Current.reset
|
|
86
91
|
end
|
|
87
|
-
# rubocop:enable Metrics/AbcSize, Metrics/
|
|
92
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
|
88
93
|
|
|
89
94
|
private
|
|
90
95
|
|
|
91
|
-
#
|
|
96
|
+
# Determine whether the request-scoped buffer should be flushed for this status code.
|
|
97
|
+
#
|
|
98
|
+
# Two independent conditions (either is sufficient):
|
|
99
|
+
# - +flush_on_error+ (default: true) — flushes on any 5xx server error
|
|
100
|
+
# - +flush_on_statuses+ (default: []) — extra status codes/ranges, e.g. [403]
|
|
101
|
+
#
|
|
102
|
+
# @example Default behaviour — flush on 5xx only
|
|
103
|
+
# config.ephemeral_buffer_flush_on_error = true # default
|
|
104
|
+
# config.ephemeral_buffer_flush_on_statuses = [] # default
|
|
105
|
+
#
|
|
106
|
+
# @example Flush on 403 in addition to 5xx
|
|
107
|
+
# config.ephemeral_buffer_flush_on_statuses = [403]
|
|
108
|
+
#
|
|
109
|
+
# @example Flush only on explicit statuses (disable 5xx default)
|
|
110
|
+
# config.ephemeral_buffer_flush_on_error = false
|
|
111
|
+
# config.ephemeral_buffer_flush_on_statuses = [403, 422]
|
|
112
|
+
#
|
|
113
|
+
# @param status [Integer] HTTP response status code
|
|
114
|
+
# @return [Boolean]
|
|
115
|
+
def should_flush_buffer?(status)
|
|
116
|
+
return false unless E11y.config.ephemeral_buffer_enabled
|
|
117
|
+
|
|
118
|
+
# Condition 1: server error flush (5xx)
|
|
119
|
+
return true if E11y.config.ephemeral_buffer_flush_on_error && status >= 500
|
|
120
|
+
|
|
121
|
+
# Condition 2: explicit extra statuses
|
|
122
|
+
extra = E11y.config.ephemeral_buffer_flush_on_statuses
|
|
123
|
+
extra&.any? { |s| s === status } || false # rubocop:disable Style/CaseEquality
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Extract trace context from request headers (W3C Trace Context or custom).
|
|
127
|
+
# Also extracts tracestate into E11y::Current.baggage (F-014).
|
|
92
128
|
# @param request [Rack::Request] Rack request
|
|
93
|
-
# @return [
|
|
94
|
-
def
|
|
95
|
-
# W3C Trace Context (traceparent header)
|
|
96
|
-
# Format: version-trace_id-span_id-flags
|
|
97
|
-
# Example: 00-0af7651916cd43dd8448eb211c80319c-00f067aa0ba902b7-01
|
|
129
|
+
# @return [Hash] { trace_id:, sampled: (from traceparent, or nil if new trace) }
|
|
130
|
+
def extract_trace_context(request)
|
|
98
131
|
traceparent = request.get_header("HTTP_TRACEPARENT")
|
|
99
|
-
|
|
132
|
+
tracestate = request.get_header("HTTP_TRACESTATE")
|
|
133
|
+
|
|
134
|
+
if tracestate && E11y::Current.respond_to?(:baggage=)
|
|
135
|
+
baggage = E11y::Tracing::Propagator.parse_tracestate(tracestate)
|
|
136
|
+
E11y::Current.baggage = baggage if baggage.any?
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
if traceparent
|
|
140
|
+
parsed = E11y::Tracing::Propagator.parse(traceparent)
|
|
141
|
+
return { trace_id: parsed[:trace_id], sampled: parsed[:sampled] } if parsed
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
trace_id = request.get_header("HTTP_X_REQUEST_ID") || request.get_header("HTTP_X_TRACE_ID")
|
|
145
|
+
{ trace_id: trace_id, sampled: nil }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Resolve sampling decision: from parent (traceparent) or Sampler for new trace.
|
|
149
|
+
# Context for Sampler = E11y::Current.to_context (already set above).
|
|
150
|
+
def resolve_sampled(trace_ctx)
|
|
151
|
+
return trace_ctx[:sampled] if trace_ctx.key?(:sampled) && !trace_ctx[:sampled].nil?
|
|
100
152
|
|
|
101
|
-
|
|
102
|
-
request.get_header("HTTP_X_REQUEST_ID") ||
|
|
103
|
-
# X-Trace-Id (custom)
|
|
104
|
-
request.get_header("HTTP_X_TRACE_ID")
|
|
153
|
+
E11y::TraceContext::Sampler.should_sample?(E11y::Current.to_context)
|
|
105
154
|
end
|
|
106
155
|
|
|
107
156
|
# Extract request_id from Rack env
|
|
@@ -141,10 +190,9 @@ module E11y
|
|
|
141
190
|
# @param start_time [Time] Request start time
|
|
142
191
|
# @return [void]
|
|
143
192
|
# @api private
|
|
144
|
-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
|
|
145
193
|
# SLO tracking requires extracting controller/action, calculating duration, and error handling
|
|
146
194
|
def track_http_request_slo(env, status, start_time)
|
|
147
|
-
return unless E11y.config.
|
|
195
|
+
return unless E11y.config.respond_to?(:slo_tracking_enabled) && E11y.config.slo_tracking_enabled
|
|
148
196
|
|
|
149
197
|
duration_ms = ((Time.now - start_time) * 1000).round(2)
|
|
150
198
|
|
|
@@ -163,7 +211,6 @@ module E11y
|
|
|
163
211
|
# Don't fail if SLO tracking fails
|
|
164
212
|
warn "[E11y] SLO tracking error: #{e.message}"
|
|
165
213
|
end
|
|
166
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity
|
|
167
214
|
end
|
|
168
215
|
end
|
|
169
216
|
end
|
|
@@ -40,13 +40,8 @@ module E11y
|
|
|
40
40
|
# # Rule: ->(e) { :audit_encrypted if e[:audit_event] }
|
|
41
41
|
# # Routes to: [:audit_encrypted]
|
|
42
42
|
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
# event_name: 'order.placed',
|
|
46
|
-
# retention_until: '2026-04-21T...' # 90 days
|
|
47
|
-
# }
|
|
48
|
-
# # Rule: ->(e) { days > 30 ? :s3_standard : :loki }
|
|
49
|
-
# # Routes to: [:s3_standard]
|
|
43
|
+
# Note: retention_until is for archival jobs (run separately), not for routing.
|
|
44
|
+
# Archival happens later — cron/Loki compaction filters by retention_until.
|
|
50
45
|
class Routing < Base
|
|
51
46
|
middleware_zone :adapters
|
|
52
47
|
|
|
@@ -58,10 +53,23 @@ module E11y
|
|
|
58
53
|
# @option event_data [Boolean] :audit_event Audit event flag (optional, for routing rules)
|
|
59
54
|
# @option event_data [Symbol] :severity Event severity (optional, for routing rules)
|
|
60
55
|
# @return [Hash, nil] Event data (passed to next middleware), or nil if dropped
|
|
61
|
-
# rubocop:disable Metrics/
|
|
56
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
|
62
57
|
# Routing logic requires adapter selection, iteration with error handling,
|
|
63
58
|
# metadata enrichment, and metrics tracking
|
|
64
59
|
def call(event_data)
|
|
60
|
+
# Handle nil from upstream middleware (e.g., rate limiting, sampling)
|
|
61
|
+
return nil unless event_data
|
|
62
|
+
|
|
63
|
+
# 0. Request-scoped buffer: buffer debug events instead of writing when enabled
|
|
64
|
+
# Skip when event is from a flush (avoid re-buffering)
|
|
65
|
+
if !event_data[:from_ephemeral_buffer_flush] &&
|
|
66
|
+
event_data[:severity] == :debug &&
|
|
67
|
+
E11y.config.ephemeral_buffer_enabled &&
|
|
68
|
+
E11y::Buffers::EphemeralBuffer.active? && E11y::Buffers::EphemeralBuffer.add_event(event_data)
|
|
69
|
+
# Buffered — skip adapter writes, pass through
|
|
70
|
+
return @app&.call(event_data)
|
|
71
|
+
end
|
|
72
|
+
|
|
65
73
|
# 1. Determine target adapters (explicit or via routing rules)
|
|
66
74
|
target_adapters = if event_data[:adapters]&.any?
|
|
67
75
|
# Explicit adapters bypass routing rules
|
|
@@ -71,18 +79,28 @@ module E11y
|
|
|
71
79
|
apply_routing_rules(event_data)
|
|
72
80
|
end
|
|
73
81
|
|
|
82
|
+
# 1.5. Validate audit events have proper routing (UC-012 compliance requirement)
|
|
83
|
+
validate_audit_routing!(event_data, target_adapters)
|
|
84
|
+
|
|
74
85
|
# 2. Write to selected adapters
|
|
75
86
|
target_adapters.each do |adapter_name|
|
|
76
87
|
adapter = E11y.configuration.adapters[adapter_name]
|
|
77
88
|
next unless adapter
|
|
78
89
|
|
|
90
|
+
# Per-adapter payload: merge payload_rewrites only when present (explicit_pii exclude_adapters)
|
|
91
|
+
data_to_write = if event_data[:payload_rewrites] && event_data[:payload_rewrites][adapter_name]
|
|
92
|
+
payload = event_data[:payload]&.dup || {}
|
|
93
|
+
payload.merge!(event_data[:payload_rewrites][adapter_name])
|
|
94
|
+
event_data.merge(payload: payload)
|
|
95
|
+
else
|
|
96
|
+
event_data
|
|
97
|
+
end
|
|
98
|
+
|
|
79
99
|
begin
|
|
80
|
-
adapter.write(
|
|
81
|
-
increment_metric("e11y.middleware.routing.write_success", adapter: adapter_name)
|
|
100
|
+
adapter.write(data_to_write)
|
|
82
101
|
rescue StandardError => e
|
|
83
102
|
# Log routing error but don't fail pipeline
|
|
84
103
|
warn "E11y routing error for adapter #{adapter_name}: #{e.message}"
|
|
85
|
-
increment_metric("e11y.middleware.routing.write_error", adapter: adapter_name)
|
|
86
104
|
end
|
|
87
105
|
end
|
|
88
106
|
|
|
@@ -94,9 +112,9 @@ module E11y
|
|
|
94
112
|
}
|
|
95
113
|
|
|
96
114
|
# 4. Increment metrics
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
115
|
+
E11y::Metrics.increment("e11y.middleware.routing.routed",
|
|
116
|
+
adapters_count: target_adapters.size,
|
|
117
|
+
routing_type: event_data[:routing][:routing_type])
|
|
100
118
|
|
|
101
119
|
# 5. Log routing decision (for debugging)
|
|
102
120
|
log_routing_decision(event_data, target_adapters) if debug_enabled?
|
|
@@ -104,7 +122,7 @@ module E11y
|
|
|
104
122
|
# 6. Pass to next app (if any)
|
|
105
123
|
@app&.call(event_data)
|
|
106
124
|
end
|
|
107
|
-
# rubocop:enable Metrics/
|
|
125
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
|
108
126
|
|
|
109
127
|
private
|
|
110
128
|
|
|
@@ -124,12 +142,12 @@ module E11y
|
|
|
124
142
|
# ->(event) { :audit_encrypted if event[:audit_event] },
|
|
125
143
|
# ->(event) {
|
|
126
144
|
# days = (Time.parse(event[:retention_until]) - Time.now) / 86400
|
|
127
|
-
# days > 90 ? :
|
|
145
|
+
# days > 90 ? :archive : :loki
|
|
128
146
|
# }
|
|
129
147
|
# ]
|
|
130
148
|
#
|
|
131
149
|
# apply_routing_rules(event_data)
|
|
132
|
-
# # => [:audit_encrypted] or [:loki] or [:
|
|
150
|
+
# # => [:audit_encrypted] or [:loki] or [:archive]
|
|
133
151
|
def apply_routing_rules(event_data)
|
|
134
152
|
matched_adapters = []
|
|
135
153
|
|
|
@@ -143,10 +161,12 @@ module E11y
|
|
|
143
161
|
warn "E11y routing rule error: #{e.message}"
|
|
144
162
|
end
|
|
145
163
|
|
|
146
|
-
#
|
|
164
|
+
# Track whether fallback was used (for audit validation)
|
|
147
165
|
if matched_adapters.any?
|
|
166
|
+
event_data[:routing_used_fallback] = false
|
|
148
167
|
matched_adapters.uniq
|
|
149
168
|
else
|
|
169
|
+
event_data[:routing_used_fallback] = true
|
|
150
170
|
E11y.configuration.fallback_adapters || [:stdout]
|
|
151
171
|
end
|
|
152
172
|
end
|
|
@@ -175,9 +195,46 @@ module E11y
|
|
|
175
195
|
# @param metric_name [String] Metric name
|
|
176
196
|
# @param tags [Hash] Metric tags
|
|
177
197
|
# @return [void]
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
198
|
+
# Validate audit events have proper routing configuration.
|
|
199
|
+
#
|
|
200
|
+
# Audit events MUST be routed via explicit adapters OR routing rules.
|
|
201
|
+
# Relying on fallback routing (no rule matched) is a compliance configuration error.
|
|
202
|
+
#
|
|
203
|
+
# @param event_data [Hash] Event data
|
|
204
|
+
# @param target_adapters [Array<Symbol>] Target adapters
|
|
205
|
+
# @raise [E11y::Error] if audit event misconfigured
|
|
206
|
+
# @return [void]
|
|
207
|
+
def validate_audit_routing!(event_data, target_adapters)
|
|
208
|
+
return unless event_data[:audit_event]
|
|
209
|
+
|
|
210
|
+
# Audit events are valid if:
|
|
211
|
+
# 1. They have explicit adapters (non-empty), OR
|
|
212
|
+
# 2. They matched a routing rule (routing_used_fallback = false)
|
|
213
|
+
|
|
214
|
+
has_explicit_adapters = event_data[:adapters]&.any?
|
|
215
|
+
return if has_explicit_adapters # Explicit adapters → valid
|
|
216
|
+
|
|
217
|
+
# Check if fallback was used (set by apply_routing_rules)
|
|
218
|
+
used_fallback = event_data[:routing_used_fallback]
|
|
219
|
+
return unless used_fallback
|
|
220
|
+
|
|
221
|
+
# CRITICAL: Audit event using fallback routing (no rule matched!)
|
|
222
|
+
error_message = <<~ERROR
|
|
223
|
+
[E11y] CRITICAL: Audit event has no routing configuration!
|
|
224
|
+
|
|
225
|
+
Event: #{event_data[:event_name]}
|
|
226
|
+
Routed to: #{target_adapters.inspect} (fallback adapters)
|
|
227
|
+
|
|
228
|
+
Audit events MUST be explicitly routed to compliance-grade storage.
|
|
229
|
+
|
|
230
|
+
Fix options:
|
|
231
|
+
1. Add explicit adapters: `adapters :audit_encrypted`
|
|
232
|
+
2. Configure routing rule: `config.routing_rules = [->(e) { :audit_encrypted if e[:audit_event] }]`
|
|
233
|
+
|
|
234
|
+
See UC-012 Audit Trail documentation for details.
|
|
235
|
+
ERROR
|
|
236
|
+
|
|
237
|
+
raise E11y::Error, error_message
|
|
181
238
|
end
|
|
182
239
|
end
|
|
183
240
|
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "e11y/middleware/base"
|
|
4
|
+
require "e11y/sampling/stratified_tracker"
|
|
4
5
|
|
|
5
6
|
module E11y
|
|
6
7
|
module Middleware
|
|
@@ -51,6 +52,8 @@ module E11y
|
|
|
51
52
|
# }
|
|
52
53
|
# }
|
|
53
54
|
# end
|
|
55
|
+
# rubocop:disable Metrics/ClassLength
|
|
56
|
+
# Class has 6 adaptive sampling strategies each requiring dedicated setup + private methods
|
|
54
57
|
class Sampling < Base
|
|
55
58
|
middleware_zone :routing
|
|
56
59
|
|
|
@@ -79,6 +82,9 @@ module E11y
|
|
|
79
82
|
# @param event_data [Hash] The event payload
|
|
80
83
|
# @return [Hash, nil] The event payload if sampled, nil if dropped
|
|
81
84
|
def call(event_data)
|
|
85
|
+
# Handle nil from upstream middleware (e.g., rate limiting)
|
|
86
|
+
return nil unless event_data
|
|
87
|
+
|
|
82
88
|
event_class = event_data[:event_class]
|
|
83
89
|
|
|
84
90
|
# Track errors for error-based adaptive sampling (FEAT-4838)
|
|
@@ -87,13 +93,34 @@ module E11y
|
|
|
87
93
|
# Track events for load-based adaptive sampling (FEAT-4842)
|
|
88
94
|
@load_monitor&.record_event
|
|
89
95
|
|
|
96
|
+
# C11: Get sample rate and severity before decision (for StratifiedTracker)
|
|
97
|
+
sample_rate = determine_sample_rate(event_class, event_data)
|
|
98
|
+
severity = event_data[:severity] || (event_class.respond_to?(:severity) ? event_class.severity : :info)
|
|
99
|
+
|
|
90
100
|
# Determine if event should be sampled
|
|
91
101
|
# Drop event if not sampled
|
|
92
|
-
|
|
102
|
+
unless should_sample?(event_data, event_class)
|
|
103
|
+
# C11: Record dropped event to StratifiedTracker for sampling correction
|
|
104
|
+
E11y::Sampling.stratified_tracker.record_sample(severity: severity, sample_rate: sample_rate, sampled: false)
|
|
105
|
+
begin
|
|
106
|
+
if defined?(E11y::Metrics) && E11y::Metrics.respond_to?(:increment)
|
|
107
|
+
E11y::Metrics.increment(:e11y_events_dropped_total, {
|
|
108
|
+
reason: "sampled_out",
|
|
109
|
+
event_type: event_data[:event_name].to_s
|
|
110
|
+
})
|
|
111
|
+
end
|
|
112
|
+
rescue StandardError
|
|
113
|
+
# non-fatal
|
|
114
|
+
end
|
|
115
|
+
return nil
|
|
116
|
+
end
|
|
93
117
|
|
|
94
118
|
# Mark as sampled for downstream middleware
|
|
95
119
|
event_data[:sampled] = true
|
|
96
|
-
event_data[:sample_rate] =
|
|
120
|
+
event_data[:sample_rate] = sample_rate
|
|
121
|
+
|
|
122
|
+
# C11: Record sampled event to StratifiedTracker for sampling correction
|
|
123
|
+
E11y::Sampling.stratified_tracker.record_sample(severity: severity, sample_rate: sample_rate, sampled: true)
|
|
97
124
|
|
|
98
125
|
# Pass to next middleware
|
|
99
126
|
@app.call(event_data)
|
|
@@ -121,6 +148,7 @@ module E11y
|
|
|
121
148
|
@default_sample_rate = config.fetch(:default_sample_rate, 1.0)
|
|
122
149
|
@trace_aware = config.fetch(:trace_aware, true)
|
|
123
150
|
@severity_rates = config.fetch(:severity_rates, {})
|
|
151
|
+
@pattern_rates = config.fetch(:pattern_rates, []) # [[Regexp, Float], ...]
|
|
124
152
|
@trace_decisions = {} # Cache for trace-level sampling decisions
|
|
125
153
|
@trace_decisions_mutex = Mutex.new
|
|
126
154
|
end
|
|
@@ -158,8 +186,10 @@ module E11y
|
|
|
158
186
|
# 1. Check if audit event (never sample audit events!)
|
|
159
187
|
return true if event_class.respond_to?(:audit_event?) && event_class.audit_event?
|
|
160
188
|
|
|
161
|
-
# 2.
|
|
189
|
+
# 2. Trace-consistent sampling (ADR-005 §7): prefer E11y::Current.sampled when trace_aware
|
|
162
190
|
if @trace_aware && event_data[:trace_id]
|
|
191
|
+
return E11y::Current.sampled if E11y::Current.respond_to?(:sampled) && !E11y::Current.sampled.nil?
|
|
192
|
+
|
|
163
193
|
return trace_sampling_decision(event_data[:trace_id], event_class, event_data)
|
|
164
194
|
end
|
|
165
195
|
|
|
@@ -183,22 +213,32 @@ module E11y
|
|
|
183
213
|
# @param event_class [Class] The event class
|
|
184
214
|
# @param event_data [Hash] Event payload (for value-based sampling)
|
|
185
215
|
# @return [Float] Sample rate (0.0-1.0)
|
|
186
|
-
# rubocop:disable Metrics/
|
|
187
|
-
# Sample rate determination follows priority chain:
|
|
188
|
-
#
|
|
216
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
217
|
+
# Sample rate determination follows a 6-step priority chain:
|
|
218
|
+
# error spike (0) → pattern-based (0.5) → value-based (1) →
|
|
219
|
+
# load-based (2) → severity (3) → event-level (4) → default (5)
|
|
189
220
|
def determine_sample_rate(event_class, event_data = nil)
|
|
190
221
|
# 0. Error-based adaptive sampling (FEAT-4838) - highest priority!
|
|
191
222
|
if @error_based_adaptive && @error_spike_detector&.error_spike?
|
|
192
223
|
return 1.0 # 100% sampling during error spike
|
|
193
224
|
end
|
|
194
225
|
|
|
226
|
+
# 0.5. Pattern-based sampling (by event_name) - overrides event-level config
|
|
227
|
+
if event_data && !@pattern_rates.empty?
|
|
228
|
+
event_name = event_data[:event_name].to_s
|
|
229
|
+
@pattern_rates.each do |pattern, rate|
|
|
230
|
+
return rate if pattern.match?(event_name)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
195
234
|
# 1. Value-based sampling (FEAT-4849) - high-value events always sampled
|
|
196
235
|
if event_data && event_class.respond_to?(:value_sampling_configs)
|
|
197
236
|
configs = event_class.value_sampling_configs
|
|
198
237
|
unless configs.empty?
|
|
199
238
|
require "e11y/sampling/value_extractor"
|
|
200
239
|
extractor = E11y::Sampling::ValueExtractor.new
|
|
201
|
-
|
|
240
|
+
payload = event_data[:payload] || event_data
|
|
241
|
+
if configs.any? { |config| config.matches?(payload, extractor) }
|
|
202
242
|
return 1.0 # 100% sampling for high-value events
|
|
203
243
|
end
|
|
204
244
|
end
|
|
@@ -228,7 +268,7 @@ module E11y
|
|
|
228
268
|
# 4. Default/load-based rate
|
|
229
269
|
base_rate
|
|
230
270
|
end
|
|
231
|
-
# rubocop:enable Metrics/
|
|
271
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
232
272
|
|
|
233
273
|
# Trace-aware sampling decision (C05 Resolution)
|
|
234
274
|
#
|
|
@@ -241,15 +281,21 @@ module E11y
|
|
|
241
281
|
# @return [Boolean] true if trace should be sampled
|
|
242
282
|
def trace_sampling_decision(trace_id, event_class, event_data = nil)
|
|
243
283
|
@trace_decisions_mutex.synchronize do
|
|
284
|
+
# Use monotonic clock (Float) to avoid Time object allocation — prevents memory leak in hot path
|
|
285
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
286
|
+
|
|
244
287
|
# Check if decision already made for this trace
|
|
245
|
-
|
|
288
|
+
if (entry = @trace_decisions[trace_id])
|
|
289
|
+
entry[:last_access] = now # LRU touch
|
|
290
|
+
return entry[:decision]
|
|
291
|
+
end
|
|
246
292
|
|
|
247
293
|
# Make new sampling decision
|
|
248
294
|
sample_rate = determine_sample_rate(event_class, event_data)
|
|
249
295
|
decision = rand < sample_rate
|
|
250
296
|
|
|
251
|
-
# Cache decision (
|
|
252
|
-
@trace_decisions[trace_id] = decision
|
|
297
|
+
# Cache decision with LRU metadata (evict oldest on cleanup)
|
|
298
|
+
@trace_decisions[trace_id] = { decision: decision, last_access: now }
|
|
253
299
|
|
|
254
300
|
# Cleanup old decisions periodically (every 1000 traces)
|
|
255
301
|
cleanup_trace_decisions if @trace_decisions.size > 1000
|
|
@@ -260,14 +306,17 @@ module E11y
|
|
|
260
306
|
|
|
261
307
|
# Cleanup old trace decisions to prevent memory leaks
|
|
262
308
|
#
|
|
263
|
-
#
|
|
264
|
-
#
|
|
265
|
-
# so old decisions are likely stale.
|
|
309
|
+
# Evicts oldest 50% by last_access (LRU). Active traces stay in cache
|
|
310
|
+
# because they are touched on each lookup, preserving trace-level consistency.
|
|
266
311
|
def cleanup_trace_decisions
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
312
|
+
return if @trace_decisions.size <= 100
|
|
313
|
+
|
|
314
|
+
size_to_remove = @trace_decisions.size / 2
|
|
315
|
+
sorted = @trace_decisions.to_a.sort_by { |_, v| v[:last_access] }
|
|
316
|
+
keys_to_remove = sorted.first(size_to_remove).map(&:first)
|
|
317
|
+
keys_to_remove.each { |k| @trace_decisions.delete(k) }
|
|
270
318
|
end
|
|
271
319
|
end
|
|
320
|
+
# rubocop:enable Metrics/ClassLength
|
|
272
321
|
end
|
|
273
322
|
end
|