e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Reliability
5
+ module DLQ
6
+ # DLQ Filter determines which failed events should be saved to DLQ.
7
+ #
8
+ # Supports:
9
+ # - Always save patterns (e.g., payment.*, audit.*)
10
+ # - Always discard patterns (e.g., debug.*, test.*)
11
+ # - Severity-based filtering (e.g., always save :error, :fatal)
12
+ #
13
+ # @example Configuration
14
+ # filter = Filter.new(
15
+ # always_save_patterns: [/^payment\./, /^audit\./],
16
+ # always_discard_patterns: [/^debug\./, /^test\./],
17
+ # save_severities: [:error, :fatal]
18
+ # )
19
+ #
20
+ # filter.should_save?(event_data) # => true/false
21
+ #
22
+ # @see ADR-013 §4.3 (DLQ Filter)
23
+ # @see UC-021 §3.2 (DLQ Filter Configuration)
24
+ class Filter
25
+ # @param always_save_patterns [Array<Regexp>] Event patterns to always save
26
+ # @param always_discard_patterns [Array<Regexp>] Event patterns to always discard
27
+ # @param save_severities [Array<Symbol>] Severities to always save (:error, :fatal)
28
+ # @param default_behavior [Symbol] Default behavior when no rule matches (:save or :discard)
29
+ def initialize(
30
+ always_save_patterns: [],
31
+ always_discard_patterns: [],
32
+ save_severities: %i[error fatal],
33
+ default_behavior: :save
34
+ )
35
+ @always_save_patterns = always_save_patterns
36
+ @always_discard_patterns = always_discard_patterns
37
+ @save_severities = save_severities
38
+ @default_behavior = default_behavior
39
+ end
40
+
41
+ # Check if event should be saved to DLQ.
42
+ #
43
+ # Priority order:
44
+ # 1. Always discard patterns (highest priority)
45
+ # 2. Always save patterns
46
+ # 3. Severity-based rules
47
+ # 4. Default behavior
48
+ #
49
+ # @param event_data [Hash] Event data
50
+ # @return [Boolean] true if event should be saved to DLQ
51
+ def should_save?(event_data)
52
+ event_name = event_data[:event_name].to_s
53
+ severity = event_data[:severity]
54
+
55
+ # Priority 1: Always discard (highest priority)
56
+ if matches_patterns?(event_name, @always_discard_patterns)
57
+ increment_metric("e11y.dlq.filter.discarded", reason: "always_discard_pattern")
58
+ return false
59
+ end
60
+
61
+ # Priority 2: Always save
62
+ if matches_patterns?(event_name, @always_save_patterns)
63
+ increment_metric("e11y.dlq.filter.saved", reason: "always_save_pattern")
64
+ return true
65
+ end
66
+
67
+ # Priority 3: Severity-based
68
+ if @save_severities.include?(severity)
69
+ increment_metric("e11y.dlq.filter.saved", reason: "severity")
70
+ return true
71
+ end
72
+
73
+ # Priority 4: Default behavior
74
+ if @default_behavior == :save
75
+ increment_metric("e11y.dlq.filter.saved", reason: "default")
76
+ true
77
+ else
78
+ increment_metric("e11y.dlq.filter.discarded", reason: "default")
79
+ false
80
+ end
81
+ end
82
+
83
+ # Get filter statistics.
84
+ #
85
+ # @return [Hash] Filter configuration stats
86
+ def stats
87
+ {
88
+ always_save_patterns: @always_save_patterns.map(&:inspect),
89
+ always_discard_patterns: @always_discard_patterns.map(&:inspect),
90
+ save_severities: @save_severities,
91
+ default_behavior: @default_behavior
92
+ }
93
+ end
94
+
95
+ private
96
+
97
+ # Check if event name matches any of the patterns.
98
+ #
99
+ # @param event_name [String] Event name
100
+ # @param patterns [Array<Regexp>] Patterns to match
101
+ # @return [Boolean] true if event matches any pattern
102
+ def matches_patterns?(event_name, patterns)
103
+ patterns.any? { |pattern| pattern.match?(event_name) }
104
+ end
105
+
106
+ # Increment DLQ filter metric.
107
+ #
108
+ # @param metric_name [String] Metric name
109
+ # @param tags [Hash] Additional tags
110
+ def increment_metric(metric_name, tags = {})
111
+ # TODO: Integrate with Yabeda metrics
112
+ # E11y::Metrics.increment(metric_name, tags)
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,207 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "securerandom"
4
+
5
+ module E11y
6
+ module Reliability
7
+ # Retry handler with exponential backoff and jitter.
8
+ #
9
+ # Automatically retries transient failures with increasing delays.
10
+ # Integrates with CircuitBreaker and DLQ for comprehensive error handling.
11
+ #
12
+ # @example Usage
13
+ # retry_handler = RetryHandler.new(config: config)
14
+ #
15
+ # retry_handler.with_retry(adapter: adapter, event: event_data) do
16
+ # adapter.send(event_data)
17
+ # end
18
+ #
19
+ # @see ADR-013 §3 (Retry Policy)
20
+ # @see UC-021 §2 (Exponential Backoff with Jitter)
21
+ class RetryHandler
22
+ # Retry exhausted error (all retries failed)
23
+ class RetryExhaustedError < StandardError
24
+ attr_reader :original_error, :retry_count
25
+
26
+ def initialize(original_error, retry_count:)
27
+ @original_error = original_error
28
+ @retry_count = retry_count
29
+ super("Retry exhausted after #{retry_count} attempts: #{original_error.message}")
30
+ end
31
+ end
32
+
33
+ # Transient errors that should be retried
34
+ TRANSIENT_ERRORS = [
35
+ Timeout::Error,
36
+ Errno::ECONNREFUSED,
37
+ Errno::ECONNRESET,
38
+ Errno::ETIMEDOUT,
39
+ Errno::EHOSTUNREACH,
40
+ Errno::ENETUNREACH
41
+ ].freeze
42
+
43
+ # HTTP status codes that should be retried (5xx server errors)
44
+ RETRIABLE_HTTP_STATUS_CODES = (500..599)
45
+
46
+ # @param config [Hash] Configuration options
47
+ # @option config [Integer] :max_attempts Maximum retry attempts (default: 3)
48
+ # @option config [Float] :base_delay_ms Initial delay in milliseconds (default: 100)
49
+ # @option config [Float] :max_delay_ms Maximum delay in milliseconds (default: 5000)
50
+ # @option config [Float] :jitter_factor Jitter factor (0.0-1.0, default: 0.1)
51
+ # @option config [Boolean] :fail_on_error Raise error after max retries (default: true)
52
+ def initialize(config: {})
53
+ @max_attempts = config[:max_attempts] || 3
54
+ @base_delay_ms = config[:base_delay_ms] || 100.0
55
+ @max_delay_ms = config[:max_delay_ms] || 5000.0
56
+ @jitter_factor = config[:jitter_factor] || 0.1
57
+ @fail_on_error = config.fetch(:fail_on_error, true)
58
+ end
59
+
60
+ # Execute block with retry logic.
61
+ #
62
+ # @param adapter [E11y::Adapters::Base] Adapter instance
63
+ # @param event [Hash] Event data
64
+ # @yield Block to execute (adapter send)
65
+ # @return [Object] Result of block execution
66
+ # @raise [RetryExhaustedError] if all retries fail and fail_on_error is true
67
+ def with_retry(adapter:, event:)
68
+ attempt = 0
69
+
70
+ loop do
71
+ attempt += 1
72
+
73
+ begin
74
+ result = yield
75
+ on_success(adapter, event, attempt)
76
+ return result # Return actual result, not true
77
+ rescue StandardError => e
78
+ # Check if error is retriable
79
+ unless retriable_error?(e)
80
+ on_permanent_failure(adapter, event, e, attempt)
81
+ raise RetryExhaustedError.new(e, retry_count: attempt) if @fail_on_error
82
+
83
+ return nil
84
+ end
85
+
86
+ # Check if max attempts reached
87
+ if attempt >= @max_attempts
88
+ on_max_retries_exhausted(adapter, event, e, attempt)
89
+ raise RetryExhaustedError.new(e, retry_count: attempt) if @fail_on_error
90
+
91
+ return nil
92
+ end
93
+
94
+ # Calculate backoff delay
95
+ delay_ms = calculate_backoff_delay(attempt)
96
+ on_retry_attempt(adapter, event, e, attempt, delay_ms)
97
+
98
+ # Sleep with backoff
99
+ sleep(delay_ms / 1000.0)
100
+ end
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ # Check if error should be retried.
107
+ #
108
+ # @param error [StandardError] The error that occurred
109
+ # @return [Boolean] true if error is retriable
110
+ def retriable_error?(error)
111
+ # Check if error class is in transient errors list
112
+ return true if TRANSIENT_ERRORS.any? { |klass| error.is_a?(klass) }
113
+
114
+ # Check HTTP status codes (if error has response)
115
+ if error.respond_to?(:response) && error.response.respond_to?(:code)
116
+ status_code = error.response.code.to_i
117
+ return true if RETRIABLE_HTTP_STATUS_CODES.cover?(status_code)
118
+ end
119
+
120
+ false
121
+ end
122
+
123
+ # Calculate exponential backoff delay with jitter.
124
+ #
125
+ # Formula: base_delay * (2 ^ attempt) + jitter
126
+ # Jitter: random value between [-jitter_factor * delay, +jitter_factor * delay]
127
+ #
128
+ # @param attempt [Integer] Current attempt number (1-indexed)
129
+ # @return [Float] Delay in milliseconds
130
+ def calculate_backoff_delay(attempt)
131
+ # Exponential backoff: base * 2^(attempt-1)
132
+ exponential_delay = @base_delay_ms * (2**(attempt - 1))
133
+
134
+ # Cap at max_delay
135
+ exponential_delay = [@max_delay_ms, exponential_delay].min
136
+
137
+ # Add jitter: +/- jitter_factor * delay
138
+ jitter_range = exponential_delay * @jitter_factor
139
+ jitter = rand(-jitter_range..jitter_range)
140
+
141
+ exponential_delay + jitter
142
+ end
143
+
144
+ # Handle successful execution.
145
+ def on_success(adapter, _event, attempt)
146
+ increment_metric("e11y.retry.success", adapter: adapter.class.name, attempts: attempt)
147
+
148
+ # Log if retry was needed
149
+ return unless attempt > 1
150
+
151
+ increment_metric("e11y.retry.recovered", adapter: adapter.class.name, attempts: attempt)
152
+ end
153
+
154
+ # Handle permanent failure (non-retriable error).
155
+ def on_permanent_failure(adapter, _event, error, attempt)
156
+ increment_metric(
157
+ "e11y.retry.permanent_failure",
158
+ adapter: adapter.class.name,
159
+ error: error.class.name,
160
+ attempt: attempt
161
+ )
162
+ end
163
+
164
+ # Handle max retries exhausted (all attempts failed).
165
+ def on_max_retries_exhausted(adapter, _event, error, attempt)
166
+ increment_metric(
167
+ "e11y.retry.exhausted",
168
+ adapter: adapter.class.name,
169
+ error: error.class.name,
170
+ attempts: attempt
171
+ )
172
+ end
173
+
174
+ # Handle retry attempt.
175
+ def on_retry_attempt(adapter, _event, error, attempt, delay_ms)
176
+ increment_metric(
177
+ "e11y.retry.attempt",
178
+ adapter: adapter.class.name,
179
+ error: error.class.name,
180
+ attempt: attempt
181
+ )
182
+
183
+ # Track backoff delay histogram
184
+ track_histogram("e11y.retry.backoff_delay_ms", delay_ms, adapter: adapter.class.name)
185
+ end
186
+
187
+ # Increment retry metric.
188
+ #
189
+ # @param metric_name [String] Metric name
190
+ # @param tags [Hash] Additional tags
191
+ def increment_metric(metric_name, tags = {})
192
+ # TODO: Integrate with Yabeda metrics
193
+ # E11y::Metrics.increment(metric_name, tags)
194
+ end
195
+
196
+ # Track histogram metric.
197
+ #
198
+ # @param metric_name [String] Metric name
199
+ # @param value [Numeric] Value to track
200
+ # @param tags [Hash] Additional tags
201
+ def track_histogram(metric_name, value, tags = {})
202
+ # TODO: Integrate with Yabeda metrics
203
+ # E11y::Metrics.histogram(metric_name, value, tags)
204
+ end
205
+ end
206
+ end
207
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Reliability
5
+ # Retry Rate Limiter prevents thundering herd on adapter recovery.
6
+ #
7
+ # Implements staged batching with jitter to smooth retry load.
8
+ # Prevents retry storms when adapters recover from failures.
9
+ #
10
+ # @example Usage
11
+ # limiter = RetryRateLimiter.new(limit: 50, window: 1.0)
12
+ #
13
+ # limiter.allow?(adapter_name, event_data) # => true/false
14
+ #
15
+ # @see ADR-013 §3.5 (C06 Resolution: Retry Rate Limiting)
16
+ # @see UC-021 §5 (Retry Storm Prevention)
17
+ class RetryRateLimiter
18
+ # @param limit [Integer] Max retries per window (default: 50 retries/sec)
19
+ # @param window [Float] Window size in seconds (default: 1.0)
20
+ # @param on_limit_exceeded [Symbol] Action when limit exceeded (:delay or :dlq, default: :delay)
21
+ # @param jitter_range [Float] Jitter factor (0.0-1.0, default: 0.2 = ±20%)
22
+ def initialize(limit: 50, window: 1.0, on_limit_exceeded: :delay, jitter_range: 0.2)
23
+ @limit = limit
24
+ @window = window
25
+ @on_limit_exceeded = on_limit_exceeded
26
+ @jitter_range = jitter_range
27
+
28
+ # Track retry counts per adapter per window
29
+ @retry_counts = Hash.new { |h, k| h[k] = [] }
30
+ @mutex = Mutex.new
31
+ end
32
+
33
+ # Check if retry is allowed for adapter.
34
+ #
35
+ # @param adapter_name [String] Adapter name
36
+ # @param event_data [Hash] Event data (optional, for metrics)
37
+ # @return [Boolean] true if retry allowed
38
+ def allow?(adapter_name, event_data = {})
39
+ @mutex.synchronize do
40
+ cleanup_old_entries(adapter_name)
41
+
42
+ current_count = @retry_counts[adapter_name].size
43
+
44
+ if current_count >= @limit
45
+ on_limit_exceeded(adapter_name, event_data)
46
+ false
47
+ else
48
+ @retry_counts[adapter_name] << Time.now
49
+ increment_metric("e11y.retry_rate_limiter.allowed", adapter: adapter_name)
50
+ true
51
+ end
52
+ end
53
+ end
54
+
55
+ # Get current retry rate for adapter.
56
+ #
57
+ # @param adapter_name [String] Adapter name
58
+ # @return [Hash] Current stats (count, limit, window)
59
+ def stats(adapter_name)
60
+ @mutex.synchronize do
61
+ cleanup_old_entries(adapter_name)
62
+
63
+ {
64
+ adapter: adapter_name,
65
+ current_count: @retry_counts[adapter_name].size,
66
+ limit: @limit,
67
+ window: @window,
68
+ utilization: (@retry_counts[adapter_name].size.to_f / @limit * 100).round(2)
69
+ }
70
+ end
71
+ end
72
+
73
+ # Reset retry counts for adapter (for testing).
74
+ #
75
+ # @param adapter_name [String] Adapter name
76
+ def reset!(adapter_name = nil)
77
+ @mutex.synchronize do
78
+ if adapter_name
79
+ @retry_counts.delete(adapter_name)
80
+ else
81
+ @retry_counts.clear
82
+ end
83
+ end
84
+ end
85
+
86
+ private
87
+
88
+ # Remove retry entries outside current window.
89
+ def cleanup_old_entries(adapter_name)
90
+ cutoff_time = Time.now - @window
91
+ @retry_counts[adapter_name].reject! { |timestamp| timestamp < cutoff_time }
92
+ end
93
+
94
+ # Handle limit exceeded based on configured strategy.
95
+ def on_limit_exceeded(adapter_name, _event_data)
96
+ increment_metric("e11y.retry_rate_limiter.exceeded", adapter: adapter_name)
97
+
98
+ case @on_limit_exceeded
99
+ when :delay
100
+ # Calculate delay with jitter
101
+ delay_sec = @window + rand((-@jitter_range * @window)..(@jitter_range * @window))
102
+ increment_metric("e11y.retry_rate_limiter.delayed", adapter: adapter_name, delay_sec: delay_sec)
103
+ # Caller should sleep(delay_sec) before retry
104
+ when :dlq
105
+ # Caller should save to DLQ instead of retrying
106
+ increment_metric("e11y.retry_rate_limiter.dlq", adapter: adapter_name)
107
+ end
108
+ end
109
+
110
+ # Increment retry rate limiter metric.
111
+ def increment_metric(metric_name, tags = {})
112
+ # TODO: Integrate with Yabeda metrics
113
+ # E11y::Metrics.increment(metric_name, tags)
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Sampling
5
+ # Error Spike Detector for Adaptive Sampling (FEAT-4838.1)
6
+ #
7
+ # Detects sudden increases in error rates and adjusts sampling accordingly.
8
+ # Implements error-based adaptive sampling strategy from ADR-009 §3.2.
9
+ #
10
+ # Features:
11
+ # - Sliding window for error rate calculation
12
+ # - Absolute threshold (errors/minute)
13
+ # - Relative threshold (ratio to baseline)
14
+ # - Per-event and global error tracking
15
+ #
16
+ # @example Configuration
17
+ # detector = E11y::Sampling::ErrorSpikeDetector.new(
18
+ # window: 60, # 60 seconds sliding window
19
+ # absolute_threshold: 100, # 100 errors/min triggers spike
20
+ # relative_threshold: 3.0, # 3x normal rate triggers spike
21
+ # spike_duration: 300 # Keep 100% sampling for 5 minutes
22
+ # )
23
+ #
24
+ # @example Usage
25
+ # if detector.error_spike?
26
+ # sample_rate = 1.0 # 100% sampling during spike
27
+ # else
28
+ # sample_rate = 0.1 # 10% normal sampling
29
+ # end
30
+ #
31
+ # detector.record_event(event_name: "payment.processed", severity: :error)
32
+ class ErrorSpikeDetector
33
+ # Default configuration
34
+ DEFAULT_WINDOW = 60 # 60 seconds sliding window
35
+ DEFAULT_ABSOLUTE_THRESHOLD = 100 # 100 errors/min triggers spike
36
+ DEFAULT_RELATIVE_THRESHOLD = 3.0 # 3x normal rate triggers spike
37
+ DEFAULT_SPIKE_DURATION = 300 # Keep elevated sampling for 5 minutes
38
+
39
+ attr_reader :window, :absolute_threshold, :relative_threshold, :spike_duration
40
+
41
+ # Initialize error spike detector
42
+ #
43
+ # @param config [Hash] Configuration options
44
+ # @option config [Integer] :window (60) Sliding window in seconds
45
+ # @option config [Integer] :absolute_threshold (100) Errors/min to trigger spike
46
+ # @option config [Float] :relative_threshold (3.0) Multiplier vs baseline to trigger spike
47
+ # @option config [Integer] :spike_duration (300) Seconds to keep elevated sampling
48
+ def initialize(config = {})
49
+ @window = config.fetch(:window, DEFAULT_WINDOW)
50
+ @absolute_threshold = config.fetch(:absolute_threshold, DEFAULT_ABSOLUTE_THRESHOLD)
51
+ @relative_threshold = config.fetch(:relative_threshold, DEFAULT_RELATIVE_THRESHOLD)
52
+ @spike_duration = config.fetch(:spike_duration, DEFAULT_SPIKE_DURATION)
53
+
54
+ # Event tracking (per event name)
55
+ @error_events = Hash.new { |h, k| h[k] = [] } # event_name => [timestamp, ...]
56
+ @all_errors = [] # All errors (global)
57
+ @baseline_rates = Hash.new(0.0) # event_name => baseline error rate
58
+
59
+ # Spike state
60
+ @spike_started_at = nil
61
+ @mutex = Mutex.new
62
+ end
63
+
64
+ # Check if currently in error spike state
65
+ #
66
+ # @return [Boolean] true if error spike detected
67
+ def error_spike?
68
+ @mutex.synchronize do
69
+ # Check if spike is still active (within spike_duration)
70
+ if @spike_started_at
71
+ elapsed = Time.now - @spike_started_at
72
+ return true if elapsed < @spike_duration
73
+
74
+ # Spike expired - check if it should continue
75
+ if spike_detected?
76
+ @spike_started_at = Time.now # Extend spike
77
+ return true
78
+ else
79
+ @spike_started_at = nil # End spike
80
+ return false
81
+ end
82
+ end
83
+
84
+ # Check for new spike
85
+ if spike_detected?
86
+ @spike_started_at = Time.now
87
+ return true
88
+ end
89
+
90
+ false
91
+ end
92
+ end
93
+
94
+ # Record an event for error rate tracking
95
+ #
96
+ # @param event_data [Hash] Event payload
97
+ # @option event_data [String] :event_name Event name
98
+ # @option event_data [Symbol] :severity Event severity
99
+ def record_event(event_data)
100
+ return unless error_severity?(event_data[:severity])
101
+
102
+ @mutex.synchronize do
103
+ now = Time.now
104
+ event_name = event_data[:event_name]
105
+
106
+ # Record error
107
+ @error_events[event_name] << now
108
+ @all_errors << now
109
+
110
+ # Cleanup old events (outside window)
111
+ cleanup_old_events(now)
112
+
113
+ # Update baseline (if not in spike)
114
+ update_baseline(event_name) unless @spike_started_at
115
+ end
116
+ end
117
+
118
+ # Get current error rate (errors per minute)
119
+ #
120
+ # @param event_name [String, nil] Event name, or nil for global rate
121
+ # @return [Float] Errors per minute
122
+ def current_error_rate(event_name = nil)
123
+ @mutex.synchronize do
124
+ now = Time.now
125
+ cleanup_old_events(now)
126
+
127
+ events = event_name ? @error_events[event_name] : @all_errors
128
+ count = events.count { |ts| (now - ts) <= @window }
129
+
130
+ # Convert to per-minute rate
131
+ (count.to_f / @window) * 60
132
+ end
133
+ end
134
+
135
+ # Get baseline error rate
136
+ #
137
+ # @param event_name [String] Event name
138
+ # @return [Float] Baseline errors per minute
139
+ def baseline_error_rate(event_name)
140
+ @mutex.synchronize { @baseline_rates[event_name] }
141
+ end
142
+
143
+ # Reset detector state (useful for testing)
144
+ def reset!
145
+ @mutex.synchronize do
146
+ @error_events.clear
147
+ @all_errors.clear
148
+ @baseline_rates.clear
149
+ @spike_started_at = nil
150
+ end
151
+ end
152
+
153
+ private
154
+
155
+ # Check if severity is an error
156
+ #
157
+ # @param severity [Symbol, nil] Severity level
158
+ # @return [Boolean] true if error or fatal
159
+ def error_severity?(severity)
160
+ %i[error fatal].include?(severity)
161
+ end
162
+
163
+ # Detect if spike conditions are met
164
+ #
165
+ # @return [Boolean] true if spike detected
166
+ def spike_detected?
167
+ # Check absolute threshold (global)
168
+ global_rate = current_error_rate_unsafe
169
+ return true if global_rate > @absolute_threshold
170
+
171
+ # Check relative threshold (per event name)
172
+ @error_events.each_key do |event_name|
173
+ current_rate = current_error_rate_unsafe(event_name)
174
+ baseline = @baseline_rates[event_name]
175
+
176
+ # Only check relative if we have a baseline
177
+ return true if baseline.positive? && current_rate > (baseline * @relative_threshold)
178
+ end
179
+
180
+ false
181
+ end
182
+
183
+ # Get current error rate (unsafe - must be called within mutex)
184
+ #
185
+ # @param event_name [String, nil] Event name, or nil for global
186
+ # @return [Float] Errors per minute
187
+ def current_error_rate_unsafe(event_name = nil)
188
+ now = Time.now
189
+ events = event_name ? @error_events[event_name] : @all_errors
190
+ count = events.count { |ts| (now - ts) <= @window }
191
+ (count.to_f / @window) * 60
192
+ end
193
+
194
+ # Update baseline error rate (unsafe - must be called within mutex)
195
+ #
196
+ # @param event_name [String] Event name
197
+ def update_baseline(event_name)
198
+ # Exponential moving average (EMA) with alpha = 0.1
199
+ current_rate = current_error_rate_unsafe(event_name)
200
+ old_baseline = @baseline_rates[event_name]
201
+
202
+ @baseline_rates[event_name] = if old_baseline.zero?
203
+ current_rate
204
+ else
205
+ (0.1 * current_rate) + (0.9 * old_baseline)
206
+ end
207
+ end
208
+
209
+ # Cleanup events outside the sliding window
210
+ #
211
+ # @param now [Time] Current timestamp
212
+ def cleanup_old_events(now)
213
+ cutoff = now - @window
214
+
215
+ # Cleanup per-event errors
216
+ @error_events.each_value do |events|
217
+ events.reject! { |ts| ts < cutoff }
218
+ end
219
+
220
+ # Cleanup global errors
221
+ @all_errors.reject! { |ts| ts < cutoff }
222
+ end
223
+ end
224
+ end
225
+ end