e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Reliability
|
|
5
|
+
module DLQ
|
|
6
|
+
# DLQ Filter determines which failed events should be saved to DLQ.
|
|
7
|
+
#
|
|
8
|
+
# Supports:
|
|
9
|
+
# - Always save patterns (e.g., payment.*, audit.*)
|
|
10
|
+
# - Always discard patterns (e.g., debug.*, test.*)
|
|
11
|
+
# - Severity-based filtering (e.g., always save :error, :fatal)
|
|
12
|
+
#
|
|
13
|
+
# @example Configuration
|
|
14
|
+
# filter = Filter.new(
|
|
15
|
+
# always_save_patterns: [/^payment\./, /^audit\./],
|
|
16
|
+
# always_discard_patterns: [/^debug\./, /^test\./],
|
|
17
|
+
# save_severities: [:error, :fatal]
|
|
18
|
+
# )
|
|
19
|
+
#
|
|
20
|
+
# filter.should_save?(event_data) # => true/false
|
|
21
|
+
#
|
|
22
|
+
# @see ADR-013 §4.3 (DLQ Filter)
|
|
23
|
+
# @see UC-021 §3.2 (DLQ Filter Configuration)
|
|
24
|
+
class Filter
|
|
25
|
+
# @param always_save_patterns [Array<Regexp>] Event patterns to always save
|
|
26
|
+
# @param always_discard_patterns [Array<Regexp>] Event patterns to always discard
|
|
27
|
+
# @param save_severities [Array<Symbol>] Severities to always save (:error, :fatal)
|
|
28
|
+
# @param default_behavior [Symbol] Default behavior when no rule matches (:save or :discard)
|
|
29
|
+
def initialize(
|
|
30
|
+
always_save_patterns: [],
|
|
31
|
+
always_discard_patterns: [],
|
|
32
|
+
save_severities: %i[error fatal],
|
|
33
|
+
default_behavior: :save
|
|
34
|
+
)
|
|
35
|
+
@always_save_patterns = always_save_patterns
|
|
36
|
+
@always_discard_patterns = always_discard_patterns
|
|
37
|
+
@save_severities = save_severities
|
|
38
|
+
@default_behavior = default_behavior
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Check if event should be saved to DLQ.
|
|
42
|
+
#
|
|
43
|
+
# Priority order:
|
|
44
|
+
# 1. Always discard patterns (highest priority)
|
|
45
|
+
# 2. Always save patterns
|
|
46
|
+
# 3. Severity-based rules
|
|
47
|
+
# 4. Default behavior
|
|
48
|
+
#
|
|
49
|
+
# @param event_data [Hash] Event data
|
|
50
|
+
# @return [Boolean] true if event should be saved to DLQ
|
|
51
|
+
def should_save?(event_data)
|
|
52
|
+
event_name = event_data[:event_name].to_s
|
|
53
|
+
severity = event_data[:severity]
|
|
54
|
+
|
|
55
|
+
# Priority 1: Always discard (highest priority)
|
|
56
|
+
if matches_patterns?(event_name, @always_discard_patterns)
|
|
57
|
+
increment_metric("e11y.dlq.filter.discarded", reason: "always_discard_pattern")
|
|
58
|
+
return false
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Priority 2: Always save
|
|
62
|
+
if matches_patterns?(event_name, @always_save_patterns)
|
|
63
|
+
increment_metric("e11y.dlq.filter.saved", reason: "always_save_pattern")
|
|
64
|
+
return true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Priority 3: Severity-based
|
|
68
|
+
if @save_severities.include?(severity)
|
|
69
|
+
increment_metric("e11y.dlq.filter.saved", reason: "severity")
|
|
70
|
+
return true
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Priority 4: Default behavior
|
|
74
|
+
if @default_behavior == :save
|
|
75
|
+
increment_metric("e11y.dlq.filter.saved", reason: "default")
|
|
76
|
+
true
|
|
77
|
+
else
|
|
78
|
+
increment_metric("e11y.dlq.filter.discarded", reason: "default")
|
|
79
|
+
false
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Get filter statistics.
|
|
84
|
+
#
|
|
85
|
+
# @return [Hash] Filter configuration stats
|
|
86
|
+
def stats
|
|
87
|
+
{
|
|
88
|
+
always_save_patterns: @always_save_patterns.map(&:inspect),
|
|
89
|
+
always_discard_patterns: @always_discard_patterns.map(&:inspect),
|
|
90
|
+
save_severities: @save_severities,
|
|
91
|
+
default_behavior: @default_behavior
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
private
|
|
96
|
+
|
|
97
|
+
# Check if event name matches any of the patterns.
|
|
98
|
+
#
|
|
99
|
+
# @param event_name [String] Event name
|
|
100
|
+
# @param patterns [Array<Regexp>] Patterns to match
|
|
101
|
+
# @return [Boolean] true if event matches any pattern
|
|
102
|
+
def matches_patterns?(event_name, patterns)
|
|
103
|
+
patterns.any? { |pattern| pattern.match?(event_name) }
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Increment DLQ filter metric.
|
|
107
|
+
#
|
|
108
|
+
# @param metric_name [String] Metric name
|
|
109
|
+
# @param tags [Hash] Additional tags
|
|
110
|
+
def increment_metric(metric_name, tags = {})
|
|
111
|
+
# TODO: Integrate with Yabeda metrics
|
|
112
|
+
# E11y::Metrics.increment(metric_name, tags)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "securerandom"
|
|
4
|
+
|
|
5
|
+
module E11y
|
|
6
|
+
module Reliability
|
|
7
|
+
# Retry handler with exponential backoff and jitter.
|
|
8
|
+
#
|
|
9
|
+
# Automatically retries transient failures with increasing delays.
|
|
10
|
+
# Integrates with CircuitBreaker and DLQ for comprehensive error handling.
|
|
11
|
+
#
|
|
12
|
+
# @example Usage
|
|
13
|
+
# retry_handler = RetryHandler.new(config: config)
|
|
14
|
+
#
|
|
15
|
+
# retry_handler.with_retry(adapter: adapter, event: event_data) do
|
|
16
|
+
# adapter.send(event_data)
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
# @see ADR-013 §3 (Retry Policy)
|
|
20
|
+
# @see UC-021 §2 (Exponential Backoff with Jitter)
|
|
21
|
+
class RetryHandler
|
|
22
|
+
# Retry exhausted error (all retries failed)
|
|
23
|
+
class RetryExhaustedError < StandardError
|
|
24
|
+
attr_reader :original_error, :retry_count
|
|
25
|
+
|
|
26
|
+
def initialize(original_error, retry_count:)
|
|
27
|
+
@original_error = original_error
|
|
28
|
+
@retry_count = retry_count
|
|
29
|
+
super("Retry exhausted after #{retry_count} attempts: #{original_error.message}")
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Transient errors that should be retried
|
|
34
|
+
TRANSIENT_ERRORS = [
|
|
35
|
+
Timeout::Error,
|
|
36
|
+
Errno::ECONNREFUSED,
|
|
37
|
+
Errno::ECONNRESET,
|
|
38
|
+
Errno::ETIMEDOUT,
|
|
39
|
+
Errno::EHOSTUNREACH,
|
|
40
|
+
Errno::ENETUNREACH
|
|
41
|
+
].freeze
|
|
42
|
+
|
|
43
|
+
# HTTP status codes that should be retried (5xx server errors)
|
|
44
|
+
RETRIABLE_HTTP_STATUS_CODES = (500..599)
|
|
45
|
+
|
|
46
|
+
# @param config [Hash] Configuration options
|
|
47
|
+
# @option config [Integer] :max_attempts Maximum retry attempts (default: 3)
|
|
48
|
+
# @option config [Float] :base_delay_ms Initial delay in milliseconds (default: 100)
|
|
49
|
+
# @option config [Float] :max_delay_ms Maximum delay in milliseconds (default: 5000)
|
|
50
|
+
# @option config [Float] :jitter_factor Jitter factor (0.0-1.0, default: 0.1)
|
|
51
|
+
# @option config [Boolean] :fail_on_error Raise error after max retries (default: true)
|
|
52
|
+
def initialize(config: {})
|
|
53
|
+
@max_attempts = config[:max_attempts] || 3
|
|
54
|
+
@base_delay_ms = config[:base_delay_ms] || 100.0
|
|
55
|
+
@max_delay_ms = config[:max_delay_ms] || 5000.0
|
|
56
|
+
@jitter_factor = config[:jitter_factor] || 0.1
|
|
57
|
+
@fail_on_error = config.fetch(:fail_on_error, true)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Execute block with retry logic.
|
|
61
|
+
#
|
|
62
|
+
# @param adapter [E11y::Adapters::Base] Adapter instance
|
|
63
|
+
# @param event [Hash] Event data
|
|
64
|
+
# @yield Block to execute (adapter send)
|
|
65
|
+
# @return [Object] Result of block execution
|
|
66
|
+
# @raise [RetryExhaustedError] if all retries fail and fail_on_error is true
|
|
67
|
+
def with_retry(adapter:, event:)
|
|
68
|
+
attempt = 0
|
|
69
|
+
|
|
70
|
+
loop do
|
|
71
|
+
attempt += 1
|
|
72
|
+
|
|
73
|
+
begin
|
|
74
|
+
result = yield
|
|
75
|
+
on_success(adapter, event, attempt)
|
|
76
|
+
return result # Return actual result, not true
|
|
77
|
+
rescue StandardError => e
|
|
78
|
+
# Check if error is retriable
|
|
79
|
+
unless retriable_error?(e)
|
|
80
|
+
on_permanent_failure(adapter, event, e, attempt)
|
|
81
|
+
raise RetryExhaustedError.new(e, retry_count: attempt) if @fail_on_error
|
|
82
|
+
|
|
83
|
+
return nil
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Check if max attempts reached
|
|
87
|
+
if attempt >= @max_attempts
|
|
88
|
+
on_max_retries_exhausted(adapter, event, e, attempt)
|
|
89
|
+
raise RetryExhaustedError.new(e, retry_count: attempt) if @fail_on_error
|
|
90
|
+
|
|
91
|
+
return nil
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Calculate backoff delay
|
|
95
|
+
delay_ms = calculate_backoff_delay(attempt)
|
|
96
|
+
on_retry_attempt(adapter, event, e, attempt, delay_ms)
|
|
97
|
+
|
|
98
|
+
# Sleep with backoff
|
|
99
|
+
sleep(delay_ms / 1000.0)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
# Check if error should be retried.
|
|
107
|
+
#
|
|
108
|
+
# @param error [StandardError] The error that occurred
|
|
109
|
+
# @return [Boolean] true if error is retriable
|
|
110
|
+
def retriable_error?(error)
|
|
111
|
+
# Check if error class is in transient errors list
|
|
112
|
+
return true if TRANSIENT_ERRORS.any? { |klass| error.is_a?(klass) }
|
|
113
|
+
|
|
114
|
+
# Check HTTP status codes (if error has response)
|
|
115
|
+
if error.respond_to?(:response) && error.response.respond_to?(:code)
|
|
116
|
+
status_code = error.response.code.to_i
|
|
117
|
+
return true if RETRIABLE_HTTP_STATUS_CODES.cover?(status_code)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
false
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Calculate exponential backoff delay with jitter.
|
|
124
|
+
#
|
|
125
|
+
# Formula: base_delay * (2 ^ attempt) + jitter
|
|
126
|
+
# Jitter: random value between [-jitter_factor * delay, +jitter_factor * delay]
|
|
127
|
+
#
|
|
128
|
+
# @param attempt [Integer] Current attempt number (1-indexed)
|
|
129
|
+
# @return [Float] Delay in milliseconds
|
|
130
|
+
def calculate_backoff_delay(attempt)
|
|
131
|
+
# Exponential backoff: base * 2^(attempt-1)
|
|
132
|
+
exponential_delay = @base_delay_ms * (2**(attempt - 1))
|
|
133
|
+
|
|
134
|
+
# Cap at max_delay
|
|
135
|
+
exponential_delay = [@max_delay_ms, exponential_delay].min
|
|
136
|
+
|
|
137
|
+
# Add jitter: +/- jitter_factor * delay
|
|
138
|
+
jitter_range = exponential_delay * @jitter_factor
|
|
139
|
+
jitter = rand(-jitter_range..jitter_range)
|
|
140
|
+
|
|
141
|
+
exponential_delay + jitter
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Handle successful execution.
|
|
145
|
+
def on_success(adapter, _event, attempt)
|
|
146
|
+
increment_metric("e11y.retry.success", adapter: adapter.class.name, attempts: attempt)
|
|
147
|
+
|
|
148
|
+
# Log if retry was needed
|
|
149
|
+
return unless attempt > 1
|
|
150
|
+
|
|
151
|
+
increment_metric("e11y.retry.recovered", adapter: adapter.class.name, attempts: attempt)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Handle permanent failure (non-retriable error).
|
|
155
|
+
def on_permanent_failure(adapter, _event, error, attempt)
|
|
156
|
+
increment_metric(
|
|
157
|
+
"e11y.retry.permanent_failure",
|
|
158
|
+
adapter: adapter.class.name,
|
|
159
|
+
error: error.class.name,
|
|
160
|
+
attempt: attempt
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Handle max retries exhausted (all attempts failed).
|
|
165
|
+
def on_max_retries_exhausted(adapter, _event, error, attempt)
|
|
166
|
+
increment_metric(
|
|
167
|
+
"e11y.retry.exhausted",
|
|
168
|
+
adapter: adapter.class.name,
|
|
169
|
+
error: error.class.name,
|
|
170
|
+
attempts: attempt
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Handle retry attempt.
|
|
175
|
+
def on_retry_attempt(adapter, _event, error, attempt, delay_ms)
|
|
176
|
+
increment_metric(
|
|
177
|
+
"e11y.retry.attempt",
|
|
178
|
+
adapter: adapter.class.name,
|
|
179
|
+
error: error.class.name,
|
|
180
|
+
attempt: attempt
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Track backoff delay histogram
|
|
184
|
+
track_histogram("e11y.retry.backoff_delay_ms", delay_ms, adapter: adapter.class.name)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Increment retry metric.
|
|
188
|
+
#
|
|
189
|
+
# @param metric_name [String] Metric name
|
|
190
|
+
# @param tags [Hash] Additional tags
|
|
191
|
+
def increment_metric(metric_name, tags = {})
|
|
192
|
+
# TODO: Integrate with Yabeda metrics
|
|
193
|
+
# E11y::Metrics.increment(metric_name, tags)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Track histogram metric.
|
|
197
|
+
#
|
|
198
|
+
# @param metric_name [String] Metric name
|
|
199
|
+
# @param value [Numeric] Value to track
|
|
200
|
+
# @param tags [Hash] Additional tags
|
|
201
|
+
def track_histogram(metric_name, value, tags = {})
|
|
202
|
+
# TODO: Integrate with Yabeda metrics
|
|
203
|
+
# E11y::Metrics.histogram(metric_name, value, tags)
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Reliability
|
|
5
|
+
# Retry Rate Limiter prevents thundering herd on adapter recovery.
|
|
6
|
+
#
|
|
7
|
+
# Implements staged batching with jitter to smooth retry load.
|
|
8
|
+
# Prevents retry storms when adapters recover from failures.
|
|
9
|
+
#
|
|
10
|
+
# @example Usage
|
|
11
|
+
# limiter = RetryRateLimiter.new(limit: 50, window: 1.0)
|
|
12
|
+
#
|
|
13
|
+
# limiter.allow?(adapter_name, event_data) # => true/false
|
|
14
|
+
#
|
|
15
|
+
# @see ADR-013 §3.5 (C06 Resolution: Retry Rate Limiting)
|
|
16
|
+
# @see UC-021 §5 (Retry Storm Prevention)
|
|
17
|
+
class RetryRateLimiter
|
|
18
|
+
# @param limit [Integer] Max retries per window (default: 50 retries/sec)
|
|
19
|
+
# @param window [Float] Window size in seconds (default: 1.0)
|
|
20
|
+
# @param on_limit_exceeded [Symbol] Action when limit exceeded (:delay or :dlq, default: :delay)
|
|
21
|
+
# @param jitter_range [Float] Jitter factor (0.0-1.0, default: 0.2 = ±20%)
|
|
22
|
+
def initialize(limit: 50, window: 1.0, on_limit_exceeded: :delay, jitter_range: 0.2)
|
|
23
|
+
@limit = limit
|
|
24
|
+
@window = window
|
|
25
|
+
@on_limit_exceeded = on_limit_exceeded
|
|
26
|
+
@jitter_range = jitter_range
|
|
27
|
+
|
|
28
|
+
# Track retry counts per adapter per window
|
|
29
|
+
@retry_counts = Hash.new { |h, k| h[k] = [] }
|
|
30
|
+
@mutex = Mutex.new
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Check if retry is allowed for adapter.
|
|
34
|
+
#
|
|
35
|
+
# @param adapter_name [String] Adapter name
|
|
36
|
+
# @param event_data [Hash] Event data (optional, for metrics)
|
|
37
|
+
# @return [Boolean] true if retry allowed
|
|
38
|
+
def allow?(adapter_name, event_data = {})
|
|
39
|
+
@mutex.synchronize do
|
|
40
|
+
cleanup_old_entries(adapter_name)
|
|
41
|
+
|
|
42
|
+
current_count = @retry_counts[adapter_name].size
|
|
43
|
+
|
|
44
|
+
if current_count >= @limit
|
|
45
|
+
on_limit_exceeded(adapter_name, event_data)
|
|
46
|
+
false
|
|
47
|
+
else
|
|
48
|
+
@retry_counts[adapter_name] << Time.now
|
|
49
|
+
increment_metric("e11y.retry_rate_limiter.allowed", adapter: adapter_name)
|
|
50
|
+
true
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Get current retry rate for adapter.
|
|
56
|
+
#
|
|
57
|
+
# @param adapter_name [String] Adapter name
|
|
58
|
+
# @return [Hash] Current stats (count, limit, window)
|
|
59
|
+
def stats(adapter_name)
|
|
60
|
+
@mutex.synchronize do
|
|
61
|
+
cleanup_old_entries(adapter_name)
|
|
62
|
+
|
|
63
|
+
{
|
|
64
|
+
adapter: adapter_name,
|
|
65
|
+
current_count: @retry_counts[adapter_name].size,
|
|
66
|
+
limit: @limit,
|
|
67
|
+
window: @window,
|
|
68
|
+
utilization: (@retry_counts[adapter_name].size.to_f / @limit * 100).round(2)
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Reset retry counts for adapter (for testing).
|
|
74
|
+
#
|
|
75
|
+
# @param adapter_name [String] Adapter name
|
|
76
|
+
def reset!(adapter_name = nil)
|
|
77
|
+
@mutex.synchronize do
|
|
78
|
+
if adapter_name
|
|
79
|
+
@retry_counts.delete(adapter_name)
|
|
80
|
+
else
|
|
81
|
+
@retry_counts.clear
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
# Remove retry entries outside current window.
|
|
89
|
+
def cleanup_old_entries(adapter_name)
|
|
90
|
+
cutoff_time = Time.now - @window
|
|
91
|
+
@retry_counts[adapter_name].reject! { |timestamp| timestamp < cutoff_time }
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Handle limit exceeded based on configured strategy.
|
|
95
|
+
def on_limit_exceeded(adapter_name, _event_data)
|
|
96
|
+
increment_metric("e11y.retry_rate_limiter.exceeded", adapter: adapter_name)
|
|
97
|
+
|
|
98
|
+
case @on_limit_exceeded
|
|
99
|
+
when :delay
|
|
100
|
+
# Calculate delay with jitter
|
|
101
|
+
delay_sec = @window + rand((-@jitter_range * @window)..(@jitter_range * @window))
|
|
102
|
+
increment_metric("e11y.retry_rate_limiter.delayed", adapter: adapter_name, delay_sec: delay_sec)
|
|
103
|
+
# Caller should sleep(delay_sec) before retry
|
|
104
|
+
when :dlq
|
|
105
|
+
# Caller should save to DLQ instead of retrying
|
|
106
|
+
increment_metric("e11y.retry_rate_limiter.dlq", adapter: adapter_name)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Increment retry rate limiter metric.
|
|
111
|
+
def increment_metric(metric_name, tags = {})
|
|
112
|
+
# TODO: Integrate with Yabeda metrics
|
|
113
|
+
# E11y::Metrics.increment(metric_name, tags)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Sampling
|
|
5
|
+
# Error Spike Detector for Adaptive Sampling (FEAT-4838.1)
|
|
6
|
+
#
|
|
7
|
+
# Detects sudden increases in error rates and adjusts sampling accordingly.
|
|
8
|
+
# Implements error-based adaptive sampling strategy from ADR-009 §3.2.
|
|
9
|
+
#
|
|
10
|
+
# Features:
|
|
11
|
+
# - Sliding window for error rate calculation
|
|
12
|
+
# - Absolute threshold (errors/minute)
|
|
13
|
+
# - Relative threshold (ratio to baseline)
|
|
14
|
+
# - Per-event and global error tracking
|
|
15
|
+
#
|
|
16
|
+
# @example Configuration
|
|
17
|
+
# detector = E11y::Sampling::ErrorSpikeDetector.new(
|
|
18
|
+
# window: 60, # 60 seconds sliding window
|
|
19
|
+
# absolute_threshold: 100, # 100 errors/min triggers spike
|
|
20
|
+
# relative_threshold: 3.0, # 3x normal rate triggers spike
|
|
21
|
+
# spike_duration: 300 # Keep 100% sampling for 5 minutes
|
|
22
|
+
# )
|
|
23
|
+
#
|
|
24
|
+
# @example Usage
|
|
25
|
+
# if detector.error_spike?
|
|
26
|
+
# sample_rate = 1.0 # 100% sampling during spike
|
|
27
|
+
# else
|
|
28
|
+
# sample_rate = 0.1 # 10% normal sampling
|
|
29
|
+
# end
|
|
30
|
+
#
|
|
31
|
+
# detector.record_event(event_name: "payment.processed", severity: :error)
|
|
32
|
+
class ErrorSpikeDetector
|
|
33
|
+
# Default configuration
|
|
34
|
+
DEFAULT_WINDOW = 60 # 60 seconds sliding window
|
|
35
|
+
DEFAULT_ABSOLUTE_THRESHOLD = 100 # 100 errors/min triggers spike
|
|
36
|
+
DEFAULT_RELATIVE_THRESHOLD = 3.0 # 3x normal rate triggers spike
|
|
37
|
+
DEFAULT_SPIKE_DURATION = 300 # Keep elevated sampling for 5 minutes
|
|
38
|
+
|
|
39
|
+
attr_reader :window, :absolute_threshold, :relative_threshold, :spike_duration
|
|
40
|
+
|
|
41
|
+
# Initialize error spike detector
|
|
42
|
+
#
|
|
43
|
+
# @param config [Hash] Configuration options
|
|
44
|
+
# @option config [Integer] :window (60) Sliding window in seconds
|
|
45
|
+
# @option config [Integer] :absolute_threshold (100) Errors/min to trigger spike
|
|
46
|
+
# @option config [Float] :relative_threshold (3.0) Multiplier vs baseline to trigger spike
|
|
47
|
+
# @option config [Integer] :spike_duration (300) Seconds to keep elevated sampling
|
|
48
|
+
def initialize(config = {})
|
|
49
|
+
@window = config.fetch(:window, DEFAULT_WINDOW)
|
|
50
|
+
@absolute_threshold = config.fetch(:absolute_threshold, DEFAULT_ABSOLUTE_THRESHOLD)
|
|
51
|
+
@relative_threshold = config.fetch(:relative_threshold, DEFAULT_RELATIVE_THRESHOLD)
|
|
52
|
+
@spike_duration = config.fetch(:spike_duration, DEFAULT_SPIKE_DURATION)
|
|
53
|
+
|
|
54
|
+
# Event tracking (per event name)
|
|
55
|
+
@error_events = Hash.new { |h, k| h[k] = [] } # event_name => [timestamp, ...]
|
|
56
|
+
@all_errors = [] # All errors (global)
|
|
57
|
+
@baseline_rates = Hash.new(0.0) # event_name => baseline error rate
|
|
58
|
+
|
|
59
|
+
# Spike state
|
|
60
|
+
@spike_started_at = nil
|
|
61
|
+
@mutex = Mutex.new
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Check if currently in error spike state
|
|
65
|
+
#
|
|
66
|
+
# @return [Boolean] true if error spike detected
|
|
67
|
+
def error_spike?
|
|
68
|
+
@mutex.synchronize do
|
|
69
|
+
# Check if spike is still active (within spike_duration)
|
|
70
|
+
if @spike_started_at
|
|
71
|
+
elapsed = Time.now - @spike_started_at
|
|
72
|
+
return true if elapsed < @spike_duration
|
|
73
|
+
|
|
74
|
+
# Spike expired - check if it should continue
|
|
75
|
+
if spike_detected?
|
|
76
|
+
@spike_started_at = Time.now # Extend spike
|
|
77
|
+
return true
|
|
78
|
+
else
|
|
79
|
+
@spike_started_at = nil # End spike
|
|
80
|
+
return false
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Check for new spike
|
|
85
|
+
if spike_detected?
|
|
86
|
+
@spike_started_at = Time.now
|
|
87
|
+
return true
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
false
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Record an event for error rate tracking
|
|
95
|
+
#
|
|
96
|
+
# @param event_data [Hash] Event payload
|
|
97
|
+
# @option event_data [String] :event_name Event name
|
|
98
|
+
# @option event_data [Symbol] :severity Event severity
|
|
99
|
+
def record_event(event_data)
|
|
100
|
+
return unless error_severity?(event_data[:severity])
|
|
101
|
+
|
|
102
|
+
@mutex.synchronize do
|
|
103
|
+
now = Time.now
|
|
104
|
+
event_name = event_data[:event_name]
|
|
105
|
+
|
|
106
|
+
# Record error
|
|
107
|
+
@error_events[event_name] << now
|
|
108
|
+
@all_errors << now
|
|
109
|
+
|
|
110
|
+
# Cleanup old events (outside window)
|
|
111
|
+
cleanup_old_events(now)
|
|
112
|
+
|
|
113
|
+
# Update baseline (if not in spike)
|
|
114
|
+
update_baseline(event_name) unless @spike_started_at
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Get current error rate (errors per minute)
|
|
119
|
+
#
|
|
120
|
+
# @param event_name [String, nil] Event name, or nil for global rate
|
|
121
|
+
# @return [Float] Errors per minute
|
|
122
|
+
def current_error_rate(event_name = nil)
|
|
123
|
+
@mutex.synchronize do
|
|
124
|
+
now = Time.now
|
|
125
|
+
cleanup_old_events(now)
|
|
126
|
+
|
|
127
|
+
events = event_name ? @error_events[event_name] : @all_errors
|
|
128
|
+
count = events.count { |ts| (now - ts) <= @window }
|
|
129
|
+
|
|
130
|
+
# Convert to per-minute rate
|
|
131
|
+
(count.to_f / @window) * 60
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Get baseline error rate
|
|
136
|
+
#
|
|
137
|
+
# @param event_name [String] Event name
|
|
138
|
+
# @return [Float] Baseline errors per minute
|
|
139
|
+
def baseline_error_rate(event_name)
|
|
140
|
+
@mutex.synchronize { @baseline_rates[event_name] }
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Reset detector state (useful for testing)
|
|
144
|
+
def reset!
|
|
145
|
+
@mutex.synchronize do
|
|
146
|
+
@error_events.clear
|
|
147
|
+
@all_errors.clear
|
|
148
|
+
@baseline_rates.clear
|
|
149
|
+
@spike_started_at = nil
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
private
|
|
154
|
+
|
|
155
|
+
# Check if severity is an error
|
|
156
|
+
#
|
|
157
|
+
# @param severity [Symbol, nil] Severity level
|
|
158
|
+
# @return [Boolean] true if error or fatal
|
|
159
|
+
def error_severity?(severity)
|
|
160
|
+
%i[error fatal].include?(severity)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Detect if spike conditions are met
|
|
164
|
+
#
|
|
165
|
+
# @return [Boolean] true if spike detected
|
|
166
|
+
def spike_detected?
|
|
167
|
+
# Check absolute threshold (global)
|
|
168
|
+
global_rate = current_error_rate_unsafe
|
|
169
|
+
return true if global_rate > @absolute_threshold
|
|
170
|
+
|
|
171
|
+
# Check relative threshold (per event name)
|
|
172
|
+
@error_events.each_key do |event_name|
|
|
173
|
+
current_rate = current_error_rate_unsafe(event_name)
|
|
174
|
+
baseline = @baseline_rates[event_name]
|
|
175
|
+
|
|
176
|
+
# Only check relative if we have a baseline
|
|
177
|
+
return true if baseline.positive? && current_rate > (baseline * @relative_threshold)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
false
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Get current error rate (unsafe - must be called within mutex)
|
|
184
|
+
#
|
|
185
|
+
# @param event_name [String, nil] Event name, or nil for global
|
|
186
|
+
# @return [Float] Errors per minute
|
|
187
|
+
def current_error_rate_unsafe(event_name = nil)
|
|
188
|
+
now = Time.now
|
|
189
|
+
events = event_name ? @error_events[event_name] : @all_errors
|
|
190
|
+
count = events.count { |ts| (now - ts) <= @window }
|
|
191
|
+
(count.to_f / @window) * 60
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Update baseline error rate (unsafe - must be called within mutex)
|
|
195
|
+
#
|
|
196
|
+
# @param event_name [String] Event name
|
|
197
|
+
def update_baseline(event_name)
|
|
198
|
+
# Exponential moving average (EMA) with alpha = 0.1
|
|
199
|
+
current_rate = current_error_rate_unsafe(event_name)
|
|
200
|
+
old_baseline = @baseline_rates[event_name]
|
|
201
|
+
|
|
202
|
+
@baseline_rates[event_name] = if old_baseline.zero?
|
|
203
|
+
current_rate
|
|
204
|
+
else
|
|
205
|
+
(0.1 * current_rate) + (0.9 * old_baseline)
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Cleanup events outside the sliding window
|
|
210
|
+
#
|
|
211
|
+
# @param now [Time] Current timestamp
|
|
212
|
+
def cleanup_old_events(now)
|
|
213
|
+
cutoff = now - @window
|
|
214
|
+
|
|
215
|
+
# Cleanup per-event errors
|
|
216
|
+
@error_events.each_value do |events|
|
|
217
|
+
events.reject! { |ts| ts < cutoff }
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Cleanup global errors
|
|
221
|
+
@all_errors.reject! { |ts| ts < cutoff }
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|