e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../reliability/retry_handler"
|
|
4
|
+
require_relative "../reliability/circuit_breaker"
|
|
5
|
+
|
|
6
|
+
module E11y
|
|
7
|
+
module Adapters
|
|
8
|
+
# Base class for all E11y adapters
|
|
9
|
+
#
|
|
10
|
+
# Provides standard interface for event destinations following ADR-004.
|
|
11
|
+
# All adapters must implement {#write} method, optionally override {#write_batch}
|
|
12
|
+
# for performance optimization.
|
|
13
|
+
#
|
|
14
|
+
# @abstract Subclass and implement {#write}, optionally {#write_batch}
|
|
15
|
+
#
|
|
16
|
+
# @example Define custom adapter
|
|
17
|
+
# class CustomAdapter < E11y::Adapters::Base
|
|
18
|
+
# def initialize(config = {})
|
|
19
|
+
# super
|
|
20
|
+
# @url = config.fetch(:url)
|
|
21
|
+
# validate_config!
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# def write(event_data)
|
|
25
|
+
# # Send single event to external system
|
|
26
|
+
# send_to_api(event_data)
|
|
27
|
+
# true
|
|
28
|
+
# rescue => e
|
|
29
|
+
# warn "Adapter error: #{e.message}"
|
|
30
|
+
# false
|
|
31
|
+
# end
|
|
32
|
+
#
|
|
33
|
+
# def capabilities
|
|
34
|
+
# {
|
|
35
|
+
# batching: false,
|
|
36
|
+
# compression: false,
|
|
37
|
+
# async: false,
|
|
38
|
+
# streaming: false
|
|
39
|
+
# }
|
|
40
|
+
# end
|
|
41
|
+
#
|
|
42
|
+
# private
|
|
43
|
+
#
|
|
44
|
+
# def validate_config!
|
|
45
|
+
# raise ArgumentError, "url is required" unless @url
|
|
46
|
+
# end
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# @see ADR-004 Section 3.1 (Base Adapter Contract)
|
|
50
|
+
class Base
|
|
51
|
+
attr_reader :config
|
|
52
|
+
|
|
53
|
+
# Initialize adapter with config
|
|
54
|
+
#
|
|
55
|
+
# @param config [Hash] Adapter-specific configuration
|
|
56
|
+
# @option config [Hash] :reliability Reliability settings (retry, circuit_breaker, dlq)
|
|
57
|
+
def initialize(config = {})
|
|
58
|
+
@config = config
|
|
59
|
+
@reliability_enabled = config.fetch(:reliability, {}).fetch(:enabled, true)
|
|
60
|
+
|
|
61
|
+
setup_reliability_layer if @reliability_enabled
|
|
62
|
+
|
|
63
|
+
validate_config!
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Write a single event (synchronous)
|
|
67
|
+
#
|
|
68
|
+
# Subclasses must implement this method to send events to external systems.
|
|
69
|
+
# This method is called for each event when batching is not used.
|
|
70
|
+
#
|
|
71
|
+
# @param event_data [Hash] Event payload with keys:
|
|
72
|
+
# - :event_name [String] Event name (e.g., "order.paid")
|
|
73
|
+
# - :severity [Symbol] Severity level (:debug, :info, :success, :warn, :error, :fatal)
|
|
74
|
+
# - :timestamp [Time] Event timestamp
|
|
75
|
+
# - :payload [Hash] Event-specific data
|
|
76
|
+
# - :trace_id [String, nil] Trace ID (if tracing enabled)
|
|
77
|
+
# - :span_id [String, nil] Span ID (if tracing enabled)
|
|
78
|
+
#
|
|
79
|
+
# @return [Boolean] true on success, false on failure (failures should be logged)
|
|
80
|
+
# @raise [NotImplementedError] if not overridden in subclass
|
|
81
|
+
#
|
|
82
|
+
# @example
|
|
83
|
+
# def write(event_data)
|
|
84
|
+
# send_to_api(event_data)
|
|
85
|
+
# true
|
|
86
|
+
# rescue => e
|
|
87
|
+
# warn "Adapter error: #{e.message}"
|
|
88
|
+
# false
|
|
89
|
+
# end
|
|
90
|
+
def write(_event_data)
|
|
91
|
+
raise NotImplementedError, "#{self.class}#write must be implemented"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Write event with reliability layer (retry, circuit breaker, DLQ).
|
|
95
|
+
#
|
|
96
|
+
# This is the recommended public API for sending events.
|
|
97
|
+
# Automatically handles failures, retries, and DLQ.
|
|
98
|
+
#
|
|
99
|
+
# Respects `E11y.config.error_handling.fail_on_error` setting (C18 Resolution):
|
|
100
|
+
# - `true`: Raises exceptions (fast feedback for web requests)
|
|
101
|
+
# - `false`: Swallows exceptions, saves to DLQ (don't fail background jobs)
|
|
102
|
+
#
|
|
103
|
+
# @param event_data [Hash] Event payload
|
|
104
|
+
# @return [Boolean] true on success
|
|
105
|
+
# @raise [RetryExhaustedError, CircuitOpenError] if fail_on_error=true
|
|
106
|
+
def write_with_reliability(event_data)
|
|
107
|
+
return write(event_data) unless @reliability_enabled
|
|
108
|
+
|
|
109
|
+
start_time = Time.now
|
|
110
|
+
begin
|
|
111
|
+
@retry_handler.with_retry(adapter: self, event: event_data) do
|
|
112
|
+
@circuit_breaker.call do
|
|
113
|
+
write(event_data)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Track successful write
|
|
118
|
+
track_adapter_success(event_data, start_time)
|
|
119
|
+
true
|
|
120
|
+
rescue E11y::Reliability::RetryHandler::RetryExhaustedError => e
|
|
121
|
+
track_adapter_failure(event_data, e, start_time)
|
|
122
|
+
handle_reliability_error(event_data, e, :retry_exhausted)
|
|
123
|
+
rescue E11y::Reliability::CircuitBreaker::CircuitOpenError => e
|
|
124
|
+
track_adapter_failure(event_data, e, start_time)
|
|
125
|
+
handle_reliability_error(event_data, e, :circuit_open)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Write a batch of events (preferred for performance)
|
|
130
|
+
#
|
|
131
|
+
# Default implementation calls {#write} for each event.
|
|
132
|
+
# Subclasses should override for better batch performance.
|
|
133
|
+
#
|
|
134
|
+
# @param events [Array<Hash>] Array of event payloads (same format as {#write})
|
|
135
|
+
# @return [Boolean] true if all events written successfully, false otherwise
|
|
136
|
+
#
|
|
137
|
+
# @example Override for batch API
|
|
138
|
+
# def write_batch(events)
|
|
139
|
+
# send_batch_to_api(events)
|
|
140
|
+
# true
|
|
141
|
+
# rescue => e
|
|
142
|
+
# warn "Batch error: #{e.message}"
|
|
143
|
+
# false
|
|
144
|
+
# end
|
|
145
|
+
def write_batch(events)
|
|
146
|
+
# Default: call write for each event
|
|
147
|
+
events.all? { |event| write(event) }
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Check if adapter is healthy
|
|
151
|
+
#
|
|
152
|
+
# Subclasses can override to implement health checks (e.g., ping destination).
|
|
153
|
+
# Called periodically to determine if adapter can accept events.
|
|
154
|
+
#
|
|
155
|
+
# @return [Boolean] Health status (true = healthy, false = unhealthy)
|
|
156
|
+
#
|
|
157
|
+
# @example
|
|
158
|
+
# def healthy?
|
|
159
|
+
# ping_api
|
|
160
|
+
# true
|
|
161
|
+
# rescue
|
|
162
|
+
# false
|
|
163
|
+
# end
|
|
164
|
+
def healthy?
|
|
165
|
+
true
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Close connections, flush buffers
|
|
169
|
+
#
|
|
170
|
+
# Called during graceful shutdown. Subclasses should override to:
|
|
171
|
+
# - Close HTTP connections
|
|
172
|
+
# - Flush internal buffers
|
|
173
|
+
# - Release resources
|
|
174
|
+
#
|
|
175
|
+
# @return [void]
|
|
176
|
+
#
|
|
177
|
+
# @example
|
|
178
|
+
# def close
|
|
179
|
+
# @buffer.flush! if @buffer.any?
|
|
180
|
+
# @connection.close
|
|
181
|
+
# end
|
|
182
|
+
def close
|
|
183
|
+
# Default: no-op
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Adapter capabilities
|
|
187
|
+
#
|
|
188
|
+
# Returns hash of capability flags. Subclasses should override to declare
|
|
189
|
+
# supported features.
|
|
190
|
+
#
|
|
191
|
+
# @return [Hash] Capability flags with keys:
|
|
192
|
+
# - :batching [Boolean] Supports efficient batch writes
|
|
193
|
+
# - :compression [Boolean] Supports compression
|
|
194
|
+
# - :async [Boolean] Non-blocking writes
|
|
195
|
+
# - :streaming [Boolean] Supports streaming
|
|
196
|
+
#
|
|
197
|
+
# @example
|
|
198
|
+
# def capabilities
|
|
199
|
+
# {
|
|
200
|
+
# batching: true,
|
|
201
|
+
# compression: true,
|
|
202
|
+
# async: false,
|
|
203
|
+
# streaming: false
|
|
204
|
+
# }
|
|
205
|
+
# end
|
|
206
|
+
def capabilities
|
|
207
|
+
{
|
|
208
|
+
batching: false,
|
|
209
|
+
compression: false,
|
|
210
|
+
async: false,
|
|
211
|
+
streaming: false
|
|
212
|
+
}
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
private
|
|
216
|
+
|
|
217
|
+
# Validate adapter config
|
|
218
|
+
#
|
|
219
|
+
# Subclasses should override to validate configuration during initialization.
|
|
220
|
+
# Raise ArgumentError for invalid config.
|
|
221
|
+
#
|
|
222
|
+
# @raise [ArgumentError] if configuration is invalid
|
|
223
|
+
#
|
|
224
|
+
# @example
|
|
225
|
+
# def validate_config!
|
|
226
|
+
# raise ArgumentError, "url is required" unless @config[:url]
|
|
227
|
+
# end
|
|
228
|
+
def validate_config!
|
|
229
|
+
# Default: no validation
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Format event for this adapter
|
|
233
|
+
#
|
|
234
|
+
# Subclasses can override to transform event_data to adapter-specific format.
|
|
235
|
+
#
|
|
236
|
+
# @param event_data [Hash] Event payload
|
|
237
|
+
# @return [Hash, String] Formatted event
|
|
238
|
+
#
|
|
239
|
+
# @example
|
|
240
|
+
# def format_event(event_data)
|
|
241
|
+
# {
|
|
242
|
+
# timestamp: event_data[:timestamp].iso8601,
|
|
243
|
+
# message: event_data[:event_name],
|
|
244
|
+
# level: event_data[:severity]
|
|
245
|
+
# }
|
|
246
|
+
# end
|
|
247
|
+
def format_event(event_data)
|
|
248
|
+
event_data
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Execute block with retry logic for transient errors
|
|
252
|
+
#
|
|
253
|
+
# Implements exponential backoff with jitter for network/transient errors.
|
|
254
|
+
# Use this helper in adapter write methods to handle temporary failures.
|
|
255
|
+
#
|
|
256
|
+
# @param max_attempts [Integer] Maximum retry attempts (default: 3)
|
|
257
|
+
# @param base_delay [Float] Initial retry delay in seconds (default: 1.0)
|
|
258
|
+
# @param max_delay [Float] Maximum retry delay in seconds (default: 16.0)
|
|
259
|
+
# @param jitter [Float] Jitter factor (0.0-1.0, default: 0.2 for ±20%)
|
|
260
|
+
# @yield Block to execute with retry
|
|
261
|
+
# @return [Object] Block result
|
|
262
|
+
# @raise Last exception if all retries exhausted
|
|
263
|
+
#
|
|
264
|
+
# @example Retry HTTP request
|
|
265
|
+
# def write(event_data)
|
|
266
|
+
# with_retry(max_attempts: 5) do
|
|
267
|
+
# http_client.post(event_data)
|
|
268
|
+
# end
|
|
269
|
+
# true
|
|
270
|
+
# rescue => e
|
|
271
|
+
# warn "Failed after retries: #{e.message}"
|
|
272
|
+
# false
|
|
273
|
+
# end
|
|
274
|
+
#
|
|
275
|
+
# @see ADR-004 Section 7.1 (Retry Policy)
|
|
276
|
+
def with_retry(max_attempts: 3, base_delay: 1.0, max_delay: 16.0, jitter: 0.2)
|
|
277
|
+
attempt = 0
|
|
278
|
+
|
|
279
|
+
begin
|
|
280
|
+
attempt += 1
|
|
281
|
+
yield
|
|
282
|
+
rescue StandardError => e
|
|
283
|
+
raise unless retriable_error?(e) && attempt < max_attempts
|
|
284
|
+
|
|
285
|
+
delay = calculate_backoff_delay(attempt, base_delay, max_delay, jitter)
|
|
286
|
+
warn "[E11y] #{self.class.name} retry #{attempt}/#{max_attempts} after #{delay.round(2)}s: #{e.message}"
|
|
287
|
+
sleep(delay)
|
|
288
|
+
retry
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Check if error is retriable (network/transient errors)
|
|
293
|
+
#
|
|
294
|
+
# Override in subclasses to customize retriable error detection.
|
|
295
|
+
# Default implementation handles common network errors.
|
|
296
|
+
#
|
|
297
|
+
# @param error [Exception] Error to check
|
|
298
|
+
# @return [Boolean] true if error is retriable
|
|
299
|
+
#
|
|
300
|
+
# @example Add custom retriable errors
|
|
301
|
+
# def retriable_error?(error)
|
|
302
|
+
# super || error.is_a?(CustomTransientError)
|
|
303
|
+
# end
|
|
304
|
+
def retriable_error?(error)
|
|
305
|
+
# Network timeout errors
|
|
306
|
+
return true if error.is_a?(Timeout::Error)
|
|
307
|
+
return true if defined?(Net::ReadTimeout) && error.is_a?(Net::ReadTimeout)
|
|
308
|
+
return true if defined?(Net::OpenTimeout) && error.is_a?(Net::OpenTimeout)
|
|
309
|
+
|
|
310
|
+
# Connection errors
|
|
311
|
+
return true if defined?(Errno::ECONNREFUSED) && error.is_a?(Errno::ECONNREFUSED)
|
|
312
|
+
return true if defined?(Errno::ECONNRESET) && error.is_a?(Errno::ECONNRESET)
|
|
313
|
+
return true if defined?(Errno::ETIMEDOUT) && error.is_a?(Errno::ETIMEDOUT)
|
|
314
|
+
return true if defined?(Errno::EHOSTUNREACH) && error.is_a?(Errno::EHOSTUNREACH)
|
|
315
|
+
|
|
316
|
+
# HTTP client errors (Faraday)
|
|
317
|
+
if defined?(Faraday::TimeoutError)
|
|
318
|
+
return true if error.is_a?(Faraday::TimeoutError)
|
|
319
|
+
return true if error.is_a?(Faraday::ConnectionFailed)
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# HTTP 5xx errors (server errors are retriable)
|
|
323
|
+
if error.respond_to?(:response) && error.response.is_a?(Hash)
|
|
324
|
+
status = error.response[:status]
|
|
325
|
+
return true if status && status >= 500 && status < 600
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
false
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# Calculate exponential backoff delay with jitter
|
|
332
|
+
#
|
|
333
|
+
# @param attempt [Integer] Current attempt number (1-based)
|
|
334
|
+
# @param base_delay [Float] Base delay in seconds
|
|
335
|
+
# @param max_delay [Float] Maximum delay in seconds
|
|
336
|
+
# @param jitter [Float] Jitter factor (0.0-1.0)
|
|
337
|
+
# @return [Float] Delay in seconds
|
|
338
|
+
#
|
|
339
|
+
# @api private
|
|
340
|
+
def calculate_backoff_delay(attempt, base_delay, max_delay, jitter)
|
|
341
|
+
# Exponential: 1s, 2s, 4s, 8s, 16s...
|
|
342
|
+
exponential_delay = base_delay * (2**(attempt - 1))
|
|
343
|
+
delay = [exponential_delay, max_delay].min
|
|
344
|
+
|
|
345
|
+
# Add jitter: ±20% by default
|
|
346
|
+
jitter_amount = delay * jitter * ((rand * 2) - 1) # Random between -jitter and +jitter
|
|
347
|
+
delay + jitter_amount
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Execute block with circuit breaker pattern
|
|
351
|
+
#
|
|
352
|
+
# Prevents cascading failures by opening circuit after threshold failures.
|
|
353
|
+
# Use this helper to wrap write operations that may fail.
|
|
354
|
+
#
|
|
355
|
+
# Note: This is a simplified circuit breaker for single adapter instance.
|
|
356
|
+
# For distributed systems, use external circuit breaker (e.g., semian gem).
|
|
357
|
+
#
|
|
358
|
+
# @param failure_threshold [Integer] Failures before opening circuit (default: 5)
|
|
359
|
+
# @param timeout [Integer] Seconds before testing half-open (default: 60)
|
|
360
|
+
# @yield Block to execute
|
|
361
|
+
# @return [Object] Block result
|
|
362
|
+
# @raise [CircuitOpenError] if circuit is open
|
|
363
|
+
#
|
|
364
|
+
# @example Wrap HTTP calls
|
|
365
|
+
# def write(event_data)
|
|
366
|
+
# with_circuit_breaker do
|
|
367
|
+
# http_client.post(event_data)
|
|
368
|
+
# end
|
|
369
|
+
# true
|
|
370
|
+
# rescue CircuitOpenError => e
|
|
371
|
+
# warn "Circuit open: #{e.message}"
|
|
372
|
+
# false
|
|
373
|
+
# end
|
|
374
|
+
#
|
|
375
|
+
# @see ADR-004 Section 7.2 (Circuit Breaker)
|
|
376
|
+
def with_circuit_breaker(failure_threshold: 5, timeout: 60)
|
|
377
|
+
init_circuit_breaker! unless @circuit_state
|
|
378
|
+
|
|
379
|
+
@circuit_mutex.synchronize do
|
|
380
|
+
if @circuit_state == :open
|
|
381
|
+
unless circuit_timeout_expired?(timeout)
|
|
382
|
+
raise CircuitOpenError, "Circuit breaker open for #{self.class.name}"
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
@circuit_state = :half_open
|
|
386
|
+
@circuit_success_count = 0
|
|
387
|
+
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
begin
|
|
392
|
+
result = yield
|
|
393
|
+
on_circuit_success
|
|
394
|
+
result
|
|
395
|
+
rescue StandardError
|
|
396
|
+
on_circuit_failure(failure_threshold)
|
|
397
|
+
raise
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# Initialize circuit breaker state
|
|
402
|
+
#
|
|
403
|
+
# @api private
|
|
404
|
+
def init_circuit_breaker!
|
|
405
|
+
@circuit_mutex = Mutex.new
|
|
406
|
+
@circuit_state = :closed
|
|
407
|
+
@circuit_failure_count = 0
|
|
408
|
+
@circuit_success_count = 0
|
|
409
|
+
@circuit_last_failure_time = nil
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
# Handle successful circuit execution
|
|
413
|
+
#
|
|
414
|
+
# @api private
|
|
415
|
+
def on_circuit_success
|
|
416
|
+
@circuit_mutex.synchronize do
|
|
417
|
+
@circuit_failure_count = 0
|
|
418
|
+
|
|
419
|
+
if @circuit_state == :half_open
|
|
420
|
+
@circuit_success_count += 1
|
|
421
|
+
if @circuit_success_count >= 2 # 2 successes → close
|
|
422
|
+
@circuit_state = :closed
|
|
423
|
+
warn "[E11y] #{self.class.name} circuit breaker closed (recovered)"
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Handle failed circuit execution
|
|
430
|
+
#
|
|
431
|
+
# @param threshold [Integer] Failure threshold
|
|
432
|
+
# @api private
|
|
433
|
+
def on_circuit_failure(threshold)
|
|
434
|
+
@circuit_mutex.synchronize do
|
|
435
|
+
@circuit_failure_count += 1
|
|
436
|
+
@circuit_success_count = 0
|
|
437
|
+
@circuit_last_failure_time = Time.now
|
|
438
|
+
|
|
439
|
+
if @circuit_failure_count >= threshold && @circuit_state == :closed
|
|
440
|
+
@circuit_state = :open
|
|
441
|
+
warn "[E11y] #{self.class.name} circuit breaker opened (#{@circuit_failure_count} failures)"
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Check if circuit timeout has expired
|
|
447
|
+
#
|
|
448
|
+
# @param timeout [Integer] Timeout in seconds
|
|
449
|
+
# @return [Boolean]
|
|
450
|
+
# @api private
|
|
451
|
+
def circuit_timeout_expired?(timeout)
|
|
452
|
+
@circuit_last_failure_time && (Time.now - @circuit_last_failure_time) >= timeout
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Setup reliability layer (Retry + CircuitBreaker + DLQ).
|
|
456
|
+
#
|
|
457
|
+
# @api private
|
|
458
|
+
def setup_reliability_layer
|
|
459
|
+
reliability_config = @config.fetch(:reliability, {})
|
|
460
|
+
|
|
461
|
+
# Setup RetryHandler
|
|
462
|
+
retry_config = reliability_config.fetch(:retry, {})
|
|
463
|
+
@retry_handler = E11y::Reliability::RetryHandler.new(config: retry_config)
|
|
464
|
+
|
|
465
|
+
# Setup CircuitBreaker
|
|
466
|
+
circuit_breaker_config = reliability_config.fetch(:circuit_breaker, {})
|
|
467
|
+
@circuit_breaker = E11y::Reliability::CircuitBreaker.new(
|
|
468
|
+
adapter_name: self.class.name,
|
|
469
|
+
config: circuit_breaker_config
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Setup DLQ components (will be initialized from E11y.config later)
|
|
473
|
+
@dlq_filter = nil
|
|
474
|
+
@dlq_storage = nil
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# Handle reliability error (retry exhausted / circuit breaker open).
|
|
478
|
+
#
|
|
479
|
+
# Behavior depends on `E11y.config.error_handling.fail_on_error` (C18 Resolution):
|
|
480
|
+
# - `true`: Re-raises exception (fast feedback for web requests)
|
|
481
|
+
# - `false`: Swallows exception, saves to DLQ (don't fail background jobs)
|
|
482
|
+
#
|
|
483
|
+
# @param event_data [Hash] Event payload
|
|
484
|
+
# @param error [StandardError] Error that occurred
|
|
485
|
+
# @param reason [Symbol] Error reason (:retry_exhausted, :circuit_open)
|
|
486
|
+
# @return [Boolean] false (event failed)
|
|
487
|
+
# @raise [StandardError] Re-raises if fail_on_error=true
|
|
488
|
+
#
|
|
489
|
+
# @api private
|
|
490
|
+
def handle_reliability_error(event_data, error, reason)
|
|
491
|
+
# Save to DLQ if filter allows
|
|
492
|
+
save_to_dlq_if_needed(event_data, error, reason)
|
|
493
|
+
|
|
494
|
+
# Log warning
|
|
495
|
+
warn "[E11y] #{self.class.name} #{reason} for event #{event_data[:event_name]}: #{error.message}"
|
|
496
|
+
|
|
497
|
+
# Check fail_on_error setting (C18 Resolution)
|
|
498
|
+
raise error if E11y.config.error_handling.fail_on_error
|
|
499
|
+
|
|
500
|
+
# Web request context: RAISE (fast feedback)
|
|
501
|
+
|
|
502
|
+
# Background job context: SWALLOW (don't fail business logic)
|
|
503
|
+
# TODO: Track metric e11y.event.tracking_failed_silent
|
|
504
|
+
false
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
# Save event to DLQ if filter allows.
|
|
508
|
+
#
|
|
509
|
+
# @api private
|
|
510
|
+
def save_to_dlq_if_needed(event_data, error, reason)
|
|
511
|
+
return unless @dlq_filter&.should_save?(event_data, error)
|
|
512
|
+
|
|
513
|
+
@dlq_storage&.save(event_data, metadata: {
|
|
514
|
+
error: error.message,
|
|
515
|
+
error_class: error.class.name,
|
|
516
|
+
reason: reason,
|
|
517
|
+
adapter: self.class.name,
|
|
518
|
+
timestamp: Time.now.utc.iso8601
|
|
519
|
+
})
|
|
520
|
+
rescue StandardError => e
|
|
521
|
+
# C18: Don't fail if DLQ save fails
|
|
522
|
+
warn "[E11y] Failed to save event to DLQ: #{e.message}"
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
# Track successful adapter write (self-monitoring).
|
|
526
|
+
#
|
|
527
|
+
# @api private
|
|
528
|
+
def track_adapter_success(_event_data, start_time)
|
|
529
|
+
duration_ms = ((Time.now - start_time) * 1000).round(2)
|
|
530
|
+
|
|
531
|
+
require "e11y/self_monitoring/performance_monitor"
|
|
532
|
+
require "e11y/self_monitoring/reliability_monitor"
|
|
533
|
+
|
|
534
|
+
# Use class name or "AnonymousAdapter" for anonymous classes
|
|
535
|
+
adapter_name = self.class.name || "AnonymousAdapter"
|
|
536
|
+
|
|
537
|
+
E11y::SelfMonitoring::PerformanceMonitor.track_adapter_latency(
|
|
538
|
+
adapter_name,
|
|
539
|
+
duration_ms
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
E11y::SelfMonitoring::ReliabilityMonitor.track_adapter_success(
|
|
543
|
+
adapter_name: adapter_name
|
|
544
|
+
)
|
|
545
|
+
rescue StandardError => e
|
|
546
|
+
# Don't fail if monitoring fails
|
|
547
|
+
warn "[E11y] Self-monitoring error: #{e.message}"
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
# Track failed adapter write (self-monitoring).
|
|
551
|
+
#
|
|
552
|
+
# @api private
|
|
553
|
+
def track_adapter_failure(_event_data, error, start_time)
|
|
554
|
+
duration_ms = ((Time.now - start_time) * 1000).round(2)
|
|
555
|
+
|
|
556
|
+
require "e11y/self_monitoring/performance_monitor"
|
|
557
|
+
require "e11y/self_monitoring/reliability_monitor"
|
|
558
|
+
|
|
559
|
+
# Use class name or "AnonymousAdapter" for anonymous classes
|
|
560
|
+
adapter_name = self.class.name || "AnonymousAdapter"
|
|
561
|
+
|
|
562
|
+
E11y::SelfMonitoring::PerformanceMonitor.track_adapter_latency(
|
|
563
|
+
adapter_name,
|
|
564
|
+
duration_ms
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
E11y::SelfMonitoring::ReliabilityMonitor.track_adapter_failure(
|
|
568
|
+
adapter_name: adapter_name,
|
|
569
|
+
error_class: error.class.name
|
|
570
|
+
)
|
|
571
|
+
rescue StandardError => e
|
|
572
|
+
# Don't fail if monitoring fails
|
|
573
|
+
warn "[E11y] Self-monitoring error: #{e.message}"
|
|
574
|
+
end
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
# Circuit breaker open error
|
|
578
|
+
class CircuitOpenError < Error; end
|
|
579
|
+
end
|
|
580
|
+
end
|