e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
data/lib/e11y/railtie.rb
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Skip Railtie if Rails is not available
|
|
4
|
+
return unless defined?(Rails)
|
|
5
|
+
|
|
6
|
+
require "rails/railtie"
|
|
7
|
+
|
|
8
|
+
module E11y
|
|
9
|
+
# Rails integration via Railtie
|
|
10
|
+
#
|
|
11
|
+
# Provides zero-config Rails integration:
|
|
12
|
+
# - Auto-initialization on Rails boot
|
|
13
|
+
# - Middleware insertion (request context, tracing)
|
|
14
|
+
# - ActiveSupport::Notifications integration
|
|
15
|
+
# - Rails.logger bridge (optional)
|
|
16
|
+
# - Console helpers
|
|
17
|
+
#
|
|
18
|
+
# @example Basic usage (no config needed)
|
|
19
|
+
# # In Rails app, E11y auto-configures:
|
|
20
|
+
# # - Service name from Rails.application.name
|
|
21
|
+
# # - Environment from Rails.env
|
|
22
|
+
# # - Adapters: stdout (dev), loki (prod)
|
|
23
|
+
#
|
|
24
|
+
# @example Custom configuration
|
|
25
|
+
# # config/initializers/e11y.rb
|
|
26
|
+
# E11y.configure do |config|
|
|
27
|
+
# config.service_name = "my-app"
|
|
28
|
+
# config.adapters.register :loki, E11y::Adapters::Loki.new(url: ENV['LOKI_URL'])
|
|
29
|
+
# end
|
|
30
|
+
#
|
|
31
|
+
# @see ADR-008 §3 (Railtie & Initialization)
|
|
32
|
+
class Railtie < Rails::Railtie
|
|
33
|
+
# Run before framework initialization
|
|
34
|
+
config.before_initialize do
|
|
35
|
+
# Set up basic configuration from Rails
|
|
36
|
+
E11y.configure do |config|
|
|
37
|
+
config.environment = Rails.env.to_s
|
|
38
|
+
config.service_name = derive_service_name
|
|
39
|
+
config.enabled = !Rails.env.test? # Disabled in tests by default
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Run after framework initialization
|
|
44
|
+
config.after_initialize do
|
|
45
|
+
next unless E11y.config.enabled
|
|
46
|
+
|
|
47
|
+
# Setup instruments (each can be enabled/disabled separately)
|
|
48
|
+
setup_rails_instrumentation if E11y.config.rails_instrumentation&.enabled
|
|
49
|
+
setup_logger_bridge if E11y.config.logger_bridge&.enabled
|
|
50
|
+
setup_sidekiq if defined?(::Sidekiq) && E11y.config.sidekiq&.enabled
|
|
51
|
+
setup_active_job if defined?(::ActiveJob) && E11y.config.active_job&.enabled
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Middleware insertion
|
|
55
|
+
initializer "e11y.middleware" do |app|
|
|
56
|
+
next unless E11y.config.enabled
|
|
57
|
+
|
|
58
|
+
# Insert E11y request middleware before Rails logger
|
|
59
|
+
# This ensures trace context is set up before any Rails logging
|
|
60
|
+
app.middleware.insert_before(
|
|
61
|
+
Rails::Rack::Logger,
|
|
62
|
+
E11y::Middleware::Request
|
|
63
|
+
)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Console helpers
|
|
67
|
+
console do
|
|
68
|
+
next unless E11y.config.enabled
|
|
69
|
+
|
|
70
|
+
require "e11y/console"
|
|
71
|
+
E11y::Console.enable!
|
|
72
|
+
|
|
73
|
+
puts "E11y loaded. Try: E11y.stats"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Rake task helpers
|
|
77
|
+
rake_tasks do
|
|
78
|
+
next unless E11y.config.enabled
|
|
79
|
+
|
|
80
|
+
# TODO: Add rake tasks (e11y:stats, e11y:test_event, etc.)
|
|
81
|
+
# load 'e11y/tasks.rake'
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Derive service name from Rails application class
|
|
85
|
+
# @return [String] Service name (e.g., "my_app")
|
|
86
|
+
def self.derive_service_name
|
|
87
|
+
Rails.application.class.module_parent_name.underscore
|
|
88
|
+
rescue StandardError
|
|
89
|
+
"rails_app"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Setup Rails instrumentation (ActiveSupport::Notifications → E11y)
|
|
93
|
+
# @return [void]
|
|
94
|
+
def self.setup_rails_instrumentation
|
|
95
|
+
require "e11y/instruments/rails_instrumentation"
|
|
96
|
+
E11y::Instruments::RailsInstrumentation.setup!
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Setup Rails.logger bridge (optional, replaces Rails.logger)
|
|
100
|
+
# @return [void]
|
|
101
|
+
def self.setup_logger_bridge
|
|
102
|
+
require "e11y/logger/bridge"
|
|
103
|
+
E11y::Logger::Bridge.setup!
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Setup Sidekiq integration (client + server middleware)
|
|
107
|
+
# @return [void]
|
|
108
|
+
def self.setup_sidekiq
|
|
109
|
+
require "e11y/instruments/sidekiq"
|
|
110
|
+
|
|
111
|
+
# Configure server middleware
|
|
112
|
+
::Sidekiq.configure_server do |config|
|
|
113
|
+
config.server_middleware do |chain|
|
|
114
|
+
chain.add E11y::Instruments::Sidekiq::ServerMiddleware
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Configure client middleware
|
|
119
|
+
::Sidekiq.configure_client do |config|
|
|
120
|
+
config.client_middleware do |chain|
|
|
121
|
+
chain.add E11y::Instruments::Sidekiq::ClientMiddleware
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Setup ActiveJob integration (callbacks)
|
|
127
|
+
# @return [void]
|
|
128
|
+
def self.setup_active_job
|
|
129
|
+
require "e11y/instruments/active_job"
|
|
130
|
+
|
|
131
|
+
# Include callbacks into ApplicationJob (if defined)
|
|
132
|
+
::ApplicationJob.include(E11y::Instruments::ActiveJob::Callbacks) if defined?(::ApplicationJob)
|
|
133
|
+
|
|
134
|
+
# Also include into ActiveJob::Base as fallback
|
|
135
|
+
::ActiveJob::Base.include(E11y::Instruments::ActiveJob::Callbacks)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Reliability
|
|
5
|
+
# Circuit Breaker pattern implementation for adapter reliability.
|
|
6
|
+
#
|
|
7
|
+
# Prevents cascading failures by opening circuit when adapter fails repeatedly.
|
|
8
|
+
# Three states: CLOSED (healthy), OPEN (failing), HALF_OPEN (testing recovery).
|
|
9
|
+
#
|
|
10
|
+
# @example Usage in adapter
|
|
11
|
+
# circuit_breaker = CircuitBreaker.new(adapter_name: "loki", config: config)
|
|
12
|
+
#
|
|
13
|
+
# circuit_breaker.call do
|
|
14
|
+
# # Send event to adapter
|
|
15
|
+
# adapter.send(event)
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# @see ADR-013 §5 (Circuit Breaker)
|
|
19
|
+
# @see UC-021 §4 (Circuit Breaker for Adapters)
|
|
20
|
+
class CircuitBreaker
|
|
21
|
+
# Circuit is closed (healthy) - all requests pass through
|
|
22
|
+
STATE_CLOSED = :closed
|
|
23
|
+
|
|
24
|
+
# Circuit is open (failing) - all requests fail fast
|
|
25
|
+
STATE_OPEN = :open
|
|
26
|
+
|
|
27
|
+
# Circuit is half-open (testing recovery) - limited requests allowed
|
|
28
|
+
STATE_HALF_OPEN = :half_open
|
|
29
|
+
|
|
30
|
+
# Circuit breaker opened error (fast fail)
|
|
31
|
+
class CircuitOpenError < StandardError; end
|
|
32
|
+
|
|
33
|
+
# @param adapter_name [String] Name of the adapter (for metrics)
|
|
34
|
+
# @param config [Hash] Configuration options
|
|
35
|
+
# @option config [Integer] :failure_threshold Number of failures before opening circuit (default: 5)
|
|
36
|
+
# @option config [Integer] :timeout_seconds Seconds before transitioning to half-open (default: 60)
|
|
37
|
+
# @option config [Integer] :half_open_attempts Success attempts needed in half-open to close (default: 2)
|
|
38
|
+
def initialize(adapter_name:, config: {})
|
|
39
|
+
@adapter_name = adapter_name
|
|
40
|
+
@failure_threshold = config[:failure_threshold] || 5
|
|
41
|
+
@timeout_seconds = config[:timeout_seconds] || 60
|
|
42
|
+
@half_open_attempts = config[:half_open_attempts] || 2
|
|
43
|
+
|
|
44
|
+
@state = STATE_CLOSED
|
|
45
|
+
@failure_count = 0
|
|
46
|
+
@success_count = 0
|
|
47
|
+
@last_failure_time = nil
|
|
48
|
+
@opened_at = nil
|
|
49
|
+
@mutex = Mutex.new
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Execute block with circuit breaker protection.
|
|
53
|
+
#
|
|
54
|
+
# @yield Block to execute (adapter send)
|
|
55
|
+
# @return [Object] Result of block execution
|
|
56
|
+
# @raise [CircuitOpenError] if circuit is open
|
|
57
|
+
# @raise [StandardError] if block raises and circuit transitions to open
|
|
58
|
+
def call(&)
|
|
59
|
+
check_state_transition
|
|
60
|
+
|
|
61
|
+
case @state
|
|
62
|
+
when STATE_CLOSED
|
|
63
|
+
execute_with_closed_circuit(&)
|
|
64
|
+
when STATE_OPEN
|
|
65
|
+
handle_open_circuit
|
|
66
|
+
when STATE_HALF_OPEN
|
|
67
|
+
execute_with_half_open_circuit(&)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Check if circuit is healthy (closed state).
|
|
72
|
+
#
|
|
73
|
+
# @return [Boolean] true if circuit is closed
|
|
74
|
+
def healthy?
|
|
75
|
+
@state == STATE_CLOSED
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Get current circuit breaker statistics.
|
|
79
|
+
#
|
|
80
|
+
# @return [Hash] Statistics hash
|
|
81
|
+
def stats
|
|
82
|
+
{
|
|
83
|
+
adapter: @adapter_name,
|
|
84
|
+
state: @state,
|
|
85
|
+
failure_count: @failure_count,
|
|
86
|
+
success_count: @success_count,
|
|
87
|
+
last_failure: @last_failure_time,
|
|
88
|
+
opened_at: @opened_at
|
|
89
|
+
}
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
# Execute block in CLOSED state (normal operation).
|
|
95
|
+
def execute_with_closed_circuit
|
|
96
|
+
result = yield
|
|
97
|
+
on_success
|
|
98
|
+
result
|
|
99
|
+
rescue StandardError => e
|
|
100
|
+
on_failure(e)
|
|
101
|
+
raise
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Execute block in HALF_OPEN state (testing recovery).
|
|
105
|
+
def execute_with_half_open_circuit
|
|
106
|
+
result = yield
|
|
107
|
+
on_half_open_success
|
|
108
|
+
result
|
|
109
|
+
rescue StandardError => e
|
|
110
|
+
on_half_open_failure(e)
|
|
111
|
+
raise
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Handle OPEN state (fast fail).
|
|
115
|
+
def handle_open_circuit
|
|
116
|
+
increment_metric("e11y.circuit_breaker.rejected")
|
|
117
|
+
|
|
118
|
+
raise CircuitOpenError, "Circuit breaker open for #{@adapter_name} " \
|
|
119
|
+
"(opened at #{@opened_at}, timeout: #{@timeout_seconds}s)"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Check if circuit should transition states.
|
|
123
|
+
def check_state_transition
|
|
124
|
+
return unless @state == STATE_OPEN
|
|
125
|
+
|
|
126
|
+
@mutex.synchronize do
|
|
127
|
+
# Transition OPEN → HALF_OPEN after timeout
|
|
128
|
+
transition_to_half_open if Time.now - @opened_at >= @timeout_seconds
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Handle successful execution in CLOSED state.
|
|
133
|
+
def on_success
|
|
134
|
+
@mutex.synchronize do
|
|
135
|
+
@failure_count = 0
|
|
136
|
+
@success_count += 1
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
increment_metric("e11y.circuit_breaker.success")
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Handle failed execution in CLOSED state.
|
|
143
|
+
def on_failure(error)
|
|
144
|
+
@mutex.synchronize do
|
|
145
|
+
@failure_count += 1
|
|
146
|
+
@last_failure_time = Time.now
|
|
147
|
+
|
|
148
|
+
# Transition CLOSED → OPEN if threshold exceeded
|
|
149
|
+
transition_to_open if @failure_count >= @failure_threshold
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
increment_metric("e11y.circuit_breaker.failure", error: error.class.name)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Handle successful execution in HALF_OPEN state.
|
|
156
|
+
def on_half_open_success
|
|
157
|
+
@mutex.synchronize do
|
|
158
|
+
@success_count += 1
|
|
159
|
+
|
|
160
|
+
# Transition HALF_OPEN → CLOSED after enough successes
|
|
161
|
+
transition_to_closed if @success_count >= @half_open_attempts
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
increment_metric("e11y.circuit_breaker.half_open_success")
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Handle failed execution in HALF_OPEN state.
|
|
168
|
+
def on_half_open_failure(error)
|
|
169
|
+
@mutex.synchronize do
|
|
170
|
+
# Single failure in HALF_OPEN → back to OPEN
|
|
171
|
+
transition_to_open
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
increment_metric("e11y.circuit_breaker.half_open_failure", error: error.class.name)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Transition to OPEN state.
|
|
178
|
+
def transition_to_open
|
|
179
|
+
@state = STATE_OPEN
|
|
180
|
+
@opened_at = Time.now
|
|
181
|
+
@failure_count = 0 # Reset for next cycle
|
|
182
|
+
@success_count = 0
|
|
183
|
+
|
|
184
|
+
increment_metric("e11y.circuit_breaker.opened")
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Transition to HALF_OPEN state.
|
|
188
|
+
def transition_to_half_open
|
|
189
|
+
@state = STATE_HALF_OPEN
|
|
190
|
+
@success_count = 0 # Reset success counter for testing
|
|
191
|
+
|
|
192
|
+
increment_metric("e11y.circuit_breaker.half_opened")
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Transition to CLOSED state.
|
|
196
|
+
def transition_to_closed
|
|
197
|
+
@state = STATE_CLOSED
|
|
198
|
+
@failure_count = 0
|
|
199
|
+
@success_count = 0
|
|
200
|
+
@opened_at = nil
|
|
201
|
+
@last_failure_time = nil
|
|
202
|
+
|
|
203
|
+
increment_metric("e11y.circuit_breaker.closed")
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Increment circuit breaker metric.
|
|
207
|
+
#
|
|
208
|
+
# @param metric_name [String] Metric name
|
|
209
|
+
# @param tags [Hash] Additional tags
|
|
210
|
+
def increment_metric(metric_name, tags = {})
|
|
211
|
+
# TODO: Integrate with Yabeda metrics
|
|
212
|
+
# E11y::Metrics.increment(metric_name, tags.merge(adapter: @adapter_name, state: @state))
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "securerandom"
|
|
6
|
+
|
|
7
|
+
module E11y
|
|
8
|
+
module Reliability
|
|
9
|
+
module DLQ
|
|
10
|
+
# File-based Dead Letter Queue storage.
|
|
11
|
+
#
|
|
12
|
+
# Stores failed events to a JSONL file for later analysis/replay.
|
|
13
|
+
# Each line is a JSON object representing a failed event with metadata.
|
|
14
|
+
#
|
|
15
|
+
# @example Usage
|
|
16
|
+
# dlq = FileStorage.new(file_path: "log/e11y_dlq.jsonl")
|
|
17
|
+
# dlq.save(event_data, metadata: { error: "Timeout", retry_count: 3 })
|
|
18
|
+
#
|
|
19
|
+
# @see ADR-013 §4 (Dead Letter Queue)
|
|
20
|
+
# @see UC-021 §3 (DLQ File Storage)
|
|
21
|
+
class FileStorage
|
|
22
|
+
# @param file_path [String] Path to DLQ file (default: log/e11y_dlq.jsonl)
|
|
23
|
+
# @param max_file_size_mb [Integer] Maximum file size in MB before rotation (default: 100)
|
|
24
|
+
# @param retention_days [Integer] Days to retain DLQ files (default: 30)
|
|
25
|
+
def initialize(file_path: nil, max_file_size_mb: 100, retention_days: 30)
|
|
26
|
+
@file_path = file_path || default_file_path
|
|
27
|
+
@max_file_size_bytes = max_file_size_mb * 1024 * 1024
|
|
28
|
+
@retention_days = retention_days
|
|
29
|
+
@mutex = Mutex.new
|
|
30
|
+
|
|
31
|
+
ensure_directory_exists
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Save failed event to DLQ.
|
|
35
|
+
#
|
|
36
|
+
# @param event_data [Hash] Event data
|
|
37
|
+
# @param metadata [Hash] Failure metadata (error, retry_count, adapter, etc.)
|
|
38
|
+
# @return [String] Event ID (UUID)
|
|
39
|
+
def save(event_data, metadata: {})
|
|
40
|
+
event_id = SecureRandom.uuid
|
|
41
|
+
timestamp = Time.now.utc
|
|
42
|
+
|
|
43
|
+
dlq_entry = {
|
|
44
|
+
id: event_id,
|
|
45
|
+
timestamp: timestamp.iso8601(3),
|
|
46
|
+
event_name: event_data[:event_name],
|
|
47
|
+
event_data: event_data,
|
|
48
|
+
metadata: metadata.merge(
|
|
49
|
+
failed_at: timestamp.iso8601(3),
|
|
50
|
+
retry_count: metadata[:retry_count] || 0,
|
|
51
|
+
error_message: metadata[:error]&.message,
|
|
52
|
+
error_class: metadata[:error]&.class&.name
|
|
53
|
+
)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
write_entry(dlq_entry)
|
|
57
|
+
rotate_if_needed
|
|
58
|
+
cleanup_old_files
|
|
59
|
+
|
|
60
|
+
increment_metric("e11y.dlq.saved", event_name: event_data[:event_name])
|
|
61
|
+
|
|
62
|
+
event_id
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# List DLQ entries with optional filters.
|
|
66
|
+
#
|
|
67
|
+
# @param limit [Integer] Maximum entries to return
|
|
68
|
+
# @param offset [Integer] Number of entries to skip
|
|
69
|
+
# @param filters [Hash] Filter options (event_name, after, before)
|
|
70
|
+
# @return [Array<Hash>] Array of DLQ entries
|
|
71
|
+
def list(limit: 100, offset: 0, filters: {})
|
|
72
|
+
entries = []
|
|
73
|
+
|
|
74
|
+
return entries unless File.exist?(@file_path)
|
|
75
|
+
|
|
76
|
+
File.foreach(@file_path).with_index do |line, index|
|
|
77
|
+
next if index < offset
|
|
78
|
+
break if entries.size >= limit
|
|
79
|
+
|
|
80
|
+
entry = JSON.parse(line, symbolize_names: true)
|
|
81
|
+
|
|
82
|
+
# Apply filters
|
|
83
|
+
next if filters[:event_name] && entry[:event_name] != filters[:event_name]
|
|
84
|
+
next if filters[:after] && Time.parse(entry[:timestamp]) < filters[:after]
|
|
85
|
+
next if filters[:before] && Time.parse(entry[:timestamp]) > filters[:before]
|
|
86
|
+
|
|
87
|
+
entries << entry
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
entries
|
|
91
|
+
rescue JSON::ParserError => e
|
|
92
|
+
# Log parsing error but don't crash
|
|
93
|
+
increment_metric("e11y.dlq.parse_error", error: e.class.name)
|
|
94
|
+
entries
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Get DLQ statistics.
|
|
98
|
+
#
|
|
99
|
+
# @return [Hash] Statistics (total_entries, file_size_mb, oldest_entry, newest_entry)
|
|
100
|
+
def stats
|
|
101
|
+
return default_stats unless File.exist?(@file_path)
|
|
102
|
+
|
|
103
|
+
file_size_bytes = File.size(@file_path)
|
|
104
|
+
total_entries = File.foreach(@file_path).count
|
|
105
|
+
|
|
106
|
+
oldest_entry = nil
|
|
107
|
+
newest_entry = nil
|
|
108
|
+
|
|
109
|
+
# Read first and last line for oldest/newest timestamps
|
|
110
|
+
File.foreach(@file_path).with_index do |line, index|
|
|
111
|
+
entry = JSON.parse(line, symbolize_names: true)
|
|
112
|
+
oldest_entry = entry[:timestamp] if index.zero?
|
|
113
|
+
newest_entry = entry[:timestamp]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
{
|
|
117
|
+
total_entries: total_entries,
|
|
118
|
+
file_size_mb: (file_size_bytes / 1024.0 / 1024.0).round(2),
|
|
119
|
+
oldest_entry: oldest_entry,
|
|
120
|
+
newest_entry: newest_entry,
|
|
121
|
+
file_path: @file_path
|
|
122
|
+
}
|
|
123
|
+
rescue StandardError => e
|
|
124
|
+
increment_metric("e11y.dlq.stats_error", error: e.class.name)
|
|
125
|
+
default_stats
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Replay single event from DLQ.
|
|
129
|
+
#
|
|
130
|
+
# @param event_id [String] Event ID to replay
|
|
131
|
+
# @return [Boolean] true if replayed successfully
|
|
132
|
+
def replay(event_id)
|
|
133
|
+
entry = find_entry(event_id)
|
|
134
|
+
return false unless entry
|
|
135
|
+
|
|
136
|
+
# Re-dispatch event through E11y pipeline
|
|
137
|
+
# TODO: Implement E11y::Pipeline.dispatch
|
|
138
|
+
# E11y::Pipeline.dispatch(entry[:event_data], metadata: entry[:metadata].merge(replayed: true))
|
|
139
|
+
|
|
140
|
+
# For now, just mark as replayed
|
|
141
|
+
increment_metric("e11y.dlq.replayed", event_name: entry[:event_name])
|
|
142
|
+
true
|
|
143
|
+
rescue StandardError => e
|
|
144
|
+
increment_metric("e11y.dlq.replay_failed", error: e.class.name)
|
|
145
|
+
false
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Replay batch of events from DLQ.
|
|
149
|
+
#
|
|
150
|
+
# @param event_ids [Array<String>] Event IDs to replay
|
|
151
|
+
# @return [Hash] Result summary (success_count, failure_count)
|
|
152
|
+
def replay_batch(event_ids)
|
|
153
|
+
success_count = 0
|
|
154
|
+
failure_count = 0
|
|
155
|
+
|
|
156
|
+
event_ids.each do |event_id|
|
|
157
|
+
if replay(event_id)
|
|
158
|
+
success_count += 1
|
|
159
|
+
else
|
|
160
|
+
failure_count += 1
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
{ success_count: success_count, failure_count: failure_count }
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Delete entry from DLQ.
|
|
168
|
+
#
|
|
169
|
+
# Note: This is a simplified implementation.
|
|
170
|
+
# In production, consider using a database or append-only log with tombstones.
|
|
171
|
+
#
|
|
172
|
+
# @param event_id [String] Event ID to delete
|
|
173
|
+
# @return [Boolean] true if deleted
|
|
174
|
+
def delete(_event_id)
|
|
175
|
+
# TODO: Implement deletion (requires rewriting file)
|
|
176
|
+
# For JSONL, deletion is expensive (requires full file rewrite)
|
|
177
|
+
# Consider marking as deleted instead or using database
|
|
178
|
+
false
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
private
|
|
182
|
+
|
|
183
|
+
# Get default file path (log/e11y_dlq.jsonl).
|
|
184
|
+
def default_file_path
|
|
185
|
+
if defined?(Rails) && Rails.root
|
|
186
|
+
Rails.root.join("log", "e11y_dlq.jsonl").to_s
|
|
187
|
+
else
|
|
188
|
+
File.join("log", "e11y_dlq.jsonl")
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Ensure log directory exists.
|
|
193
|
+
def ensure_directory_exists
|
|
194
|
+
dir = File.dirname(@file_path)
|
|
195
|
+
FileUtils.mkdir_p(dir)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Write DLQ entry to file (thread-safe).
|
|
199
|
+
def write_entry(entry)
|
|
200
|
+
@mutex.synchronize do
|
|
201
|
+
File.open(@file_path, "a") do |f|
|
|
202
|
+
f.flock(File::LOCK_EX)
|
|
203
|
+
f.puts(JSON.generate(entry))
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Rotate file if size exceeds max_file_size.
|
|
209
|
+
def rotate_if_needed
|
|
210
|
+
return unless File.exist?(@file_path)
|
|
211
|
+
return if File.size(@file_path) < @max_file_size_bytes
|
|
212
|
+
|
|
213
|
+
@mutex.synchronize do
|
|
214
|
+
# Rotate: log/e11y_dlq.jsonl → log/e11y_dlq.2026-01-20T12:34:56Z.jsonl
|
|
215
|
+
timestamp = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
216
|
+
rotated_path = @file_path.sub(/\.jsonl$/, ".#{timestamp}.jsonl")
|
|
217
|
+
|
|
218
|
+
FileUtils.mv(@file_path, rotated_path)
|
|
219
|
+
|
|
220
|
+
increment_metric("e11y.dlq.rotated", new_file: rotated_path)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Cleanup old rotated files.
|
|
225
|
+
def cleanup_old_files
|
|
226
|
+
dir = File.dirname(@file_path)
|
|
227
|
+
base_name = File.basename(@file_path, ".jsonl")
|
|
228
|
+
|
|
229
|
+
# Find all rotated files: e11y_dlq.*.jsonl
|
|
230
|
+
pattern = File.join(dir, "#{base_name}.*.jsonl")
|
|
231
|
+
|
|
232
|
+
Dir.glob(pattern).each do |file|
|
|
233
|
+
next unless File.file?(file)
|
|
234
|
+
|
|
235
|
+
file_age_days = (Time.now - File.mtime(file)) / 86_400
|
|
236
|
+
|
|
237
|
+
if file_age_days > @retention_days
|
|
238
|
+
File.delete(file)
|
|
239
|
+
increment_metric("e11y.dlq.cleaned_up", file: file)
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Find DLQ entry by ID.
|
|
245
|
+
def find_entry(event_id)
|
|
246
|
+
return nil unless File.exist?(@file_path)
|
|
247
|
+
|
|
248
|
+
File.foreach(@file_path) do |line|
|
|
249
|
+
entry = JSON.parse(line, symbolize_names: true)
|
|
250
|
+
return entry if entry[:id] == event_id
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
nil
|
|
254
|
+
rescue JSON::ParserError
|
|
255
|
+
nil
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
# Default stats when file doesn't exist.
|
|
259
|
+
def default_stats
|
|
260
|
+
{
|
|
261
|
+
total_entries: 0,
|
|
262
|
+
file_size_mb: 0.0,
|
|
263
|
+
oldest_entry: nil,
|
|
264
|
+
newest_entry: nil,
|
|
265
|
+
file_path: @file_path
|
|
266
|
+
}
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Increment DLQ metric.
|
|
270
|
+
def increment_metric(metric_name, tags = {})
|
|
271
|
+
# TODO: Integrate with Yabeda metrics
|
|
272
|
+
# E11y::Metrics.increment(metric_name, tags)
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|