e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Skip Railtie if Rails is not available
4
+ return unless defined?(Rails)
5
+
6
+ require "rails/railtie"
7
+
8
+ module E11y
9
+ # Rails integration via Railtie
10
+ #
11
+ # Provides zero-config Rails integration:
12
+ # - Auto-initialization on Rails boot
13
+ # - Middleware insertion (request context, tracing)
14
+ # - ActiveSupport::Notifications integration
15
+ # - Rails.logger bridge (optional)
16
+ # - Console helpers
17
+ #
18
+ # @example Basic usage (no config needed)
19
+ # # In Rails app, E11y auto-configures:
20
+ # # - Service name from Rails.application.name
21
+ # # - Environment from Rails.env
22
+ # # - Adapters: stdout (dev), loki (prod)
23
+ #
24
+ # @example Custom configuration
25
+ # # config/initializers/e11y.rb
26
+ # E11y.configure do |config|
27
+ # config.service_name = "my-app"
28
+ # config.adapters.register :loki, E11y::Adapters::Loki.new(url: ENV['LOKI_URL'])
29
+ # end
30
+ #
31
+ # @see ADR-008 §3 (Railtie & Initialization)
32
+ class Railtie < Rails::Railtie
33
+ # Run before framework initialization
34
+ config.before_initialize do
35
+ # Set up basic configuration from Rails
36
+ E11y.configure do |config|
37
+ config.environment = Rails.env.to_s
38
+ config.service_name = derive_service_name
39
+ config.enabled = !Rails.env.test? # Disabled in tests by default
40
+ end
41
+ end
42
+
43
+ # Run after framework initialization
44
+ config.after_initialize do
45
+ next unless E11y.config.enabled
46
+
47
+ # Setup instruments (each can be enabled/disabled separately)
48
+ setup_rails_instrumentation if E11y.config.rails_instrumentation&.enabled
49
+ setup_logger_bridge if E11y.config.logger_bridge&.enabled
50
+ setup_sidekiq if defined?(::Sidekiq) && E11y.config.sidekiq&.enabled
51
+ setup_active_job if defined?(::ActiveJob) && E11y.config.active_job&.enabled
52
+ end
53
+
54
+ # Middleware insertion
55
+ initializer "e11y.middleware" do |app|
56
+ next unless E11y.config.enabled
57
+
58
+ # Insert E11y request middleware before Rails logger
59
+ # This ensures trace context is set up before any Rails logging
60
+ app.middleware.insert_before(
61
+ Rails::Rack::Logger,
62
+ E11y::Middleware::Request
63
+ )
64
+ end
65
+
66
+ # Console helpers
67
+ console do
68
+ next unless E11y.config.enabled
69
+
70
+ require "e11y/console"
71
+ E11y::Console.enable!
72
+
73
+ puts "E11y loaded. Try: E11y.stats"
74
+ end
75
+
76
+ # Rake task helpers
77
+ rake_tasks do
78
+ next unless E11y.config.enabled
79
+
80
+ # TODO: Add rake tasks (e11y:stats, e11y:test_event, etc.)
81
+ # load 'e11y/tasks.rake'
82
+ end
83
+
84
+ # Derive service name from Rails application class
85
+ # @return [String] Service name (e.g., "my_app")
86
+ def self.derive_service_name
87
+ Rails.application.class.module_parent_name.underscore
88
+ rescue StandardError
89
+ "rails_app"
90
+ end
91
+
92
+ # Setup Rails instrumentation (ActiveSupport::Notifications → E11y)
93
+ # @return [void]
94
+ def self.setup_rails_instrumentation
95
+ require "e11y/instruments/rails_instrumentation"
96
+ E11y::Instruments::RailsInstrumentation.setup!
97
+ end
98
+
99
+ # Setup Rails.logger bridge (optional, replaces Rails.logger)
100
+ # @return [void]
101
+ def self.setup_logger_bridge
102
+ require "e11y/logger/bridge"
103
+ E11y::Logger::Bridge.setup!
104
+ end
105
+
106
+ # Setup Sidekiq integration (client + server middleware)
107
+ # @return [void]
108
+ def self.setup_sidekiq
109
+ require "e11y/instruments/sidekiq"
110
+
111
+ # Configure server middleware
112
+ ::Sidekiq.configure_server do |config|
113
+ config.server_middleware do |chain|
114
+ chain.add E11y::Instruments::Sidekiq::ServerMiddleware
115
+ end
116
+ end
117
+
118
+ # Configure client middleware
119
+ ::Sidekiq.configure_client do |config|
120
+ config.client_middleware do |chain|
121
+ chain.add E11y::Instruments::Sidekiq::ClientMiddleware
122
+ end
123
+ end
124
+ end
125
+
126
+ # Setup ActiveJob integration (callbacks)
127
+ # @return [void]
128
+ def self.setup_active_job
129
+ require "e11y/instruments/active_job"
130
+
131
+ # Include callbacks into ApplicationJob (if defined)
132
+ ::ApplicationJob.include(E11y::Instruments::ActiveJob::Callbacks) if defined?(::ApplicationJob)
133
+
134
+ # Also include into ActiveJob::Base as fallback
135
+ ::ActiveJob::Base.include(E11y::Instruments::ActiveJob::Callbacks)
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Reliability
5
+ # Circuit Breaker pattern implementation for adapter reliability.
6
+ #
7
+ # Prevents cascading failures by opening circuit when adapter fails repeatedly.
8
+ # Three states: CLOSED (healthy), OPEN (failing), HALF_OPEN (testing recovery).
9
+ #
10
+ # @example Usage in adapter
11
+ # circuit_breaker = CircuitBreaker.new(adapter_name: "loki", config: config)
12
+ #
13
+ # circuit_breaker.call do
14
+ # # Send event to adapter
15
+ # adapter.send(event)
16
+ # end
17
+ #
18
+ # @see ADR-013 §5 (Circuit Breaker)
19
+ # @see UC-021 §4 (Circuit Breaker for Adapters)
20
+ class CircuitBreaker
21
+ # Circuit is closed (healthy) - all requests pass through
22
+ STATE_CLOSED = :closed
23
+
24
+ # Circuit is open (failing) - all requests fail fast
25
+ STATE_OPEN = :open
26
+
27
+ # Circuit is half-open (testing recovery) - limited requests allowed
28
+ STATE_HALF_OPEN = :half_open
29
+
30
+ # Circuit breaker opened error (fast fail)
31
+ class CircuitOpenError < StandardError; end
32
+
33
+ # @param adapter_name [String] Name of the adapter (for metrics)
34
+ # @param config [Hash] Configuration options
35
+ # @option config [Integer] :failure_threshold Number of failures before opening circuit (default: 5)
36
+ # @option config [Integer] :timeout_seconds Seconds before transitioning to half-open (default: 60)
37
+ # @option config [Integer] :half_open_attempts Success attempts needed in half-open to close (default: 2)
38
+ def initialize(adapter_name:, config: {})
39
+ @adapter_name = adapter_name
40
+ @failure_threshold = config[:failure_threshold] || 5
41
+ @timeout_seconds = config[:timeout_seconds] || 60
42
+ @half_open_attempts = config[:half_open_attempts] || 2
43
+
44
+ @state = STATE_CLOSED
45
+ @failure_count = 0
46
+ @success_count = 0
47
+ @last_failure_time = nil
48
+ @opened_at = nil
49
+ @mutex = Mutex.new
50
+ end
51
+
52
+ # Execute block with circuit breaker protection.
53
+ #
54
+ # @yield Block to execute (adapter send)
55
+ # @return [Object] Result of block execution
56
+ # @raise [CircuitOpenError] if circuit is open
57
+ # @raise [StandardError] if block raises and circuit transitions to open
58
+ def call(&)
59
+ check_state_transition
60
+
61
+ case @state
62
+ when STATE_CLOSED
63
+ execute_with_closed_circuit(&)
64
+ when STATE_OPEN
65
+ handle_open_circuit
66
+ when STATE_HALF_OPEN
67
+ execute_with_half_open_circuit(&)
68
+ end
69
+ end
70
+
71
+ # Check if circuit is healthy (closed state).
72
+ #
73
+ # @return [Boolean] true if circuit is closed
74
+ def healthy?
75
+ @state == STATE_CLOSED
76
+ end
77
+
78
+ # Get current circuit breaker statistics.
79
+ #
80
+ # @return [Hash] Statistics hash
81
+ def stats
82
+ {
83
+ adapter: @adapter_name,
84
+ state: @state,
85
+ failure_count: @failure_count,
86
+ success_count: @success_count,
87
+ last_failure: @last_failure_time,
88
+ opened_at: @opened_at
89
+ }
90
+ end
91
+
92
+ private
93
+
94
+ # Execute block in CLOSED state (normal operation).
95
+ def execute_with_closed_circuit
96
+ result = yield
97
+ on_success
98
+ result
99
+ rescue StandardError => e
100
+ on_failure(e)
101
+ raise
102
+ end
103
+
104
+ # Execute block in HALF_OPEN state (testing recovery).
105
+ def execute_with_half_open_circuit
106
+ result = yield
107
+ on_half_open_success
108
+ result
109
+ rescue StandardError => e
110
+ on_half_open_failure(e)
111
+ raise
112
+ end
113
+
114
+ # Handle OPEN state (fast fail).
115
+ def handle_open_circuit
116
+ increment_metric("e11y.circuit_breaker.rejected")
117
+
118
+ raise CircuitOpenError, "Circuit breaker open for #{@adapter_name} " \
119
+ "(opened at #{@opened_at}, timeout: #{@timeout_seconds}s)"
120
+ end
121
+
122
+ # Check if circuit should transition states.
123
+ def check_state_transition
124
+ return unless @state == STATE_OPEN
125
+
126
+ @mutex.synchronize do
127
+ # Transition OPEN → HALF_OPEN after timeout
128
+ transition_to_half_open if Time.now - @opened_at >= @timeout_seconds
129
+ end
130
+ end
131
+
132
+ # Handle successful execution in CLOSED state.
133
+ def on_success
134
+ @mutex.synchronize do
135
+ @failure_count = 0
136
+ @success_count += 1
137
+ end
138
+
139
+ increment_metric("e11y.circuit_breaker.success")
140
+ end
141
+
142
+ # Handle failed execution in CLOSED state.
143
+ def on_failure(error)
144
+ @mutex.synchronize do
145
+ @failure_count += 1
146
+ @last_failure_time = Time.now
147
+
148
+ # Transition CLOSED → OPEN if threshold exceeded
149
+ transition_to_open if @failure_count >= @failure_threshold
150
+ end
151
+
152
+ increment_metric("e11y.circuit_breaker.failure", error: error.class.name)
153
+ end
154
+
155
+ # Handle successful execution in HALF_OPEN state.
156
+ def on_half_open_success
157
+ @mutex.synchronize do
158
+ @success_count += 1
159
+
160
+ # Transition HALF_OPEN → CLOSED after enough successes
161
+ transition_to_closed if @success_count >= @half_open_attempts
162
+ end
163
+
164
+ increment_metric("e11y.circuit_breaker.half_open_success")
165
+ end
166
+
167
+ # Handle failed execution in HALF_OPEN state.
168
+ def on_half_open_failure(error)
169
+ @mutex.synchronize do
170
+ # Single failure in HALF_OPEN → back to OPEN
171
+ transition_to_open
172
+ end
173
+
174
+ increment_metric("e11y.circuit_breaker.half_open_failure", error: error.class.name)
175
+ end
176
+
177
+ # Transition to OPEN state.
178
+ def transition_to_open
179
+ @state = STATE_OPEN
180
+ @opened_at = Time.now
181
+ @failure_count = 0 # Reset for next cycle
182
+ @success_count = 0
183
+
184
+ increment_metric("e11y.circuit_breaker.opened")
185
+ end
186
+
187
+ # Transition to HALF_OPEN state.
188
+ def transition_to_half_open
189
+ @state = STATE_HALF_OPEN
190
+ @success_count = 0 # Reset success counter for testing
191
+
192
+ increment_metric("e11y.circuit_breaker.half_opened")
193
+ end
194
+
195
+ # Transition to CLOSED state.
196
+ def transition_to_closed
197
+ @state = STATE_CLOSED
198
+ @failure_count = 0
199
+ @success_count = 0
200
+ @opened_at = nil
201
+ @last_failure_time = nil
202
+
203
+ increment_metric("e11y.circuit_breaker.closed")
204
+ end
205
+
206
+ # Increment circuit breaker metric.
207
+ #
208
+ # @param metric_name [String] Metric name
209
+ # @param tags [Hash] Additional tags
210
+ def increment_metric(metric_name, tags = {})
211
+ # TODO: Integrate with Yabeda metrics
212
+ # E11y::Metrics.increment(metric_name, tags.merge(adapter: @adapter_name, state: @state))
213
+ end
214
+ end
215
+ end
216
+ end
@@ -0,0 +1,277 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+ require "securerandom"
6
+
7
+ module E11y
8
+ module Reliability
9
+ module DLQ
10
+ # File-based Dead Letter Queue storage.
11
+ #
12
+ # Stores failed events to a JSONL file for later analysis/replay.
13
+ # Each line is a JSON object representing a failed event with metadata.
14
+ #
15
+ # @example Usage
16
+ # dlq = FileStorage.new(file_path: "log/e11y_dlq.jsonl")
17
+ # dlq.save(event_data, metadata: { error: "Timeout", retry_count: 3 })
18
+ #
19
+ # @see ADR-013 §4 (Dead Letter Queue)
20
+ # @see UC-021 §3 (DLQ File Storage)
21
+ class FileStorage
22
+ # @param file_path [String] Path to DLQ file (default: log/e11y_dlq.jsonl)
23
+ # @param max_file_size_mb [Integer] Maximum file size in MB before rotation (default: 100)
24
+ # @param retention_days [Integer] Days to retain DLQ files (default: 30)
25
+ def initialize(file_path: nil, max_file_size_mb: 100, retention_days: 30)
26
+ @file_path = file_path || default_file_path
27
+ @max_file_size_bytes = max_file_size_mb * 1024 * 1024
28
+ @retention_days = retention_days
29
+ @mutex = Mutex.new
30
+
31
+ ensure_directory_exists
32
+ end
33
+
34
+ # Save failed event to DLQ.
35
+ #
36
+ # @param event_data [Hash] Event data
37
+ # @param metadata [Hash] Failure metadata (error, retry_count, adapter, etc.)
38
+ # @return [String] Event ID (UUID)
39
+ def save(event_data, metadata: {})
40
+ event_id = SecureRandom.uuid
41
+ timestamp = Time.now.utc
42
+
43
+ dlq_entry = {
44
+ id: event_id,
45
+ timestamp: timestamp.iso8601(3),
46
+ event_name: event_data[:event_name],
47
+ event_data: event_data,
48
+ metadata: metadata.merge(
49
+ failed_at: timestamp.iso8601(3),
50
+ retry_count: metadata[:retry_count] || 0,
51
+ error_message: metadata[:error]&.message,
52
+ error_class: metadata[:error]&.class&.name
53
+ )
54
+ }
55
+
56
+ write_entry(dlq_entry)
57
+ rotate_if_needed
58
+ cleanup_old_files
59
+
60
+ increment_metric("e11y.dlq.saved", event_name: event_data[:event_name])
61
+
62
+ event_id
63
+ end
64
+
65
+ # List DLQ entries with optional filters.
66
+ #
67
+ # @param limit [Integer] Maximum entries to return
68
+ # @param offset [Integer] Number of entries to skip
69
+ # @param filters [Hash] Filter options (event_name, after, before)
70
+ # @return [Array<Hash>] Array of DLQ entries
71
+ def list(limit: 100, offset: 0, filters: {})
72
+ entries = []
73
+
74
+ return entries unless File.exist?(@file_path)
75
+
76
+ File.foreach(@file_path).with_index do |line, index|
77
+ next if index < offset
78
+ break if entries.size >= limit
79
+
80
+ entry = JSON.parse(line, symbolize_names: true)
81
+
82
+ # Apply filters
83
+ next if filters[:event_name] && entry[:event_name] != filters[:event_name]
84
+ next if filters[:after] && Time.parse(entry[:timestamp]) < filters[:after]
85
+ next if filters[:before] && Time.parse(entry[:timestamp]) > filters[:before]
86
+
87
+ entries << entry
88
+ end
89
+
90
+ entries
91
+ rescue JSON::ParserError => e
92
+ # Log parsing error but don't crash
93
+ increment_metric("e11y.dlq.parse_error", error: e.class.name)
94
+ entries
95
+ end
96
+
97
+ # Get DLQ statistics.
98
+ #
99
+ # @return [Hash] Statistics (total_entries, file_size_mb, oldest_entry, newest_entry)
100
+ def stats
101
+ return default_stats unless File.exist?(@file_path)
102
+
103
+ file_size_bytes = File.size(@file_path)
104
+ total_entries = File.foreach(@file_path).count
105
+
106
+ oldest_entry = nil
107
+ newest_entry = nil
108
+
109
+ # Read first and last line for oldest/newest timestamps
110
+ File.foreach(@file_path).with_index do |line, index|
111
+ entry = JSON.parse(line, symbolize_names: true)
112
+ oldest_entry = entry[:timestamp] if index.zero?
113
+ newest_entry = entry[:timestamp]
114
+ end
115
+
116
+ {
117
+ total_entries: total_entries,
118
+ file_size_mb: (file_size_bytes / 1024.0 / 1024.0).round(2),
119
+ oldest_entry: oldest_entry,
120
+ newest_entry: newest_entry,
121
+ file_path: @file_path
122
+ }
123
+ rescue StandardError => e
124
+ increment_metric("e11y.dlq.stats_error", error: e.class.name)
125
+ default_stats
126
+ end
127
+
128
+ # Replay single event from DLQ.
129
+ #
130
+ # @param event_id [String] Event ID to replay
131
+ # @return [Boolean] true if replayed successfully
132
+ def replay(event_id)
133
+ entry = find_entry(event_id)
134
+ return false unless entry
135
+
136
+ # Re-dispatch event through E11y pipeline
137
+ # TODO: Implement E11y::Pipeline.dispatch
138
+ # E11y::Pipeline.dispatch(entry[:event_data], metadata: entry[:metadata].merge(replayed: true))
139
+
140
+ # For now, just mark as replayed
141
+ increment_metric("e11y.dlq.replayed", event_name: entry[:event_name])
142
+ true
143
+ rescue StandardError => e
144
+ increment_metric("e11y.dlq.replay_failed", error: e.class.name)
145
+ false
146
+ end
147
+
148
+ # Replay batch of events from DLQ.
149
+ #
150
+ # @param event_ids [Array<String>] Event IDs to replay
151
+ # @return [Hash] Result summary (success_count, failure_count)
152
+ def replay_batch(event_ids)
153
+ success_count = 0
154
+ failure_count = 0
155
+
156
+ event_ids.each do |event_id|
157
+ if replay(event_id)
158
+ success_count += 1
159
+ else
160
+ failure_count += 1
161
+ end
162
+ end
163
+
164
+ { success_count: success_count, failure_count: failure_count }
165
+ end
166
+
167
+ # Delete entry from DLQ.
168
+ #
169
+ # Note: This is a simplified implementation.
170
+ # In production, consider using a database or append-only log with tombstones.
171
+ #
172
+ # @param event_id [String] Event ID to delete
173
+ # @return [Boolean] true if deleted
174
+ def delete(_event_id)
175
+ # TODO: Implement deletion (requires rewriting file)
176
+ # For JSONL, deletion is expensive (requires full file rewrite)
177
+ # Consider marking as deleted instead or using database
178
+ false
179
+ end
180
+
181
+ private
182
+
183
+ # Get default file path (log/e11y_dlq.jsonl).
184
+ def default_file_path
185
+ if defined?(Rails) && Rails.root
186
+ Rails.root.join("log", "e11y_dlq.jsonl").to_s
187
+ else
188
+ File.join("log", "e11y_dlq.jsonl")
189
+ end
190
+ end
191
+
192
+ # Ensure log directory exists.
193
+ def ensure_directory_exists
194
+ dir = File.dirname(@file_path)
195
+ FileUtils.mkdir_p(dir)
196
+ end
197
+
198
+ # Write DLQ entry to file (thread-safe).
199
+ def write_entry(entry)
200
+ @mutex.synchronize do
201
+ File.open(@file_path, "a") do |f|
202
+ f.flock(File::LOCK_EX)
203
+ f.puts(JSON.generate(entry))
204
+ end
205
+ end
206
+ end
207
+
208
+ # Rotate file if size exceeds max_file_size.
209
+ def rotate_if_needed
210
+ return unless File.exist?(@file_path)
211
+ return if File.size(@file_path) < @max_file_size_bytes
212
+
213
+ @mutex.synchronize do
214
+ # Rotate: log/e11y_dlq.jsonl → log/e11y_dlq.2026-01-20T12:34:56Z.jsonl
215
+ timestamp = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
216
+ rotated_path = @file_path.sub(/\.jsonl$/, ".#{timestamp}.jsonl")
217
+
218
+ FileUtils.mv(@file_path, rotated_path)
219
+
220
+ increment_metric("e11y.dlq.rotated", new_file: rotated_path)
221
+ end
222
+ end
223
+
224
+ # Cleanup old rotated files.
225
+ def cleanup_old_files
226
+ dir = File.dirname(@file_path)
227
+ base_name = File.basename(@file_path, ".jsonl")
228
+
229
+ # Find all rotated files: e11y_dlq.*.jsonl
230
+ pattern = File.join(dir, "#{base_name}.*.jsonl")
231
+
232
+ Dir.glob(pattern).each do |file|
233
+ next unless File.file?(file)
234
+
235
+ file_age_days = (Time.now - File.mtime(file)) / 86_400
236
+
237
+ if file_age_days > @retention_days
238
+ File.delete(file)
239
+ increment_metric("e11y.dlq.cleaned_up", file: file)
240
+ end
241
+ end
242
+ end
243
+
244
+ # Find DLQ entry by ID.
245
+ def find_entry(event_id)
246
+ return nil unless File.exist?(@file_path)
247
+
248
+ File.foreach(@file_path) do |line|
249
+ entry = JSON.parse(line, symbolize_names: true)
250
+ return entry if entry[:id] == event_id
251
+ end
252
+
253
+ nil
254
+ rescue JSON::ParserError
255
+ nil
256
+ end
257
+
258
+ # Default stats when file doesn't exist.
259
+ def default_stats
260
+ {
261
+ total_entries: 0,
262
+ file_size_mb: 0.0,
263
+ oldest_entry: nil,
264
+ newest_entry: nil,
265
+ file_path: @file_path
266
+ }
267
+ end
268
+
269
+ # Increment DLQ metric.
270
+ def increment_metric(metric_name, tags = {})
271
+ # TODO: Integrate with Yabeda metrics
272
+ # E11y::Metrics.increment(metric_name, tags)
273
+ end
274
+ end
275
+ end
276
+ end
277
+ end