e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/concern"
4
+
5
+ module E11y
6
+ module Instruments
7
+ # ActiveJob integration for job-scoped context and trace propagation.
8
+ #
9
+ # Provides callbacks to:
10
+ # 1. Inject trace context when job is enqueued (before_enqueue)
11
+ # 2. Set up job-scoped context when job executes (around_perform)
12
+ #
13
+ # @example Setup (automatic via Railtie)
14
+ # class ApplicationJob < ActiveJob::Base
15
+ # include E11y::Instruments::ActiveJob::Callbacks
16
+ # end
17
+ #
18
+ # @see ADR-008 §10 (ActiveJob Integration)
19
+ module ActiveJob
20
+ # Callbacks module to be included into ActiveJob classes.
21
+ # Provides before_enqueue and around_perform callbacks for trace propagation.
22
+ module Callbacks
23
+ extend ActiveSupport::Concern
24
+
25
+ included do
26
+ # Inject trace context before enqueueing (C17 Hybrid Tracing)
27
+ # Store parent trace context for job to link back to originating request
28
+ before_enqueue do |job|
29
+ # Store current trace as parent (job will create NEW trace)
30
+ job.e11y_parent_trace_id = E11y::Current.trace_id if E11y::Current.trace_id
31
+ job.e11y_parent_span_id = E11y::Current.span_id if E11y::Current.span_id
32
+ end
33
+
34
+ # Set up job-scoped context around job execution (C17 Hybrid Tracing + C18 Non-Failing)
35
+ around_perform do |job, block|
36
+ # C18: Disable fail_on_error for jobs (observability should not block business logic)
37
+ original_fail_on_error = E11y.config.error_handling.fail_on_error
38
+ E11y.config.error_handling.fail_on_error = false
39
+
40
+ setup_job_context_active_job(job)
41
+ setup_job_buffer_active_job
42
+
43
+ # Track job start time for SLO
44
+ start_time = Time.now
45
+ job_status = :success
46
+
47
+ # Execute job (business logic)
48
+ block.call
49
+ rescue StandardError => e
50
+ job_status = :failed
51
+ # Handle error (C18: Non-Failing Event Tracking)
52
+ handle_job_error_active_job(e)
53
+
54
+ raise # Always re-raise original exception
55
+ ensure
56
+ # Track SLO metrics
57
+ track_job_slo_active_job(job, job_status, start_time)
58
+
59
+ cleanup_job_context_active_job
60
+
61
+ # Restore original setting
62
+ E11y.config.error_handling.fail_on_error = original_fail_on_error
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ # Setup job-scoped context (C17 Hybrid Tracing)
69
+ def setup_job_context_active_job(job)
70
+ # Extract parent trace context from job metadata
71
+ parent_trace_id = job.e11y_parent_trace_id
72
+
73
+ # Generate NEW trace_id for this job (not reuse parent!)
74
+ trace_id = generate_trace_id
75
+ span_id = generate_span_id
76
+
77
+ # Set job-scoped context
78
+ E11y::Current.trace_id = trace_id
79
+ E11y::Current.span_id = span_id
80
+ E11y::Current.parent_trace_id = parent_trace_id
81
+ E11y::Current.request_id = job.job_id
82
+ end
83
+
84
+ # Setup job-scoped buffer
85
+ def setup_job_buffer_active_job
86
+ return unless E11y.config.request_buffer&.enabled
87
+
88
+ E11y::Buffers::RequestScopedBuffer.start!
89
+ rescue StandardError => e
90
+ # C18: Don't fail job if buffer setup fails
91
+ warn "[E11y] Failed to start job buffer: #{e.message}"
92
+ end
93
+
94
+ # Handle job error (C18: Non-Failing Event Tracking)
95
+ def handle_job_error_active_job(_error)
96
+ return unless E11y.config.request_buffer&.enabled
97
+
98
+ E11y::Buffers::RequestScopedBuffer.flush_on_error!
99
+ rescue StandardError => e
100
+ # C18: Don't fail job if buffer flush fails
101
+ warn "[E11y] Failed to flush job buffer on error: #{e.message}"
102
+ end
103
+
104
+ # Cleanup job-scoped context
105
+ def cleanup_job_context_active_job
106
+ # Flush buffer on success (not on error, already flushed in rescue)
107
+ if !$ERROR_INFO && E11y.config.request_buffer&.enabled
108
+ begin
109
+ E11y::Buffers::RequestScopedBuffer.flush!
110
+ rescue StandardError => e
111
+ # C18: Don't fail job if buffer flush fails
112
+ warn "[E11y] Failed to flush job buffer: #{e.message}"
113
+ end
114
+ end
115
+
116
+ # Reset context (always, even if flush failed)
117
+ E11y::Current.reset
118
+ rescue StandardError => e
119
+ # C18: Absolutely don't fail job on context cleanup
120
+ warn "[E11y] Failed to reset job context: #{e.message}"
121
+ end
122
+
123
+ # Generate new trace_id (32-character hex)
124
+ # @return [String]
125
+ def generate_trace_id
126
+ SecureRandom.hex(16)
127
+ end
128
+
129
+ # Generate new span_id (16-character hex)
130
+ # @return [String]
131
+ def generate_span_id
132
+ SecureRandom.hex(8)
133
+ end
134
+
135
+ # Track ActiveJob for SLO metrics (if enabled).
136
+ #
137
+ # @param job [ActiveJob::Base] Job instance
138
+ # @param status [Symbol] Job status (:success or :failed)
139
+ # @param start_time [Time] Job start time
140
+ # @return [void]
141
+ # @api private
142
+ def track_job_slo_active_job(job, status, start_time)
143
+ return unless E11y.config.slo_tracking&.enabled
144
+
145
+ duration_ms = ((Time.now - start_time) * 1000).round(2)
146
+
147
+ require "e11y/slo/tracker"
148
+ E11y::SLO::Tracker.track_background_job(
149
+ job_class: job.class.name,
150
+ status: status,
151
+ duration_ms: duration_ms,
152
+ queue: job.queue_name
153
+ )
154
+ rescue StandardError => e
155
+ # C18: Don't fail if SLO tracking fails
156
+ E11y.logger.warn("[E11y] SLO tracking error: #{e.message}", error: e.class.name)
157
+ end
158
+ end
159
+
160
+ # Custom attribute accessors for trace context (C17 Hybrid Tracing)
161
+ module TraceAttributes
162
+ def e11y_parent_trace_id
163
+ @e11y_parent_trace_id
164
+ end
165
+
166
+ def e11y_parent_trace_id=(value)
167
+ @e11y_parent_trace_id = value
168
+ end
169
+
170
+ def e11y_parent_span_id
171
+ @e11y_parent_span_id
172
+ end
173
+
174
+ def e11y_parent_span_id=(value)
175
+ @e11y_parent_span_id = value
176
+ end
177
+
178
+ # Deprecated: Jobs should create NEW trace_id (C17)
179
+ # These are kept for backward compatibility but should not be used.
180
+ def e11y_trace_id
181
+ @e11y_trace_id
182
+ end
183
+
184
+ def e11y_trace_id=(value)
185
+ @e11y_trace_id = value
186
+ end
187
+
188
+ def e11y_span_id
189
+ @e11y_span_id
190
+ end
191
+
192
+ def e11y_span_id=(value)
193
+ @e11y_span_id = value
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ # Extend ActiveJob::Base with trace attributes
201
+ ActiveJob::Base.include(E11y::Instruments::ActiveJob::TraceAttributes) if defined?(ActiveJob::Base)
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Instruments
5
+ # Rails Instrumentation (ActiveSupport::Notifications → E11y)
6
+ #
7
+ # Subscribes to Rails internal events (ActiveSupport::Notifications)
8
+ # and converts them to E11y events for unified observability.
9
+ #
10
+ # **Unidirectional Flow:** ASN → E11y
11
+ #
12
+ # @example Basic usage
13
+ # # Automatically enabled by E11y::Railtie if config.rails_instrumentation.enabled = true
14
+ # E11y::Instruments::RailsInstrumentation.setup!
15
+ #
16
+ # @example Custom event mapping
17
+ # E11y.configure do |config|
18
+ # config.rails_instrumentation do
19
+ # event_class_for 'sql.active_record', MyApp::CustomQueryEvent
20
+ # ignore_event 'cache_read.active_support'
21
+ # end
22
+ # end
23
+ #
24
+ # @see ADR-008 §4.1 (Unidirectional Flow ASN → E11y)
25
+ # @see UC-016 (Rails Logger Migration)
26
+ class RailsInstrumentation
27
+ # Built-in event mappings (ASN pattern → E11y Event class)
28
+ #
29
+ # These are defaults that can be overridden via config.event_class_for
30
+ #
31
+ # @return [Hash<String, Class>] Event mappings
32
+ DEFAULT_RAILS_EVENT_MAPPING = {
33
+ "sql.active_record" => "Events::Rails::Database::Query",
34
+ "process_action.action_controller" => "Events::Rails::Http::Request",
35
+ "render_template.action_view" => "Events::Rails::View::Render",
36
+ "send_file.action_controller" => "Events::Rails::Http::SendFile",
37
+ "redirect_to.action_controller" => "Events::Rails::Http::Redirect",
38
+ "cache_read.active_support" => "Events::Rails::Cache::Read",
39
+ "cache_write.active_support" => "Events::Rails::Cache::Write",
40
+ "cache_delete.active_support" => "Events::Rails::Cache::Delete",
41
+ "enqueue.active_job" => "Events::Rails::Job::Enqueued",
42
+ "enqueue_at.active_job" => "Events::Rails::Job::Scheduled",
43
+ "perform_start.active_job" => "Events::Rails::Job::Started",
44
+ "perform.active_job" => "Events::Rails::Job::Completed"
45
+ }.freeze
46
+
47
+ # Setup Rails instrumentation
48
+ #
49
+ # Subscribes to ActiveSupport::Notifications events and converts them to E11y events.
50
+ #
51
+ # @return [void]
52
+ def self.setup!
53
+ return unless E11y.config.rails_instrumentation&.enabled
54
+
55
+ # Subscribe to each configured event pattern
56
+ event_mapping.each do |asn_pattern, e11y_event_class_name|
57
+ next if ignored?(asn_pattern)
58
+
59
+ subscribe_to_event(asn_pattern, e11y_event_class_name)
60
+ end
61
+ end
62
+
63
+ # Subscribe to a specific ASN event
64
+ # @param asn_pattern [String] ActiveSupport::Notifications pattern
65
+ # @param e11y_event_class_name [String] E11y event class name
66
+ # @return [void]
67
+ def self.subscribe_to_event(asn_pattern, e11y_event_class_name)
68
+ ActiveSupport::Notifications.subscribe(asn_pattern) do |name, start, finish, id, payload|
69
+ # Convert ASN event → E11y event
70
+ duration = (finish - start) * 1000 # Convert to milliseconds
71
+
72
+ # Resolve event class (string → constant)
73
+ e11y_event_class = resolve_event_class(e11y_event_class_name)
74
+ next unless e11y_event_class
75
+
76
+ # Track E11y event with extracted payload
77
+ e11y_event_class.track(
78
+ event_name: name,
79
+ duration: duration,
80
+ **extract_relevant_payload(payload)
81
+ )
82
+ rescue StandardError => e
83
+ # Don't crash the app if event tracking fails
84
+ warn "[E11y] Failed to track Rails event #{name}: #{e.message}"
85
+ end
86
+ end
87
+
88
+ # Get final event mapping (after config overrides)
89
+ # @return [Hash<String, String>] Event mappings
90
+ def self.event_mapping
91
+ @event_mapping ||= begin
92
+ mapping = DEFAULT_RAILS_EVENT_MAPPING.dup
93
+
94
+ # Apply custom mappings from config (Devise-style overrides)
95
+ custom_mappings = E11y.config.rails_instrumentation&.custom_mappings || {}
96
+ custom_mappings.each do |pattern, event_class|
97
+ mapping[pattern] = event_class.name
98
+ end
99
+
100
+ mapping
101
+ end
102
+ end
103
+
104
+ # Check if event pattern should be ignored
105
+ # @param pattern [String] ASN event pattern
106
+ # @return [Boolean] true if should be ignored
107
+ def self.ignored?(pattern)
108
+ ignore_list = E11y.config.rails_instrumentation&.ignore_events || []
109
+ ignore_list.include?(pattern)
110
+ end
111
+
112
+ # Extract relevant payload fields from ASN event
113
+ #
114
+ # Filters out PII and noisy fields, keeping only relevant data.
115
+ #
116
+ # @param payload [Hash] ASN event payload
117
+ # @return [Hash] Filtered payload
118
+ def self.extract_relevant_payload(payload)
119
+ # Extract only relevant fields (avoid PII, reduce noise)
120
+ # This is a basic implementation - specific event classes can override
121
+ payload.slice(
122
+ :controller, :action, :format, :status,
123
+ :allocations, :db_runtime, :view_runtime,
124
+ :name, :sql, :connection_id,
125
+ :key, :hit,
126
+ :job_class, :job_id, :queue
127
+ )
128
+ end
129
+
130
+ # Resolve event class from string name
131
+ # @param class_name [String] Event class name
132
+ # @return [Class, nil] Event class or nil if not found
133
+ def self.resolve_event_class(class_name)
134
+ class_name.constantize
135
+ rescue NameError => e
136
+ warn "[E11y] Event class not found: #{class_name} (#{e.message})"
137
+ nil
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module E11y
4
+ module Instruments
5
+ # Sidekiq integration for job-scoped context and trace propagation.
6
+ #
7
+ # Provides two middleware:
8
+ # 1. ClientMiddleware - Injects trace context when job is enqueued
9
+ # 2. ServerMiddleware - Sets up job-scoped context when job executes
10
+ #
11
+ # @example Setup (automatic via Railtie)
12
+ # Sidekiq.configure_server do |config|
13
+ # config.server_middleware do |chain|
14
+ # chain.add E11y::Instruments::Sidekiq::ServerMiddleware
15
+ # end
16
+ # end
17
+ #
18
+ # Sidekiq.configure_client do |config|
19
+ # config.client_middleware do |chain|
20
+ # chain.add E11y::Instruments::Sidekiq::ClientMiddleware
21
+ # end
22
+ # end
23
+ #
24
+ # @see ADR-008 §9 (Sidekiq Integration)
25
+ module Sidekiq
26
+ # Client-side middleware: Inject trace context when enqueueing job
27
+ #
28
+ # **C17 Hybrid Tracing**: Propagates parent_trace_id to job metadata.
29
+ # Job will create NEW trace_id but keep link to parent.
30
+ class ClientMiddleware
31
+ def call(_worker_class, job, _queue, _redis_pool)
32
+ # Inject current trace context into job metadata as parent trace
33
+ # Job will generate NEW trace_id but keep parent link (C17)
34
+ job["e11y_parent_trace_id"] = E11y::Current.trace_id if E11y::Current.trace_id
35
+ job["e11y_parent_span_id"] = E11y::Current.span_id if E11y::Current.span_id
36
+
37
+ yield
38
+ end
39
+ end
40
+
41
+ # Server-side middleware: Set up job-scoped context when executing job
42
+ #
43
+ # **C17 Hybrid Tracing**: Creates NEW trace_id for job, but preserves parent link.
44
+ # **C18 Non-Failing**: E11y errors don't fail jobs (observability is secondary to business logic).
45
+ class ServerMiddleware
46
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
47
+ def call(_worker, job, queue)
48
+ # C18: Disable fail_on_error for jobs (observability should not block business logic)
49
+ original_fail_on_error = E11y.config.error_handling.fail_on_error
50
+ E11y.config.error_handling.fail_on_error = false
51
+
52
+ setup_job_context(job)
53
+ setup_job_buffer
54
+
55
+ # Track job start time for SLO
56
+ start_time = Time.now
57
+ job_status = :success
58
+
59
+ # Execute job (business logic)
60
+ yield
61
+ rescue StandardError => e
62
+ job_status = :failed
63
+ # Check if this is E11y error (circuit breaker, retry exhausted, etc.)
64
+ handle_job_error(e)
65
+
66
+ raise # Always re-raise original exception
67
+ ensure
68
+ # Track SLO metrics
69
+ track_job_slo(job, queue, job_status, start_time)
70
+
71
+ cleanup_job_context
72
+
73
+ # Restore original setting
74
+ E11y.config.error_handling.fail_on_error = original_fail_on_error
75
+ end
76
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
77
+
78
+ private
79
+
80
+ # Setup job-scoped context (C17 Hybrid Tracing)
81
+ def setup_job_context(job)
82
+ # Extract parent trace context from job metadata
83
+ parent_trace_id = job["e11y_parent_trace_id"]
84
+
85
+ # Generate NEW trace_id for this job (not reuse parent!)
86
+ trace_id = generate_trace_id
87
+ span_id = generate_span_id
88
+
89
+ # Set job-scoped context
90
+ E11y::Current.trace_id = trace_id
91
+ E11y::Current.span_id = span_id
92
+ E11y::Current.parent_trace_id = parent_trace_id
93
+ E11y::Current.request_id = job["jid"]
94
+ end
95
+
96
+ # Setup job-scoped buffer
97
+ def setup_job_buffer
98
+ return unless E11y.config.request_buffer&.enabled
99
+
100
+ E11y::Buffers::RequestScopedBuffer.start!
101
+ rescue StandardError => e
102
+ # C18: Don't fail job if buffer setup fails
103
+ warn "[E11y] Failed to start job buffer: #{e.message}"
104
+ end
105
+
106
+ # Handle job error (C18: Non-Failing Event Tracking)
107
+ def handle_job_error(error)
108
+ # Flush buffer on error (includes debug events)
109
+ return unless E11y.config.request_buffer&.enabled
110
+
111
+ E11y::Buffers::RequestScopedBuffer.flush_on_error!
112
+ rescue StandardError => e
113
+ # C18: Don't fail job if buffer flush fails
114
+ warn "[E11y] Failed to flush job buffer on error: #{e.message}"
115
+ end
116
+
117
+ # Cleanup job-scoped context
118
+ def cleanup_job_context
119
+ # Flush buffer on success (not on error, already flushed in rescue)
120
+ if !$ERROR_INFO && E11y.config.request_buffer&.enabled
121
+ begin
122
+ E11y::Buffers::RequestScopedBuffer.flush!
123
+ rescue StandardError => e
124
+ # C18: Don't fail job if buffer flush fails
125
+ warn "[E11y] Failed to flush job buffer: #{e.message}"
126
+ end
127
+ end
128
+
129
+ # Reset context (always, even if flush failed)
130
+ E11y::Current.reset
131
+ rescue StandardError => e
132
+ # C18: Absolutely don't fail job on context cleanup
133
+ warn "[E11y] Failed to reset job context: #{e.message}"
134
+ end
135
+
136
+ # Generate new trace_id (32-character hex)
137
+ # @return [String]
138
+ def generate_trace_id
139
+ SecureRandom.hex(16)
140
+ end
141
+
142
+ # Generate new span_id (16-character hex)
143
+ # @return [String]
144
+ def generate_span_id
145
+ SecureRandom.hex(8)
146
+ end
147
+
148
+ # Track Sidekiq job for SLO metrics (if enabled).
149
+ #
150
+ # @param job [Hash] Sidekiq job hash
151
+ # @param queue [String] Queue name
152
+ # @param status [Symbol] Job status (:success or :failed)
153
+ # @param start_time [Time] Job start time
154
+ # @return [void]
155
+ # @api private
156
+ def track_job_slo(job, queue, status, start_time)
157
+ return unless E11y.config.slo_tracking&.enabled
158
+
159
+ duration_ms = ((Time.now - start_time) * 1000).round(2)
160
+
161
+ require "e11y/slo/tracker"
162
+ E11y::SLO::Tracker.track_background_job(
163
+ job_class: job["class"],
164
+ status: status,
165
+ duration_ms: duration_ms,
166
+ queue: queue
167
+ )
168
+ rescue StandardError => e
169
+ # C18: Don't fail if SLO tracking fails
170
+ warn "[E11y] SLO tracking error: #{e.message}"
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end