rails_error_dashboard 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +22 -0
  3. data/app/controllers/rails_error_dashboard/application_controller.rb +5 -0
  4. data/app/controllers/rails_error_dashboard/errors_controller.rb +12 -0
  5. data/app/jobs/rails_error_dashboard/storm_flush_job.rb +19 -0
  6. data/app/jobs/rails_error_dashboard/storm_notification_job.rb +74 -0
  7. data/app/models/rails_error_dashboard/storm_event.rb +34 -0
  8. data/app/views/layouts/rails_error_dashboard.html.erb +21 -0
  9. data/app/views/rails_error_dashboard/errors/storms.html.erb +91 -0
  10. data/config/routes.rb +1 -0
  11. data/db/migrate/20260306000002_add_instance_variables_to_error_logs.rb +7 -1
  12. data/db/migrate/20260306000003_create_rails_error_dashboard_swallowed_exceptions.rb +4 -0
  13. data/db/migrate/20260307000001_create_rails_error_dashboard_diagnostic_dumps.rb +4 -0
  14. data/db/migrate/20260613000001_create_storm_events.rb +28 -0
  15. data/lib/generators/rails_error_dashboard/install/templates/initializer.rb +36 -0
  16. data/lib/rails_error_dashboard/commands/flush_storm_counts.rb +188 -0
  17. data/lib/rails_error_dashboard/commands/log_error.rb +70 -12
  18. data/lib/rails_error_dashboard/configuration.rb +60 -0
  19. data/lib/rails_error_dashboard/queries/storm_history.rb +39 -0
  20. data/lib/rails_error_dashboard/services/storm_protection/circuit_breaker.rb +195 -0
  21. data/lib/rails_error_dashboard/services/storm_protection/count_buffer.rb +100 -0
  22. data/lib/rails_error_dashboard/services/storm_protection/fingerprint_buckets.rb +123 -0
  23. data/lib/rails_error_dashboard/services/storm_protection/gate.rb +258 -0
  24. data/lib/rails_error_dashboard/subscribers/issue_tracker_subscriber.rb +12 -0
  25. data/lib/rails_error_dashboard/version.rb +1 -1
  26. data/lib/rails_error_dashboard.rb +6 -0
  27. metadata +13 -2
@@ -6,15 +6,59 @@ module RailsErrorDashboard
6
6
  # This is a write operation that creates an ErrorLog record
7
7
  class LogError
8
8
  def self.call(exception, context = {})
9
- # Check if async logging is enabled
9
+ # Filter FIRST (ignore list + static sampling) so ignored exceptions
10
+ # never count toward storm state. _pre_filtered prevents the sync path
11
+ # from re-rolling the sampling dice (rate would square otherwise).
12
+ # The filter + gate run inside this method's rescue: nothing in the
13
+ # capture path may ever raise into the host app.
14
+ begin
15
+ unless Services::ExceptionFilter.should_log?(exception)
16
+ # Preserve the OTel contract: filtered captures still emit a span
17
+ # tagged filtered=true (no-op when OTel export is disabled).
18
+ Integrations::Tracer.in_span(
19
+ "capture_error",
20
+ kind: :capture,
21
+ attributes: build_capture_span_attributes(exception, was_async: false)
22
+ ) do |span|
23
+ span&.set_attribute("rails_error_dashboard.filtered", true)
24
+ end
25
+ return nil
26
+ end
27
+ context = context.merge(_pre_filtered: true)
28
+
29
+ # Storm protection gate — BEFORE the async branch, because with
30
+ # SolidQueue the enqueue itself is a DB write. :count_only events are
31
+ # tallied in memory and reconciled by StormFlushJob; nothing else
32
+ # happens for them (that's the point).
33
+ storm_decision = Services::StormProtection::Gate.admit!(exception, context)
34
+ return nil if storm_decision == :count_only
35
+ context = context.merge(_storm_decision: storm_decision) if storm_decision == :lite
36
+ rescue => e
37
+ RailsErrorDashboard::Logger.error(
38
+ "[RailsErrorDashboard] Capture pre-checks failed: #{e.class} - #{e.message}"
39
+ )
40
+ # Fall through and attempt full capture — fail open, never raise
41
+ end
42
+
10
43
  if RailsErrorDashboard.configuration.async_logging
11
44
  # For async logging, just enqueue the job
12
- # All filtering happens when the job runs
13
45
  call_async(exception, context)
14
46
  else
15
47
  # For sync logging, execute immediately
16
48
  new(exception, context).call
17
49
  end
50
+ rescue => e
51
+ RailsErrorDashboard::Logger.error(
52
+ "[RailsErrorDashboard] LogError.call failed: #{e.class} - #{e.message}"
53
+ )
54
+ nil
55
+ end
56
+
57
+ # :lite captures shed context (breadcrumbs/health/locals/ivars) — the
58
+ # storm shedding ladder's first economy. Symbol or string key: the
59
+ # async job round-trips context through the queue serializer.
60
+ def self.storm_lite?(context)
61
+ context[:_storm_decision].to_s == "lite"
18
62
  end
19
63
 
20
64
  # Build the base OTel span attributes available before any work happens.
@@ -42,18 +86,22 @@ module RailsErrorDashboard
42
86
  cause_chain: serialize_cause_chain(exception)
43
87
  }
44
88
 
89
+ # Storm shedding: :lite captures skip ALL pre-enqueue context harvest —
90
+ # this is request-thread CPU, the most valuable thing to shed.
91
+ lite = storm_lite?(context)
92
+
45
93
  # Harvest breadcrumbs NOW (before job dispatch — different thread won't have them)
46
- if RailsErrorDashboard.configuration.enable_breadcrumbs
94
+ if !lite && RailsErrorDashboard.configuration.enable_breadcrumbs
47
95
  context = context.merge(_serialized_breadcrumbs: Services::BreadcrumbCollector.harvest)
48
96
  end
49
97
 
50
98
  # Capture system health NOW (metrics are time-sensitive, different thread = different state)
51
- if RailsErrorDashboard.configuration.enable_system_health
99
+ if !lite && RailsErrorDashboard.configuration.enable_system_health
52
100
  context = context.merge(_serialized_system_health: Services::SystemHealthSnapshot.capture)
53
101
  end
54
102
 
55
103
  # Capture local variables NOW (TracePoint attaches to exception, must extract before job dispatch)
56
- if RailsErrorDashboard.configuration.enable_local_variables
104
+ if !lite && RailsErrorDashboard.configuration.enable_local_variables
57
105
  begin
58
106
  raw_locals = Services::LocalVariableCapturer.extract(exception)
59
107
  if raw_locals.is_a?(Hash) && raw_locals.any?
@@ -65,7 +113,7 @@ module RailsErrorDashboard
65
113
  end
66
114
 
67
115
  # Capture instance variables NOW (same reason — attached to exception object)
68
- if RailsErrorDashboard.configuration.enable_instance_variables
116
+ if !lite && RailsErrorDashboard.configuration.enable_instance_variables
69
117
  begin
70
118
  raw_ivars = Services::LocalVariableCapturer.extract_instance_vars(exception)
71
119
  if raw_ivars.is_a?(Hash) && raw_ivars.any?
@@ -157,12 +205,19 @@ module RailsErrorDashboard
157
205
  kind: :capture,
158
206
  attributes: self.class.build_capture_span_attributes(@exception, was_async: false)
159
207
  ) do |span|
160
- # Check if this exception should be logged (ignore list + sampling)
161
- if !Services::ExceptionFilter.should_log?(@exception)
208
+ # Check if this exception should be logged (ignore list + sampling).
209
+ # Skipped when self.call already filtered (re-rolling the sampling
210
+ # dice here would square the effective rate).
211
+ if !@context[:_pre_filtered] && !Services::ExceptionFilter.should_log?(@exception)
162
212
  span&.set_attribute("rails_error_dashboard.filtered", true)
163
213
  next nil
164
214
  end
165
215
 
216
+ # Storm shedding: :lite captures keep the error + occurrence row but
217
+ # shed context payloads (breadcrumbs/health/locals/ivars).
218
+ storm_lite = self.class.storm_lite?(@context)
219
+ span&.set_attribute("rails_error_dashboard.storm_degraded", true) if storm_lite
220
+
166
221
  error_context = ValueObjects::ErrorContext.new(@context, @context[:source])
167
222
 
168
223
  # Find or create application (cached lookup)
@@ -239,7 +294,7 @@ module RailsErrorDashboard
239
294
  attributes = Services::SensitiveDataFilter.filter_attributes(attributes)
240
295
 
241
296
  # Harvest breadcrumbs (if enabled and column exists)
242
- if ErrorLog.column_names.include?("breadcrumbs") && RailsErrorDashboard.configuration.enable_breadcrumbs
297
+ if !storm_lite && ErrorLog.column_names.include?("breadcrumbs") && RailsErrorDashboard.configuration.enable_breadcrumbs
243
298
  # Sync path: harvest from current thread
244
299
  raw_breadcrumbs = Services::BreadcrumbCollector.harvest
245
300
 
@@ -256,13 +311,13 @@ module RailsErrorDashboard
256
311
  end
257
312
 
258
313
  # Capture system health snapshot (if enabled and column exists)
259
- if ErrorLog.column_names.include?("system_health") && RailsErrorDashboard.configuration.enable_system_health
314
+ if !storm_lite && ErrorLog.column_names.include?("system_health") && RailsErrorDashboard.configuration.enable_system_health
260
315
  health_data = @context[:_serialized_system_health] || Services::SystemHealthSnapshot.capture
261
316
  attributes[:system_health] = health_data.to_json
262
317
  end
263
318
 
264
319
  # Capture local variables (if enabled and column exists)
265
- if ErrorLog.column_names.include?("local_variables") && RailsErrorDashboard.configuration.enable_local_variables
320
+ if !storm_lite && ErrorLog.column_names.include?("local_variables") && RailsErrorDashboard.configuration.enable_local_variables
266
321
  begin
267
322
  # Sync path: extract from exception ivar
268
323
  raw_locals = Services::LocalVariableCapturer.extract(@exception)
@@ -278,7 +333,7 @@ module RailsErrorDashboard
278
333
  end
279
334
 
280
335
  # Capture instance variables (if enabled and column exists)
281
- if ErrorLog.column_names.include?("instance_variables") && RailsErrorDashboard.configuration.enable_instance_variables
336
+ if !storm_lite && ErrorLog.column_names.include?("instance_variables") && RailsErrorDashboard.configuration.enable_instance_variables
282
337
  begin
283
338
  # Sync path: extract from exception ivar
284
339
  raw_ivars = Services::LocalVariableCapturer.extract_instance_vars(@exception)
@@ -364,8 +419,11 @@ module RailsErrorDashboard
364
419
 
365
420
  # Dispatch notification if error is not muted and the throttle check passes.
366
421
  # Muted errors skip notifications but still fire plugin events/callbacks.
422
+ # During a storm (breaker not closed) per-error notifications are
423
+ # suppressed — a single storm notification replaces them.
367
424
  def maybe_notify(error_log)
368
425
  return if error_log.muted?
426
+ return if Services::StormProtection::Gate.notifications_suppressed?
369
427
  return unless yield
370
428
 
371
429
  Services::ErrorNotificationDispatcher.call(error_log)
@@ -67,6 +67,25 @@ module RailsErrorDashboard
67
67
  # Sampling rate for non-critical errors (0.0 to 1.0, default 1.0 = 100%)
68
68
  attr_accessor :sampling_rate
69
69
 
70
+ # Storm protection — circuit breaker + adaptive sampling for error floods.
71
+ # Protects the HOST APP from the gem's own writes during an error storm
72
+ # (bad deploy throwing thousands of errors/minute). Default ON: this is
73
+ # the feature that makes the gem quieter, so ON is the conservative choice.
74
+ # All thresholds are PER PROCESS (no cross-process coordination by design).
75
+ attr_accessor :enable_storm_protection # Master switch (default: true)
76
+ attr_accessor :storm_fingerprint_full_per_minute # Full-fidelity captures per fingerprint per minute (default: 30)
77
+ attr_accessor :storm_occurrence_sample_keep_every # Past the cap, keep every Nth occurrence (default: 10)
78
+ attr_accessor :storm_shedding_threshold_per_second # Global rate that enters shedding state (default: 10)
79
+ attr_accessor :storm_open_threshold_per_second # Global rate that opens the breaker = count-only (default: 50)
80
+ attr_accessor :storm_cooldown_seconds # Open → half-open probe delay (default: 60)
81
+ attr_accessor :storm_max_tracked_fingerprints # Bounded in-memory map size; beyond = overflow bucket (default: 1000)
82
+ attr_accessor :storm_flush_interval_seconds # Count-buffer flush cadence (default: 30)
83
+ attr_accessor :storm_notification # Single "storm in progress" notification per episode (default: true)
84
+ attr_accessor :auto_issue_rate_limit_count # Max auto-created issues per window — applies always (default: 5)
85
+ attr_accessor :auto_issue_rate_limit_window_minutes # Window for the above (default: 10)
86
+ attr_accessor :context_sampling_threshold_per_day # Full-context captures per fingerprint per day before sampling (default: 25)
87
+ attr_accessor :context_sampling_keep_every # After threshold, keep full context every Nth (default: 10)
88
+
70
89
  # Async logging configuration
71
90
  attr_accessor :async_logging
72
91
  attr_accessor :async_adapter # :sidekiq, :solid_queue, or :async
@@ -268,6 +287,21 @@ module RailsErrorDashboard
268
287
  @ignored_exceptions = []
269
288
  @custom_fingerprint = nil # Lambda: ->(exception, context) { "custom_key" }
270
289
  @sampling_rate = 1.0 # 100% by default
290
+
291
+ # Storm protection defaults (thresholds tuned via chaos Phase G — see ROADMAP)
292
+ @enable_storm_protection = true
293
+ @storm_fingerprint_full_per_minute = 30
294
+ @storm_occurrence_sample_keep_every = 10
295
+ @storm_shedding_threshold_per_second = 10
296
+ @storm_open_threshold_per_second = 50
297
+ @storm_cooldown_seconds = 60
298
+ @storm_max_tracked_fingerprints = 1000
299
+ @storm_flush_interval_seconds = 30
300
+ @storm_notification = true
301
+ @auto_issue_rate_limit_count = 5
302
+ @auto_issue_rate_limit_window_minutes = 10
303
+ @context_sampling_threshold_per_day = 25
304
+ @context_sampling_keep_every = 10
271
305
  @async_logging = false
272
306
  @async_adapter = :sidekiq # Battle-tested default
273
307
  @max_backtrace_lines = 100 # Matches industry standard (Rollbar, Airbrake)
@@ -683,6 +717,32 @@ module RailsErrorDashboard
683
717
  end
684
718
  end
685
719
 
720
+ # Validate storm protection thresholds (all must be positive when protection is on)
721
+ if enable_storm_protection
722
+ {
723
+ storm_fingerprint_full_per_minute: storm_fingerprint_full_per_minute,
724
+ storm_occurrence_sample_keep_every: storm_occurrence_sample_keep_every,
725
+ storm_shedding_threshold_per_second: storm_shedding_threshold_per_second,
726
+ storm_open_threshold_per_second: storm_open_threshold_per_second,
727
+ storm_cooldown_seconds: storm_cooldown_seconds,
728
+ storm_max_tracked_fingerprints: storm_max_tracked_fingerprints,
729
+ storm_flush_interval_seconds: storm_flush_interval_seconds,
730
+ auto_issue_rate_limit_count: auto_issue_rate_limit_count,
731
+ auto_issue_rate_limit_window_minutes: auto_issue_rate_limit_window_minutes,
732
+ context_sampling_threshold_per_day: context_sampling_threshold_per_day,
733
+ context_sampling_keep_every: context_sampling_keep_every
734
+ }.each do |name, value|
735
+ if value.nil? || value.to_i < 1
736
+ errors << "#{name} must be a positive integer (got: #{value.inspect})"
737
+ end
738
+ end
739
+
740
+ if storm_open_threshold_per_second.to_i < storm_shedding_threshold_per_second.to_i
741
+ errors << "storm_open_threshold_per_second (#{storm_open_threshold_per_second}) must be >= " \
742
+ "storm_shedding_threshold_per_second (#{storm_shedding_threshold_per_second})"
743
+ end
744
+ end
745
+
686
746
  # Validate total_users_for_impact (must be positive if set)
687
747
  if total_users_for_impact && total_users_for_impact < 1
688
748
  errors << "total_users_for_impact must be at least 1 (got: #{total_users_for_impact})"
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RailsErrorDashboard
4
+ module Queries
5
+ # Query: Storm protection episode history + active-storm lookup.
6
+ #
7
+ # Read-only. Powers the /errors/storms page and the layout banner.
8
+ class StormHistory
9
+ RECENT_BANNER_WINDOW = 24.hours
10
+
11
+ def self.call(limit: 50)
12
+ return { active: nil, recent: nil, events: [] } unless StormEvent.table_exists?
13
+
14
+ {
15
+ active: StormEvent.active.recent_first.first,
16
+ recent: StormEvent.ended_within(RECENT_BANNER_WINDOW).recent_first.first,
17
+ events: StormEvent.recent_first.limit(limit).to_a
18
+ }
19
+ rescue => e
20
+ RailsErrorDashboard::Logger.error(
21
+ "[RailsErrorDashboard] StormHistory query failed: #{e.message}"
22
+ )
23
+ { active: nil, recent: nil, events: [] }
24
+ end
25
+
26
+ # Cheap banner lookup for the layout — one indexed query on the happy
27
+ # path (no active storm), two when a banner is showing.
28
+ def self.banner_event
29
+ return nil unless RailsErrorDashboard.configuration.enable_storm_protection
30
+ return nil unless StormEvent.table_exists?
31
+
32
+ StormEvent.active.recent_first.first ||
33
+ StormEvent.ended_within(RECENT_BANNER_WINDOW).recent_first.first
34
+ rescue
35
+ nil
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,195 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RailsErrorDashboard
4
+ module Services
5
+ module StormProtection
6
+ # Per-process circuit breaker for the error capture path.
7
+ #
8
+ # Counts capture attempts in fixed 10-second buckets and transitions
9
+ # between states based on the completed bucket's rate:
10
+ #
11
+ # :closed — normal operation, per-fingerprint buckets decide fidelity
12
+ # :shedding — elevated rate: context shed, notifications suppressed
13
+ # :open — storm: count-only mode, zero per-event I/O
14
+ # :half_open — post-cooldown probe: small sample admitted, watching rate
15
+ #
16
+ # Hysteresis: opens FAST (a single hot bucket, or mid-bucket fast-trip),
17
+ # closes SLOW (two consecutive calm buckets) to prevent flapping.
18
+ #
19
+ # Concurrency: the hot path is one AtomicFixnum increment plus a float
20
+ # comparison. The mutex is taken only on bucket roll (once per 10s) and
21
+ # for state transitions — never per event.
22
+ class CircuitBreaker
23
+ BUCKET_SECONDS = 10
24
+ CALM_BUCKETS_TO_CLOSE = 2
25
+
26
+ attr_reader :state
27
+
28
+ # @param clock [#call] returns monotonic seconds; injectable for tests
29
+ def initialize(clock: -> { Process.clock_gettime(Process::CLOCK_MONOTONIC) })
30
+ @clock = clock
31
+ @mutex = Mutex.new
32
+ reset!
33
+ end
34
+
35
+ def reset!
36
+ @mutex.synchronize do
37
+ @state = :closed
38
+ @bucket_start = @clock.call
39
+ @bucket_count = Concurrent::AtomicFixnum.new(0)
40
+ @calm_buckets = 0
41
+ @opened_at = nil
42
+ @episode = nil
43
+ end
44
+ end
45
+
46
+ # Count one capture attempt and return the state that should govern it.
47
+ # Called on EVERY capture — must stay allocation-free on the fast path.
48
+ def record!
49
+ now = @clock.call
50
+ roll!(now) if now - @bucket_start >= BUCKET_SECONDS
51
+
52
+ count = @bucket_count.increment
53
+
54
+ # Fast-trip: don't wait for the bucket to complete if it's already
55
+ # over the open threshold — at 50k errors/min a full 10s bucket
56
+ # would let ~8k events through before reacting.
57
+ if count >= open_threshold * BUCKET_SECONDS && @state != :open
58
+ trip_open!(now, count)
59
+ end
60
+
61
+ @state
62
+ end
63
+
64
+ # Episode metadata for the honesty layer (storm_events row).
65
+ # @return [Hash, nil] nil when no episode is active or recently closed
66
+ def episode_snapshot
67
+ @mutex.synchronize { @episode&.dup }
68
+ end
69
+
70
+ # Forget a closed episode once it has been persisted by the flush job.
71
+ def clear_closed_episode!
72
+ @mutex.synchronize do
73
+ @episode = nil if @episode && @episode[:ended_at]
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def roll!(now)
80
+ @mutex.synchronize do
81
+ elapsed = now - @bucket_start
82
+ return if elapsed < BUCKET_SECONDS # another thread already rolled
83
+
84
+ rate = @bucket_count.value / elapsed.to_f
85
+ @bucket_start = now
86
+ @bucket_count = Concurrent::AtomicFixnum.new(0)
87
+ transition!(rate, now)
88
+ end
89
+ end
90
+
91
+ # Transition table — runs inside @mutex, once per bucket roll.
92
+ # track_peak runs AFTER the case: a transition out of :closed creates
93
+ # the episode, and the triggering bucket's rate must be its first peak.
94
+ def transition!(rate, now)
95
+ case @state
96
+ when :closed
97
+ if rate >= open_threshold
98
+ open!(now)
99
+ elsif rate >= shedding_threshold
100
+ enter!(:shedding, now)
101
+ end
102
+ when :shedding
103
+ if rate >= open_threshold
104
+ open!(now)
105
+ elsif rate < shedding_threshold / 2.0
106
+ calm_step!(now)
107
+ else
108
+ @calm_buckets = 0
109
+ end
110
+ when :open
111
+ if now - @opened_at >= cooldown_seconds && rate < shedding_threshold
112
+ @state = :half_open
113
+ @calm_buckets = 0
114
+ end
115
+ when :half_open
116
+ if rate >= shedding_threshold
117
+ open!(now)
118
+ else
119
+ calm_step!(now)
120
+ end
121
+ end
122
+
123
+ track_peak(rate)
124
+ end
125
+
126
+ def calm_step!(now)
127
+ @calm_buckets += 1
128
+ close!(now) if @calm_buckets >= CALM_BUCKETS_TO_CLOSE
129
+ end
130
+
131
+ def enter!(new_state, _now)
132
+ begin_episode! if @state == :closed
133
+ @state = new_state
134
+ @calm_buckets = 0
135
+ end
136
+
137
+ def open!(now)
138
+ begin_episode! if @state == :closed
139
+ @state = :open
140
+ @opened_at = now
141
+ @calm_buckets = 0
142
+ @episode[:reached_open] = true if @episode
143
+ end
144
+
145
+ # Mid-bucket fast trip — takes the mutex (rare: at most once per storm onset).
146
+ def trip_open!(now, count)
147
+ @mutex.synchronize do
148
+ return if @state == :open
149
+
150
+ begin_episode! if @state == :closed
151
+ track_peak(count / [ now - @bucket_start, 1.0 ].max)
152
+ @state = :open
153
+ @opened_at = now
154
+ @calm_buckets = 0
155
+ @episode[:reached_open] = true if @episode
156
+ end
157
+ end
158
+
159
+ def close!(_now)
160
+ @state = :closed
161
+ @calm_buckets = 0
162
+ @episode[:ended_at] = Time.current if @episode
163
+ end
164
+
165
+ def begin_episode!
166
+ @episode = {
167
+ started_at: Time.current,
168
+ ended_at: nil,
169
+ peak_rate_per_minute: 0,
170
+ reached_open: false
171
+ }
172
+ end
173
+
174
+ def track_peak(rate_per_second)
175
+ return unless @episode
176
+
177
+ per_minute = (rate_per_second * 60).round
178
+ @episode[:peak_rate_per_minute] = per_minute if per_minute > @episode[:peak_rate_per_minute]
179
+ end
180
+
181
+ def shedding_threshold
182
+ RailsErrorDashboard.configuration.storm_shedding_threshold_per_second.to_f
183
+ end
184
+
185
+ def open_threshold
186
+ RailsErrorDashboard.configuration.storm_open_threshold_per_second.to_f
187
+ end
188
+
189
+ def cooldown_seconds
190
+ RailsErrorDashboard.configuration.storm_cooldown_seconds.to_i
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RailsErrorDashboard
4
+ module Services
5
+ module StormProtection
6
+ # In-memory accumulator for events that are counted but not stored
7
+ # per-event (Layer 1 overflow and the breaker's count-only mode).
8
+ #
9
+ # Stores exact counts plus just enough identity to reconcile onto the
10
+ # right ErrorLog at flush time: the flush command recomputes the
11
+ # canonical error_hash from these parts (with application resolved in
12
+ # the background job, where DB access is allowed) and issues a single
13
+ # `occurrence_count = occurrence_count + N` UPDATE per fingerprint.
14
+ # Fingerprints first seen during count-only mode get a minimal ErrorLog
15
+ # created from the stored exemplar. Counting is exact — no extrapolation.
16
+ #
17
+ # Memory: bounded map; beyond the cap events land in a single overflow
18
+ # counter (still exact in total, anonymous in identity).
19
+ #
20
+ # Concurrency: snapshot! atomically swaps the whole map out via
21
+ # AtomicReference, so flushing never races with recording.
22
+ class CountBuffer
23
+ Entry = Struct.new(
24
+ :error_class, :message, :first_app_frame,
25
+ :controller_name, :action_name, :custom_hash,
26
+ :count, :first_seen_at, :last_seen_at
27
+ )
28
+
29
+ def initialize
30
+ reset!
31
+ end
32
+
33
+ def reset!
34
+ @map_ref = Concurrent::AtomicReference.new(Concurrent::Map.new)
35
+ @overflow = Concurrent::AtomicFixnum.new(0)
36
+ end
37
+
38
+ # Record one counted-not-stored event.
39
+ # @param gate_key [String] cheap in-process bucketing key
40
+ # @param parts [Hash] identity parts captured at the gate
41
+ def record(gate_key, parts)
42
+ map = @map_ref.get
43
+ entry = map[gate_key]
44
+
45
+ unless entry
46
+ if map.size >= max_tracked
47
+ @overflow.increment
48
+ return
49
+ end
50
+ entry = map.compute_if_absent(gate_key) do
51
+ Entry.new(
52
+ parts[:error_class], parts[:message], parts[:first_app_frame],
53
+ parts[:controller_name], parts[:action_name], parts[:custom_hash],
54
+ Concurrent::AtomicFixnum.new(0), Time.current, Time.current
55
+ )
56
+ end
57
+ end
58
+
59
+ entry.count.increment
60
+ entry.last_seen_at = Time.current
61
+ end
62
+
63
+ def any?
64
+ @overflow.value.positive? || !@map_ref.get.empty?
65
+ end
66
+
67
+ # Atomically swap the buffer out and return serializable entry hashes.
68
+ # @return [Hash] { entries: Array<Hash>, overflow: Integer }
69
+ def snapshot!
70
+ old_map = @map_ref.get_and_set(Concurrent::Map.new)
71
+ overflow = @overflow.value
72
+ @overflow.update { |v| v - overflow }
73
+
74
+ entries = []
75
+ old_map.each_pair do |_key, entry|
76
+ entries << {
77
+ "error_class" => entry.error_class,
78
+ "message" => entry.message,
79
+ "first_app_frame" => entry.first_app_frame,
80
+ "controller_name" => entry.controller_name,
81
+ "action_name" => entry.action_name,
82
+ "custom_hash" => entry.custom_hash,
83
+ "count" => entry.count.value,
84
+ "first_seen_at" => entry.first_seen_at.iso8601,
85
+ "last_seen_at" => entry.last_seen_at.iso8601
86
+ }
87
+ end
88
+
89
+ { entries: entries, overflow: overflow }
90
+ end
91
+
92
+ private
93
+
94
+ def max_tracked
95
+ RailsErrorDashboard.configuration.storm_max_tracked_fingerprints.to_i
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end