rails_error_dashboard 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -0
- data/app/controllers/rails_error_dashboard/application_controller.rb +5 -0
- data/app/controllers/rails_error_dashboard/errors_controller.rb +12 -0
- data/app/jobs/rails_error_dashboard/storm_flush_job.rb +19 -0
- data/app/jobs/rails_error_dashboard/storm_notification_job.rb +74 -0
- data/app/models/rails_error_dashboard/storm_event.rb +34 -0
- data/app/views/layouts/rails_error_dashboard.html.erb +21 -0
- data/app/views/rails_error_dashboard/errors/storms.html.erb +91 -0
- data/config/routes.rb +1 -0
- data/db/migrate/20260306000002_add_instance_variables_to_error_logs.rb +7 -1
- data/db/migrate/20260306000003_create_rails_error_dashboard_swallowed_exceptions.rb +4 -0
- data/db/migrate/20260307000001_create_rails_error_dashboard_diagnostic_dumps.rb +4 -0
- data/db/migrate/20260613000001_create_storm_events.rb +28 -0
- data/lib/generators/rails_error_dashboard/install/templates/initializer.rb +36 -0
- data/lib/rails_error_dashboard/commands/flush_storm_counts.rb +188 -0
- data/lib/rails_error_dashboard/commands/log_error.rb +70 -12
- data/lib/rails_error_dashboard/configuration.rb +60 -0
- data/lib/rails_error_dashboard/queries/storm_history.rb +39 -0
- data/lib/rails_error_dashboard/services/storm_protection/circuit_breaker.rb +195 -0
- data/lib/rails_error_dashboard/services/storm_protection/count_buffer.rb +100 -0
- data/lib/rails_error_dashboard/services/storm_protection/fingerprint_buckets.rb +123 -0
- data/lib/rails_error_dashboard/services/storm_protection/gate.rb +258 -0
- data/lib/rails_error_dashboard/subscribers/issue_tracker_subscriber.rb +12 -0
- data/lib/rails_error_dashboard/version.rb +1 -1
- data/lib/rails_error_dashboard.rb +6 -0
- metadata +13 -2
|
@@ -6,15 +6,59 @@ module RailsErrorDashboard
|
|
|
6
6
|
# This is a write operation that creates an ErrorLog record
|
|
7
7
|
class LogError
|
|
8
8
|
def self.call(exception, context = {})
|
|
9
|
-
#
|
|
9
|
+
# Filter FIRST (ignore list + static sampling) so ignored exceptions
|
|
10
|
+
# never count toward storm state. _pre_filtered prevents the sync path
|
|
11
|
+
# from re-rolling the sampling dice (rate would square otherwise).
|
|
12
|
+
# The filter + gate run inside this method's rescue: nothing in the
|
|
13
|
+
# capture path may ever raise into the host app.
|
|
14
|
+
begin
|
|
15
|
+
unless Services::ExceptionFilter.should_log?(exception)
|
|
16
|
+
# Preserve the OTel contract: filtered captures still emit a span
|
|
17
|
+
# tagged filtered=true (no-op when OTel export is disabled).
|
|
18
|
+
Integrations::Tracer.in_span(
|
|
19
|
+
"capture_error",
|
|
20
|
+
kind: :capture,
|
|
21
|
+
attributes: build_capture_span_attributes(exception, was_async: false)
|
|
22
|
+
) do |span|
|
|
23
|
+
span&.set_attribute("rails_error_dashboard.filtered", true)
|
|
24
|
+
end
|
|
25
|
+
return nil
|
|
26
|
+
end
|
|
27
|
+
context = context.merge(_pre_filtered: true)
|
|
28
|
+
|
|
29
|
+
# Storm protection gate — BEFORE the async branch, because with
|
|
30
|
+
# SolidQueue the enqueue itself is a DB write. :count_only events are
|
|
31
|
+
# tallied in memory and reconciled by StormFlushJob; nothing else
|
|
32
|
+
# happens for them (that's the point).
|
|
33
|
+
storm_decision = Services::StormProtection::Gate.admit!(exception, context)
|
|
34
|
+
return nil if storm_decision == :count_only
|
|
35
|
+
context = context.merge(_storm_decision: storm_decision) if storm_decision == :lite
|
|
36
|
+
rescue => e
|
|
37
|
+
RailsErrorDashboard::Logger.error(
|
|
38
|
+
"[RailsErrorDashboard] Capture pre-checks failed: #{e.class} - #{e.message}"
|
|
39
|
+
)
|
|
40
|
+
# Fall through and attempt full capture — fail open, never raise
|
|
41
|
+
end
|
|
42
|
+
|
|
10
43
|
if RailsErrorDashboard.configuration.async_logging
|
|
11
44
|
# For async logging, just enqueue the job
|
|
12
|
-
# All filtering happens when the job runs
|
|
13
45
|
call_async(exception, context)
|
|
14
46
|
else
|
|
15
47
|
# For sync logging, execute immediately
|
|
16
48
|
new(exception, context).call
|
|
17
49
|
end
|
|
50
|
+
rescue => e
|
|
51
|
+
RailsErrorDashboard::Logger.error(
|
|
52
|
+
"[RailsErrorDashboard] LogError.call failed: #{e.class} - #{e.message}"
|
|
53
|
+
)
|
|
54
|
+
nil
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# :lite captures shed context (breadcrumbs/health/locals/ivars) — the
|
|
58
|
+
# storm shedding ladder's first economy. Symbol or string key: the
|
|
59
|
+
# async job round-trips context through the queue serializer.
|
|
60
|
+
def self.storm_lite?(context)
|
|
61
|
+
context[:_storm_decision].to_s == "lite"
|
|
18
62
|
end
|
|
19
63
|
|
|
20
64
|
# Build the base OTel span attributes available before any work happens.
|
|
@@ -42,18 +86,22 @@ module RailsErrorDashboard
|
|
|
42
86
|
cause_chain: serialize_cause_chain(exception)
|
|
43
87
|
}
|
|
44
88
|
|
|
89
|
+
# Storm shedding: :lite captures skip ALL pre-enqueue context harvest —
|
|
90
|
+
# this is request-thread CPU, the most valuable thing to shed.
|
|
91
|
+
lite = storm_lite?(context)
|
|
92
|
+
|
|
45
93
|
# Harvest breadcrumbs NOW (before job dispatch — different thread won't have them)
|
|
46
|
-
if RailsErrorDashboard.configuration.enable_breadcrumbs
|
|
94
|
+
if !lite && RailsErrorDashboard.configuration.enable_breadcrumbs
|
|
47
95
|
context = context.merge(_serialized_breadcrumbs: Services::BreadcrumbCollector.harvest)
|
|
48
96
|
end
|
|
49
97
|
|
|
50
98
|
# Capture system health NOW (metrics are time-sensitive, different thread = different state)
|
|
51
|
-
if RailsErrorDashboard.configuration.enable_system_health
|
|
99
|
+
if !lite && RailsErrorDashboard.configuration.enable_system_health
|
|
52
100
|
context = context.merge(_serialized_system_health: Services::SystemHealthSnapshot.capture)
|
|
53
101
|
end
|
|
54
102
|
|
|
55
103
|
# Capture local variables NOW (TracePoint attaches to exception, must extract before job dispatch)
|
|
56
|
-
if RailsErrorDashboard.configuration.enable_local_variables
|
|
104
|
+
if !lite && RailsErrorDashboard.configuration.enable_local_variables
|
|
57
105
|
begin
|
|
58
106
|
raw_locals = Services::LocalVariableCapturer.extract(exception)
|
|
59
107
|
if raw_locals.is_a?(Hash) && raw_locals.any?
|
|
@@ -65,7 +113,7 @@ module RailsErrorDashboard
|
|
|
65
113
|
end
|
|
66
114
|
|
|
67
115
|
# Capture instance variables NOW (same reason — attached to exception object)
|
|
68
|
-
if RailsErrorDashboard.configuration.enable_instance_variables
|
|
116
|
+
if !lite && RailsErrorDashboard.configuration.enable_instance_variables
|
|
69
117
|
begin
|
|
70
118
|
raw_ivars = Services::LocalVariableCapturer.extract_instance_vars(exception)
|
|
71
119
|
if raw_ivars.is_a?(Hash) && raw_ivars.any?
|
|
@@ -157,12 +205,19 @@ module RailsErrorDashboard
|
|
|
157
205
|
kind: :capture,
|
|
158
206
|
attributes: self.class.build_capture_span_attributes(@exception, was_async: false)
|
|
159
207
|
) do |span|
|
|
160
|
-
# Check if this exception should be logged (ignore list + sampling)
|
|
161
|
-
|
|
208
|
+
# Check if this exception should be logged (ignore list + sampling).
|
|
209
|
+
# Skipped when self.call already filtered (re-rolling the sampling
|
|
210
|
+
# dice here would square the effective rate).
|
|
211
|
+
if !@context[:_pre_filtered] && !Services::ExceptionFilter.should_log?(@exception)
|
|
162
212
|
span&.set_attribute("rails_error_dashboard.filtered", true)
|
|
163
213
|
next nil
|
|
164
214
|
end
|
|
165
215
|
|
|
216
|
+
# Storm shedding: :lite captures keep the error + occurrence row but
|
|
217
|
+
# shed context payloads (breadcrumbs/health/locals/ivars).
|
|
218
|
+
storm_lite = self.class.storm_lite?(@context)
|
|
219
|
+
span&.set_attribute("rails_error_dashboard.storm_degraded", true) if storm_lite
|
|
220
|
+
|
|
166
221
|
error_context = ValueObjects::ErrorContext.new(@context, @context[:source])
|
|
167
222
|
|
|
168
223
|
# Find or create application (cached lookup)
|
|
@@ -239,7 +294,7 @@ module RailsErrorDashboard
|
|
|
239
294
|
attributes = Services::SensitiveDataFilter.filter_attributes(attributes)
|
|
240
295
|
|
|
241
296
|
# Harvest breadcrumbs (if enabled and column exists)
|
|
242
|
-
if ErrorLog.column_names.include?("breadcrumbs") && RailsErrorDashboard.configuration.enable_breadcrumbs
|
|
297
|
+
if !storm_lite && ErrorLog.column_names.include?("breadcrumbs") && RailsErrorDashboard.configuration.enable_breadcrumbs
|
|
243
298
|
# Sync path: harvest from current thread
|
|
244
299
|
raw_breadcrumbs = Services::BreadcrumbCollector.harvest
|
|
245
300
|
|
|
@@ -256,13 +311,13 @@ module RailsErrorDashboard
|
|
|
256
311
|
end
|
|
257
312
|
|
|
258
313
|
# Capture system health snapshot (if enabled and column exists)
|
|
259
|
-
if ErrorLog.column_names.include?("system_health") && RailsErrorDashboard.configuration.enable_system_health
|
|
314
|
+
if !storm_lite && ErrorLog.column_names.include?("system_health") && RailsErrorDashboard.configuration.enable_system_health
|
|
260
315
|
health_data = @context[:_serialized_system_health] || Services::SystemHealthSnapshot.capture
|
|
261
316
|
attributes[:system_health] = health_data.to_json
|
|
262
317
|
end
|
|
263
318
|
|
|
264
319
|
# Capture local variables (if enabled and column exists)
|
|
265
|
-
if ErrorLog.column_names.include?("local_variables") && RailsErrorDashboard.configuration.enable_local_variables
|
|
320
|
+
if !storm_lite && ErrorLog.column_names.include?("local_variables") && RailsErrorDashboard.configuration.enable_local_variables
|
|
266
321
|
begin
|
|
267
322
|
# Sync path: extract from exception ivar
|
|
268
323
|
raw_locals = Services::LocalVariableCapturer.extract(@exception)
|
|
@@ -278,7 +333,7 @@ module RailsErrorDashboard
|
|
|
278
333
|
end
|
|
279
334
|
|
|
280
335
|
# Capture instance variables (if enabled and column exists)
|
|
281
|
-
if ErrorLog.column_names.include?("instance_variables") && RailsErrorDashboard.configuration.enable_instance_variables
|
|
336
|
+
if !storm_lite && ErrorLog.column_names.include?("instance_variables") && RailsErrorDashboard.configuration.enable_instance_variables
|
|
282
337
|
begin
|
|
283
338
|
# Sync path: extract from exception ivar
|
|
284
339
|
raw_ivars = Services::LocalVariableCapturer.extract_instance_vars(@exception)
|
|
@@ -364,8 +419,11 @@ module RailsErrorDashboard
|
|
|
364
419
|
|
|
365
420
|
# Dispatch notification if error is not muted and the throttle check passes.
|
|
366
421
|
# Muted errors skip notifications but still fire plugin events/callbacks.
|
|
422
|
+
# During a storm (breaker not closed) per-error notifications are
|
|
423
|
+
# suppressed — a single storm notification replaces them.
|
|
367
424
|
def maybe_notify(error_log)
|
|
368
425
|
return if error_log.muted?
|
|
426
|
+
return if Services::StormProtection::Gate.notifications_suppressed?
|
|
369
427
|
return unless yield
|
|
370
428
|
|
|
371
429
|
Services::ErrorNotificationDispatcher.call(error_log)
|
|
@@ -67,6 +67,25 @@ module RailsErrorDashboard
|
|
|
67
67
|
# Sampling rate for non-critical errors (0.0 to 1.0, default 1.0 = 100%)
|
|
68
68
|
attr_accessor :sampling_rate
|
|
69
69
|
|
|
70
|
+
# Storm protection — circuit breaker + adaptive sampling for error floods.
|
|
71
|
+
# Protects the HOST APP from the gem's own writes during an error storm
|
|
72
|
+
# (bad deploy throwing thousands of errors/minute). Default ON: this is
|
|
73
|
+
# the feature that makes the gem quieter, so ON is the conservative choice.
|
|
74
|
+
# All thresholds are PER PROCESS (no cross-process coordination by design).
|
|
75
|
+
attr_accessor :enable_storm_protection # Master switch (default: true)
|
|
76
|
+
attr_accessor :storm_fingerprint_full_per_minute # Full-fidelity captures per fingerprint per minute (default: 30)
|
|
77
|
+
attr_accessor :storm_occurrence_sample_keep_every # Past the cap, keep every Nth occurrence (default: 10)
|
|
78
|
+
attr_accessor :storm_shedding_threshold_per_second # Global rate that enters shedding state (default: 10)
|
|
79
|
+
attr_accessor :storm_open_threshold_per_second # Global rate that opens the breaker = count-only (default: 50)
|
|
80
|
+
attr_accessor :storm_cooldown_seconds # Open → half-open probe delay (default: 60)
|
|
81
|
+
attr_accessor :storm_max_tracked_fingerprints # Bounded in-memory map size; beyond = overflow bucket (default: 1000)
|
|
82
|
+
attr_accessor :storm_flush_interval_seconds # Count-buffer flush cadence (default: 30)
|
|
83
|
+
attr_accessor :storm_notification # Single "storm in progress" notification per episode (default: true)
|
|
84
|
+
attr_accessor :auto_issue_rate_limit_count # Max auto-created issues per window — applies always (default: 5)
|
|
85
|
+
attr_accessor :auto_issue_rate_limit_window_minutes # Window for the above (default: 10)
|
|
86
|
+
attr_accessor :context_sampling_threshold_per_day # Full-context captures per fingerprint per day before sampling (default: 25)
|
|
87
|
+
attr_accessor :context_sampling_keep_every # After threshold, keep full context every Nth (default: 10)
|
|
88
|
+
|
|
70
89
|
# Async logging configuration
|
|
71
90
|
attr_accessor :async_logging
|
|
72
91
|
attr_accessor :async_adapter # :sidekiq, :solid_queue, or :async
|
|
@@ -268,6 +287,21 @@ module RailsErrorDashboard
|
|
|
268
287
|
@ignored_exceptions = []
|
|
269
288
|
@custom_fingerprint = nil # Lambda: ->(exception, context) { "custom_key" }
|
|
270
289
|
@sampling_rate = 1.0 # 100% by default
|
|
290
|
+
|
|
291
|
+
# Storm protection defaults (thresholds tuned via chaos Phase G — see ROADMAP)
|
|
292
|
+
@enable_storm_protection = true
|
|
293
|
+
@storm_fingerprint_full_per_minute = 30
|
|
294
|
+
@storm_occurrence_sample_keep_every = 10
|
|
295
|
+
@storm_shedding_threshold_per_second = 10
|
|
296
|
+
@storm_open_threshold_per_second = 50
|
|
297
|
+
@storm_cooldown_seconds = 60
|
|
298
|
+
@storm_max_tracked_fingerprints = 1000
|
|
299
|
+
@storm_flush_interval_seconds = 30
|
|
300
|
+
@storm_notification = true
|
|
301
|
+
@auto_issue_rate_limit_count = 5
|
|
302
|
+
@auto_issue_rate_limit_window_minutes = 10
|
|
303
|
+
@context_sampling_threshold_per_day = 25
|
|
304
|
+
@context_sampling_keep_every = 10
|
|
271
305
|
@async_logging = false
|
|
272
306
|
@async_adapter = :sidekiq # Battle-tested default
|
|
273
307
|
@max_backtrace_lines = 100 # Matches industry standard (Rollbar, Airbrake)
|
|
@@ -683,6 +717,32 @@ module RailsErrorDashboard
|
|
|
683
717
|
end
|
|
684
718
|
end
|
|
685
719
|
|
|
720
|
+
# Validate storm protection thresholds (all must be positive when protection is on)
|
|
721
|
+
if enable_storm_protection
|
|
722
|
+
{
|
|
723
|
+
storm_fingerprint_full_per_minute: storm_fingerprint_full_per_minute,
|
|
724
|
+
storm_occurrence_sample_keep_every: storm_occurrence_sample_keep_every,
|
|
725
|
+
storm_shedding_threshold_per_second: storm_shedding_threshold_per_second,
|
|
726
|
+
storm_open_threshold_per_second: storm_open_threshold_per_second,
|
|
727
|
+
storm_cooldown_seconds: storm_cooldown_seconds,
|
|
728
|
+
storm_max_tracked_fingerprints: storm_max_tracked_fingerprints,
|
|
729
|
+
storm_flush_interval_seconds: storm_flush_interval_seconds,
|
|
730
|
+
auto_issue_rate_limit_count: auto_issue_rate_limit_count,
|
|
731
|
+
auto_issue_rate_limit_window_minutes: auto_issue_rate_limit_window_minutes,
|
|
732
|
+
context_sampling_threshold_per_day: context_sampling_threshold_per_day,
|
|
733
|
+
context_sampling_keep_every: context_sampling_keep_every
|
|
734
|
+
}.each do |name, value|
|
|
735
|
+
if value.nil? || value.to_i < 1
|
|
736
|
+
errors << "#{name} must be a positive integer (got: #{value.inspect})"
|
|
737
|
+
end
|
|
738
|
+
end
|
|
739
|
+
|
|
740
|
+
if storm_open_threshold_per_second.to_i < storm_shedding_threshold_per_second.to_i
|
|
741
|
+
errors << "storm_open_threshold_per_second (#{storm_open_threshold_per_second}) must be >= " \
|
|
742
|
+
"storm_shedding_threshold_per_second (#{storm_shedding_threshold_per_second})"
|
|
743
|
+
end
|
|
744
|
+
end
|
|
745
|
+
|
|
686
746
|
# Validate total_users_for_impact (must be positive if set)
|
|
687
747
|
if total_users_for_impact && total_users_for_impact < 1
|
|
688
748
|
errors << "total_users_for_impact must be at least 1 (got: #{total_users_for_impact})"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RailsErrorDashboard
|
|
4
|
+
module Queries
|
|
5
|
+
# Query: Storm protection episode history + active-storm lookup.
|
|
6
|
+
#
|
|
7
|
+
# Read-only. Powers the /errors/storms page and the layout banner.
|
|
8
|
+
class StormHistory
|
|
9
|
+
RECENT_BANNER_WINDOW = 24.hours
|
|
10
|
+
|
|
11
|
+
def self.call(limit: 50)
|
|
12
|
+
return { active: nil, recent: nil, events: [] } unless StormEvent.table_exists?
|
|
13
|
+
|
|
14
|
+
{
|
|
15
|
+
active: StormEvent.active.recent_first.first,
|
|
16
|
+
recent: StormEvent.ended_within(RECENT_BANNER_WINDOW).recent_first.first,
|
|
17
|
+
events: StormEvent.recent_first.limit(limit).to_a
|
|
18
|
+
}
|
|
19
|
+
rescue => e
|
|
20
|
+
RailsErrorDashboard::Logger.error(
|
|
21
|
+
"[RailsErrorDashboard] StormHistory query failed: #{e.message}"
|
|
22
|
+
)
|
|
23
|
+
{ active: nil, recent: nil, events: [] }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Cheap banner lookup for the layout — one indexed query on the happy
|
|
27
|
+
# path (no active storm), two when a banner is showing.
|
|
28
|
+
def self.banner_event
|
|
29
|
+
return nil unless RailsErrorDashboard.configuration.enable_storm_protection
|
|
30
|
+
return nil unless StormEvent.table_exists?
|
|
31
|
+
|
|
32
|
+
StormEvent.active.recent_first.first ||
|
|
33
|
+
StormEvent.ended_within(RECENT_BANNER_WINDOW).recent_first.first
|
|
34
|
+
rescue
|
|
35
|
+
nil
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RailsErrorDashboard
|
|
4
|
+
module Services
|
|
5
|
+
module StormProtection
|
|
6
|
+
# Per-process circuit breaker for the error capture path.
|
|
7
|
+
#
|
|
8
|
+
# Counts capture attempts in fixed 10-second buckets and transitions
|
|
9
|
+
# between states based on the completed bucket's rate:
|
|
10
|
+
#
|
|
11
|
+
# :closed — normal operation, per-fingerprint buckets decide fidelity
|
|
12
|
+
# :shedding — elevated rate: context shed, notifications suppressed
|
|
13
|
+
# :open — storm: count-only mode, zero per-event I/O
|
|
14
|
+
# :half_open — post-cooldown probe: small sample admitted, watching rate
|
|
15
|
+
#
|
|
16
|
+
# Hysteresis: opens FAST (a single hot bucket, or mid-bucket fast-trip),
|
|
17
|
+
# closes SLOW (two consecutive calm buckets) to prevent flapping.
|
|
18
|
+
#
|
|
19
|
+
# Concurrency: the hot path is one AtomicFixnum increment plus a float
|
|
20
|
+
# comparison. The mutex is taken only on bucket roll (once per 10s) and
|
|
21
|
+
# for state transitions — never per event.
|
|
22
|
+
class CircuitBreaker
|
|
23
|
+
BUCKET_SECONDS = 10
|
|
24
|
+
CALM_BUCKETS_TO_CLOSE = 2
|
|
25
|
+
|
|
26
|
+
attr_reader :state
|
|
27
|
+
|
|
28
|
+
# @param clock [#call] returns monotonic seconds; injectable for tests
|
|
29
|
+
def initialize(clock: -> { Process.clock_gettime(Process::CLOCK_MONOTONIC) })
|
|
30
|
+
@clock = clock
|
|
31
|
+
@mutex = Mutex.new
|
|
32
|
+
reset!
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def reset!
|
|
36
|
+
@mutex.synchronize do
|
|
37
|
+
@state = :closed
|
|
38
|
+
@bucket_start = @clock.call
|
|
39
|
+
@bucket_count = Concurrent::AtomicFixnum.new(0)
|
|
40
|
+
@calm_buckets = 0
|
|
41
|
+
@opened_at = nil
|
|
42
|
+
@episode = nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Count one capture attempt and return the state that should govern it.
|
|
47
|
+
# Called on EVERY capture — must stay allocation-free on the fast path.
|
|
48
|
+
def record!
|
|
49
|
+
now = @clock.call
|
|
50
|
+
roll!(now) if now - @bucket_start >= BUCKET_SECONDS
|
|
51
|
+
|
|
52
|
+
count = @bucket_count.increment
|
|
53
|
+
|
|
54
|
+
# Fast-trip: don't wait for the bucket to complete if it's already
|
|
55
|
+
# over the open threshold — at 50k errors/min a full 10s bucket
|
|
56
|
+
# would let ~8k events through before reacting.
|
|
57
|
+
if count >= open_threshold * BUCKET_SECONDS && @state != :open
|
|
58
|
+
trip_open!(now, count)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
@state
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Episode metadata for the honesty layer (storm_events row).
|
|
65
|
+
# @return [Hash, nil] nil when no episode is active or recently closed
|
|
66
|
+
def episode_snapshot
|
|
67
|
+
@mutex.synchronize { @episode&.dup }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Forget a closed episode once it has been persisted by the flush job.
|
|
71
|
+
def clear_closed_episode!
|
|
72
|
+
@mutex.synchronize do
|
|
73
|
+
@episode = nil if @episode && @episode[:ended_at]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def roll!(now)
|
|
80
|
+
@mutex.synchronize do
|
|
81
|
+
elapsed = now - @bucket_start
|
|
82
|
+
return if elapsed < BUCKET_SECONDS # another thread already rolled
|
|
83
|
+
|
|
84
|
+
rate = @bucket_count.value / elapsed.to_f
|
|
85
|
+
@bucket_start = now
|
|
86
|
+
@bucket_count = Concurrent::AtomicFixnum.new(0)
|
|
87
|
+
transition!(rate, now)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Transition table — runs inside @mutex, once per bucket roll.
|
|
92
|
+
# track_peak runs AFTER the case: a transition out of :closed creates
|
|
93
|
+
# the episode, and the triggering bucket's rate must be its first peak.
|
|
94
|
+
def transition!(rate, now)
|
|
95
|
+
case @state
|
|
96
|
+
when :closed
|
|
97
|
+
if rate >= open_threshold
|
|
98
|
+
open!(now)
|
|
99
|
+
elsif rate >= shedding_threshold
|
|
100
|
+
enter!(:shedding, now)
|
|
101
|
+
end
|
|
102
|
+
when :shedding
|
|
103
|
+
if rate >= open_threshold
|
|
104
|
+
open!(now)
|
|
105
|
+
elsif rate < shedding_threshold / 2.0
|
|
106
|
+
calm_step!(now)
|
|
107
|
+
else
|
|
108
|
+
@calm_buckets = 0
|
|
109
|
+
end
|
|
110
|
+
when :open
|
|
111
|
+
if now - @opened_at >= cooldown_seconds && rate < shedding_threshold
|
|
112
|
+
@state = :half_open
|
|
113
|
+
@calm_buckets = 0
|
|
114
|
+
end
|
|
115
|
+
when :half_open
|
|
116
|
+
if rate >= shedding_threshold
|
|
117
|
+
open!(now)
|
|
118
|
+
else
|
|
119
|
+
calm_step!(now)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
track_peak(rate)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def calm_step!(now)
|
|
127
|
+
@calm_buckets += 1
|
|
128
|
+
close!(now) if @calm_buckets >= CALM_BUCKETS_TO_CLOSE
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def enter!(new_state, _now)
|
|
132
|
+
begin_episode! if @state == :closed
|
|
133
|
+
@state = new_state
|
|
134
|
+
@calm_buckets = 0
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def open!(now)
|
|
138
|
+
begin_episode! if @state == :closed
|
|
139
|
+
@state = :open
|
|
140
|
+
@opened_at = now
|
|
141
|
+
@calm_buckets = 0
|
|
142
|
+
@episode[:reached_open] = true if @episode
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Mid-bucket fast trip — takes the mutex (rare: at most once per storm onset).
|
|
146
|
+
def trip_open!(now, count)
|
|
147
|
+
@mutex.synchronize do
|
|
148
|
+
return if @state == :open
|
|
149
|
+
|
|
150
|
+
begin_episode! if @state == :closed
|
|
151
|
+
track_peak(count / [ now - @bucket_start, 1.0 ].max)
|
|
152
|
+
@state = :open
|
|
153
|
+
@opened_at = now
|
|
154
|
+
@calm_buckets = 0
|
|
155
|
+
@episode[:reached_open] = true if @episode
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def close!(_now)
|
|
160
|
+
@state = :closed
|
|
161
|
+
@calm_buckets = 0
|
|
162
|
+
@episode[:ended_at] = Time.current if @episode
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def begin_episode!
|
|
166
|
+
@episode = {
|
|
167
|
+
started_at: Time.current,
|
|
168
|
+
ended_at: nil,
|
|
169
|
+
peak_rate_per_minute: 0,
|
|
170
|
+
reached_open: false
|
|
171
|
+
}
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def track_peak(rate_per_second)
|
|
175
|
+
return unless @episode
|
|
176
|
+
|
|
177
|
+
per_minute = (rate_per_second * 60).round
|
|
178
|
+
@episode[:peak_rate_per_minute] = per_minute if per_minute > @episode[:peak_rate_per_minute]
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def shedding_threshold
|
|
182
|
+
RailsErrorDashboard.configuration.storm_shedding_threshold_per_second.to_f
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def open_threshold
|
|
186
|
+
RailsErrorDashboard.configuration.storm_open_threshold_per_second.to_f
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def cooldown_seconds
|
|
190
|
+
RailsErrorDashboard.configuration.storm_cooldown_seconds.to_i
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RailsErrorDashboard
|
|
4
|
+
module Services
|
|
5
|
+
module StormProtection
|
|
6
|
+
# In-memory accumulator for events that are counted but not stored
|
|
7
|
+
# per-event (Layer 1 overflow and the breaker's count-only mode).
|
|
8
|
+
#
|
|
9
|
+
# Stores exact counts plus just enough identity to reconcile onto the
|
|
10
|
+
# right ErrorLog at flush time: the flush command recomputes the
|
|
11
|
+
# canonical error_hash from these parts (with application resolved in
|
|
12
|
+
# the background job, where DB access is allowed) and issues a single
|
|
13
|
+
# `occurrence_count = occurrence_count + N` UPDATE per fingerprint.
|
|
14
|
+
# Fingerprints first seen during count-only mode get a minimal ErrorLog
|
|
15
|
+
# created from the stored exemplar. Counting is exact — no extrapolation.
|
|
16
|
+
#
|
|
17
|
+
# Memory: bounded map; beyond the cap events land in a single overflow
|
|
18
|
+
# counter (still exact in total, anonymous in identity).
|
|
19
|
+
#
|
|
20
|
+
# Concurrency: snapshot! atomically swaps the whole map out via
|
|
21
|
+
# AtomicReference, so flushing never races with recording.
|
|
22
|
+
class CountBuffer
|
|
23
|
+
Entry = Struct.new(
|
|
24
|
+
:error_class, :message, :first_app_frame,
|
|
25
|
+
:controller_name, :action_name, :custom_hash,
|
|
26
|
+
:count, :first_seen_at, :last_seen_at
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def initialize
|
|
30
|
+
reset!
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def reset!
|
|
34
|
+
@map_ref = Concurrent::AtomicReference.new(Concurrent::Map.new)
|
|
35
|
+
@overflow = Concurrent::AtomicFixnum.new(0)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Record one counted-not-stored event.
|
|
39
|
+
# @param gate_key [String] cheap in-process bucketing key
|
|
40
|
+
# @param parts [Hash] identity parts captured at the gate
|
|
41
|
+
def record(gate_key, parts)
|
|
42
|
+
map = @map_ref.get
|
|
43
|
+
entry = map[gate_key]
|
|
44
|
+
|
|
45
|
+
unless entry
|
|
46
|
+
if map.size >= max_tracked
|
|
47
|
+
@overflow.increment
|
|
48
|
+
return
|
|
49
|
+
end
|
|
50
|
+
entry = map.compute_if_absent(gate_key) do
|
|
51
|
+
Entry.new(
|
|
52
|
+
parts[:error_class], parts[:message], parts[:first_app_frame],
|
|
53
|
+
parts[:controller_name], parts[:action_name], parts[:custom_hash],
|
|
54
|
+
Concurrent::AtomicFixnum.new(0), Time.current, Time.current
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
entry.count.increment
|
|
60
|
+
entry.last_seen_at = Time.current
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def any?
|
|
64
|
+
@overflow.value.positive? || !@map_ref.get.empty?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Atomically swap the buffer out and return serializable entry hashes.
|
|
68
|
+
# @return [Hash] { entries: Array<Hash>, overflow: Integer }
|
|
69
|
+
def snapshot!
|
|
70
|
+
old_map = @map_ref.get_and_set(Concurrent::Map.new)
|
|
71
|
+
overflow = @overflow.value
|
|
72
|
+
@overflow.update { |v| v - overflow }
|
|
73
|
+
|
|
74
|
+
entries = []
|
|
75
|
+
old_map.each_pair do |_key, entry|
|
|
76
|
+
entries << {
|
|
77
|
+
"error_class" => entry.error_class,
|
|
78
|
+
"message" => entry.message,
|
|
79
|
+
"first_app_frame" => entry.first_app_frame,
|
|
80
|
+
"controller_name" => entry.controller_name,
|
|
81
|
+
"action_name" => entry.action_name,
|
|
82
|
+
"custom_hash" => entry.custom_hash,
|
|
83
|
+
"count" => entry.count.value,
|
|
84
|
+
"first_seen_at" => entry.first_seen_at.iso8601,
|
|
85
|
+
"last_seen_at" => entry.last_seen_at.iso8601
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
{ entries: entries, overflow: overflow }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
def max_tracked
|
|
95
|
+
RailsErrorDashboard.configuration.storm_max_tracked_fingerprints.to_i
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|