dead_bro 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,196 +1,216 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DeadBro
4
+ # Process-wide memory leak detector. Previously stored samples in
5
+ # `Thread.current[...]`, which meant each Puma worker thread saw only the
6
+ # handful of requests it served — far too few samples, and reset whenever a
7
+ # thread was recycled. History is now shared across all threads in the
8
+ # process behind a mutex, with a hard cap on the number of retained samples.
4
9
  class MemoryLeakDetector
5
- # Track memory patterns over time to detect leaks
6
- MEMORY_HISTORY_KEY = :dead_bro_memory_history
7
- LEAK_DETECTION_WINDOW = 300 # 5 minutes
8
- MEMORY_GROWTH_THRESHOLD = 50 # 50MB growth threshold
10
+ LEAK_DETECTION_WINDOW = 300 # seconds (5 minutes)
11
+ MEMORY_GROWTH_THRESHOLD = 50 # MB growth over the window
9
12
  MIN_SAMPLES_FOR_LEAK_DETECTION = 10
13
+ MAX_SAMPLES = 500 # hard cap so a long-running process can't grow unbounded
14
+ MAX_LEAK_ALERTS = 10
15
+
16
+ @mutex = Mutex.new
17
+ @history = {
18
+ samples: [],
19
+ last_cleanup: Time.now.utc.to_i,
20
+ leak_alerts: []
21
+ }
22
+
23
+ class << self
24
+ def record_memory_sample(sample_data)
25
+ sample = {
26
+ timestamp: Time.now.utc.to_i,
27
+ memory_usage: sample_data[:memory_usage] || 0,
28
+ gc_count: sample_data[:gc_count] || 0,
29
+ heap_pages: sample_data[:heap_pages] || 0,
30
+ object_count: sample_data[:object_count] || 0,
31
+ request_id: sample_data[:request_id],
32
+ controller: sample_data[:controller],
33
+ action: sample_data[:action]
34
+ }
35
+
36
+ samples_snapshot = @mutex.synchronize do
37
+ @history[:samples] << sample
38
+ cleanup_old_samples_unlocked
39
+ @history[:samples].dup
40
+ end
10
41
 
11
- def self.initialize_history
12
- Thread.current[MEMORY_HISTORY_KEY] = {
13
- samples: [],
14
- last_cleanup: Time.now.utc.to_i,
15
- leak_alerts: []
16
- }
17
- end
42
+ check_for_memory_leaks(samples_snapshot)
18
43
 
19
- def self.record_memory_sample(sample_data)
20
- history = Thread.current[MEMORY_HISTORY_KEY] || initialize_history
44
+ nil
45
+ end
21
46
 
22
- sample = {
23
- timestamp: Time.now.utc.to_i,
24
- memory_usage: sample_data[:memory_usage] || 0,
25
- gc_count: sample_data[:gc_count] || 0,
26
- heap_pages: sample_data[:heap_pages] || 0,
27
- object_count: sample_data[:object_count] || 0,
28
- request_id: sample_data[:request_id],
29
- controller: sample_data[:controller],
30
- action: sample_data[:action]
31
- }
47
+ def get_memory_analysis
48
+ samples_snapshot, leak_alerts_snapshot = @mutex.synchronize do
49
+ [@history[:samples].dup, @history[:leak_alerts].dup]
50
+ end
32
51
 
33
- history[:samples] << sample
52
+ if samples_snapshot.length < 5
53
+ return {status: "insufficient_data", sample_count: samples_snapshot.length}
54
+ end
34
55
 
35
- # Clean up old samples
36
- cleanup_old_samples(history)
56
+ memory_values = samples_snapshot.map { |s| s[:memory_usage] }
57
+ gc_counts = samples_snapshot.map { |s| s[:gc_count] }
58
+ object_counts = samples_snapshot.map { |s| s[:object_count] }
59
+
60
+ memory_stats = calculate_stats(memory_values)
61
+ gc_stats = calculate_stats(gc_counts)
62
+ object_stats = calculate_stats(object_counts)
63
+ memory_trend = calculate_memory_trend(memory_values, samples_snapshot.map { |s| s[:timestamp] })
64
+
65
+ recent_samples = samples_snapshot.last(10)
66
+ recent_controllers = recent_samples.map { |s| "#{s[:controller]}##{s[:action]}" }.tally
67
+
68
+ {
69
+ status: "analyzed",
70
+ sample_count: samples_snapshot.length,
71
+ time_window_seconds: samples_snapshot.last[:timestamp] - samples_snapshot.first[:timestamp],
72
+ memory_stats: memory_stats,
73
+ gc_stats: gc_stats,
74
+ object_stats: object_stats,
75
+ memory_trend: memory_trend,
76
+ recent_controllers: recent_controllers,
77
+ leak_alerts: leak_alerts_snapshot.last(5),
78
+ memory_efficiency: calculate_memory_efficiency(samples_snapshot)
79
+ }
80
+ end
37
81
 
38
- # Check for memory leaks
39
- check_for_memory_leaks(history)
82
+ def clear_history
83
+ @mutex.synchronize do
84
+ @history = {
85
+ samples: [],
86
+ last_cleanup: Time.now.utc.to_i,
87
+ leak_alerts: []
88
+ }
89
+ end
90
+ end
40
91
 
41
- history
42
- end
92
+ # Kept for backwards compatibility — history is now initialized at
93
+ # class-load time, so Railtie callers don't need to do anything.
94
+ def initialize_history
95
+ # no-op
96
+ end
43
97
 
44
- def self.cleanup_old_samples(history)
45
- cutoff_time = Time.now.utc.to_i - LEAK_DETECTION_WINDOW
46
- history[:samples] = history[:samples].select { |sample| sample[:timestamp] > cutoff_time }
47
- end
98
+ def calculate_memory_trend(memory_values, timestamps)
99
+ return {slope: 0, r_squared: 0} if memory_values.length < 2
48
100
 
49
- def self.check_for_memory_leaks(history)
50
- samples = history[:samples]
51
- return if samples.length < MIN_SAMPLES_FOR_LEAK_DETECTION
101
+ n = memory_values.length
102
+ sum_x = timestamps.sum
103
+ sum_y = memory_values.sum
104
+ sum_xy = timestamps.zip(memory_values).sum { |x, y| x * y }
105
+ sum_x2 = timestamps.sum { |x| x * x }
52
106
 
53
- # Calculate memory growth trend
54
- memory_values = samples.map { |s| s[:memory_usage] }
55
- timestamps = samples.map { |s| s[:timestamp] }
107
+ denominator = (n * sum_x2 - sum_x * sum_x)
108
+ return {slope: 0, r_squared: 0} if denominator.zero?
56
109
 
57
- # Use linear regression to detect upward trend
58
- trend = calculate_memory_trend(memory_values, timestamps)
110
+ slope = (n * sum_xy - sum_x * sum_y).to_f / denominator
111
+ intercept = (sum_y - slope * sum_x).to_f / n
59
112
 
60
- # Check if memory is growing consistently
61
- if trend[:slope] > 0.1 && trend[:r_squared] > 0.7 # Growing with good correlation
62
- memory_growth = memory_values.last - memory_values.first
113
+ y_mean = sum_y.to_f / n
114
+ ss_tot = memory_values.sum { |y| (y - y_mean)**2 }
115
+ ss_res = memory_values.zip(timestamps).sum { |y, x| (y - (slope * x + intercept))**2 }
116
+ r_squared = (ss_tot > 0) ? 1 - (ss_res / ss_tot) : 0
63
117
 
64
- if memory_growth > MEMORY_GROWTH_THRESHOLD
65
- leak_alert = {
66
- detected_at: Time.now.utc.to_i,
67
- memory_growth_mb: memory_growth.round(2),
68
- growth_rate_mb_per_second: trend[:slope],
69
- confidence: trend[:r_squared],
70
- sample_count: samples.length,
71
- time_window_seconds: timestamps.last - timestamps.first,
72
- recent_controllers: samples.last(5).map { |s| "#{s[:controller]}##{s[:action]}" }.uniq
73
- }
118
+ {slope: slope, intercept: intercept, r_squared: r_squared}
119
+ end
74
120
 
75
- history[:leak_alerts] << leak_alert
121
+ def calculate_stats(values)
122
+ return {} if values.empty?
76
123
 
77
- # Only keep recent leak alerts
78
- history[:leak_alerts] = history[:leak_alerts].last(10)
79
- end
124
+ {
125
+ min: values.min,
126
+ max: values.max,
127
+ mean: (values.sum.to_f / values.length).round(2),
128
+ median: values.sort[values.length / 2],
129
+ std_dev: calculate_standard_deviation(values)
130
+ }
80
131
  end
81
- end
82
132
 
83
- def self.calculate_memory_trend(memory_values, timestamps)
84
- return {slope: 0, r_squared: 0} if memory_values.length < 2
85
-
86
- n = memory_values.length
87
- sum_x = timestamps.sum
88
- sum_y = memory_values.sum
89
- sum_xy = timestamps.zip(memory_values).sum { |x, y| x * y }
90
- sum_x2 = timestamps.sum { |x| x * x }
91
- memory_values.sum { |y| y * y }
92
-
93
- # Calculate slope (m) and intercept (b) for y = mx + b
94
- slope = (n * sum_xy - sum_x * sum_y).to_f / (n * sum_x2 - sum_x * sum_x)
95
- intercept = (sum_y - slope * sum_x).to_f / n
96
-
97
- # Calculate R-squared (coefficient of determination)
98
- y_mean = sum_y.to_f / n
99
- ss_tot = memory_values.sum { |y| (y - y_mean)**2 }
100
- ss_res = memory_values.zip(timestamps).sum { |y, x| (y - (slope * x + intercept))**2 }
101
- r_squared = (ss_tot > 0) ? 1 - (ss_res / ss_tot) : 0
102
-
103
- {
104
- slope: slope,
105
- intercept: intercept,
106
- r_squared: r_squared
107
- }
108
- end
133
+ def calculate_standard_deviation(values)
134
+ return 0 if values.length < 2
109
135
 
110
- def self.get_memory_analysis
111
- history = Thread.current[MEMORY_HISTORY_KEY] || initialize_history
112
- samples = history[:samples]
113
-
114
- return {status: "insufficient_data", sample_count: samples.length} if samples.length < 5
115
-
116
- memory_values = samples.map { |s| s[:memory_usage] }
117
- gc_counts = samples.map { |s| s[:gc_count] }
118
- object_counts = samples.map { |s| s[:object_count] }
119
-
120
- # Calculate basic statistics
121
- memory_stats = calculate_stats(memory_values)
122
- gc_stats = calculate_stats(gc_counts)
123
- object_stats = calculate_stats(object_counts)
124
-
125
- # Detect patterns
126
- memory_trend = calculate_memory_trend(memory_values, samples.map { |s| s[:timestamp] })
127
-
128
- # Analyze recent activity
129
- recent_samples = samples.last(10)
130
- recent_controllers = recent_samples.map { |s| "#{s[:controller]}##{s[:action]}" }.tally
131
-
132
- {
133
- status: "analyzed",
134
- sample_count: samples.length,
135
- time_window_seconds: samples.last[:timestamp] - samples.first[:timestamp],
136
- memory_stats: memory_stats,
137
- gc_stats: gc_stats,
138
- object_stats: object_stats,
139
- memory_trend: memory_trend,
140
- recent_controllers: recent_controllers,
141
- leak_alerts: history[:leak_alerts].last(5),
142
- memory_efficiency: calculate_memory_efficiency(samples)
143
- }
144
- end
136
+ mean = values.sum.to_f / values.length
137
+ variance = values.sum { |v| (v - mean)**2 } / (values.length - 1)
138
+ Math.sqrt(variance).round(2)
139
+ end
145
140
 
146
- def self.calculate_stats(values)
147
- return {} if values.empty?
141
+ def calculate_memory_efficiency(samples)
142
+ return {} if samples.length < 2
148
143
 
149
- {
150
- min: values.min,
151
- max: values.max,
152
- mean: (values.sum.to_f / values.length).round(2),
153
- median: values.sort[values.length / 2],
154
- std_dev: calculate_standard_deviation(values)
155
- }
156
- end
144
+ memory_per_object = samples.map do |sample|
145
+ (sample[:object_count] > 0) ? sample[:memory_usage] / sample[:object_count] : 0
146
+ end
157
147
 
158
- def self.calculate_standard_deviation(values)
159
- return 0 if values.length < 2
148
+ gc_efficiency = []
149
+ (1...samples.length).each do |i|
150
+ gc_delta = samples[i][:gc_count] - samples[i - 1][:gc_count]
151
+ memory_delta = samples[i][:memory_usage] - samples[i - 1][:memory_usage]
160
152
 
161
- mean = values.sum.to_f / values.length
162
- variance = values.sum { |v| (v - mean)**2 } / (values.length - 1)
163
- Math.sqrt(variance).round(2)
164
- end
165
-
166
- def self.calculate_memory_efficiency(samples)
167
- return {} if samples.length < 2
153
+ if gc_delta > 0 && memory_delta < 0
154
+ gc_efficiency << (-memory_delta / gc_delta).round(2)
155
+ end
156
+ end
168
157
 
169
- # Calculate memory per object ratio
170
- memory_per_object = samples.map do |sample|
171
- (sample[:object_count] > 0) ? sample[:memory_usage] / sample[:object_count] : 0
158
+ {
159
+ average_memory_per_object_kb: (memory_per_object.sum / memory_per_object.length).round(2),
160
+ gc_efficiency_mb_per_cycle: gc_efficiency.any? ? (gc_efficiency.sum / gc_efficiency.length).round(2) : 0,
161
+ memory_volatility: calculate_standard_deviation(samples.map { |s| s[:memory_usage] })
162
+ }
172
163
  end
173
164
 
174
- # Calculate GC efficiency (objects collected per GC cycle)
175
- gc_efficiency = []
176
- (1...samples.length).each do |i|
177
- gc_delta = samples[i][:gc_count] - samples[i - 1][:gc_count]
178
- memory_delta = samples[i][:memory_usage] - samples[i - 1][:memory_usage]
165
+ private
179
166
 
180
- if gc_delta > 0 && memory_delta < 0
181
- gc_efficiency << (-memory_delta / gc_delta).round(2)
167
+ def cleanup_old_samples_unlocked
168
+ cutoff_time = Time.now.utc.to_i - LEAK_DETECTION_WINDOW
169
+ samples = @history[:samples]
170
+
171
+ # Drop stale samples by time window.
172
+ if samples.any? && samples.first[:timestamp] <= cutoff_time
173
+ @history[:samples] = samples.select { |s| s[:timestamp] > cutoff_time }
174
+ samples = @history[:samples]
182
175
  end
176
+
177
+ # Enforce hard cap so a burst of traffic can't grow the buffer forever.
178
+ if samples.length > MAX_SAMPLES
179
+ @history[:samples] = samples.last(MAX_SAMPLES)
180
+ end
181
+
182
+ @history[:last_cleanup] = Time.now.utc.to_i
183
183
  end
184
184
 
185
- {
186
- average_memory_per_object_kb: (memory_per_object.sum / memory_per_object.length).round(2),
187
- gc_efficiency_mb_per_cycle: gc_efficiency.any? ? (gc_efficiency.sum / gc_efficiency.length).round(2) : 0,
188
- memory_volatility: calculate_standard_deviation(samples.map { |s| s[:memory_usage] })
189
- }
190
- end
185
+ # Runs the O(N) regression on a pre-copied snapshot so the mutex is not
186
+ # held during the computation. The alert is appended inside a short lock.
187
+ def check_for_memory_leaks(samples)
188
+ return if samples.length < MIN_SAMPLES_FOR_LEAK_DETECTION
189
+
190
+ memory_values = samples.map { |s| s[:memory_usage] }
191
+ timestamps = samples.map { |s| s[:timestamp] }
191
192
 
192
- def self.clear_history
193
- Thread.current[MEMORY_HISTORY_KEY] = nil
193
+ trend = calculate_memory_trend(memory_values, timestamps)
194
+ return unless trend[:slope] > 0.1 && trend[:r_squared] > 0.7
195
+
196
+ memory_growth = memory_values.last - memory_values.first
197
+ return unless memory_growth > MEMORY_GROWTH_THRESHOLD
198
+
199
+ leak_alert = {
200
+ detected_at: Time.now.utc.to_i,
201
+ memory_growth_mb: memory_growth.round(2),
202
+ growth_rate_mb_per_second: trend[:slope],
203
+ confidence: trend[:r_squared],
204
+ sample_count: samples.length,
205
+ time_window_seconds: timestamps.last - timestamps.first,
206
+ recent_controllers: samples.last(5).map { |s| "#{s[:controller]}##{s[:action]}" }.uniq
207
+ }
208
+
209
+ @mutex.synchronize do
210
+ @history[:leak_alerts] << leak_alert
211
+ @history[:leak_alerts] = @history[:leak_alerts].last(MAX_LEAK_ALERTS)
212
+ end
213
+ end
194
214
  end
195
215
  end
196
216
  end
@@ -4,8 +4,10 @@ require "active_support/notifications"
4
4
 
5
5
  module DeadBro
6
6
  class MemoryTrackingSubscriber
7
- # Object allocation events
8
- ALLOCATION_EVENT = "object_allocations.active_support"
7
+ # Allocation counts come from the process_action event (Rails instruments
8
+ # allocations there via ActiveSupport::Notifications). The old
9
+ # "object_allocations.active_support" constant was never emitted by Rails,
10
+ # so that subscription was dead code — removed.
9
11
  PROCESS_ACTION_EVENT = "process_action.action_controller"
10
12
 
11
13
  THREAD_LOCAL_KEY = :dead_bro_memory_events
@@ -13,7 +15,6 @@ module DeadBro
13
15
  LARGE_OBJECT_THRESHOLD = 1_000_000 # 1MB threshold for large objects
14
16
 
15
17
  # Performance optimization settings
16
- ALLOCATION_SAMPLING_RATE = 1 # Track all when enabled (adjust in production)
17
18
  MAX_ALLOCATIONS_PER_REQUEST = 1000 # Limit allocations tracked per request
18
19
  LARGE_OBJECT_SAMPLE_RATE = 0.01 # Sample 1% of live objects to estimate large ones
19
20
  MAX_LARGE_OBJECTS = 50 # Cap number of large objects captured per request
@@ -23,13 +24,6 @@ module DeadBro
23
24
  return unless DeadBro.configuration.allocation_tracking_enabled
24
25
  if defined?(ActiveSupport::Notifications) && ActiveSupport::Notifications.notifier.respond_to?(:subscribe)
25
26
  begin
26
- # Subscribe to object allocation events with sampling
27
- ActiveSupport::Notifications.subscribe(ALLOCATION_EVENT) do |name, started, finished, _unique_id, data|
28
- # Sample allocations to reduce overhead
29
- next unless rand < ALLOCATION_SAMPLING_RATE
30
- track_allocation(data, started, finished)
31
- end
32
-
33
27
  # Subscribe to process_action to capture request-level allocation counters
34
28
  ActiveSupport::Notifications.subscribe(PROCESS_ACTION_EVENT) do |*args|
35
29
  event = if args.length == 1 && args.first.is_a?(ActiveSupport::Notifications::Event)
@@ -345,27 +339,9 @@ module DeadBro
345
339
  end
346
340
 
347
341
  def self.memory_usage_mb
348
- # Use cached memory calculation to avoid expensive system calls
349
- @memory_cache ||= {}
350
- cache_key = Process.pid
351
-
352
- # Cache memory usage for 1 second to avoid repeated system calls
353
- if @memory_cache[cache_key] && (Time.now - @memory_cache[cache_key][:timestamp]) < 1
354
- return @memory_cache[cache_key][:memory]
355
- end
356
-
357
- memory = if defined?(GC) && GC.respond_to?(:stat)
358
- # Use GC stats as a proxy for memory usage (much faster than ps)
359
- gc_stats = GC.stat
360
- # Estimate memory usage from heap pages (rough approximation)
361
- heap_pages = gc_stats[:heap_allocated_pages] || 0
362
- (heap_pages * 4 * 1024) / (1024 * 1024) # 4KB per page, convert to MB
363
- else
364
- 0
365
- end
366
-
367
- @memory_cache[cache_key] = {memory: memory, timestamp: Time.now}
368
- memory
342
+ # MemoryHelpers.rss_mb reads /proc/self/status on Linux and caches for
343
+ # ~1 second across threads, so this is safe to call per-request.
344
+ DeadBro::MemoryHelpers.rss_mb
369
345
  rescue
370
346
  0
371
347
  end
@@ -2,15 +2,23 @@
2
2
 
3
3
  module DeadBro
4
4
  class Monitor
5
+ SLEEP_INTERVAL_SECONDS = 60
6
+
5
7
  def initialize(client: DeadBro.client)
6
8
  @client = client
7
9
  @thread = nil
8
10
  @running = false
11
+ @stop_mutex = Mutex.new
12
+ @stop_cv = ConditionVariable.new
9
13
  end
10
14
 
11
15
  def start
12
- return if @running
13
- return unless DeadBro.configuration.job_queue_monitoring_enabled
16
+ # Live thread already running — nothing to do.
17
+ return if @running && @thread&.alive?
18
+
19
+ # Reset: handles post-fork where @running=true but the thread is dead.
20
+ @running = false
21
+
14
22
  return unless DeadBro.configuration.enabled
15
23
 
16
24
  @running = true
@@ -25,8 +33,12 @@ module DeadBro
25
33
  log_error("Error collecting stats: #{e.message}")
26
34
  end
27
35
 
28
- # Sleep for 60 seconds (1 minute)
29
- sleep(60)
36
+ # Interruptible sleep stop() signals the CV so shutdown doesn't
37
+ # block up to a full minute. Still naps the full interval during
38
+ # normal operation.
39
+ @stop_mutex.synchronize do
40
+ @stop_cv.wait(@stop_mutex, SLEEP_INTERVAL_SECONDS) if @running
41
+ end
30
42
  end
31
43
  end
32
44
 
@@ -35,7 +47,8 @@ module DeadBro
35
47
 
36
48
  def stop
37
49
  @running = false
38
- @thread&.join(5) # Wait up to 5 seconds for thread to finish
50
+ @stop_mutex.synchronize { @stop_cv.broadcast }
51
+ @thread&.join(5) # Safety timeout in case the thread is mid-flight
39
52
  @thread = nil
40
53
  end
41
54
 
@@ -55,12 +55,12 @@ if defined?(Rails) && defined?(Rails::Railtie)
55
55
  DeadBro::JobSubscriber.subscribe!(client: shared_client)
56
56
  end
57
57
 
58
- # Start job queue monitoring if enabled
59
- if DeadBro.configuration.job_queue_monitoring_enabled
60
- require "dead_bro/monitor"
61
- DeadBro.monitor = DeadBro::Monitor.new(client: shared_client)
62
- DeadBro.monitor.start
63
- end
58
+ # Always start the monitor thread. The thread runs every 60s but
59
+ # post_monitor_stats skips the HTTP POST when job_queue_monitoring_enabled
60
+ # is false, so the backend can toggle monitoring on/off mid-process.
61
+ require "dead_bro/monitor"
62
+ DeadBro.monitor = DeadBro::Monitor.new(client: shared_client)
63
+ DeadBro.monitor.start
64
64
  rescue
65
65
  # Never raise in Railtie init
66
66
  end