dead_bro 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -43
- data/lib/dead_bro/circuit_breaker.rb +58 -38
- data/lib/dead_bro/client.rb +112 -143
- data/lib/dead_bro/configuration.rb +76 -40
- data/lib/dead_bro/dispatcher.rb +130 -0
- data/lib/dead_bro/error_middleware.rb +1 -1
- data/lib/dead_bro/job_subscriber.rb +35 -12
- data/lib/dead_bro/lightweight_memory_tracker.rb +5 -7
- data/lib/dead_bro/logger.rb +30 -11
- data/lib/dead_bro/memory_details.rb +71 -0
- data/lib/dead_bro/memory_helpers.rb +62 -0
- data/lib/dead_bro/memory_leak_detector.rb +178 -158
- data/lib/dead_bro/memory_tracking_subscriber.rb +7 -31
- data/lib/dead_bro/monitor.rb +18 -5
- data/lib/dead_bro/railtie.rb +6 -6
- data/lib/dead_bro/sql_subscriber.rb +103 -70
- data/lib/dead_bro/subscriber.rb +36 -14
- data/lib/dead_bro/version.rb +1 -1
- data/lib/dead_bro.rb +85 -88
- metadata +3 -1
|
@@ -1,196 +1,216 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module DeadBro
|
|
4
|
+
# Process-wide memory leak detector. Previously stored samples in
|
|
5
|
+
# `Thread.current[...]`, which meant each Puma worker thread saw only the
|
|
6
|
+
# handful of requests it served — far too few samples, and reset whenever a
|
|
7
|
+
# thread was recycled. History is now shared across all threads in the
|
|
8
|
+
# process behind a mutex, with a hard cap on the number of retained samples.
|
|
4
9
|
class MemoryLeakDetector
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
LEAK_DETECTION_WINDOW = 300 # 5 minutes
|
|
8
|
-
MEMORY_GROWTH_THRESHOLD = 50 # 50MB growth threshold
|
|
10
|
+
LEAK_DETECTION_WINDOW = 300 # seconds (5 minutes)
|
|
11
|
+
MEMORY_GROWTH_THRESHOLD = 50 # MB growth over the window
|
|
9
12
|
MIN_SAMPLES_FOR_LEAK_DETECTION = 10
|
|
13
|
+
MAX_SAMPLES = 500 # hard cap so a long-running process can't grow unbounded
|
|
14
|
+
MAX_LEAK_ALERTS = 10
|
|
15
|
+
|
|
16
|
+
@mutex = Mutex.new
|
|
17
|
+
@history = {
|
|
18
|
+
samples: [],
|
|
19
|
+
last_cleanup: Time.now.utc.to_i,
|
|
20
|
+
leak_alerts: []
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
class << self
|
|
24
|
+
def record_memory_sample(sample_data)
|
|
25
|
+
sample = {
|
|
26
|
+
timestamp: Time.now.utc.to_i,
|
|
27
|
+
memory_usage: sample_data[:memory_usage] || 0,
|
|
28
|
+
gc_count: sample_data[:gc_count] || 0,
|
|
29
|
+
heap_pages: sample_data[:heap_pages] || 0,
|
|
30
|
+
object_count: sample_data[:object_count] || 0,
|
|
31
|
+
request_id: sample_data[:request_id],
|
|
32
|
+
controller: sample_data[:controller],
|
|
33
|
+
action: sample_data[:action]
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
samples_snapshot = @mutex.synchronize do
|
|
37
|
+
@history[:samples] << sample
|
|
38
|
+
cleanup_old_samples_unlocked
|
|
39
|
+
@history[:samples].dup
|
|
40
|
+
end
|
|
10
41
|
|
|
11
|
-
|
|
12
|
-
Thread.current[MEMORY_HISTORY_KEY] = {
|
|
13
|
-
samples: [],
|
|
14
|
-
last_cleanup: Time.now.utc.to_i,
|
|
15
|
-
leak_alerts: []
|
|
16
|
-
}
|
|
17
|
-
end
|
|
42
|
+
check_for_memory_leaks(samples_snapshot)
|
|
18
43
|
|
|
19
|
-
|
|
20
|
-
|
|
44
|
+
nil
|
|
45
|
+
end
|
|
21
46
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
heap_pages: sample_data[:heap_pages] || 0,
|
|
27
|
-
object_count: sample_data[:object_count] || 0,
|
|
28
|
-
request_id: sample_data[:request_id],
|
|
29
|
-
controller: sample_data[:controller],
|
|
30
|
-
action: sample_data[:action]
|
|
31
|
-
}
|
|
47
|
+
def get_memory_analysis
|
|
48
|
+
samples_snapshot, leak_alerts_snapshot = @mutex.synchronize do
|
|
49
|
+
[@history[:samples].dup, @history[:leak_alerts].dup]
|
|
50
|
+
end
|
|
32
51
|
|
|
33
|
-
|
|
52
|
+
if samples_snapshot.length < 5
|
|
53
|
+
return {status: "insufficient_data", sample_count: samples_snapshot.length}
|
|
54
|
+
end
|
|
34
55
|
|
|
35
|
-
|
|
36
|
-
|
|
56
|
+
memory_values = samples_snapshot.map { |s| s[:memory_usage] }
|
|
57
|
+
gc_counts = samples_snapshot.map { |s| s[:gc_count] }
|
|
58
|
+
object_counts = samples_snapshot.map { |s| s[:object_count] }
|
|
59
|
+
|
|
60
|
+
memory_stats = calculate_stats(memory_values)
|
|
61
|
+
gc_stats = calculate_stats(gc_counts)
|
|
62
|
+
object_stats = calculate_stats(object_counts)
|
|
63
|
+
memory_trend = calculate_memory_trend(memory_values, samples_snapshot.map { |s| s[:timestamp] })
|
|
64
|
+
|
|
65
|
+
recent_samples = samples_snapshot.last(10)
|
|
66
|
+
recent_controllers = recent_samples.map { |s| "#{s[:controller]}##{s[:action]}" }.tally
|
|
67
|
+
|
|
68
|
+
{
|
|
69
|
+
status: "analyzed",
|
|
70
|
+
sample_count: samples_snapshot.length,
|
|
71
|
+
time_window_seconds: samples_snapshot.last[:timestamp] - samples_snapshot.first[:timestamp],
|
|
72
|
+
memory_stats: memory_stats,
|
|
73
|
+
gc_stats: gc_stats,
|
|
74
|
+
object_stats: object_stats,
|
|
75
|
+
memory_trend: memory_trend,
|
|
76
|
+
recent_controllers: recent_controllers,
|
|
77
|
+
leak_alerts: leak_alerts_snapshot.last(5),
|
|
78
|
+
memory_efficiency: calculate_memory_efficiency(samples_snapshot)
|
|
79
|
+
}
|
|
80
|
+
end
|
|
37
81
|
|
|
38
|
-
|
|
39
|
-
|
|
82
|
+
def clear_history
|
|
83
|
+
@mutex.synchronize do
|
|
84
|
+
@history = {
|
|
85
|
+
samples: [],
|
|
86
|
+
last_cleanup: Time.now.utc.to_i,
|
|
87
|
+
leak_alerts: []
|
|
88
|
+
}
|
|
89
|
+
end
|
|
90
|
+
end
|
|
40
91
|
|
|
41
|
-
history
|
|
42
|
-
|
|
92
|
+
# Kept for backwards compatibility — history is now initialized at
|
|
93
|
+
# class-load time, so Railtie callers don't need to do anything.
|
|
94
|
+
def initialize_history
|
|
95
|
+
# no-op
|
|
96
|
+
end
|
|
43
97
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
history[:samples] = history[:samples].select { |sample| sample[:timestamp] > cutoff_time }
|
|
47
|
-
end
|
|
98
|
+
def calculate_memory_trend(memory_values, timestamps)
|
|
99
|
+
return {slope: 0, r_squared: 0} if memory_values.length < 2
|
|
48
100
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
101
|
+
n = memory_values.length
|
|
102
|
+
sum_x = timestamps.sum
|
|
103
|
+
sum_y = memory_values.sum
|
|
104
|
+
sum_xy = timestamps.zip(memory_values).sum { |x, y| x * y }
|
|
105
|
+
sum_x2 = timestamps.sum { |x| x * x }
|
|
52
106
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
timestamps = samples.map { |s| s[:timestamp] }
|
|
107
|
+
denominator = (n * sum_x2 - sum_x * sum_x)
|
|
108
|
+
return {slope: 0, r_squared: 0} if denominator.zero?
|
|
56
109
|
|
|
57
|
-
|
|
58
|
-
|
|
110
|
+
slope = (n * sum_xy - sum_x * sum_y).to_f / denominator
|
|
111
|
+
intercept = (sum_y - slope * sum_x).to_f / n
|
|
59
112
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
113
|
+
y_mean = sum_y.to_f / n
|
|
114
|
+
ss_tot = memory_values.sum { |y| (y - y_mean)**2 }
|
|
115
|
+
ss_res = memory_values.zip(timestamps).sum { |y, x| (y - (slope * x + intercept))**2 }
|
|
116
|
+
r_squared = (ss_tot > 0) ? 1 - (ss_res / ss_tot) : 0
|
|
63
117
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
detected_at: Time.now.utc.to_i,
|
|
67
|
-
memory_growth_mb: memory_growth.round(2),
|
|
68
|
-
growth_rate_mb_per_second: trend[:slope],
|
|
69
|
-
confidence: trend[:r_squared],
|
|
70
|
-
sample_count: samples.length,
|
|
71
|
-
time_window_seconds: timestamps.last - timestamps.first,
|
|
72
|
-
recent_controllers: samples.last(5).map { |s| "#{s[:controller]}##{s[:action]}" }.uniq
|
|
73
|
-
}
|
|
118
|
+
{slope: slope, intercept: intercept, r_squared: r_squared}
|
|
119
|
+
end
|
|
74
120
|
|
|
75
|
-
|
|
121
|
+
def calculate_stats(values)
|
|
122
|
+
return {} if values.empty?
|
|
76
123
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
124
|
+
{
|
|
125
|
+
min: values.min,
|
|
126
|
+
max: values.max,
|
|
127
|
+
mean: (values.sum.to_f / values.length).round(2),
|
|
128
|
+
median: values.sort[values.length / 2],
|
|
129
|
+
std_dev: calculate_standard_deviation(values)
|
|
130
|
+
}
|
|
80
131
|
end
|
|
81
|
-
end
|
|
82
132
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
n = memory_values.length
|
|
87
|
-
sum_x = timestamps.sum
|
|
88
|
-
sum_y = memory_values.sum
|
|
89
|
-
sum_xy = timestamps.zip(memory_values).sum { |x, y| x * y }
|
|
90
|
-
sum_x2 = timestamps.sum { |x| x * x }
|
|
91
|
-
memory_values.sum { |y| y * y }
|
|
92
|
-
|
|
93
|
-
# Calculate slope (m) and intercept (b) for y = mx + b
|
|
94
|
-
slope = (n * sum_xy - sum_x * sum_y).to_f / (n * sum_x2 - sum_x * sum_x)
|
|
95
|
-
intercept = (sum_y - slope * sum_x).to_f / n
|
|
96
|
-
|
|
97
|
-
# Calculate R-squared (coefficient of determination)
|
|
98
|
-
y_mean = sum_y.to_f / n
|
|
99
|
-
ss_tot = memory_values.sum { |y| (y - y_mean)**2 }
|
|
100
|
-
ss_res = memory_values.zip(timestamps).sum { |y, x| (y - (slope * x + intercept))**2 }
|
|
101
|
-
r_squared = (ss_tot > 0) ? 1 - (ss_res / ss_tot) : 0
|
|
102
|
-
|
|
103
|
-
{
|
|
104
|
-
slope: slope,
|
|
105
|
-
intercept: intercept,
|
|
106
|
-
r_squared: r_squared
|
|
107
|
-
}
|
|
108
|
-
end
|
|
133
|
+
def calculate_standard_deviation(values)
|
|
134
|
+
return 0 if values.length < 2
|
|
109
135
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return {status: "insufficient_data", sample_count: samples.length} if samples.length < 5
|
|
115
|
-
|
|
116
|
-
memory_values = samples.map { |s| s[:memory_usage] }
|
|
117
|
-
gc_counts = samples.map { |s| s[:gc_count] }
|
|
118
|
-
object_counts = samples.map { |s| s[:object_count] }
|
|
119
|
-
|
|
120
|
-
# Calculate basic statistics
|
|
121
|
-
memory_stats = calculate_stats(memory_values)
|
|
122
|
-
gc_stats = calculate_stats(gc_counts)
|
|
123
|
-
object_stats = calculate_stats(object_counts)
|
|
124
|
-
|
|
125
|
-
# Detect patterns
|
|
126
|
-
memory_trend = calculate_memory_trend(memory_values, samples.map { |s| s[:timestamp] })
|
|
127
|
-
|
|
128
|
-
# Analyze recent activity
|
|
129
|
-
recent_samples = samples.last(10)
|
|
130
|
-
recent_controllers = recent_samples.map { |s| "#{s[:controller]}##{s[:action]}" }.tally
|
|
131
|
-
|
|
132
|
-
{
|
|
133
|
-
status: "analyzed",
|
|
134
|
-
sample_count: samples.length,
|
|
135
|
-
time_window_seconds: samples.last[:timestamp] - samples.first[:timestamp],
|
|
136
|
-
memory_stats: memory_stats,
|
|
137
|
-
gc_stats: gc_stats,
|
|
138
|
-
object_stats: object_stats,
|
|
139
|
-
memory_trend: memory_trend,
|
|
140
|
-
recent_controllers: recent_controllers,
|
|
141
|
-
leak_alerts: history[:leak_alerts].last(5),
|
|
142
|
-
memory_efficiency: calculate_memory_efficiency(samples)
|
|
143
|
-
}
|
|
144
|
-
end
|
|
136
|
+
mean = values.sum.to_f / values.length
|
|
137
|
+
variance = values.sum { |v| (v - mean)**2 } / (values.length - 1)
|
|
138
|
+
Math.sqrt(variance).round(2)
|
|
139
|
+
end
|
|
145
140
|
|
|
146
|
-
|
|
147
|
-
|
|
141
|
+
def calculate_memory_efficiency(samples)
|
|
142
|
+
return {} if samples.length < 2
|
|
148
143
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
mean: (values.sum.to_f / values.length).round(2),
|
|
153
|
-
median: values.sort[values.length / 2],
|
|
154
|
-
std_dev: calculate_standard_deviation(values)
|
|
155
|
-
}
|
|
156
|
-
end
|
|
144
|
+
memory_per_object = samples.map do |sample|
|
|
145
|
+
(sample[:object_count] > 0) ? sample[:memory_usage] / sample[:object_count] : 0
|
|
146
|
+
end
|
|
157
147
|
|
|
158
|
-
|
|
159
|
-
|
|
148
|
+
gc_efficiency = []
|
|
149
|
+
(1...samples.length).each do |i|
|
|
150
|
+
gc_delta = samples[i][:gc_count] - samples[i - 1][:gc_count]
|
|
151
|
+
memory_delta = samples[i][:memory_usage] - samples[i - 1][:memory_usage]
|
|
160
152
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def self.calculate_memory_efficiency(samples)
|
|
167
|
-
return {} if samples.length < 2
|
|
153
|
+
if gc_delta > 0 && memory_delta < 0
|
|
154
|
+
gc_efficiency << (-memory_delta / gc_delta).round(2)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
168
157
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
158
|
+
{
|
|
159
|
+
average_memory_per_object_kb: (memory_per_object.sum / memory_per_object.length).round(2),
|
|
160
|
+
gc_efficiency_mb_per_cycle: gc_efficiency.any? ? (gc_efficiency.sum / gc_efficiency.length).round(2) : 0,
|
|
161
|
+
memory_volatility: calculate_standard_deviation(samples.map { |s| s[:memory_usage] })
|
|
162
|
+
}
|
|
172
163
|
end
|
|
173
164
|
|
|
174
|
-
|
|
175
|
-
gc_efficiency = []
|
|
176
|
-
(1...samples.length).each do |i|
|
|
177
|
-
gc_delta = samples[i][:gc_count] - samples[i - 1][:gc_count]
|
|
178
|
-
memory_delta = samples[i][:memory_usage] - samples[i - 1][:memory_usage]
|
|
165
|
+
private
|
|
179
166
|
|
|
180
|
-
|
|
181
|
-
|
|
167
|
+
def cleanup_old_samples_unlocked
|
|
168
|
+
cutoff_time = Time.now.utc.to_i - LEAK_DETECTION_WINDOW
|
|
169
|
+
samples = @history[:samples]
|
|
170
|
+
|
|
171
|
+
# Drop stale samples by time window.
|
|
172
|
+
if samples.any? && samples.first[:timestamp] <= cutoff_time
|
|
173
|
+
@history[:samples] = samples.select { |s| s[:timestamp] > cutoff_time }
|
|
174
|
+
samples = @history[:samples]
|
|
182
175
|
end
|
|
176
|
+
|
|
177
|
+
# Enforce hard cap so a burst of traffic can't grow the buffer forever.
|
|
178
|
+
if samples.length > MAX_SAMPLES
|
|
179
|
+
@history[:samples] = samples.last(MAX_SAMPLES)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
@history[:last_cleanup] = Time.now.utc.to_i
|
|
183
183
|
end
|
|
184
184
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
185
|
+
# Runs the O(N) regression on a pre-copied snapshot so the mutex is not
|
|
186
|
+
# held during the computation. The alert is appended inside a short lock.
|
|
187
|
+
def check_for_memory_leaks(samples)
|
|
188
|
+
return if samples.length < MIN_SAMPLES_FOR_LEAK_DETECTION
|
|
189
|
+
|
|
190
|
+
memory_values = samples.map { |s| s[:memory_usage] }
|
|
191
|
+
timestamps = samples.map { |s| s[:timestamp] }
|
|
191
192
|
|
|
192
|
-
|
|
193
|
-
|
|
193
|
+
trend = calculate_memory_trend(memory_values, timestamps)
|
|
194
|
+
return unless trend[:slope] > 0.1 && trend[:r_squared] > 0.7
|
|
195
|
+
|
|
196
|
+
memory_growth = memory_values.last - memory_values.first
|
|
197
|
+
return unless memory_growth > MEMORY_GROWTH_THRESHOLD
|
|
198
|
+
|
|
199
|
+
leak_alert = {
|
|
200
|
+
detected_at: Time.now.utc.to_i,
|
|
201
|
+
memory_growth_mb: memory_growth.round(2),
|
|
202
|
+
growth_rate_mb_per_second: trend[:slope],
|
|
203
|
+
confidence: trend[:r_squared],
|
|
204
|
+
sample_count: samples.length,
|
|
205
|
+
time_window_seconds: timestamps.last - timestamps.first,
|
|
206
|
+
recent_controllers: samples.last(5).map { |s| "#{s[:controller]}##{s[:action]}" }.uniq
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
@mutex.synchronize do
|
|
210
|
+
@history[:leak_alerts] << leak_alert
|
|
211
|
+
@history[:leak_alerts] = @history[:leak_alerts].last(MAX_LEAK_ALERTS)
|
|
212
|
+
end
|
|
213
|
+
end
|
|
194
214
|
end
|
|
195
215
|
end
|
|
196
216
|
end
|
|
@@ -4,8 +4,10 @@ require "active_support/notifications"
|
|
|
4
4
|
|
|
5
5
|
module DeadBro
|
|
6
6
|
class MemoryTrackingSubscriber
|
|
7
|
-
#
|
|
8
|
-
|
|
7
|
+
# Allocation counts come from the process_action event (Rails instruments
|
|
8
|
+
# allocations there via ActiveSupport::Notifications). The old
|
|
9
|
+
# "object_allocations.active_support" constant was never emitted by Rails,
|
|
10
|
+
# so that subscription was dead code — removed.
|
|
9
11
|
PROCESS_ACTION_EVENT = "process_action.action_controller"
|
|
10
12
|
|
|
11
13
|
THREAD_LOCAL_KEY = :dead_bro_memory_events
|
|
@@ -13,7 +15,6 @@ module DeadBro
|
|
|
13
15
|
LARGE_OBJECT_THRESHOLD = 1_000_000 # 1MB threshold for large objects
|
|
14
16
|
|
|
15
17
|
# Performance optimization settings
|
|
16
|
-
ALLOCATION_SAMPLING_RATE = 1 # Track all when enabled (adjust in production)
|
|
17
18
|
MAX_ALLOCATIONS_PER_REQUEST = 1000 # Limit allocations tracked per request
|
|
18
19
|
LARGE_OBJECT_SAMPLE_RATE = 0.01 # Sample 1% of live objects to estimate large ones
|
|
19
20
|
MAX_LARGE_OBJECTS = 50 # Cap number of large objects captured per request
|
|
@@ -23,13 +24,6 @@ module DeadBro
|
|
|
23
24
|
return unless DeadBro.configuration.allocation_tracking_enabled
|
|
24
25
|
if defined?(ActiveSupport::Notifications) && ActiveSupport::Notifications.notifier.respond_to?(:subscribe)
|
|
25
26
|
begin
|
|
26
|
-
# Subscribe to object allocation events with sampling
|
|
27
|
-
ActiveSupport::Notifications.subscribe(ALLOCATION_EVENT) do |name, started, finished, _unique_id, data|
|
|
28
|
-
# Sample allocations to reduce overhead
|
|
29
|
-
next unless rand < ALLOCATION_SAMPLING_RATE
|
|
30
|
-
track_allocation(data, started, finished)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
27
|
# Subscribe to process_action to capture request-level allocation counters
|
|
34
28
|
ActiveSupport::Notifications.subscribe(PROCESS_ACTION_EVENT) do |*args|
|
|
35
29
|
event = if args.length == 1 && args.first.is_a?(ActiveSupport::Notifications::Event)
|
|
@@ -345,27 +339,9 @@ module DeadBro
|
|
|
345
339
|
end
|
|
346
340
|
|
|
347
341
|
def self.memory_usage_mb
|
|
348
|
-
#
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
# Cache memory usage for 1 second to avoid repeated system calls
|
|
353
|
-
if @memory_cache[cache_key] && (Time.now - @memory_cache[cache_key][:timestamp]) < 1
|
|
354
|
-
return @memory_cache[cache_key][:memory]
|
|
355
|
-
end
|
|
356
|
-
|
|
357
|
-
memory = if defined?(GC) && GC.respond_to?(:stat)
|
|
358
|
-
# Use GC stats as a proxy for memory usage (much faster than ps)
|
|
359
|
-
gc_stats = GC.stat
|
|
360
|
-
# Estimate memory usage from heap pages (rough approximation)
|
|
361
|
-
heap_pages = gc_stats[:heap_allocated_pages] || 0
|
|
362
|
-
(heap_pages * 4 * 1024) / (1024 * 1024) # 4KB per page, convert to MB
|
|
363
|
-
else
|
|
364
|
-
0
|
|
365
|
-
end
|
|
366
|
-
|
|
367
|
-
@memory_cache[cache_key] = {memory: memory, timestamp: Time.now}
|
|
368
|
-
memory
|
|
342
|
+
# MemoryHelpers.rss_mb reads /proc/self/status on Linux and caches for
|
|
343
|
+
# ~1 second across threads, so this is safe to call per-request.
|
|
344
|
+
DeadBro::MemoryHelpers.rss_mb
|
|
369
345
|
rescue
|
|
370
346
|
0
|
|
371
347
|
end
|
data/lib/dead_bro/monitor.rb
CHANGED
|
@@ -2,15 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
module DeadBro
|
|
4
4
|
class Monitor
|
|
5
|
+
SLEEP_INTERVAL_SECONDS = 60
|
|
6
|
+
|
|
5
7
|
def initialize(client: DeadBro.client)
|
|
6
8
|
@client = client
|
|
7
9
|
@thread = nil
|
|
8
10
|
@running = false
|
|
11
|
+
@stop_mutex = Mutex.new
|
|
12
|
+
@stop_cv = ConditionVariable.new
|
|
9
13
|
end
|
|
10
14
|
|
|
11
15
|
def start
|
|
12
|
-
|
|
13
|
-
return
|
|
16
|
+
# Live thread already running — nothing to do.
|
|
17
|
+
return if @running && @thread&.alive?
|
|
18
|
+
|
|
19
|
+
# Reset: handles post-fork where @running=true but the thread is dead.
|
|
20
|
+
@running = false
|
|
21
|
+
|
|
14
22
|
return unless DeadBro.configuration.enabled
|
|
15
23
|
|
|
16
24
|
@running = true
|
|
@@ -25,8 +33,12 @@ module DeadBro
|
|
|
25
33
|
log_error("Error collecting stats: #{e.message}")
|
|
26
34
|
end
|
|
27
35
|
|
|
28
|
-
#
|
|
29
|
-
|
|
36
|
+
# Interruptible sleep — stop() signals the CV so shutdown doesn't
|
|
37
|
+
# block up to a full minute. Still naps the full interval during
|
|
38
|
+
# normal operation.
|
|
39
|
+
@stop_mutex.synchronize do
|
|
40
|
+
@stop_cv.wait(@stop_mutex, SLEEP_INTERVAL_SECONDS) if @running
|
|
41
|
+
end
|
|
30
42
|
end
|
|
31
43
|
end
|
|
32
44
|
|
|
@@ -35,7 +47,8 @@ module DeadBro
|
|
|
35
47
|
|
|
36
48
|
def stop
|
|
37
49
|
@running = false
|
|
38
|
-
@
|
|
50
|
+
@stop_mutex.synchronize { @stop_cv.broadcast }
|
|
51
|
+
@thread&.join(5) # Safety timeout in case the thread is mid-flight
|
|
39
52
|
@thread = nil
|
|
40
53
|
end
|
|
41
54
|
|
data/lib/dead_bro/railtie.rb
CHANGED
|
@@ -55,12 +55,12 @@ if defined?(Rails) && defined?(Rails::Railtie)
|
|
|
55
55
|
DeadBro::JobSubscriber.subscribe!(client: shared_client)
|
|
56
56
|
end
|
|
57
57
|
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
# Always start the monitor thread. The thread runs every 60s but
|
|
59
|
+
# post_monitor_stats skips the HTTP POST when job_queue_monitoring_enabled
|
|
60
|
+
# is false, so the backend can toggle monitoring on/off mid-process.
|
|
61
|
+
require "dead_bro/monitor"
|
|
62
|
+
DeadBro.monitor = DeadBro::Monitor.new(client: shared_client)
|
|
63
|
+
DeadBro.monitor.start
|
|
64
64
|
rescue
|
|
65
65
|
# Never raise in Railtie init
|
|
66
66
|
end
|