dead_bro 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -43
- data/lib/dead_bro/circuit_breaker.rb +58 -38
- data/lib/dead_bro/client.rb +131 -130
- data/lib/dead_bro/configuration.rb +155 -81
- data/lib/dead_bro/dispatcher.rb +130 -0
- data/lib/dead_bro/error_middleware.rb +1 -1
- data/lib/dead_bro/job_subscriber.rb +36 -13
- data/lib/dead_bro/lightweight_memory_tracker.rb +5 -7
- data/lib/dead_bro/logger.rb +30 -11
- data/lib/dead_bro/memory_details.rb +71 -0
- data/lib/dead_bro/memory_helpers.rb +62 -0
- data/lib/dead_bro/memory_leak_detector.rb +178 -158
- data/lib/dead_bro/memory_tracking_subscriber.rb +12 -36
- data/lib/dead_bro/monitor.rb +18 -5
- data/lib/dead_bro/railtie.rb +6 -6
- data/lib/dead_bro/redis_subscriber.rb +2 -2
- data/lib/dead_bro/sql_subscriber.rb +104 -71
- data/lib/dead_bro/subscriber.rb +41 -17
- data/lib/dead_bro/version.rb +1 -1
- data/lib/dead_bro.rb +87 -96
- metadata +4 -2
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DeadBro
|
|
4
|
+
module MemoryDetails
|
|
5
|
+
# Maps Ruby internal ObjectSpace type codes to human-readable names.
|
|
6
|
+
# Types omitted here are filtered out (internal noise).
|
|
7
|
+
OBJECT_TYPE_NAMES = {
|
|
8
|
+
T_STRING: "String",
|
|
9
|
+
T_ARRAY: "Array",
|
|
10
|
+
T_HASH: "Hash",
|
|
11
|
+
T_OBJECT: "Object",
|
|
12
|
+
T_DATA: "C Extension",
|
|
13
|
+
T_CLASS: "Class",
|
|
14
|
+
T_MODULE: "Module",
|
|
15
|
+
T_STRUCT: "Struct",
|
|
16
|
+
T_MATCH: "MatchData",
|
|
17
|
+
T_REGEXP: "Regexp",
|
|
18
|
+
T_SYMBOL: "Symbol",
|
|
19
|
+
T_FLOAT: "Float",
|
|
20
|
+
T_FILE: "File",
|
|
21
|
+
T_BIGNUM: "Integer (big)"
|
|
22
|
+
}.freeze
|
|
23
|
+
|
|
24
|
+
# Noise types never shown to users.
|
|
25
|
+
SKIP_TYPES = %i[FREE T_IMEMO TOTAL T_NODE T_ICLASS T_ZOMBIE T_MOVED].freeze
|
|
26
|
+
|
|
27
|
+
def self.format_object_breakdown(deltas)
|
|
28
|
+
result = {}
|
|
29
|
+
deltas.each do |type, count|
|
|
30
|
+
next if SKIP_TYPES.include?(type)
|
|
31
|
+
next unless count.positive?
|
|
32
|
+
name = OBJECT_TYPE_NAMES[type] || type.to_s.sub(/\AT_/, "")
|
|
33
|
+
result[name] = count
|
|
34
|
+
end
|
|
35
|
+
result.sort_by { |_, v| -v }.to_h
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.build(gc_before:, gc_after:, memory_before_mb:, memory_after_mb:,
|
|
39
|
+
object_counts_before:, object_counts_after:, large_objects:)
|
|
40
|
+
memory_delta_mb = (memory_after_mb - memory_before_mb).round(2)
|
|
41
|
+
gc_collections = (gc_after[:count] || 0) - (gc_before[:count] || 0)
|
|
42
|
+
heap_pages_added = (gc_after[:heap_allocated_pages] || 0) - (gc_before[:heap_allocated_pages] || 0)
|
|
43
|
+
new_objects = (gc_after[:total_allocated_objects] || 0) - (gc_before[:total_allocated_objects] || 0)
|
|
44
|
+
|
|
45
|
+
raw_deltas = {}
|
|
46
|
+
if object_counts_before.any? && object_counts_after.any?
|
|
47
|
+
keys = (object_counts_before.keys + object_counts_after.keys).uniq
|
|
48
|
+
keys.each do |k|
|
|
49
|
+
diff = (object_counts_after[k] || 0) - (object_counts_before[k] || 0)
|
|
50
|
+
raw_deltas[k] = diff unless diff.zero?
|
|
51
|
+
end
|
|
52
|
+
raw_deltas = raw_deltas.sort_by { |_, v| -v.abs }.first(20).to_h
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
warnings = []
|
|
56
|
+
warnings << "Memory grew #{memory_delta_mb}MB — possible leak or large allocation" if memory_delta_mb > 20
|
|
57
|
+
warnings << "GC ran #{gc_collections} times — many short-lived objects being created" if gc_collections > 5
|
|
58
|
+
warnings << "Heap grew by #{heap_pages_added} pages — Ruby needed more memory from the OS" if heap_pages_added > 10
|
|
59
|
+
warnings << "#{large_objects.length} object(s) over 1MB found in memory" if large_objects.any?
|
|
60
|
+
|
|
61
|
+
{
|
|
62
|
+
gc_collections: gc_collections,
|
|
63
|
+
heap_pages_added: heap_pages_added,
|
|
64
|
+
new_objects: new_objects,
|
|
65
|
+
object_breakdown: format_object_breakdown(raw_deltas),
|
|
66
|
+
large_objects: large_objects,
|
|
67
|
+
warnings: warnings
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -4,6 +4,68 @@ module DeadBro
|
|
|
4
4
|
module MemoryHelpers
|
|
5
5
|
# Helper methods for memory tracking and leak detection
|
|
6
6
|
|
|
7
|
+
RSS_CACHE_TTL_SECONDS = 1.0
|
|
8
|
+
@rss_cache_mutex = Mutex.new
|
|
9
|
+
@rss_cache = nil # [value_bytes, captured_at_monotonic]
|
|
10
|
+
|
|
11
|
+
# Current process RSS in bytes. Uses /proc/self/status on Linux (cheap read)
|
|
12
|
+
# and falls back to `ps` elsewhere. Result is cached for 1s across threads
|
|
13
|
+
# so this is safe to call from every request without flooding the kernel.
|
|
14
|
+
def self.rss_bytes
|
|
15
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
16
|
+
cached = @rss_cache
|
|
17
|
+
if cached && (now - cached[1]) < RSS_CACHE_TTL_SECONDS
|
|
18
|
+
return cached[0]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
value = read_rss_bytes
|
|
22
|
+
@rss_cache_mutex.synchronize do
|
|
23
|
+
# Re-check inside the lock to avoid racing a newer reading.
|
|
24
|
+
cached = @rss_cache
|
|
25
|
+
if cached.nil? || (now - cached[1]) >= RSS_CACHE_TTL_SECONDS
|
|
26
|
+
@rss_cache = [value, now]
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
value
|
|
30
|
+
rescue
|
|
31
|
+
0
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.rss_mb
|
|
35
|
+
(rss_bytes.to_f / (1024 * 1024)).round(2)
|
|
36
|
+
rescue
|
|
37
|
+
0.0
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def self.read_rss_bytes
|
|
41
|
+
if File.readable?("/proc/self/status")
|
|
42
|
+
read_rss_from_proc_status
|
|
43
|
+
else
|
|
44
|
+
read_rss_from_ps
|
|
45
|
+
end
|
|
46
|
+
rescue
|
|
47
|
+
0
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.read_rss_from_proc_status
|
|
51
|
+
File.foreach("/proc/self/status") do |line|
|
|
52
|
+
next unless line.start_with?("VmRSS:")
|
|
53
|
+
kb = line.split[1].to_i
|
|
54
|
+
return kb * 1024 if kb > 0
|
|
55
|
+
end
|
|
56
|
+
0
|
|
57
|
+
rescue
|
|
58
|
+
0
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def self.read_rss_from_ps
|
|
62
|
+
kb = `ps -o rss= -p #{Process.pid}`.to_i
|
|
63
|
+
return 0 if kb <= 0
|
|
64
|
+
kb * 1024
|
|
65
|
+
rescue
|
|
66
|
+
0
|
|
67
|
+
end
|
|
68
|
+
|
|
7
69
|
# Take a memory snapshot with a custom label
|
|
8
70
|
def self.snapshot(label)
|
|
9
71
|
DeadBro::MemoryTrackingSubscriber.take_memory_snapshot(label)
|
|
@@ -1,196 +1,216 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module DeadBro
|
|
4
|
+
# Process-wide memory leak detector. Previously stored samples in
|
|
5
|
+
# `Thread.current[...]`, which meant each Puma worker thread saw only the
|
|
6
|
+
# handful of requests it served — far too few samples, and reset whenever a
|
|
7
|
+
# thread was recycled. History is now shared across all threads in the
|
|
8
|
+
# process behind a mutex, with a hard cap on the number of retained samples.
|
|
4
9
|
class MemoryLeakDetector
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
LEAK_DETECTION_WINDOW = 300 # 5 minutes
|
|
8
|
-
MEMORY_GROWTH_THRESHOLD = 50 # 50MB growth threshold
|
|
10
|
+
LEAK_DETECTION_WINDOW = 300 # seconds (5 minutes)
|
|
11
|
+
MEMORY_GROWTH_THRESHOLD = 50 # MB growth over the window
|
|
9
12
|
MIN_SAMPLES_FOR_LEAK_DETECTION = 10
|
|
13
|
+
MAX_SAMPLES = 500 # hard cap so a long-running process can't grow unbounded
|
|
14
|
+
MAX_LEAK_ALERTS = 10
|
|
15
|
+
|
|
16
|
+
@mutex = Mutex.new
|
|
17
|
+
@history = {
|
|
18
|
+
samples: [],
|
|
19
|
+
last_cleanup: Time.now.utc.to_i,
|
|
20
|
+
leak_alerts: []
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
class << self
|
|
24
|
+
def record_memory_sample(sample_data)
|
|
25
|
+
sample = {
|
|
26
|
+
timestamp: Time.now.utc.to_i,
|
|
27
|
+
memory_usage: sample_data[:memory_usage] || 0,
|
|
28
|
+
gc_count: sample_data[:gc_count] || 0,
|
|
29
|
+
heap_pages: sample_data[:heap_pages] || 0,
|
|
30
|
+
object_count: sample_data[:object_count] || 0,
|
|
31
|
+
request_id: sample_data[:request_id],
|
|
32
|
+
controller: sample_data[:controller],
|
|
33
|
+
action: sample_data[:action]
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
samples_snapshot = @mutex.synchronize do
|
|
37
|
+
@history[:samples] << sample
|
|
38
|
+
cleanup_old_samples_unlocked
|
|
39
|
+
@history[:samples].dup
|
|
40
|
+
end
|
|
10
41
|
|
|
11
|
-
|
|
12
|
-
Thread.current[MEMORY_HISTORY_KEY] = {
|
|
13
|
-
samples: [],
|
|
14
|
-
last_cleanup: Time.now.utc.to_i,
|
|
15
|
-
leak_alerts: []
|
|
16
|
-
}
|
|
17
|
-
end
|
|
42
|
+
check_for_memory_leaks(samples_snapshot)
|
|
18
43
|
|
|
19
|
-
|
|
20
|
-
|
|
44
|
+
nil
|
|
45
|
+
end
|
|
21
46
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
heap_pages: sample_data[:heap_pages] || 0,
|
|
27
|
-
object_count: sample_data[:object_count] || 0,
|
|
28
|
-
request_id: sample_data[:request_id],
|
|
29
|
-
controller: sample_data[:controller],
|
|
30
|
-
action: sample_data[:action]
|
|
31
|
-
}
|
|
47
|
+
def get_memory_analysis
|
|
48
|
+
samples_snapshot, leak_alerts_snapshot = @mutex.synchronize do
|
|
49
|
+
[@history[:samples].dup, @history[:leak_alerts].dup]
|
|
50
|
+
end
|
|
32
51
|
|
|
33
|
-
|
|
52
|
+
if samples_snapshot.length < 5
|
|
53
|
+
return {status: "insufficient_data", sample_count: samples_snapshot.length}
|
|
54
|
+
end
|
|
34
55
|
|
|
35
|
-
|
|
36
|
-
|
|
56
|
+
memory_values = samples_snapshot.map { |s| s[:memory_usage] }
|
|
57
|
+
gc_counts = samples_snapshot.map { |s| s[:gc_count] }
|
|
58
|
+
object_counts = samples_snapshot.map { |s| s[:object_count] }
|
|
59
|
+
|
|
60
|
+
memory_stats = calculate_stats(memory_values)
|
|
61
|
+
gc_stats = calculate_stats(gc_counts)
|
|
62
|
+
object_stats = calculate_stats(object_counts)
|
|
63
|
+
memory_trend = calculate_memory_trend(memory_values, samples_snapshot.map { |s| s[:timestamp] })
|
|
64
|
+
|
|
65
|
+
recent_samples = samples_snapshot.last(10)
|
|
66
|
+
recent_controllers = recent_samples.map { |s| "#{s[:controller]}##{s[:action]}" }.tally
|
|
67
|
+
|
|
68
|
+
{
|
|
69
|
+
status: "analyzed",
|
|
70
|
+
sample_count: samples_snapshot.length,
|
|
71
|
+
time_window_seconds: samples_snapshot.last[:timestamp] - samples_snapshot.first[:timestamp],
|
|
72
|
+
memory_stats: memory_stats,
|
|
73
|
+
gc_stats: gc_stats,
|
|
74
|
+
object_stats: object_stats,
|
|
75
|
+
memory_trend: memory_trend,
|
|
76
|
+
recent_controllers: recent_controllers,
|
|
77
|
+
leak_alerts: leak_alerts_snapshot.last(5),
|
|
78
|
+
memory_efficiency: calculate_memory_efficiency(samples_snapshot)
|
|
79
|
+
}
|
|
80
|
+
end
|
|
37
81
|
|
|
38
|
-
|
|
39
|
-
|
|
82
|
+
def clear_history
|
|
83
|
+
@mutex.synchronize do
|
|
84
|
+
@history = {
|
|
85
|
+
samples: [],
|
|
86
|
+
last_cleanup: Time.now.utc.to_i,
|
|
87
|
+
leak_alerts: []
|
|
88
|
+
}
|
|
89
|
+
end
|
|
90
|
+
end
|
|
40
91
|
|
|
41
|
-
history
|
|
42
|
-
|
|
92
|
+
# Kept for backwards compatibility — history is now initialized at
|
|
93
|
+
# class-load time, so Railtie callers don't need to do anything.
|
|
94
|
+
def initialize_history
|
|
95
|
+
# no-op
|
|
96
|
+
end
|
|
43
97
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
history[:samples] = history[:samples].select { |sample| sample[:timestamp] > cutoff_time }
|
|
47
|
-
end
|
|
98
|
+
def calculate_memory_trend(memory_values, timestamps)
|
|
99
|
+
return {slope: 0, r_squared: 0} if memory_values.length < 2
|
|
48
100
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
101
|
+
n = memory_values.length
|
|
102
|
+
sum_x = timestamps.sum
|
|
103
|
+
sum_y = memory_values.sum
|
|
104
|
+
sum_xy = timestamps.zip(memory_values).sum { |x, y| x * y }
|
|
105
|
+
sum_x2 = timestamps.sum { |x| x * x }
|
|
52
106
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
timestamps = samples.map { |s| s[:timestamp] }
|
|
107
|
+
denominator = (n * sum_x2 - sum_x * sum_x)
|
|
108
|
+
return {slope: 0, r_squared: 0} if denominator.zero?
|
|
56
109
|
|
|
57
|
-
|
|
58
|
-
|
|
110
|
+
slope = (n * sum_xy - sum_x * sum_y).to_f / denominator
|
|
111
|
+
intercept = (sum_y - slope * sum_x).to_f / n
|
|
59
112
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
113
|
+
y_mean = sum_y.to_f / n
|
|
114
|
+
ss_tot = memory_values.sum { |y| (y - y_mean)**2 }
|
|
115
|
+
ss_res = memory_values.zip(timestamps).sum { |y, x| (y - (slope * x + intercept))**2 }
|
|
116
|
+
r_squared = (ss_tot > 0) ? 1 - (ss_res / ss_tot) : 0
|
|
63
117
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
detected_at: Time.now.utc.to_i,
|
|
67
|
-
memory_growth_mb: memory_growth.round(2),
|
|
68
|
-
growth_rate_mb_per_second: trend[:slope],
|
|
69
|
-
confidence: trend[:r_squared],
|
|
70
|
-
sample_count: samples.length,
|
|
71
|
-
time_window_seconds: timestamps.last - timestamps.first,
|
|
72
|
-
recent_controllers: samples.last(5).map { |s| "#{s[:controller]}##{s[:action]}" }.uniq
|
|
73
|
-
}
|
|
118
|
+
{slope: slope, intercept: intercept, r_squared: r_squared}
|
|
119
|
+
end
|
|
74
120
|
|
|
75
|
-
|
|
121
|
+
def calculate_stats(values)
|
|
122
|
+
return {} if values.empty?
|
|
76
123
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
124
|
+
{
|
|
125
|
+
min: values.min,
|
|
126
|
+
max: values.max,
|
|
127
|
+
mean: (values.sum.to_f / values.length).round(2),
|
|
128
|
+
median: values.sort[values.length / 2],
|
|
129
|
+
std_dev: calculate_standard_deviation(values)
|
|
130
|
+
}
|
|
80
131
|
end
|
|
81
|
-
end
|
|
82
132
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
n = memory_values.length
|
|
87
|
-
sum_x = timestamps.sum
|
|
88
|
-
sum_y = memory_values.sum
|
|
89
|
-
sum_xy = timestamps.zip(memory_values).sum { |x, y| x * y }
|
|
90
|
-
sum_x2 = timestamps.sum { |x| x * x }
|
|
91
|
-
memory_values.sum { |y| y * y }
|
|
92
|
-
|
|
93
|
-
# Calculate slope (m) and intercept (b) for y = mx + b
|
|
94
|
-
slope = (n * sum_xy - sum_x * sum_y).to_f / (n * sum_x2 - sum_x * sum_x)
|
|
95
|
-
intercept = (sum_y - slope * sum_x).to_f / n
|
|
96
|
-
|
|
97
|
-
# Calculate R-squared (coefficient of determination)
|
|
98
|
-
y_mean = sum_y.to_f / n
|
|
99
|
-
ss_tot = memory_values.sum { |y| (y - y_mean)**2 }
|
|
100
|
-
ss_res = memory_values.zip(timestamps).sum { |y, x| (y - (slope * x + intercept))**2 }
|
|
101
|
-
r_squared = (ss_tot > 0) ? 1 - (ss_res / ss_tot) : 0
|
|
102
|
-
|
|
103
|
-
{
|
|
104
|
-
slope: slope,
|
|
105
|
-
intercept: intercept,
|
|
106
|
-
r_squared: r_squared
|
|
107
|
-
}
|
|
108
|
-
end
|
|
133
|
+
def calculate_standard_deviation(values)
|
|
134
|
+
return 0 if values.length < 2
|
|
109
135
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return {status: "insufficient_data", sample_count: samples.length} if samples.length < 5
|
|
115
|
-
|
|
116
|
-
memory_values = samples.map { |s| s[:memory_usage] }
|
|
117
|
-
gc_counts = samples.map { |s| s[:gc_count] }
|
|
118
|
-
object_counts = samples.map { |s| s[:object_count] }
|
|
119
|
-
|
|
120
|
-
# Calculate basic statistics
|
|
121
|
-
memory_stats = calculate_stats(memory_values)
|
|
122
|
-
gc_stats = calculate_stats(gc_counts)
|
|
123
|
-
object_stats = calculate_stats(object_counts)
|
|
124
|
-
|
|
125
|
-
# Detect patterns
|
|
126
|
-
memory_trend = calculate_memory_trend(memory_values, samples.map { |s| s[:timestamp] })
|
|
127
|
-
|
|
128
|
-
# Analyze recent activity
|
|
129
|
-
recent_samples = samples.last(10)
|
|
130
|
-
recent_controllers = recent_samples.map { |s| "#{s[:controller]}##{s[:action]}" }.tally
|
|
131
|
-
|
|
132
|
-
{
|
|
133
|
-
status: "analyzed",
|
|
134
|
-
sample_count: samples.length,
|
|
135
|
-
time_window_seconds: samples.last[:timestamp] - samples.first[:timestamp],
|
|
136
|
-
memory_stats: memory_stats,
|
|
137
|
-
gc_stats: gc_stats,
|
|
138
|
-
object_stats: object_stats,
|
|
139
|
-
memory_trend: memory_trend,
|
|
140
|
-
recent_controllers: recent_controllers,
|
|
141
|
-
leak_alerts: history[:leak_alerts].last(5),
|
|
142
|
-
memory_efficiency: calculate_memory_efficiency(samples)
|
|
143
|
-
}
|
|
144
|
-
end
|
|
136
|
+
mean = values.sum.to_f / values.length
|
|
137
|
+
variance = values.sum { |v| (v - mean)**2 } / (values.length - 1)
|
|
138
|
+
Math.sqrt(variance).round(2)
|
|
139
|
+
end
|
|
145
140
|
|
|
146
|
-
|
|
147
|
-
|
|
141
|
+
def calculate_memory_efficiency(samples)
|
|
142
|
+
return {} if samples.length < 2
|
|
148
143
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
mean: (values.sum.to_f / values.length).round(2),
|
|
153
|
-
median: values.sort[values.length / 2],
|
|
154
|
-
std_dev: calculate_standard_deviation(values)
|
|
155
|
-
}
|
|
156
|
-
end
|
|
144
|
+
memory_per_object = samples.map do |sample|
|
|
145
|
+
(sample[:object_count] > 0) ? sample[:memory_usage] / sample[:object_count] : 0
|
|
146
|
+
end
|
|
157
147
|
|
|
158
|
-
|
|
159
|
-
|
|
148
|
+
gc_efficiency = []
|
|
149
|
+
(1...samples.length).each do |i|
|
|
150
|
+
gc_delta = samples[i][:gc_count] - samples[i - 1][:gc_count]
|
|
151
|
+
memory_delta = samples[i][:memory_usage] - samples[i - 1][:memory_usage]
|
|
160
152
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def self.calculate_memory_efficiency(samples)
|
|
167
|
-
return {} if samples.length < 2
|
|
153
|
+
if gc_delta > 0 && memory_delta < 0
|
|
154
|
+
gc_efficiency << (-memory_delta / gc_delta).round(2)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
168
157
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
158
|
+
{
|
|
159
|
+
average_memory_per_object_kb: (memory_per_object.sum / memory_per_object.length).round(2),
|
|
160
|
+
gc_efficiency_mb_per_cycle: gc_efficiency.any? ? (gc_efficiency.sum / gc_efficiency.length).round(2) : 0,
|
|
161
|
+
memory_volatility: calculate_standard_deviation(samples.map { |s| s[:memory_usage] })
|
|
162
|
+
}
|
|
172
163
|
end
|
|
173
164
|
|
|
174
|
-
|
|
175
|
-
gc_efficiency = []
|
|
176
|
-
(1...samples.length).each do |i|
|
|
177
|
-
gc_delta = samples[i][:gc_count] - samples[i - 1][:gc_count]
|
|
178
|
-
memory_delta = samples[i][:memory_usage] - samples[i - 1][:memory_usage]
|
|
165
|
+
private
|
|
179
166
|
|
|
180
|
-
|
|
181
|
-
|
|
167
|
+
def cleanup_old_samples_unlocked
|
|
168
|
+
cutoff_time = Time.now.utc.to_i - LEAK_DETECTION_WINDOW
|
|
169
|
+
samples = @history[:samples]
|
|
170
|
+
|
|
171
|
+
# Drop stale samples by time window.
|
|
172
|
+
if samples.any? && samples.first[:timestamp] <= cutoff_time
|
|
173
|
+
@history[:samples] = samples.select { |s| s[:timestamp] > cutoff_time }
|
|
174
|
+
samples = @history[:samples]
|
|
182
175
|
end
|
|
176
|
+
|
|
177
|
+
# Enforce hard cap so a burst of traffic can't grow the buffer forever.
|
|
178
|
+
if samples.length > MAX_SAMPLES
|
|
179
|
+
@history[:samples] = samples.last(MAX_SAMPLES)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
@history[:last_cleanup] = Time.now.utc.to_i
|
|
183
183
|
end
|
|
184
184
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
185
|
+
# Runs the O(N) regression on a pre-copied snapshot so the mutex is not
|
|
186
|
+
# held during the computation. The alert is appended inside a short lock.
|
|
187
|
+
def check_for_memory_leaks(samples)
|
|
188
|
+
return if samples.length < MIN_SAMPLES_FOR_LEAK_DETECTION
|
|
189
|
+
|
|
190
|
+
memory_values = samples.map { |s| s[:memory_usage] }
|
|
191
|
+
timestamps = samples.map { |s| s[:timestamp] }
|
|
191
192
|
|
|
192
|
-
|
|
193
|
-
|
|
193
|
+
trend = calculate_memory_trend(memory_values, timestamps)
|
|
194
|
+
return unless trend[:slope] > 0.1 && trend[:r_squared] > 0.7
|
|
195
|
+
|
|
196
|
+
memory_growth = memory_values.last - memory_values.first
|
|
197
|
+
return unless memory_growth > MEMORY_GROWTH_THRESHOLD
|
|
198
|
+
|
|
199
|
+
leak_alert = {
|
|
200
|
+
detected_at: Time.now.utc.to_i,
|
|
201
|
+
memory_growth_mb: memory_growth.round(2),
|
|
202
|
+
growth_rate_mb_per_second: trend[:slope],
|
|
203
|
+
confidence: trend[:r_squared],
|
|
204
|
+
sample_count: samples.length,
|
|
205
|
+
time_window_seconds: timestamps.last - timestamps.first,
|
|
206
|
+
recent_controllers: samples.last(5).map { |s| "#{s[:controller]}##{s[:action]}" }.uniq
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
@mutex.synchronize do
|
|
210
|
+
@history[:leak_alerts] << leak_alert
|
|
211
|
+
@history[:leak_alerts] = @history[:leak_alerts].last(MAX_LEAK_ALERTS)
|
|
212
|
+
end
|
|
213
|
+
end
|
|
194
214
|
end
|
|
195
215
|
end
|
|
196
216
|
end
|
|
@@ -4,8 +4,10 @@ require "active_support/notifications"
|
|
|
4
4
|
|
|
5
5
|
module DeadBro
|
|
6
6
|
class MemoryTrackingSubscriber
|
|
7
|
-
#
|
|
8
|
-
|
|
7
|
+
# Allocation counts come from the process_action event (Rails instruments
|
|
8
|
+
# allocations there via ActiveSupport::Notifications). The old
|
|
9
|
+
# "object_allocations.active_support" constant was never emitted by Rails,
|
|
10
|
+
# so that subscription was dead code — removed.
|
|
9
11
|
PROCESS_ACTION_EVENT = "process_action.action_controller"
|
|
10
12
|
|
|
11
13
|
THREAD_LOCAL_KEY = :dead_bro_memory_events
|
|
@@ -13,7 +15,6 @@ module DeadBro
|
|
|
13
15
|
LARGE_OBJECT_THRESHOLD = 1_000_000 # 1MB threshold for large objects
|
|
14
16
|
|
|
15
17
|
# Performance optimization settings
|
|
16
|
-
ALLOCATION_SAMPLING_RATE = 1 # Track all when enabled (adjust in production)
|
|
17
18
|
MAX_ALLOCATIONS_PER_REQUEST = 1000 # Limit allocations tracked per request
|
|
18
19
|
LARGE_OBJECT_SAMPLE_RATE = 0.01 # Sample 1% of live objects to estimate large ones
|
|
19
20
|
MAX_LARGE_OBJECTS = 50 # Cap number of large objects captured per request
|
|
@@ -23,13 +24,6 @@ module DeadBro
|
|
|
23
24
|
return unless DeadBro.configuration.allocation_tracking_enabled
|
|
24
25
|
if defined?(ActiveSupport::Notifications) && ActiveSupport::Notifications.notifier.respond_to?(:subscribe)
|
|
25
26
|
begin
|
|
26
|
-
# Subscribe to object allocation events with sampling
|
|
27
|
-
ActiveSupport::Notifications.subscribe(ALLOCATION_EVENT) do |name, started, finished, _unique_id, data|
|
|
28
|
-
# Sample allocations to reduce overhead
|
|
29
|
-
next unless rand < ALLOCATION_SAMPLING_RATE
|
|
30
|
-
track_allocation(data, started, finished)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
27
|
# Subscribe to process_action to capture request-level allocation counters
|
|
34
28
|
ActiveSupport::Notifications.subscribe(PROCESS_ACTION_EVENT) do |*args|
|
|
35
29
|
event = if args.length == 1 && args.first.is_a?(ActiveSupport::Notifications::Event)
|
|
@@ -80,7 +74,7 @@ module DeadBro
|
|
|
80
74
|
|
|
81
75
|
def self.stop_request_tracking
|
|
82
76
|
stack = Thread.current[THREAD_LOCAL_KEY]
|
|
83
|
-
events = stack.is_a?(Array) && stack.any? ? stack.pop : nil
|
|
77
|
+
events = (stack.is_a?(Array) && stack.any?) ? stack.pop : nil
|
|
84
78
|
Thread.current[THREAD_LOCAL_KEY] = nil if stack.nil? || stack.empty?
|
|
85
79
|
|
|
86
80
|
if events
|
|
@@ -191,10 +185,10 @@ module DeadBro
|
|
|
191
185
|
# Group allocations by class
|
|
192
186
|
allocations_by_class = allocations.group_by { |a| a[:class_name] }
|
|
193
187
|
.transform_values { |allocs|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
188
|
+
{
|
|
189
|
+
count: allocs.sum { |a| a[:count] },
|
|
190
|
+
size: allocs.sum { |a| a[:size] }
|
|
191
|
+
}
|
|
198
192
|
}
|
|
199
193
|
|
|
200
194
|
# Find top allocating classes
|
|
@@ -345,27 +339,9 @@ module DeadBro
|
|
|
345
339
|
end
|
|
346
340
|
|
|
347
341
|
def self.memory_usage_mb
|
|
348
|
-
#
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
# Cache memory usage for 1 second to avoid repeated system calls
|
|
353
|
-
if @memory_cache[cache_key] && (Time.now - @memory_cache[cache_key][:timestamp]) < 1
|
|
354
|
-
return @memory_cache[cache_key][:memory]
|
|
355
|
-
end
|
|
356
|
-
|
|
357
|
-
memory = if defined?(GC) && GC.respond_to?(:stat)
|
|
358
|
-
# Use GC stats as a proxy for memory usage (much faster than ps)
|
|
359
|
-
gc_stats = GC.stat
|
|
360
|
-
# Estimate memory usage from heap pages (rough approximation)
|
|
361
|
-
heap_pages = gc_stats[:heap_allocated_pages] || 0
|
|
362
|
-
(heap_pages * 4 * 1024) / (1024 * 1024) # 4KB per page, convert to MB
|
|
363
|
-
else
|
|
364
|
-
0
|
|
365
|
-
end
|
|
366
|
-
|
|
367
|
-
@memory_cache[cache_key] = {memory: memory, timestamp: Time.now}
|
|
368
|
-
memory
|
|
342
|
+
# MemoryHelpers.rss_mb reads /proc/self/status on Linux and caches for
|
|
343
|
+
# ~1 second across threads, so this is safe to call per-request.
|
|
344
|
+
DeadBro::MemoryHelpers.rss_mb
|
|
369
345
|
rescue
|
|
370
346
|
0
|
|
371
347
|
end
|
data/lib/dead_bro/monitor.rb
CHANGED
|
@@ -2,15 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
module DeadBro
|
|
4
4
|
class Monitor
|
|
5
|
+
SLEEP_INTERVAL_SECONDS = 60
|
|
6
|
+
|
|
5
7
|
def initialize(client: DeadBro.client)
|
|
6
8
|
@client = client
|
|
7
9
|
@thread = nil
|
|
8
10
|
@running = false
|
|
11
|
+
@stop_mutex = Mutex.new
|
|
12
|
+
@stop_cv = ConditionVariable.new
|
|
9
13
|
end
|
|
10
14
|
|
|
11
15
|
def start
|
|
12
|
-
|
|
13
|
-
return
|
|
16
|
+
# Live thread already running — nothing to do.
|
|
17
|
+
return if @running && @thread&.alive?
|
|
18
|
+
|
|
19
|
+
# Reset: handles post-fork where @running=true but the thread is dead.
|
|
20
|
+
@running = false
|
|
21
|
+
|
|
14
22
|
return unless DeadBro.configuration.enabled
|
|
15
23
|
|
|
16
24
|
@running = true
|
|
@@ -25,8 +33,12 @@ module DeadBro
|
|
|
25
33
|
log_error("Error collecting stats: #{e.message}")
|
|
26
34
|
end
|
|
27
35
|
|
|
28
|
-
#
|
|
29
|
-
|
|
36
|
+
# Interruptible sleep — stop() signals the CV so shutdown doesn't
|
|
37
|
+
# block up to a full minute. Still naps the full interval during
|
|
38
|
+
# normal operation.
|
|
39
|
+
@stop_mutex.synchronize do
|
|
40
|
+
@stop_cv.wait(@stop_mutex, SLEEP_INTERVAL_SECONDS) if @running
|
|
41
|
+
end
|
|
30
42
|
end
|
|
31
43
|
end
|
|
32
44
|
|
|
@@ -35,7 +47,8 @@ module DeadBro
|
|
|
35
47
|
|
|
36
48
|
def stop
|
|
37
49
|
@running = false
|
|
38
|
-
@
|
|
50
|
+
@stop_mutex.synchronize { @stop_cv.broadcast }
|
|
51
|
+
@thread&.join(5) # Safety timeout in case the thread is mid-flight
|
|
39
52
|
@thread = nil
|
|
40
53
|
end
|
|
41
54
|
|
data/lib/dead_bro/railtie.rb
CHANGED
|
@@ -55,12 +55,12 @@ if defined?(Rails) && defined?(Rails::Railtie)
|
|
|
55
55
|
DeadBro::JobSubscriber.subscribe!(client: shared_client)
|
|
56
56
|
end
|
|
57
57
|
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
# Always start the monitor thread. The thread runs every 60s but
|
|
59
|
+
# post_monitor_stats skips the HTTP POST when job_queue_monitoring_enabled
|
|
60
|
+
# is false, so the backend can toggle monitoring on/off mid-process.
|
|
61
|
+
require "dead_bro/monitor"
|
|
62
|
+
DeadBro.monitor = DeadBro::Monitor.new(client: shared_client)
|
|
63
|
+
DeadBro.monitor.start
|
|
64
64
|
rescue
|
|
65
65
|
# Never raise in Railtie init
|
|
66
66
|
end
|