dead_bro 0.2.8 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,17 @@ module DeadBro
16
16
  THREAD_LOCAL_EXPLAIN_PENDING_KEY = :dead_bro_explain_pending
17
17
  MAX_TRACKED_QUERIES = 1000
18
18
 
19
+ # Precompiled regexes used by sanitize_sql. Dynamic /.../i literals inside
20
+ # a hot-path method allocate a fresh Regexp on every call — pinning them
21
+ # here removes that allocation entirely.
22
+ SENSITIVE_KV_QUOTED_RE = /\b(password|token|secret|key|ssn|credit_card)\s*=\s*['"][^'"]*['"]/i
23
+ SENSITIVE_KV_BARE_RE = /\b(password|token|secret|key|ssn|credit_card)\s*=\s*[^'",\s)]+/i
24
+ WHERE_EQ_QUOTED_RE = /WHERE\s+[^=]+=\s*['"][^'"]*['"]/i
25
+ WHERE_EQ_QUOTED_INNER_RE = /=\s*['"][^'"]*['"]/
26
+ SANITIZE_MAX_LENGTH = 1000
27
+ SANITIZE_SKIP_SENSITIVE_WHEN_NO_KEYWORDS = /password|token|secret|key|ssn|credit_card/i
28
+ SANITIZE_SKIP_WHERE_WHEN_NO_KEYWORD = /WHERE/i
29
+
19
30
  # True when there is at least one active tracking context (e.g. for nested jobs).
20
31
  def self.tracking_active?
21
32
  stack = Thread.current[THREAD_LOCAL_KEY]
@@ -62,27 +73,33 @@ module DeadBro
62
73
  next unless current
63
74
  unique_id = _unique_id
64
75
  allocations = nil
65
- captured_backtrace = nil
66
76
  begin
67
77
  alloc_results = Thread.current[THREAD_LOCAL_ALLOC_RESULTS_KEY]
68
78
  allocations = alloc_results && alloc_results.delete(unique_id)
69
-
70
- # Get the captured backtrace from when the query started
71
- backtrace_map = Thread.current[THREAD_LOCAL_BACKTRACE_KEY]
72
- captured_backtrace = backtrace_map && backtrace_map.delete(unique_id)
73
79
  rescue
74
80
  end
75
81
 
76
82
  duration_ms = ((finished - started) * 1000.0).round(2)
77
83
  original_sql = data[:sql]
78
84
 
85
+ # Only capture a backtrace for queries we actually care about tracing
86
+ # (slow). This skips the ~O(stack-depth) allocation on the 99% of queries
87
+ # that are fast. An N+1 of 100 x 1ms queries no longer eats a thousand
88
+ # frame allocations for traces nobody will read.
89
+ threshold = begin
90
+ DeadBro.configuration.slow_query_threshold_ms
91
+ rescue
92
+ 500
93
+ end
94
+ captured_trace = (duration_ms >= threshold.to_f) ? capture_app_backtrace : []
95
+
79
96
  query_info = {
80
97
  sql: sanitize_sql(original_sql),
81
98
  name: data[:name],
82
99
  duration_ms: duration_ms,
83
100
  cached: data[:cached] || false,
84
101
  connection_id: data[:connection_id],
85
- trace: safe_query_trace(data, captured_backtrace),
102
+ trace: captured_trace,
86
103
  allocations: allocations
87
104
  }
88
105
 
@@ -115,7 +132,7 @@ module DeadBro
115
132
  # Wait for any pending EXPLAIN ANALYZE queries to complete (with timeout)
116
133
  # This must happen BEFORE we get the queries array reference to ensure
117
134
  # all explain_plan fields are populated
118
- wait_for_pending_explains(5.0) # 5 second timeout
135
+ wait_for_pending_explains(EXPLAIN_WAIT_TIMEOUT_SECONDS)
119
136
 
120
137
  stack = Thread.current[THREAD_LOCAL_KEY]
121
138
  queries = (stack.is_a?(Array) && stack.any?) ? stack.pop : []
@@ -130,13 +147,20 @@ module DeadBro
130
147
  queries
131
148
  end
132
149
 
150
+ # Upper bound on pending EXPLAIN threads per request — stops a slow-query
151
+ # storm from spawning unbounded background threads.
152
+ MAX_PENDING_EXPLAINS = 20
153
+ # Overall wall-clock we're willing to block the request thread for pending
154
+ # EXPLAINs. If the plan isn't ready by then, skip it rather than stall the request.
155
+ EXPLAIN_WAIT_TIMEOUT_SECONDS = 5.0
156
+
133
157
  def self.wait_for_pending_explains(timeout_seconds)
134
158
  pending = Thread.current[THREAD_LOCAL_EXPLAIN_PENDING_KEY]
135
159
  return unless pending && !pending.empty?
136
160
 
137
- start_time = Time.now
161
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
138
162
  pending.each do |thread|
139
- remaining_time = timeout_seconds - (Time.now - start_time)
163
+ remaining_time = timeout_seconds - (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time)
140
164
  break if remaining_time <= 0
141
165
 
142
166
  begin
@@ -150,17 +174,26 @@ module DeadBro
150
174
  def self.sanitize_sql(sql)
151
175
  return sql unless sql.is_a?(String)
152
176
 
153
- # Remove sensitive data patterns
154
- sql = sql.gsub(/\b(password|token|secret|key|ssn|credit_card)\s*=\s*['"][^'"]*['"]/i, '\1 = ?')
155
- sql = sql.gsub(/\b(password|token|secret|key|ssn|credit_card)\s*=\s*[^'",\s)]+/i, '\1 = ?')
177
+ # Cap length first — most "expensive" queries from the app's perspective
178
+ # are big UPDATE/INSERT with long literal blobs; don't burn regex time on
179
+ # those when we're going to truncate anyway.
180
+ sql = sql[0..SANITIZE_MAX_LENGTH] + "..." if sql.length > SANITIZE_MAX_LENGTH
181
+
182
+ # Only scan for sensitive KV pairs if one of the keywords is actually
183
+ # present — saves two regex passes on the vast majority of queries.
184
+ if sql.match?(SANITIZE_SKIP_SENSITIVE_WHEN_NO_KEYWORDS)
185
+ sql = sql.gsub(SENSITIVE_KV_QUOTED_RE, '\1 = ?')
186
+ sql = sql.gsub(SENSITIVE_KV_BARE_RE, '\1 = ?')
187
+ end
156
188
 
157
- # Remove specific values in WHERE clauses that might be sensitive
158
- sql = sql.gsub(/WHERE\s+[^=]+=\s*['"][^'"]*['"]/i) do |match|
159
- match.gsub(/=\s*['"][^'"]*['"]/, "= ?")
189
+ # Same short-circuit for WHERE rewrite.
190
+ if sql.match?(SANITIZE_SKIP_WHERE_WHEN_NO_KEYWORD)
191
+ sql = sql.gsub(WHERE_EQ_QUOTED_RE) do |match|
192
+ match.gsub(WHERE_EQ_QUOTED_INNER_RE, "= ?")
193
+ end
160
194
  end
161
195
 
162
- # Limit query length to prevent huge payloads
163
- (sql.length > 1000) ? sql[0..1000] + "..." : sql
196
+ sql
164
197
  end
165
198
 
166
199
  def self.should_explain_query?(duration_ms, sql)
@@ -185,64 +218,47 @@ module DeadBro
185
218
  return unless defined?(ActiveRecord)
186
219
  return unless ActiveRecord::Base.respond_to?(:connection)
187
220
 
221
+ # Cap pending EXPLAINs per request. A slow-query storm that would have
222
+ # spawned 200 threads and starved the AR pool now drops excess plans
223
+ # instead of cascading into a timeout.
224
+ pending = Thread.current[THREAD_LOCAL_EXPLAIN_PENDING_KEY] ||= []
225
+ if pending.length >= MAX_PENDING_EXPLAINS
226
+ query_info[:explain_plan] = nil
227
+ return
228
+ end
229
+
188
230
  # Capture the main thread reference to append logs to the correct thread
189
231
  main_thread = Thread.current
190
232
 
191
- # Run EXPLAIN in a background thread to avoid blocking the main request
233
+ # Run EXPLAIN in a background thread to avoid blocking the main request.
234
+ # We use `with_connection` so the connection returns to the pool even if
235
+ # the thread is killed or the block raises — the previous manual
236
+ # checkout/checkin could leak a connection under pathological paths.
192
237
  explain_thread = Thread.new do
193
- connection = nil
194
238
  begin
195
- # Use a separate connection to avoid interfering with the main query
196
- connection = if ActiveRecord::Base.connection_pool.respond_to?(:checkout)
197
- ActiveRecord::Base.connection_pool.checkout
198
- else
199
- ActiveRecord::Base.connection
200
- end
201
-
202
- # Interpolate binds if present to ensure EXPLAIN works with placeholders
203
- final_sql = interpolate_sql_with_binds(sql, binds, connection)
204
-
205
- # Build EXPLAIN query based on database adapter
206
- explain_sql = build_explain_query(final_sql, connection)
239
+ ActiveRecord::Base.connection_pool.with_connection do |connection|
240
+ final_sql = interpolate_sql_with_binds(sql, binds, connection)
241
+ explain_sql = build_explain_query(final_sql, connection)
207
242
 
208
- # Execute the EXPLAIN query
209
- # For PostgreSQL, use select_all which returns ActiveRecord::Result
210
- # For other databases, use execute
211
- adapter_name = connection.adapter_name.downcase
212
- result = if adapter_name == "postgresql" || adapter_name == "postgis"
213
- # PostgreSQL: select_all returns ActiveRecord::Result with rows
214
- connection.select_all(explain_sql)
215
- else
216
- # Other databases: use execute
217
- connection.execute(explain_sql)
218
- end
219
-
220
- # Format the result based on database adapter
221
- explain_plan = format_explain_result(result, connection)
243
+ adapter_name = connection.adapter_name.downcase
244
+ result = if adapter_name == "postgresql" || adapter_name == "postgis"
245
+ connection.select_all(explain_sql)
246
+ else
247
+ connection.execute(explain_sql)
248
+ end
222
249
 
223
- # Update the query_info with the explain plan
224
- # This updates the hash that's already in the queries array
225
- query_info[:explain_plan] = if explain_plan && !explain_plan.to_s.strip.empty?
226
- explain_plan
250
+ explain_plan = format_explain_result(result, connection)
251
+ query_info[:explain_plan] = if explain_plan && !explain_plan.to_s.strip.empty?
252
+ explain_plan
253
+ end
227
254
  end
228
255
  rescue => e
229
- # Silently fail - don't let EXPLAIN break the application
256
+ # Silently fail don't let EXPLAIN break the application.
230
257
  append_log_to_thread(main_thread, :debug, "Failed to capture EXPLAIN ANALYZE: #{e.message}")
231
258
  query_info[:explain_plan] = nil
232
- ensure
233
- # Return connection to pool if we checked it out
234
- if connection && ActiveRecord::Base.connection_pool.respond_to?(:checkin)
235
- begin
236
- ActiveRecord::Base.connection_pool.checkin(connection)
237
- rescue
238
- nil
239
- end
240
- end
241
259
  end
242
260
  end
243
261
 
244
- # Track the thread so we can wait for it when stopping request tracking
245
- pending = Thread.current[THREAD_LOCAL_EXPLAIN_PENDING_KEY] ||= []
246
262
  pending << explain_thread
247
263
  rescue => e
248
264
  # Use DeadBro.logger here since we're still in the main thread
@@ -419,6 +435,27 @@ module DeadBro
419
435
  result.to_s
420
436
  end
421
437
 
438
+ APP_BACKTRACE_MAX_FRAMES = 25
439
+ APP_BACKTRACE_SENSITIVE_RE = /\/[^\/]*(password|secret|key|token)[^\/]*\//i
440
+
441
+ # Cheap app-only backtrace for the current query. Uses caller_locations
442
+ # (lazy frame objects, no string allocations until we render) and keeps
443
+ # only frames under app/ (filtering vendor/). Returns at most N frames.
444
+ def self.capture_app_backtrace
445
+ locations = caller_locations(1, 100) || []
446
+ frames = []
447
+ locations.each do |loc|
448
+ path = loc.path.to_s
449
+ next unless path.include?("app/")
450
+ next if path.include?("/vendor/")
451
+ frames << "#{path}:#{loc.lineno}:in `#{loc.label}'".gsub(APP_BACKTRACE_SENSITIVE_RE, "/[FILTERED]/")
452
+ break if frames.length >= APP_BACKTRACE_MAX_FRAMES
453
+ end
454
+ frames
455
+ rescue
456
+ []
457
+ end
458
+
422
459
  def self.safe_query_trace(data, captured_backtrace = nil)
423
460
  return [] unless data.is_a?(Hash)
424
461
 
@@ -520,15 +557,10 @@ module DeadBro
520
557
  def start(name, id, payload)
521
558
  map = (Thread.current[DeadBro::SqlSubscriber::THREAD_LOCAL_ALLOC_START_KEY] ||= {})
522
559
  map[id] = GC.stat[:total_allocated_objects] if defined?(GC) && GC.respond_to?(:stat)
523
-
524
- # Capture the backtrace at query start time (before notification system processes it)
525
- # This gives us the actual call stack where the SQL was executed
526
- backtrace_map = (Thread.current[DeadBro::SqlSubscriber::THREAD_LOCAL_BACKTRACE_KEY] ||= {})
527
- captured_backtrace = Thread.current.backtrace
528
- if captured_backtrace && captured_backtrace.is_a?(Array)
529
- # Skip the first few frames (our listener code) to get to the actual query execution
530
- backtrace_map[id] = captured_backtrace[5..-1] || captured_backtrace
531
- end
560
+ # Backtraces used to be captured here for every SQL event, which was
561
+ # dominating CPU on N+1-heavy requests (100s of full Thread#backtrace
562
+ # allocations). The main subscriber now captures a trimmed backtrace
563
+ # lazily and only when a query exceeds slow_query_threshold_ms.
532
564
  rescue
533
565
  end
534
566
 
@@ -40,6 +40,11 @@ module DeadBro
40
40
  DeadBro::MemoryTrackingSubscriber.start_request_tracking
41
41
  end
42
42
 
43
+ # Start Elasticsearch tracking for this request
44
+ if defined?(DeadBro::ElasticsearchSubscriber)
45
+ DeadBro::ElasticsearchSubscriber.start_request_tracking
46
+ end
47
+
43
48
  # Start outgoing HTTP accumulation for this request
44
49
  Thread.current[:dead_bro_http_events] = []
45
50
 
@@ -74,7 +79,8 @@ module DeadBro
74
79
  Thread.current[:dead_bro_lightweight_memory] = nil
75
80
  end
76
81
 
77
- # Clean up HTTP events and tracking start time
82
+ # Clean up HTTP events, ES events, and tracking start time
83
+ Thread.current[:dead_bro_elasticsearch_events] = nil
78
84
  Thread.current[:dead_bro_http_events] = nil
79
85
  Thread.current[DeadBro::TRACKING_START_TIME_KEY] = nil
80
86
  end
@@ -12,6 +12,7 @@ module DeadBro
12
12
  # can detect when tracking has been re-enabled, then skip all tracking.
13
13
  unless DeadBro.configuration.enabled
14
14
  client.post_heartbeat if DeadBro.configuration.heartbeat_due?
15
+ drain_request_tracking
15
16
  next
16
17
  end
17
18
 
@@ -21,9 +22,23 @@ module DeadBro
21
22
  controller_name = notification[:controller].to_s
22
23
  action_name = notification[:action].to_s
23
24
  begin
24
- next if DeadBro.configuration.excluded_controller?(controller_name, action_name)
25
- next unless DeadBro.configuration.exclusive_controller?(controller_name, action_name)
25
+ if DeadBro.configuration.excluded_controller?(controller_name, action_name)
26
+ drain_request_tracking
27
+ next
28
+ end
29
+ unless DeadBro.configuration.exclusive_controller?(controller_name, action_name)
30
+ drain_request_tracking
31
+ next
32
+ end
26
33
  rescue
34
+ drain_request_tracking
35
+ next
36
+ end
37
+
38
+ has_error = data[:exception] || data[:exception_object]
39
+ # Errors always ship regardless of sampling (this is what the docs promise).
40
+ unless has_error || DeadBro.configuration.should_sample?
41
+ drain_request_tracking
27
42
  next
28
43
  end
29
44
 
@@ -31,9 +46,10 @@ module DeadBro
31
46
  # Stop SQL tracking and get collected queries (this was started by the request)
32
47
  sql_queries = DeadBro::SqlSubscriber.stop_request_tracking
33
48
 
34
- # Stop cache and redis tracking
49
+ # Stop cache, redis, and elasticsearch tracking
35
50
  cache_events = defined?(DeadBro::CacheSubscriber) ? DeadBro::CacheSubscriber.stop_request_tracking : []
36
51
  redis_events = defined?(DeadBro::RedisSubscriber) ? DeadBro::RedisSubscriber.stop_request_tracking : []
52
+ elasticsearch_events = defined?(DeadBro::ElasticsearchSubscriber) ? DeadBro::ElasticsearchSubscriber.stop_request_tracking : []
37
53
 
38
54
  # Stop view rendering tracking and get collected view events
39
55
  view_events = DeadBro::ViewRenderingSubscriber.stop_request_tracking
@@ -108,7 +124,7 @@ module DeadBro
108
124
  }
109
125
 
110
126
  event_name = (exception_class || exception_obj&.class&.name || "exception").to_s
111
- client.post_metric(event_name: event_name, payload: error_payload)
127
+ client.post_metric(event_name: event_name, payload: error_payload, force: true)
112
128
  rescue
113
129
  ensure
114
130
  next
@@ -137,6 +153,7 @@ module DeadBro
137
153
  http_outgoing: Thread.current[:dead_bro_http_events] || [],
138
154
  cache_events: cache_events,
139
155
  redis_events: redis_events,
156
+ elasticsearch_events: elasticsearch_events,
140
157
  cache_hits: cache_hits(data),
141
158
  cache_misses: cache_misses(data),
142
159
  view_events: view_events,
@@ -149,6 +166,24 @@ module DeadBro
149
166
  end
150
167
  end
151
168
 
169
+ # Release per-subscriber thread-local state when we've decided not to build
170
+ # a payload (disabled / excluded / sampled out). Without this, a subsequent
171
+ # request reusing the same Puma thread would see stale queries/events.
172
+ def self.drain_request_tracking
173
+ DeadBro::SqlSubscriber.stop_request_tracking if defined?(DeadBro::SqlSubscriber)
174
+ DeadBro::CacheSubscriber.stop_request_tracking if defined?(DeadBro::CacheSubscriber)
175
+ DeadBro::RedisSubscriber.stop_request_tracking if defined?(DeadBro::RedisSubscriber)
176
+ DeadBro::ElasticsearchSubscriber.stop_request_tracking if defined?(DeadBro::ElasticsearchSubscriber)
177
+ DeadBro::ViewRenderingSubscriber.stop_request_tracking if defined?(DeadBro::ViewRenderingSubscriber)
178
+ DeadBro::LightweightMemoryTracker.stop_request_tracking if defined?(DeadBro::LightweightMemoryTracker)
179
+ if DeadBro.configuration.allocation_tracking_enabled && defined?(DeadBro::MemoryTrackingSubscriber)
180
+ DeadBro::MemoryTrackingSubscriber.stop_request_tracking
181
+ end
182
+ Thread.current[:dead_bro_http_events] = nil
183
+ rescue
184
+ # Best effort — draining must never raise from the notifications callback.
185
+ end
186
+
152
187
  def self.safe_path(data)
153
188
  path = data[:path] || (data[:request] && data[:request].path)
154
189
  path.to_s
@@ -261,17 +296,7 @@ module DeadBro
261
296
  end
262
297
 
263
298
  def self.memory_usage_mb
264
- if defined?(GC) && GC.respond_to?(:stat)
265
- # Get memory usage in MB
266
- memory_kb = begin
267
- `ps -o rss= -p #{Process.pid}`.to_i
268
- rescue
269
- 0
270
- end
271
- (memory_kb / 1024.0).round(2)
272
- else
273
- 0
274
- end
299
+ DeadBro::MemoryHelpers.rss_mb
275
300
  rescue
276
301
  0
277
302
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DeadBro
4
- VERSION = "0.2.8"
4
+ VERSION = "0.2.10"
5
5
  end