dead_bro 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +126 -0
- data/lib/dead_bro/cache_subscriber.rb +3 -3
- data/lib/dead_bro/client.rb +6 -6
- data/lib/dead_bro/collectors/database.rb +126 -0
- data/lib/dead_bro/collectors/filesystem.rb +94 -0
- data/lib/dead_bro/collectors/jobs.rb +403 -0
- data/lib/dead_bro/collectors/network.rb +252 -0
- data/lib/dead_bro/collectors/process_info.rb +178 -0
- data/lib/dead_bro/collectors/sample_store.rb +108 -0
- data/lib/dead_bro/collectors/system.rb +206 -0
- data/lib/dead_bro/collectors.rb +14 -0
- data/lib/dead_bro/configuration.rb +21 -17
- data/lib/dead_bro/error_middleware.rb +1 -11
- data/lib/dead_bro/http_instrumentation.rb +3 -3
- data/lib/dead_bro/job_sql_tracking_middleware.rb +2 -2
- data/lib/dead_bro/job_subscriber.rb +2 -12
- data/lib/dead_bro/monitor.rb +89 -0
- data/lib/dead_bro/railtie.rb +5 -6
- data/lib/dead_bro/redis_subscriber.rb +3 -3
- data/lib/dead_bro/sql_subscriber.rb +41 -39
- data/lib/dead_bro/sql_tracking_middleware.rb +1 -1
- data/lib/dead_bro/subscriber.rb +1 -9
- data/lib/dead_bro/version.rb +1 -1
- data/lib/dead_bro/view_rendering_subscriber.rb +3 -3
- data/lib/dead_bro.rb +11 -8
- metadata +10 -2
- data/lib/dead_bro/job_queue_monitor.rb +0 -395
|
@@ -1,395 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module DeadBro
|
|
4
|
-
class JobQueueMonitor
|
|
5
|
-
def initialize(client: DeadBro.client)
|
|
6
|
-
@client = client
|
|
7
|
-
@thread = nil
|
|
8
|
-
@running = false
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
def start
|
|
12
|
-
return if @running
|
|
13
|
-
return unless DeadBro.configuration.job_queue_monitoring_enabled
|
|
14
|
-
return unless DeadBro.configuration.enabled
|
|
15
|
-
|
|
16
|
-
@running = true
|
|
17
|
-
@thread = Thread.new do
|
|
18
|
-
Thread.current.abort_on_exception = false
|
|
19
|
-
loop do
|
|
20
|
-
break unless @running
|
|
21
|
-
|
|
22
|
-
begin
|
|
23
|
-
stats = collect_queue_stats
|
|
24
|
-
@client.post_job_stats(stats) if stats
|
|
25
|
-
rescue => e
|
|
26
|
-
log_error("Error collecting job queue stats: #{e.message}")
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
# Sleep for 60 seconds (1 minute)
|
|
30
|
-
sleep(120)
|
|
31
|
-
end
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
@thread
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def stop
|
|
38
|
-
@running = false
|
|
39
|
-
@thread&.join(5) # Wait up to 5 seconds for thread to finish
|
|
40
|
-
@thread = nil
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
private
|
|
44
|
-
|
|
45
|
-
def collect_queue_stats
|
|
46
|
-
stats = {
|
|
47
|
-
timestamp: Time.now.utc.iso8601,
|
|
48
|
-
queue_system: detect_queue_system,
|
|
49
|
-
environment: Rails.env,
|
|
50
|
-
queues: {}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
case stats[:queue_system]
|
|
54
|
-
when :sidekiq
|
|
55
|
-
stats[:queues] = collect_sidekiq_stats
|
|
56
|
-
when :solid_queue
|
|
57
|
-
stats[:queues] = collect_solid_queue_stats
|
|
58
|
-
when :delayed_job
|
|
59
|
-
stats[:queues] = collect_delayed_job_stats
|
|
60
|
-
when :good_job
|
|
61
|
-
stats[:queues] = collect_good_job_stats
|
|
62
|
-
else
|
|
63
|
-
return nil # Unknown queue system, don't send stats
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
stats
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
def detect_queue_system
|
|
70
|
-
return :sidekiq if defined?(Sidekiq)
|
|
71
|
-
return :solid_queue if defined?(SolidQueue)
|
|
72
|
-
return :delayed_job if defined?(Delayed::Job)
|
|
73
|
-
return :good_job if defined?(GoodJob)
|
|
74
|
-
:unknown
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
def collect_sidekiq_stats
|
|
78
|
-
return {} unless defined?(Sidekiq)
|
|
79
|
-
|
|
80
|
-
stats = {
|
|
81
|
-
total_queued: 0,
|
|
82
|
-
total_busy: 0,
|
|
83
|
-
queues: {}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
begin
|
|
87
|
-
# Get queue sizes - try to access Queue class (will trigger autoload if needed)
|
|
88
|
-
begin
|
|
89
|
-
queue_class = Sidekiq.const_get(:Queue)
|
|
90
|
-
if queue_class.respond_to?(:all)
|
|
91
|
-
queue_class.all.each do |queue|
|
|
92
|
-
queue_name = queue.name
|
|
93
|
-
size = queue.size
|
|
94
|
-
stats[:queues][queue_name] = {
|
|
95
|
-
queued: size,
|
|
96
|
-
busy: 0,
|
|
97
|
-
scheduled: 0,
|
|
98
|
-
retries: 0
|
|
99
|
-
}
|
|
100
|
-
stats[:total_queued] += size
|
|
101
|
-
end
|
|
102
|
-
end
|
|
103
|
-
rescue NameError => e
|
|
104
|
-
log_error("Sidekiq::Queue not available: #{e.message}")
|
|
105
|
-
rescue => e
|
|
106
|
-
log_error("Error accessing Sidekiq::Queue: #{e.message}")
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
# Get busy workers
|
|
110
|
-
begin
|
|
111
|
-
workers_class = Sidekiq.const_get(:Workers)
|
|
112
|
-
workers = workers_class.new
|
|
113
|
-
workers.each do |process_id, thread_id, work|
|
|
114
|
-
next unless work
|
|
115
|
-
queue_name = work["queue"] || "default"
|
|
116
|
-
stats[:queues][queue_name] ||= { queued: 0, busy: 0, scheduled: 0, retries: 0 }
|
|
117
|
-
stats[:queues][queue_name][:busy] += 1
|
|
118
|
-
stats[:total_busy] += 1
|
|
119
|
-
end
|
|
120
|
-
rescue NameError
|
|
121
|
-
# Workers class not available, try fallback
|
|
122
|
-
if Sidekiq.respond_to?(:workers)
|
|
123
|
-
# Fallback for older Sidekiq versions
|
|
124
|
-
begin
|
|
125
|
-
workers = Sidekiq.workers
|
|
126
|
-
if workers.respond_to?(:each)
|
|
127
|
-
workers.each do |worker|
|
|
128
|
-
queue_name = worker.respond_to?(:queue) ? worker.queue : "default"
|
|
129
|
-
stats[:queues][queue_name] ||= { queued: 0, busy: 0, scheduled: 0, retries: 0 }
|
|
130
|
-
stats[:queues][queue_name][:busy] += 1
|
|
131
|
-
stats[:total_busy] += 1
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
|
-
rescue => e
|
|
135
|
-
log_error("Error getting Sidekiq workers (fallback): #{e.message}")
|
|
136
|
-
end
|
|
137
|
-
end
|
|
138
|
-
rescue => e
|
|
139
|
-
log_error("Error getting Sidekiq workers: #{e.message}")
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
# Get scheduled jobs
|
|
143
|
-
begin
|
|
144
|
-
scheduled_set_class = Sidekiq.const_get(:ScheduledSet)
|
|
145
|
-
scheduled_set = scheduled_set_class.new
|
|
146
|
-
stats[:total_scheduled] = scheduled_set.size
|
|
147
|
-
rescue NameError
|
|
148
|
-
# ScheduledSet not available, skip
|
|
149
|
-
rescue => e
|
|
150
|
-
log_error("Error getting Sidekiq scheduled jobs: #{e.message}")
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
# Get retries
|
|
154
|
-
begin
|
|
155
|
-
retry_set_class = Sidekiq.const_get(:RetrySet)
|
|
156
|
-
retry_set = retry_set_class.new
|
|
157
|
-
stats[:total_retries] = retry_set.size
|
|
158
|
-
rescue NameError
|
|
159
|
-
# RetrySet not available, skip
|
|
160
|
-
rescue => e
|
|
161
|
-
log_error("Error getting Sidekiq retries: #{e.message}")
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
# Get dead jobs
|
|
165
|
-
begin
|
|
166
|
-
dead_set_class = Sidekiq.const_get(:DeadSet)
|
|
167
|
-
dead_set = dead_set_class.new
|
|
168
|
-
stats[:total_dead] = dead_set.size
|
|
169
|
-
rescue NameError
|
|
170
|
-
# DeadSet not available, skip
|
|
171
|
-
rescue => e
|
|
172
|
-
log_error("Error getting Sidekiq dead jobs: #{e.message}")
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
# Get process info
|
|
176
|
-
begin
|
|
177
|
-
process_set_class = Sidekiq.const_get(:ProcessSet)
|
|
178
|
-
process_set = process_set_class.new
|
|
179
|
-
stats[:processes] = process_set.size
|
|
180
|
-
rescue NameError
|
|
181
|
-
# ProcessSet not available, skip
|
|
182
|
-
rescue => e
|
|
183
|
-
log_error("Error getting Sidekiq processes: #{e.message}")
|
|
184
|
-
end
|
|
185
|
-
rescue => e
|
|
186
|
-
log_error("Error collecting Sidekiq stats: #{e.message}")
|
|
187
|
-
log_error("Backtrace: #{e.backtrace.first(5).join("\n")}")
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
stats
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
def collect_solid_queue_stats
|
|
194
|
-
return {} unless defined?(SolidQueue)
|
|
195
|
-
|
|
196
|
-
stats = {
|
|
197
|
-
total_queued: 0,
|
|
198
|
-
total_busy: 0,
|
|
199
|
-
queues: {}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
begin
|
|
203
|
-
# Solid Queue uses ActiveJob and stores jobs in a database table
|
|
204
|
-
if defined?(ActiveRecord) && ActiveRecord::Base.connected? && ActiveRecord::Base.connection.table_exists?("solid_queue_jobs")
|
|
205
|
-
# Get queued jobs grouped by queue
|
|
206
|
-
result = ActiveRecord::Base.connection.execute(
|
|
207
|
-
"SELECT queue_name, COUNT(*) as count FROM solid_queue_jobs WHERE finished_at IS NULL GROUP BY queue_name"
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
parse_query_result(result).each do |row|
|
|
211
|
-
queue_name = (row["queue_name"] || row[:queue_name] || "default").to_s
|
|
212
|
-
count = (row["count"] || row[:count] || 0).to_i
|
|
213
|
-
stats[:queues][queue_name] = {
|
|
214
|
-
queued: count,
|
|
215
|
-
busy: 0,
|
|
216
|
-
scheduled: 0,
|
|
217
|
-
retries: 0
|
|
218
|
-
}
|
|
219
|
-
stats[:total_queued] += count
|
|
220
|
-
end
|
|
221
|
-
|
|
222
|
-
# Get busy jobs (claimed but not finished)
|
|
223
|
-
result = ActiveRecord::Base.connection.execute(
|
|
224
|
-
"SELECT queue_name, COUNT(*) as count FROM solid_queue_jobs WHERE finished_at IS NULL AND claimed_at IS NOT NULL GROUP BY queue_name"
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
parse_query_result(result).each do |row|
|
|
228
|
-
queue_name = (row["queue_name"] || row[:queue_name] || "default").to_s
|
|
229
|
-
count = (row["count"] || row[:count] || 0).to_i
|
|
230
|
-
stats[:queues][queue_name] ||= { queued: 0, busy: 0, scheduled: 0, retries: 0 }
|
|
231
|
-
stats[:queues][queue_name][:busy] = count
|
|
232
|
-
stats[:total_busy] += count
|
|
233
|
-
end
|
|
234
|
-
|
|
235
|
-
# Get scheduled jobs
|
|
236
|
-
result = ActiveRecord::Base.connection.execute(
|
|
237
|
-
"SELECT COUNT(*) as count FROM solid_queue_jobs WHERE scheduled_at > NOW()"
|
|
238
|
-
)
|
|
239
|
-
scheduled_count = parse_query_result(result).first
|
|
240
|
-
stats[:total_scheduled] = (scheduled_count&.dig("count") || scheduled_count&.dig(:count) || 0).to_i
|
|
241
|
-
|
|
242
|
-
# Get failed jobs
|
|
243
|
-
if ActiveRecord::Base.connection.table_exists?("solid_queue_failed_jobs")
|
|
244
|
-
result = ActiveRecord::Base.connection.execute(
|
|
245
|
-
"SELECT COUNT(*) as count FROM solid_queue_failed_jobs"
|
|
246
|
-
)
|
|
247
|
-
failed_count = parse_query_result(result).first
|
|
248
|
-
stats[:total_failed] = (failed_count&.dig("count") || failed_count&.dig(:count) || 0).to_i
|
|
249
|
-
end
|
|
250
|
-
end
|
|
251
|
-
rescue => e
|
|
252
|
-
log_error("Error collecting Solid Queue stats: #{e.message}")
|
|
253
|
-
end
|
|
254
|
-
|
|
255
|
-
stats
|
|
256
|
-
end
|
|
257
|
-
|
|
258
|
-
def collect_delayed_job_stats
|
|
259
|
-
return {} unless defined?(Delayed::Job)
|
|
260
|
-
|
|
261
|
-
stats = {
|
|
262
|
-
total_queued: 0,
|
|
263
|
-
total_busy: 0,
|
|
264
|
-
queues: {}
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
begin
|
|
268
|
-
# Delayed Job uses a single table
|
|
269
|
-
if defined?(ActiveRecord) && ActiveRecord::Base.connected? && ActiveRecord::Base.connection.table_exists?("delayed_jobs")
|
|
270
|
-
# Get queued jobs
|
|
271
|
-
queued = Delayed::Job.where("locked_at IS NULL AND attempts < max_attempts").count
|
|
272
|
-
stats[:total_queued] = queued
|
|
273
|
-
stats[:queues]["default"] = {
|
|
274
|
-
queued: queued,
|
|
275
|
-
busy: 0,
|
|
276
|
-
scheduled: 0,
|
|
277
|
-
retries: 0
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
# Get busy jobs (locked)
|
|
281
|
-
busy = Delayed::Job.where("locked_at IS NOT NULL AND locked_by IS NOT NULL").count
|
|
282
|
-
stats[:total_busy] = busy
|
|
283
|
-
stats[:queues]["default"][:busy] = busy
|
|
284
|
-
|
|
285
|
-
# Get failed jobs
|
|
286
|
-
failed = Delayed::Job.where("attempts >= max_attempts").count
|
|
287
|
-
stats[:total_failed] = failed
|
|
288
|
-
end
|
|
289
|
-
rescue => e
|
|
290
|
-
log_error("Error collecting Delayed Job stats: #{e.message}")
|
|
291
|
-
end
|
|
292
|
-
|
|
293
|
-
stats
|
|
294
|
-
end
|
|
295
|
-
|
|
296
|
-
def collect_good_job_stats
|
|
297
|
-
return {} unless defined?(GoodJob)
|
|
298
|
-
|
|
299
|
-
stats = {
|
|
300
|
-
total_queued: 0,
|
|
301
|
-
total_busy: 0,
|
|
302
|
-
queues: {}
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
begin
|
|
306
|
-
# Good Job uses ActiveJob and stores jobs in a database table
|
|
307
|
-
if defined?(ActiveRecord) && ActiveRecord::Base.connected? && ActiveRecord::Base.connection.table_exists?("good_jobs")
|
|
308
|
-
# Get queued jobs grouped by queue
|
|
309
|
-
result = ActiveRecord::Base.connection.execute(
|
|
310
|
-
"SELECT queue_name, COUNT(*) as count FROM good_jobs WHERE finished_at IS NULL GROUP BY queue_name"
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
parse_query_result(result).each do |row|
|
|
314
|
-
queue_name = (row["queue_name"] || row[:queue_name] || "default").to_s
|
|
315
|
-
count = (row["count"] || row[:count] || 0).to_i
|
|
316
|
-
stats[:queues][queue_name] = {
|
|
317
|
-
queued: count,
|
|
318
|
-
busy: 0,
|
|
319
|
-
scheduled: 0,
|
|
320
|
-
retries: 0
|
|
321
|
-
}
|
|
322
|
-
stats[:total_queued] += count
|
|
323
|
-
end
|
|
324
|
-
|
|
325
|
-
# Get busy jobs (running)
|
|
326
|
-
result = ActiveRecord::Base.connection.execute(
|
|
327
|
-
"SELECT queue_name, COUNT(*) as count FROM good_jobs WHERE finished_at IS NULL AND performed_at IS NOT NULL GROUP BY queue_name"
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
parse_query_result(result).each do |row|
|
|
331
|
-
queue_name = (row["queue_name"] || row[:queue_name] || "default").to_s
|
|
332
|
-
count = (row["count"] || row[:count] || 0).to_i
|
|
333
|
-
stats[:queues][queue_name] ||= { queued: 0, busy: 0, scheduled: 0, retries: 0 }
|
|
334
|
-
stats[:queues][queue_name][:busy] = count
|
|
335
|
-
stats[:total_busy] += count
|
|
336
|
-
end
|
|
337
|
-
|
|
338
|
-
# Get scheduled jobs
|
|
339
|
-
result = ActiveRecord::Base.connection.execute(
|
|
340
|
-
"SELECT COUNT(*) as count FROM good_jobs WHERE scheduled_at > NOW()"
|
|
341
|
-
)
|
|
342
|
-
scheduled_count = parse_query_result(result).first
|
|
343
|
-
stats[:total_scheduled] = (scheduled_count&.dig("count") || scheduled_count&.dig(:count) || 0).to_i
|
|
344
|
-
|
|
345
|
-
# Get failed jobs
|
|
346
|
-
result = ActiveRecord::Base.connection.execute(
|
|
347
|
-
"SELECT COUNT(*) as count FROM good_jobs WHERE finished_at IS NOT NULL AND error IS NOT NULL"
|
|
348
|
-
)
|
|
349
|
-
failed_count = parse_query_result(result).first
|
|
350
|
-
stats[:total_failed] = (failed_count&.dig("count") || failed_count&.dig(:count) || 0).to_i
|
|
351
|
-
end
|
|
352
|
-
rescue => e
|
|
353
|
-
log_error("Error collecting Good Job stats: #{e.message}")
|
|
354
|
-
end
|
|
355
|
-
|
|
356
|
-
stats
|
|
357
|
-
end
|
|
358
|
-
|
|
359
|
-
def parse_query_result(result)
|
|
360
|
-
# Handle different database adapter result formats
|
|
361
|
-
if result.respond_to?(:each)
|
|
362
|
-
# PostgreSQL PG::Result or similar
|
|
363
|
-
if result.respond_to?(:values)
|
|
364
|
-
# Convert to array of hashes
|
|
365
|
-
columns = result.fields rescue result.column_names rescue []
|
|
366
|
-
result.values.map do |row|
|
|
367
|
-
columns.each_with_index.each_with_object({}) do |(col, idx), hash|
|
|
368
|
-
hash[col.to_s] = row[idx]
|
|
369
|
-
hash[col.to_sym] = row[idx]
|
|
370
|
-
end
|
|
371
|
-
end
|
|
372
|
-
elsif result.is_a?(Array)
|
|
373
|
-
# Already an array
|
|
374
|
-
result
|
|
375
|
-
else
|
|
376
|
-
# Try to convert to array
|
|
377
|
-
result.to_a
|
|
378
|
-
end
|
|
379
|
-
else
|
|
380
|
-
[]
|
|
381
|
-
end
|
|
382
|
-
rescue => e
|
|
383
|
-
log_error("Error parsing query result: #{e.message}")
|
|
384
|
-
[]
|
|
385
|
-
end
|
|
386
|
-
|
|
387
|
-
def log_error(message)
|
|
388
|
-
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
389
|
-
Rails.logger.error("[DeadBro::JobQueueMonitor] #{message}")
|
|
390
|
-
else
|
|
391
|
-
$stderr.puts("[DeadBro::JobQueueMonitor] #{message}")
|
|
392
|
-
end
|
|
393
|
-
end
|
|
394
|
-
end
|
|
395
|
-
end
|