dead_bro 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +126 -0
- data/lib/dead_bro/cache_subscriber.rb +3 -3
- data/lib/dead_bro/client.rb +6 -6
- data/lib/dead_bro/collectors/database.rb +126 -0
- data/lib/dead_bro/collectors/filesystem.rb +94 -0
- data/lib/dead_bro/collectors/jobs.rb +403 -0
- data/lib/dead_bro/collectors/network.rb +252 -0
- data/lib/dead_bro/collectors/process_info.rb +178 -0
- data/lib/dead_bro/collectors/sample_store.rb +108 -0
- data/lib/dead_bro/collectors/system.rb +206 -0
- data/lib/dead_bro/collectors.rb +14 -0
- data/lib/dead_bro/configuration.rb +21 -17
- data/lib/dead_bro/error_middleware.rb +1 -11
- data/lib/dead_bro/http_instrumentation.rb +3 -3
- data/lib/dead_bro/job_sql_tracking_middleware.rb +2 -2
- data/lib/dead_bro/job_subscriber.rb +2 -12
- data/lib/dead_bro/monitor.rb +89 -0
- data/lib/dead_bro/railtie.rb +5 -6
- data/lib/dead_bro/redis_subscriber.rb +3 -3
- data/lib/dead_bro/sql_subscriber.rb +41 -39
- data/lib/dead_bro/sql_tracking_middleware.rb +1 -1
- data/lib/dead_bro/subscriber.rb +1 -9
- data/lib/dead_bro/version.rb +1 -1
- data/lib/dead_bro/view_rendering_subscriber.rb +3 -3
- data/lib/dead_bro.rb +11 -8
- metadata +10 -2
- data/lib/dead_bro/job_queue_monitor.rb +0 -395
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7f4e8376defc769119dd530481b5742580e2c296c09c065bd9f1e6f6d6d687b2
|
|
4
|
+
data.tar.gz: ca5caacda90af41400df74709567c70001aac71f83a174b3da3976dc5d673d3a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 241f825cf7696926f57fbc30ef59a0fc2e235173f4b6ad72ce4d6eecab393b77c037bd6deab20d780fe275289cac01955562bfb62a70e331e5294b4b607ea305
|
|
7
|
+
data.tar.gz: '0988d794e49eb4a825ff7936e00cd41ad98b73b67d45126b1ff537eef47ed1f17562c080f82091ae455a12e261a92ead0aea1a0bbed1ade7468f5635693e42b6'
|
data/README.md
CHANGED
|
@@ -262,6 +262,132 @@ DeadBro automatically tracks ActiveJob background jobs when ActiveJob is availab
|
|
|
262
262
|
- `message` - Exception message (for failed jobs)
|
|
263
263
|
- `backtrace` - Exception backtrace (for failed jobs)
|
|
264
264
|
|
|
265
|
+
## Control Plane Metrics (Queues, DB, Process, System)
|
|
266
|
+
|
|
267
|
+
DeadBro includes a lightweight **control plane metrics** job that runs periodically (by default once per minute via `DeadBro::JobQueueMonitor`) and sends a single JSON payload summarizing:
|
|
268
|
+
|
|
269
|
+
- **Sidekiq / job queues**: global stats (`processed`, `failed`, `enqueued`, `scheduled_size`, `retry_size`, `dead_size`, `workers_size`, `processes_size`), and per-queue entries with `name`, `size`, and `latency_s`.
|
|
270
|
+
- **Database (best effort)**: connection pool stats and a simple `ping_ms` latency when `ActiveRecord` is available and connected.
|
|
271
|
+
- **Process / Rails**: `pid`, `hostname`, uptime, Ruby/Rails versions, environment, GC stats, RSS (`rss_bytes`), thread and file descriptor counts (on Linux).
|
|
272
|
+
- **System (Linux best effort)**: CPU percentage over the last interval (normalised 0–100), memory used/total/available, plus filesystem and network summaries.
|
|
273
|
+
|
|
274
|
+
Everything is **best effort** and designed to be **safe and low overhead**:
|
|
275
|
+
|
|
276
|
+
- Collection never raises; failures are reported as `{error_class, error_message}` under the respective section key.
|
|
277
|
+
- No sensitive data is sent (no job arguments, env vars, CLI args, or full SQL text in these control-plane metrics).
|
|
278
|
+
- CPU and network *rates* require two samples; on the first run you may see `nil` for `cpu_pct` or network `*_bytes_per_s` fields until a second sample is available.
|
|
279
|
+
|
|
280
|
+
### Configuration
|
|
281
|
+
|
|
282
|
+
You can enable or disable individual collectors and tune basic options via the standard `DeadBro.configure` block:
|
|
283
|
+
|
|
284
|
+
```ruby
|
|
285
|
+
DeadBro.configure do |config|
|
|
286
|
+
# Enable the periodic job queue monitor (disabled by default)
|
|
287
|
+
config.job_queue_monitoring_enabled = true
|
|
288
|
+
|
|
289
|
+
# Enable best-effort collectors (all default to false)
|
|
290
|
+
config.enable_db_stats = true # ActiveRecord pool + ping latency
|
|
291
|
+
config.enable_process_stats = true # pid, hostname, RSS, GC, threads, fds
|
|
292
|
+
config.enable_system_stats = true # CPU%, memory, disk, network
|
|
293
|
+
|
|
294
|
+
# Filesystem paths to report disk usage for (default: ["/"])
|
|
295
|
+
config.disk_paths = ["/", "/var"]
|
|
296
|
+
|
|
297
|
+
# Network interfaces to ignore when computing rx/tx stats
|
|
298
|
+
config.interfaces_ignore = %w[lo docker0]
|
|
299
|
+
end
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Example Payload Shape
|
|
303
|
+
|
|
304
|
+
The control plane job sends a single JSON payload roughly shaped like:
|
|
305
|
+
|
|
306
|
+
```json
|
|
307
|
+
{
|
|
308
|
+
"ts": "2025-01-01T12:00:00Z",
|
|
309
|
+
"app_name": "MyApp",
|
|
310
|
+
"env": "production",
|
|
311
|
+
"host": "app-1",
|
|
312
|
+
"pid": 12345,
|
|
313
|
+
"versions": {
|
|
314
|
+
"ruby": "3.1.2",
|
|
315
|
+
"rails": "7.1.0",
|
|
316
|
+
"sidekiq": "7.3.0"
|
|
317
|
+
},
|
|
318
|
+
"queue_system": "sidekiq",
|
|
319
|
+
"sidekiq": {
|
|
320
|
+
"processed": 1000,
|
|
321
|
+
"failed": 5,
|
|
322
|
+
"enqueued": 42,
|
|
323
|
+
"scheduled_size": 3,
|
|
324
|
+
"retry_size": 2,
|
|
325
|
+
"dead_size": 1,
|
|
326
|
+
"workers_size": 4,
|
|
327
|
+
"processes_size": 2,
|
|
328
|
+
"memory_rss_bytes": 123456789,
|
|
329
|
+
"queues": [
|
|
330
|
+
{"name": "default", "size": 10, "latency_s": 0.5}
|
|
331
|
+
]
|
|
332
|
+
},
|
|
333
|
+
"db": {
|
|
334
|
+
"available": true,
|
|
335
|
+
"pool": {
|
|
336
|
+
"size": 5,
|
|
337
|
+
"connections": 3,
|
|
338
|
+
"busy": 1,
|
|
339
|
+
"num_waiting": 0
|
|
340
|
+
},
|
|
341
|
+
"ping_ms": 2.1
|
|
342
|
+
},
|
|
343
|
+
"process": {
|
|
344
|
+
"pid": 12345,
|
|
345
|
+
"hostname": "app-1",
|
|
346
|
+
"uptime_s": 3600.5,
|
|
347
|
+
"rss_bytes": 123456789,
|
|
348
|
+
"thread_count": 20,
|
|
349
|
+
"fd_count": 128,
|
|
350
|
+
"gc": {
|
|
351
|
+
"heap_live_slots": 123_456,
|
|
352
|
+
"heap_free_slots": 12_345,
|
|
353
|
+
"total_allocated_objects": 1_234_567,
|
|
354
|
+
"major_gc_count": 10,
|
|
355
|
+
"minor_gc_count": 50
|
|
356
|
+
}
|
|
357
|
+
},
|
|
358
|
+
"system": {
|
|
359
|
+
"cpu_pct": 12.3,
|
|
360
|
+
"mem_used_bytes": 987654321,
|
|
361
|
+
"mem_total_bytes": 2147483648,
|
|
362
|
+
"mem_available_bytes": 1153433600,
|
|
363
|
+
"disk": {
|
|
364
|
+
"paths": [
|
|
365
|
+
{
|
|
366
|
+
"path": "/",
|
|
367
|
+
"disk_total_bytes": 107374182400,
|
|
368
|
+
"disk_free_bytes": 53687091200,
|
|
369
|
+
"disk_available_bytes": 53687091200
|
|
370
|
+
}
|
|
371
|
+
]
|
|
372
|
+
},
|
|
373
|
+
"net": {
|
|
374
|
+
"available": true,
|
|
375
|
+
"interfaces": [
|
|
376
|
+
{
|
|
377
|
+
"name": "eth0",
|
|
378
|
+
"rx_bytes": 123456,
|
|
379
|
+
"tx_bytes": 654321,
|
|
380
|
+
"rx_bytes_per_s": 1000.0,
|
|
381
|
+
"tx_bytes_per_s": 500.0
|
|
382
|
+
}
|
|
383
|
+
]
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
Not all fields will be present in all environments; unsupported or unavailable metrics may be `null` or omitted, and any hard failures are captured in `error_class` / `error_message` fields per section.
|
|
390
|
+
|
|
265
391
|
|
|
266
392
|
## Development
|
|
267
393
|
|
|
@@ -49,17 +49,17 @@ module DeadBro
|
|
|
49
49
|
def self.should_continue_tracking?
|
|
50
50
|
events = Thread.current[THREAD_LOCAL_KEY]
|
|
51
51
|
return false unless events
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
# Check count limit
|
|
54
54
|
return false if events.length >= MAX_TRACKED_EVENTS
|
|
55
|
-
|
|
55
|
+
|
|
56
56
|
# Check time limit
|
|
57
57
|
start_time = Thread.current[DeadBro::TRACKING_START_TIME_KEY]
|
|
58
58
|
if start_time
|
|
59
59
|
elapsed_seconds = Time.now - start_time
|
|
60
60
|
return false if elapsed_seconds >= DeadBro::MAX_TRACKING_DURATION_SECONDS
|
|
61
61
|
end
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
true
|
|
64
64
|
end
|
|
65
65
|
|
data/lib/dead_bro/client.rb
CHANGED
|
@@ -18,7 +18,7 @@ module DeadBro
|
|
|
18
18
|
|
|
19
19
|
# Check sampling rate - skip if not selected for sampling
|
|
20
20
|
return unless @configuration.should_sample?
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
# Check circuit breaker before making request
|
|
23
23
|
if @circuit_breaker && @configuration.circuit_breaker_enabled
|
|
24
24
|
if @circuit_breaker.state == :open
|
|
@@ -37,7 +37,7 @@ module DeadBro
|
|
|
37
37
|
nil
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
def
|
|
40
|
+
def post_monitor_stats(payload)
|
|
41
41
|
return if @configuration.api_key.nil?
|
|
42
42
|
return unless @configuration.enabled
|
|
43
43
|
return unless @configuration.job_queue_monitoring_enabled
|
|
@@ -55,7 +55,7 @@ module DeadBro
|
|
|
55
55
|
end
|
|
56
56
|
|
|
57
57
|
# Make the HTTP request (async) to jobs endpoint
|
|
58
|
-
|
|
58
|
+
make_monitor_request(payload, @configuration.api_key)
|
|
59
59
|
|
|
60
60
|
nil
|
|
61
61
|
end
|
|
@@ -122,10 +122,10 @@ module DeadBro
|
|
|
122
122
|
nil
|
|
123
123
|
end
|
|
124
124
|
|
|
125
|
-
def
|
|
125
|
+
def make_monitor_request(payload, api_key)
|
|
126
126
|
use_staging = ENV["USE_STAGING_ENDPOINT"] && !ENV["USE_STAGING_ENDPOINT"].empty?
|
|
127
|
-
production_url = use_staging ? "https://deadbro.aberatii.com/apm/v1/
|
|
128
|
-
endpoint_url = @configuration.ruby_dev ? "http://localhost:3100/apm/v1/
|
|
127
|
+
production_url = use_staging ? "https://deadbro.aberatii.com/apm/v1/monitor" : "https://www.deadbro.com/apm/v1/monitor"
|
|
128
|
+
endpoint_url = @configuration.ruby_dev ? "http://localhost:3100/apm/v1/monitor" : production_url
|
|
129
129
|
uri = URI.parse(endpoint_url)
|
|
130
130
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
131
131
|
http.use_ssl = (uri.scheme == "https")
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module DeadBro
|
|
5
|
+
module Collectors
|
|
6
|
+
# Database collector provides lightweight, best-effort information
|
|
7
|
+
# about the current ActiveRecord connection pool and a simple ping
|
|
8
|
+
# latency measurement.
|
|
9
|
+
module Database
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def collect
|
|
13
|
+
return {disabled: true} unless db_enabled?
|
|
14
|
+
return {available: false} unless defined?(::ActiveRecord)
|
|
15
|
+
|
|
16
|
+
base = ::ActiveRecord::Base
|
|
17
|
+
return {available: false} unless base.respond_to?(:connection_pool) && base.connection_pool
|
|
18
|
+
|
|
19
|
+
pool = safe_connection_pool(base)
|
|
20
|
+
|
|
21
|
+
{
|
|
22
|
+
available: true,
|
|
23
|
+
pool: pool_stats(pool),
|
|
24
|
+
ping_ms: ping_ms(base)
|
|
25
|
+
}
|
|
26
|
+
rescue => e
|
|
27
|
+
{
|
|
28
|
+
error_class: e.class.name,
|
|
29
|
+
error_message: e.message.to_s[0, 500]
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def db_enabled?
|
|
34
|
+
DeadBro.configuration.respond_to?(:enable_db_stats) &&
|
|
35
|
+
DeadBro.configuration.enable_db_stats
|
|
36
|
+
rescue
|
|
37
|
+
false
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def safe_connection_pool(base)
|
|
41
|
+
if base.respond_to?(:connection_pool)
|
|
42
|
+
base.connection_pool
|
|
43
|
+
elsif base.respond_to?(:connection_handler)
|
|
44
|
+
begin
|
|
45
|
+
base.connection_handler.retrieve_connection_pool(base)
|
|
46
|
+
rescue
|
|
47
|
+
nil
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
rescue
|
|
51
|
+
nil
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def pool_stats(pool)
|
|
55
|
+
return {} unless pool
|
|
56
|
+
|
|
57
|
+
{
|
|
58
|
+
size: begin
|
|
59
|
+
safe_integer(pool.size)
|
|
60
|
+
rescue
|
|
61
|
+
nil
|
|
62
|
+
end,
|
|
63
|
+
connections: begin
|
|
64
|
+
safe_integer(pool.connections.size)
|
|
65
|
+
rescue
|
|
66
|
+
nil
|
|
67
|
+
end,
|
|
68
|
+
busy: begin
|
|
69
|
+
safe_integer(pool.respond_to?(:busy) ? pool.busy : nil)
|
|
70
|
+
rescue
|
|
71
|
+
nil
|
|
72
|
+
end,
|
|
73
|
+
dead: begin
|
|
74
|
+
safe_integer(pool.respond_to?(:dead) ? pool.dead : nil)
|
|
75
|
+
rescue
|
|
76
|
+
nil
|
|
77
|
+
end,
|
|
78
|
+
num_waiting: begin
|
|
79
|
+
safe_integer(pool.respond_to?(:num_waiting) ? pool.num_waiting : nil)
|
|
80
|
+
rescue
|
|
81
|
+
nil
|
|
82
|
+
end,
|
|
83
|
+
automatic_reconnect: pool.respond_to?(:automatic_reconnect) ? !!pool.automatic_reconnect : nil
|
|
84
|
+
}
|
|
85
|
+
rescue
|
|
86
|
+
{}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def ping_ms(base)
|
|
90
|
+
started = current_time
|
|
91
|
+
base.connection_pool.with_connection do |conn|
|
|
92
|
+
sql = case conn.adapter_name.to_s.downcase
|
|
93
|
+
when /mysql/
|
|
94
|
+
"SELECT 1"
|
|
95
|
+
when /sqlite/
|
|
96
|
+
"SELECT 1"
|
|
97
|
+
else
|
|
98
|
+
"SELECT 1"
|
|
99
|
+
end
|
|
100
|
+
conn.select_value(sql)
|
|
101
|
+
end
|
|
102
|
+
elapsed_ms(started)
|
|
103
|
+
rescue
|
|
104
|
+
nil
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def current_time
|
|
108
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
109
|
+
rescue
|
|
110
|
+
Time.now.to_f
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def elapsed_ms(started)
|
|
114
|
+
((current_time - started) * 1000.0).round(2)
|
|
115
|
+
rescue
|
|
116
|
+
nil
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def safe_integer(value)
|
|
120
|
+
Integer(value)
|
|
121
|
+
rescue
|
|
122
|
+
nil
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "rbconfig"
|
|
5
|
+
|
|
6
|
+
module DeadBro
|
|
7
|
+
module Collectors
|
|
8
|
+
# Filesystem collector exposes disk usage information using a best-effort
|
|
9
|
+
# approach. It prefers Ruby or Sys::Filesystem APIs when available and
|
|
10
|
+
# falls back to parsing `df` output.
|
|
11
|
+
module Filesystem
|
|
12
|
+
module_function
|
|
13
|
+
|
|
14
|
+
def collect
|
|
15
|
+
paths = disk_paths
|
|
16
|
+
return {paths: []} if paths.nil? || paths.empty?
|
|
17
|
+
|
|
18
|
+
{
|
|
19
|
+
paths: paths.map { |path| stats_for_path(path) }.compact
|
|
20
|
+
}
|
|
21
|
+
rescue => e
|
|
22
|
+
{
|
|
23
|
+
error_class: e.class.name,
|
|
24
|
+
error_message: e.message.to_s[0, 500]
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def disk_paths
|
|
29
|
+
if DeadBro.configuration.respond_to?(:disk_paths)
|
|
30
|
+
DeadBro.configuration.disk_paths || ["/"]
|
|
31
|
+
else
|
|
32
|
+
["/"]
|
|
33
|
+
end
|
|
34
|
+
rescue
|
|
35
|
+
["/"]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def stats_for_path(path)
|
|
39
|
+
if defined?(Sys::Filesystem)
|
|
40
|
+
sys_filesystem_stats(path)
|
|
41
|
+
else
|
|
42
|
+
df_stats(path)
|
|
43
|
+
end
|
|
44
|
+
rescue
|
|
45
|
+
nil
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def sys_filesystem_stats(path)
|
|
49
|
+
stat = Sys::Filesystem.stat(path)
|
|
50
|
+
{
|
|
51
|
+
path: path,
|
|
52
|
+
disk_total_bytes: stat.blocks * stat.block_size,
|
|
53
|
+
disk_free_bytes: stat.blocks_available * stat.block_size,
|
|
54
|
+
disk_available_bytes: stat.blocks_available * stat.block_size
|
|
55
|
+
}
|
|
56
|
+
rescue
|
|
57
|
+
nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def df_stats(path)
|
|
61
|
+
# Use POSIX df when available. Output format varies slightly by platform,
|
|
62
|
+
# but we only depend on total and available in blocks.
|
|
63
|
+
output = `df -k #{Shellwords.escape(path)} 2>/dev/null`
|
|
64
|
+
lines = output.to_s.split("\n")
|
|
65
|
+
return nil if lines.size < 2
|
|
66
|
+
|
|
67
|
+
lines[0]
|
|
68
|
+
data = lines[1]
|
|
69
|
+
parts = data.split
|
|
70
|
+
# POSIX df: Filesystem 1K-blocks Used Available Use% Mounted on
|
|
71
|
+
total_kb = begin
|
|
72
|
+
Integer(parts[1])
|
|
73
|
+
rescue
|
|
74
|
+
nil
|
|
75
|
+
end
|
|
76
|
+
avail_kb = begin
|
|
77
|
+
Integer(parts[3])
|
|
78
|
+
rescue
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
return nil unless total_kb && avail_kb
|
|
82
|
+
|
|
83
|
+
{
|
|
84
|
+
path: path,
|
|
85
|
+
disk_total_bytes: total_kb * 1024,
|
|
86
|
+
disk_free_bytes: avail_kb * 1024,
|
|
87
|
+
disk_available_bytes: avail_kb * 1024
|
|
88
|
+
}
|
|
89
|
+
rescue
|
|
90
|
+
nil
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|