solid_queue_web 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +57 -3
- data/app/controllers/solid_queue_web/dashboard_controller.rb +2 -0
- data/app/controllers/solid_queue_web/jobs_controller.rb +6 -2
- data/app/controllers/solid_queue_web/queues_controller.rb +5 -0
- data/app/services/solid_queue_web/slow_job_alert.rb +70 -0
- data/app/services/solid_queue_web/stale_process_alert.rb +68 -0
- data/app/views/solid_queue_web/jobs/index.html.erb +4 -0
- data/app/views/solid_queue_web/queues/index.html.erb +3 -3
- data/lib/solid_queue_web/version.rb +1 -1
- data/lib/solid_queue_web.rb +10 -1
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c0d9757682265e639856df718934858d99390e8ca06ff14cb575b117db105932
|
|
4
|
+
data.tar.gz: a6c0ca7e495f76467cde1aabb0f54674c1439a5489913b4fe9d95837df3acd03
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 41ff634f54cb990c14c57870835f62e091a36130c26e0ecdca0a9fbe07f6ed70b9f128a41d22b59177f03cd21497c3bda9fc17e3044e3fcb843a6031ad9ece3d
|
|
7
|
+
data.tar.gz: 15ed48fde349d58b1f2778cd9b0cf5a8e502ccd74f610a23368665f938fe36c3d66041400eeb96b3aa196d03a5dd56c7becfbd6b743b055cba60ee01e4cb6c3b
|
data/README.md
CHANGED
|
@@ -53,7 +53,8 @@ SolidQueueWeb surfaces all of this in a browser UI available at any route you ch
|
|
|
53
53
|
- **Dashboard quick actions** — "Retry All Failed" and "Discard All Blocked" cards appear on the dashboard only when the respective count is non-zero; one-click bulk operations with confirm dialogs, keeping the dashboard clean when everything is healthy
|
|
54
54
|
- **CSV export** — "Export CSV" button on the jobs, failed jobs, and history pages downloads all records matching the current filters; columns are tailored per view
|
|
55
55
|
- **Slow job detection** — when `slow_job_threshold` is configured, claimed jobs running longer than the threshold are flagged with an orange row, a "slow" badge, and a "Running For" duration column on the Running tab; a "Slow Jobs" warning card appears on the dashboard with a link to the Running tab
|
|
56
|
-
- **
|
|
56
|
+
- **Job wait time** — the Running tab shows a "Wait Time" column with how long each job waited in the queue from enqueue to pickup; also exported as `wait_time_seconds` in the claimed-status CSV
|
|
57
|
+
- **Webhook alerts** — set `alert_webhook_url` and `alert_failure_threshold` to receive a POST request whenever the failed job count meets or exceeds the threshold; set `alert_queue_thresholds` for per-queue depth alerts; set `alert_slow_job_count_threshold` (requires `slow_job_threshold`) for slow-job count alerts; set `alert_stale_process_threshold` for stale-worker alerts; all fire asynchronously with a configurable cooldown (default 1 h) to prevent repeated alerts
|
|
57
58
|
- **Performance analytics** — per-job-class statistics at `/jobs/performance` showing run count, average, p50, p95, p99, standard deviation, min, and max duration; sorted by p95 descending so the slowest classes surface first; high std dev surfaces inconsistent jobs worth investigating; period filter scopes to 1h / 24h / 7d or all time; each class name links to the filtered History view
|
|
58
59
|
- **Failed job trend chart** — a "Failures — Last 12 Hours" bar chart on the dashboard shows failures per hour over the last 12 hours; bars are red, making failure spikes visible before clicking into the failed jobs list
|
|
59
60
|
- **Error frequency report** — `GET /jobs/failed_jobs/errors` groups all failed jobs by error class and message prefix, shows a count per group, and surfaces a sample backtrace in an expandable row; sorted by count descending so the most common errors appear first; accessible via the "Error Summary" button on the Failed Jobs page
|
|
@@ -106,8 +107,10 @@ SolidQueueWeb.configure do |config|
|
|
|
106
107
|
config.slow_job_threshold = 5.minutes # flag claimed jobs running longer than this (default: nil = disabled)
|
|
107
108
|
config.alert_webhook_url = "https://hooks.example.com/solid-queue" # POST target — string or array (default: nil = disabled)
|
|
108
109
|
config.alert_failure_threshold = 10 # fire when failed count >= this (default: nil = disabled)
|
|
109
|
-
config.alert_queue_thresholds
|
|
110
|
-
config.
|
|
110
|
+
config.alert_queue_thresholds = { "critical" => 50, "default" => 200 } # fire when queue depth >= threshold (default: {})
|
|
111
|
+
config.alert_slow_job_count_threshold = 5 # fire when slow job count >= this (default: nil = disabled)
|
|
112
|
+
config.alert_stale_process_threshold = 1 # fire when stale process count >= this (default: nil = disabled)
|
|
113
|
+
config.alert_webhook_cooldown = 1800 # seconds between repeated alerts per alert type (default: 3600)
|
|
111
114
|
config.connects_to = { reading: :reading, writing: :writing } # read replica (default: nil)
|
|
112
115
|
config.time_zone = "America/New_York" # display timezone for all timestamps (default: nil = UTC)
|
|
113
116
|
end
|
|
@@ -182,6 +185,57 @@ The same `alert_webhook_url` endpoint(s) receive the payload, with a distinct ev
|
|
|
182
185
|
|
|
183
186
|
Cooldown is tracked independently per queue, so a persistently deep "critical" queue does not suppress alerts for "default". The shared `alert_webhook_cooldown` setting applies to each queue separately.
|
|
184
187
|
|
|
188
|
+
## Slow job alerts
|
|
189
|
+
|
|
190
|
+
Set `alert_slow_job_count_threshold` to fire a webhook when the number of currently-running slow jobs meets or exceeds a count. This requires `slow_job_threshold` to also be configured — it defines what "slow" means.
|
|
191
|
+
|
|
192
|
+
```ruby
|
|
193
|
+
SolidQueueWeb.configure do |config|
|
|
194
|
+
config.slow_job_threshold = 5.minutes # a job is "slow" if it has been claimed longer than this
|
|
195
|
+
config.alert_slow_job_count_threshold = 3 # fire when >= 3 jobs are slow
|
|
196
|
+
config.alert_webhook_url = "https://hooks.example.com/solid-queue"
|
|
197
|
+
config.alert_webhook_cooldown = 1800 # don't re-fire for 30 minutes (default: 3600)
|
|
198
|
+
end
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
The same `alert_webhook_url` endpoint(s) receive the payload with a distinct event type:
|
|
202
|
+
|
|
203
|
+
```json
|
|
204
|
+
{
|
|
205
|
+
"event": "slow_job_threshold_exceeded",
|
|
206
|
+
"slow_job_count": 5,
|
|
207
|
+
"threshold": 3,
|
|
208
|
+
"fired_at": "2026-05-28T08:00:00Z"
|
|
209
|
+
}
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
The alert fires on every dashboard page load while the condition persists, subject to the cooldown window.
|
|
213
|
+
|
|
214
|
+
## Stale process alerts
|
|
215
|
+
|
|
216
|
+
Set `alert_stale_process_threshold` to fire a webhook when the number of stale workers meets or exceeds a count. A process is considered stale when its `last_heartbeat_at` has not been updated within `SolidQueue.process_alive_threshold` (default 5 minutes). A stale worker means jobs in its queues have silently stopped processing.
|
|
217
|
+
|
|
218
|
+
```ruby
|
|
219
|
+
SolidQueueWeb.configure do |config|
|
|
220
|
+
config.alert_stale_process_threshold = 1 # fire when any process goes stale
|
|
221
|
+
config.alert_webhook_url = "https://hooks.example.com/solid-queue"
|
|
222
|
+
config.alert_webhook_cooldown = 1800 # don't re-fire for 30 minutes (default: 3600)
|
|
223
|
+
end
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
The same `alert_webhook_url` endpoint(s) receive the payload with a distinct event type:
|
|
227
|
+
|
|
228
|
+
```json
|
|
229
|
+
{
|
|
230
|
+
"event": "stale_process_detected",
|
|
231
|
+
"stale_process_count": 2,
|
|
232
|
+
"threshold": 1,
|
|
233
|
+
"fired_at": "2026-05-28T08:00:00Z"
|
|
234
|
+
}
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
The alert fires on every dashboard page load while the condition persists, subject to the cooldown window.
|
|
238
|
+
|
|
185
239
|
## Metrics endpoint
|
|
186
240
|
|
|
187
241
|
`GET /jobs/metrics.json` returns a machine-readable JSON document suitable for Prometheus scraping, uptime monitors, or external dashboards. No configuration is required — the endpoint is available as soon as the engine is mounted.
|
|
@@ -82,10 +82,14 @@ module SolidQueueWeb
|
|
|
82
82
|
|
|
83
83
|
def jobs_csv(scope)
|
|
84
84
|
CSV.generate(headers: true) do |csv|
|
|
85
|
-
|
|
85
|
+
headers = %w[id class_name queue_name status priority enqueued_at]
|
|
86
|
+
headers << "wait_time_seconds" if @status == "claimed"
|
|
87
|
+
csv << headers
|
|
86
88
|
scope.each do |execution|
|
|
87
89
|
job = execution.job
|
|
88
|
-
|
|
90
|
+
row = [job.id, job.class_name, job.queue_name, @status, job.priority, job.created_at.iso8601]
|
|
91
|
+
row << (execution.created_at - job.created_at).to_i if @status == "claimed"
|
|
92
|
+
csv << row
|
|
89
93
|
end
|
|
90
94
|
end
|
|
91
95
|
end
|
|
@@ -7,6 +7,11 @@ module SolidQueueWeb
|
|
|
7
7
|
@failed_24h = stats.failed_24h
|
|
8
8
|
@oldest_ready = stats.oldest_ready
|
|
9
9
|
@failure_sparklines = stats.failure_sparklines
|
|
10
|
+
@queue_sizes = SolidQueue::ReadyExecution
|
|
11
|
+
.joins(:job)
|
|
12
|
+
.group("solid_queue_jobs.queue_name")
|
|
13
|
+
.count
|
|
14
|
+
@paused_queue_names = SolidQueue::Pause.pluck(:queue_name).to_set
|
|
10
15
|
end
|
|
11
16
|
end
|
|
12
17
|
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "json"
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module SolidQueueWeb
|
|
6
|
+
class SlowJobAlert
|
|
7
|
+
MUTEX = Mutex.new
|
|
8
|
+
|
|
9
|
+
class << self
|
|
10
|
+
def call
|
|
11
|
+
return unless configured?
|
|
12
|
+
|
|
13
|
+
slow_count = SolidQueue::ClaimedExecution
|
|
14
|
+
.where("created_at <= ?", SolidQueueWeb.slow_job_threshold.ago)
|
|
15
|
+
.count
|
|
16
|
+
|
|
17
|
+
return if slow_count < SolidQueueWeb.alert_slow_job_count_threshold
|
|
18
|
+
return unless should_fire?
|
|
19
|
+
|
|
20
|
+
urls = webhook_urls
|
|
21
|
+
Thread.new { urls.each { |url| post(url, slow_count) } }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def reset!
|
|
25
|
+
MUTEX.synchronize { @last_fired_at = nil }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def configured?
|
|
31
|
+
SolidQueueWeb.slow_job_threshold.present? &&
|
|
32
|
+
SolidQueueWeb.alert_slow_job_count_threshold.present? &&
|
|
33
|
+
webhook_urls.any?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def webhook_urls
|
|
37
|
+
Array(SolidQueueWeb.alert_webhook_url).flatten.compact.select(&:present?)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def should_fire?
|
|
41
|
+
MUTEX.synchronize do
|
|
42
|
+
cooldown = SolidQueueWeb.alert_webhook_cooldown
|
|
43
|
+
return false if @last_fired_at && Time.current - @last_fired_at < cooldown
|
|
44
|
+
|
|
45
|
+
@last_fired_at = Time.current
|
|
46
|
+
true
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def post(url_string, slow_count)
|
|
51
|
+
uri = URI.parse(url_string)
|
|
52
|
+
payload = JSON.generate(
|
|
53
|
+
event: "slow_job_threshold_exceeded",
|
|
54
|
+
slow_job_count: slow_count,
|
|
55
|
+
threshold: SolidQueueWeb.alert_slow_job_count_threshold,
|
|
56
|
+
fired_at: Time.current.iso8601
|
|
57
|
+
)
|
|
58
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
59
|
+
http.use_ssl = uri.scheme == "https"
|
|
60
|
+
http.open_timeout = 5
|
|
61
|
+
http.read_timeout = 10
|
|
62
|
+
request = Net::HTTP::Post.new(uri.path.presence || "/", "Content-Type" => "application/json")
|
|
63
|
+
request.body = payload
|
|
64
|
+
http.request(request)
|
|
65
|
+
rescue => e
|
|
66
|
+
Rails.logger.error("[SolidQueueWeb] Slow job alert webhook failed: #{e.message}")
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "json"
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module SolidQueueWeb
|
|
6
|
+
class StaleProcessAlert
|
|
7
|
+
MUTEX = Mutex.new
|
|
8
|
+
|
|
9
|
+
class << self
|
|
10
|
+
def call
|
|
11
|
+
return unless configured?
|
|
12
|
+
|
|
13
|
+
stale_count = SolidQueue::Process
|
|
14
|
+
.where("last_heartbeat_at < ?", SolidQueue.process_alive_threshold.ago)
|
|
15
|
+
.count
|
|
16
|
+
|
|
17
|
+
return if stale_count < SolidQueueWeb.alert_stale_process_threshold
|
|
18
|
+
return unless should_fire?
|
|
19
|
+
|
|
20
|
+
urls = webhook_urls
|
|
21
|
+
Thread.new { urls.each { |url| post(url, stale_count) } }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def reset!
|
|
25
|
+
MUTEX.synchronize { @last_fired_at = nil }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def configured?
|
|
31
|
+
SolidQueueWeb.alert_stale_process_threshold.present? && webhook_urls.any?
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def webhook_urls
|
|
35
|
+
Array(SolidQueueWeb.alert_webhook_url).flatten.compact.select(&:present?)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def should_fire?
|
|
39
|
+
MUTEX.synchronize do
|
|
40
|
+
cooldown = SolidQueueWeb.alert_webhook_cooldown
|
|
41
|
+
return false if @last_fired_at && Time.current - @last_fired_at < cooldown
|
|
42
|
+
|
|
43
|
+
@last_fired_at = Time.current
|
|
44
|
+
true
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def post(url_string, stale_count)
|
|
49
|
+
uri = URI.parse(url_string)
|
|
50
|
+
payload = JSON.generate(
|
|
51
|
+
event: "stale_process_detected",
|
|
52
|
+
stale_process_count: stale_count,
|
|
53
|
+
threshold: SolidQueueWeb.alert_stale_process_threshold,
|
|
54
|
+
fired_at: Time.current.iso8601
|
|
55
|
+
)
|
|
56
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
57
|
+
http.use_ssl = uri.scheme == "https"
|
|
58
|
+
http.open_timeout = 5
|
|
59
|
+
http.read_timeout = 10
|
|
60
|
+
request = Net::HTTP::Post.new(uri.path.presence || "/", "Content-Type" => "application/json")
|
|
61
|
+
request.body = payload
|
|
62
|
+
http.request(request)
|
|
63
|
+
rescue => e
|
|
64
|
+
Rails.logger.error("[SolidQueueWeb] Stale process alert webhook failed: #{e.message}")
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -164,6 +164,7 @@
|
|
|
164
164
|
<%= sort_header_th("Enqueued At", "created_at", sort_url, current_sort: @sort, current_dir: @direction) %>
|
|
165
165
|
<% if @status == "claimed" %>
|
|
166
166
|
<th scope="col">Running For</th>
|
|
167
|
+
<th scope="col">Wait Time</th>
|
|
167
168
|
<% end %>
|
|
168
169
|
</tr>
|
|
169
170
|
</thead>
|
|
@@ -192,6 +193,9 @@
|
|
|
192
193
|
<td class="sqd-mono<%= slow ? " sqd-slow-duration" : "" %>">
|
|
193
194
|
<%= time_ago_in_words(execution.created_at) %>
|
|
194
195
|
</td>
|
|
196
|
+
<td class="sqd-mono">
|
|
197
|
+
<%= format_duration(execution.created_at - job.created_at) %>
|
|
198
|
+
</td>
|
|
195
199
|
<% end %>
|
|
196
200
|
</tr>
|
|
197
201
|
<% end %>
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
<% @queues.each do |queue| %>
|
|
22
22
|
<tr>
|
|
23
23
|
<td class="sqd-mono"><%= queue.name %></td>
|
|
24
|
-
<td><%= queue.
|
|
24
|
+
<td><%= @queue_sizes[queue.name] || 0 %></td>
|
|
25
25
|
<td>
|
|
26
26
|
<% if (oldest = @oldest_ready[queue.name]) %>
|
|
27
27
|
<% age = Time.current - oldest %>
|
|
@@ -52,14 +52,14 @@
|
|
|
52
52
|
<% end %>
|
|
53
53
|
</td>
|
|
54
54
|
<td>
|
|
55
|
-
<% if queue.
|
|
55
|
+
<% if @paused_queue_names.include?(queue.name) %>
|
|
56
56
|
<span class="sqd-badge sqd-badge--paused">Paused</span>
|
|
57
57
|
<% else %>
|
|
58
58
|
<span class="sqd-badge sqd-badge--running">Running</span>
|
|
59
59
|
<% end %>
|
|
60
60
|
</td>
|
|
61
61
|
<td class="sqd-row-actions">
|
|
62
|
-
<% if queue.
|
|
62
|
+
<% if @paused_queue_names.include?(queue.name) %>
|
|
63
63
|
<%= button_to "Resume", queue_pause_path(queue.name), method: :delete,
|
|
64
64
|
class: "sqd-btn sqd-btn--primary sqd-btn--sm" %>
|
|
65
65
|
<% else %>
|
data/lib/solid_queue_web.rb
CHANGED
|
@@ -6,7 +6,8 @@ module SolidQueueWeb
|
|
|
6
6
|
class << self
|
|
7
7
|
attr_writer :page_size, :dashboard_refresh_interval, :default_refresh_interval, :search_results_limit,
|
|
8
8
|
:slow_job_threshold, :alert_webhook_url, :alert_failure_threshold, :alert_webhook_cooldown,
|
|
9
|
-
:alert_queue_thresholds, :
|
|
9
|
+
:alert_queue_thresholds, :alert_slow_job_count_threshold, :alert_stale_process_threshold,
|
|
10
|
+
:connects_to, :time_zone
|
|
10
11
|
|
|
11
12
|
def page_size
|
|
12
13
|
@page_size || 25
|
|
@@ -44,6 +45,14 @@ module SolidQueueWeb
|
|
|
44
45
|
@alert_queue_thresholds || {}
|
|
45
46
|
end
|
|
46
47
|
|
|
48
|
+
def alert_slow_job_count_threshold
|
|
49
|
+
@alert_slow_job_count_threshold
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def alert_stale_process_threshold
|
|
53
|
+
@alert_stale_process_threshold
|
|
54
|
+
end
|
|
55
|
+
|
|
47
56
|
def connects_to
|
|
48
57
|
@connects_to
|
|
49
58
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: solid_queue_web
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chuck Smith
|
|
@@ -159,6 +159,8 @@ files:
|
|
|
159
159
|
- app/services/solid_queue_web/metrics_payload.rb
|
|
160
160
|
- app/services/solid_queue_web/queue_depth_alert.rb
|
|
161
161
|
- app/services/solid_queue_web/queue_stats.rb
|
|
162
|
+
- app/services/solid_queue_web/slow_job_alert.rb
|
|
163
|
+
- app/services/solid_queue_web/stale_process_alert.rb
|
|
162
164
|
- app/views/layouts/solid_queue_web/application.html.erb
|
|
163
165
|
- app/views/solid_queue_web/dashboard/index.html.erb
|
|
164
166
|
- app/views/solid_queue_web/failed_jobs/errors/index.html.erb
|