roundhouse_ui 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 73ff2f1bc8db03a06168ee39d18b1de518e1a373860e1e0faafdbd08370d5eab
4
- data.tar.gz: e1bb6668de35f02e64c369d016a21af989984f604a6009efba39160bd05b5b6a
3
+ metadata.gz: 5064ee74f5aea444f5762d2f466f0b4c05d9c7af6c44a48045e6703d48b72587
4
+ data.tar.gz: 3b5a08f1769a5bd5ee66bdbe5ccd586842373b45530b95cbc4ff6340fe17796e
5
5
  SHA512:
6
- metadata.gz: a27aa6ed4d0a91982e1997a6dcb6492a734edbfcbe3c4cf1c914366b4019b8ba49e922c7bfe59a45da662493d12b196c1f5bcd5ac47f9486248458877f42bb7f
7
- data.tar.gz: faa3b64f3da583b3e361b431b8e6fa7b89b4c077a07cc981d54dfd2b20f248144c2798a1497aae4220de86fe385aca7f335b15d6b63c9d9648b22aa05f4acde5
6
+ metadata.gz: 6d3cf1e161baec06cb4b899d9f97065fb216341716719756c062a140a811bf51997748f9bcc322dc78aaa8ec36d56311decc511b70eea8d53a07a57a5b317b8c
7
+ data.tar.gz: 15a01ddaa5fa81c37c67516c369d1dac2b795b011e5dad1927b3b70aeed47b365a4938765e78f9eae910da0a39e36fe174d6170ba0226d1eeab404a700f4c218
@@ -5,6 +5,11 @@ module RoundhouseUi
5
5
  def show
6
6
  @stats = Sidekiq::Stats.new
7
7
  @queues = Sidekiq::Queue.all
8
+ @metrics = Metrics.new(stats: @stats)
9
+ @health = Health.new(stats: @stats, queues: @queues, metrics: @metrics)
10
+ # Highest-signal slices for the overview, from data we already read.
11
+ @top_errors = ErrorGroups.new(limit: 200).call.groups.first(5)
12
+ @problem_queues = @queues.select { |q| q.latency > 5 }.sort_by { |q| -q.latency }.first(5)
8
13
  end
9
14
 
10
15
  # Polled by the dashboard for live counts (same approach Sidekiq Web uses —
@@ -3,59 +3,11 @@ module RoundhouseUi
3
3
  # (job class + error class) — so one bad deploy reads as a single issue with
4
4
  # a count, not five thousand identical rows. The aggregation Sidekiq Web lacks.
5
5
  class ErrorsController < ApplicationController
6
- SCAN_LIMIT = 1_000 # cap entries scanned per pass; shown honestly in the view
7
-
8
6
  def index
9
7
  @query = params[:q].to_s.strip
10
- @scan_limit = SCAN_LIMIT
11
- @groups, @scanned, @truncated = aggregate
12
- end
13
-
14
- private
15
-
16
- def aggregate
17
- groups = {}
18
- scanned = 0
19
- truncated = false
20
-
21
- sources.each do |source, set|
22
- set.each do |entry|
23
- scanned += 1
24
- if scanned > SCAN_LIMIT
25
- truncated = true
26
- break
27
- end
28
- record(groups, source, entry)
29
- end
30
- break if truncated
31
- end
32
-
33
- list = groups.values.sort_by { |g| -g[:count] }
34
- list = list.select { |g| "#{g[:klass]} #{g[:error]}".downcase.include?(@query.downcase) } if @query.present?
35
- [ list, scanned, truncated ]
36
- end
37
-
38
- # Sidekiq's native sets, plus the sidekiq-failures `failed` set when opted in
39
- # and that gem is loaded. Its FailureSet is a Sidekiq::JobSet, so it iterates
40
- # exactly like the others — no special-casing in the aggregation above.
41
- def sources
42
- sets = { "retry" => Sidekiq::RetrySet.new, "dead" => Sidekiq::DeadSet.new }
43
- if RoundhouseUi.show_sidekiq_failures && defined?(Sidekiq::Failures::FailureSet)
44
- sets["failed"] = Sidekiq::Failures::FailureSet.new
45
- end
46
- sets
47
- end
48
-
49
- def record(groups, source, entry)
50
- error = entry.item["error_class"] || "UnknownError"
51
- group = (groups["#{entry.klass}|#{error}"] ||= {
52
- klass: entry.klass, error: error, count: 0, last_at: nil, queues: [], sources: []
53
- })
54
- group[:count] += 1
55
- group[:queues] |= [ entry.queue ]
56
- group[:sources] |= [ source ]
57
- at = entry.at
58
- group[:last_at] = at if at && (group[:last_at].nil? || at > group[:last_at])
8
+ @scan_limit = ErrorGroups::DEFAULT_SCAN_LIMIT
9
+ result = ErrorGroups.new(query: @query).call
10
+ @groups, @scanned, @truncated = result.groups, result.scanned, result.truncated
59
11
  end
60
12
  end
61
13
  end
@@ -117,12 +117,52 @@
117
117
  .rh-card .d.up { color:var(--good); } .rh-card .d.bad { color:var(--crit); }
118
118
  .rh-card .pill { display:inline-flex; align-items:center; gap:6px; font-size:13px; font-weight:600; padding:4px 10px; border-radius:7px; margin-bottom:9px; background:rgba(226,165,63,.14); color:var(--warn); }
119
119
  .rh-card .pill.ok { background:rgba(68,197,140,.14); color:var(--good); }
120
- .rh-chart-wrap { background:var(--panel); border:1px solid var(--line-soft); border-radius:12px; padding:18px 20px 10px; margin-bottom:24px; }
120
+ .rh-chart-wrap { background:var(--panel); border:1px solid var(--line-soft); border-radius:12px; padding:18px 20px 10px; margin-bottom:24px; position:relative; }
121
+ #rh-chart-tip { position:absolute; top:34px; transform:translateX(-50%); pointer-events:none; background:var(--panel-3); border:1px solid var(--line); color:var(--text); font:11px var(--mono); padding:2px 7px; border-radius:6px; white-space:nowrap; }
122
+ #rh-chart-tip[hidden] { display:none; }
121
123
  .rh-chart-wrap .top { display:flex; align-items:baseline; gap:12px; margin-bottom:6px; }
122
124
  .rh-chart-wrap h3 { font-size:13px; font-weight:600; margin:0; }
123
125
  .rh-chart-wrap .now { margin-left:auto; font-family:var(--mono); font-size:13px; color:var(--accent); }
124
126
  canvas#rh-chart { display:block; width:100%; height:90px; }
125
127
  .rh-cb { width:15px; height:15px; }
128
+
129
+ /* composite health banner */
130
+ .rh-health { background:var(--panel); border:1px solid var(--line-soft); border-left:3px solid var(--faint); border-radius:12px; margin-bottom:18px; }
131
+ .rh-health-ok { border-left-color:var(--good); }
132
+ .rh-health-warn { border-left-color:var(--warn); }
133
+ .rh-health-crit { border-left-color:var(--crit); }
134
+ .rh-health summary { display:flex; align-items:center; gap:12px; padding:14px 16px; cursor:pointer; list-style:none; }
135
+ .rh-health summary::-webkit-details-marker { display:none; }
136
+ .rh-health-dot { width:9px; height:9px; border-radius:50%; flex:none; background:var(--faint); }
137
+ .rh-health-ok .rh-health-dot { background:var(--good); }
138
+ .rh-health-warn .rh-health-dot { background:var(--warn); }
139
+ .rh-health-crit .rh-health-dot { background:var(--crit); }
140
+ .rh-health-verdict { font-weight:650; font-size:15px; }
141
+ .rh-health-ok .rh-health-verdict { color:var(--good); }
142
+ .rh-health-warn .rh-health-verdict { color:var(--warn); }
143
+ .rh-health-crit .rh-health-verdict { color:var(--crit); }
144
+ .rh-health-reason { color:var(--muted); font-size:13px; }
145
+ .rh-health-cta { margin-left:auto; font:11px var(--mono); color:var(--faint); }
146
+ .rh-health[open] .rh-health-cta { color:var(--accent); }
147
+ .rh-health-signals { padding:2px 16px 14px; display:flex; flex-direction:column; gap:8px; border-top:1px solid var(--line-soft); margin-top:2px; padding-top:12px; }
148
+ .rh-sig { display:flex; align-items:center; gap:10px; font-size:13px; }
149
+ .rh-sig b { min-width:170px; font-weight:600; }
150
+
151
+ /* insight panels */
152
+ .rh-insights { display:grid; gap:14px; margin-bottom:24px; }
153
+ @media(min-width:760px){ .rh-insights { grid-template-columns:1fr 1fr; } }
154
+ .rh-insight { padding:6px 0; }
155
+ .rh-insight-h { display:flex; align-items:baseline; justify-content:space-between; gap:10px; font-size:13px; font-weight:600; padding:13px 16px 10px; }
156
+ .rh-insight-row { display:flex; align-items:center; gap:12px; padding:10px 16px; border-top:1px solid var(--line-soft); }
157
+ .rh-insight-row .rh-insight-main { flex:1; min-width:0; font-size:13px; }
158
+ .rh-insight-row .rh-pill { font-variant-numeric:tabular-nums; }
159
+ .rh-insight .rh-empty { padding:18px 16px; }
160
+
161
+ /* collapsible disclosure (large args, full backtrace) */
162
+ .rh-disclose summary { cursor:pointer; font:12px var(--mono); color:var(--accent); padding:6px 0; list-style:none; }
163
+ .rh-disclose summary::-webkit-details-marker { display:none; }
164
+ .rh-disclose summary::before { content:"▸ "; }
165
+ .rh-disclose[open] summary::before { content:"▾ "; }
126
166
  .rh-field { margin-bottom:16px; max-width:640px; }
127
167
  .rh-field label { display:block; font-size:12px; color:var(--muted); margin-bottom:6px; }
128
168
  .rh-field input, .rh-field textarea { width:100%; background:var(--panel); border:1px solid var(--line); border-radius:9px; padding:9px 12px; color:var(--text); font:13px var(--mono); }
@@ -160,6 +200,7 @@
160
200
  var started = false, lastProcessed = null, lastFailed = null, lastBacklog = null, lastT = null;
161
201
  var POLL_MS = <%= (RoundhouseUi.poll_interval.to_f * 1000).round %>;
162
202
  var samples = [], buckets = [], bucketStart = null; // samples = current bucket; buckets = finalized per-interval averages
203
+ var lastPts = []; // points last drawn, for the hover tooltip to read
163
204
  function setText(id, t) { var el = document.getElementById(id); if (el) el.textContent = t; }
164
205
  function humanizeEta(s) {
165
206
  if (s < 60) return "~" + Math.round(s) + "s";
@@ -179,12 +220,28 @@
179
220
  // finalized buckets + the in-progress bucket as a provisional last point,
180
221
  // so the chart shows data immediately instead of waiting a full interval.
181
222
  var pts = buckets.slice(-60); if (samples.length) pts = pts.concat([ avg(samples) ]);
182
- var n = pts.length; if (n < 2) return;
183
- var max = Math.max.apply(null, pts) * 1.25 || 1;
223
+ lastPts = pts;
224
+ var n = pts.length; if (n < 2) { setText("rh-chart-peak", ""); return; }
225
+ var peak = Math.max.apply(null, pts);
226
+ var max = peak * 1.25 || 1;
184
227
  var x = function (i) { return i / (n - 1) * w; }, y = function (v) { return h - pad - v / max * (h - pad * 2); };
185
228
  var g = ctx.createLinearGradient(0, 0, 0, h); g.addColorStop(0, "rgba(110,139,255,.30)"); g.addColorStop(1, "rgba(110,139,255,0)");
186
229
  ctx.beginPath(); ctx.moveTo(0, h); pts.forEach(function (v, i) { ctx.lineTo(x(i), y(v)); }); ctx.lineTo(w, h); ctx.closePath(); ctx.fillStyle = g; ctx.fill();
187
230
  ctx.beginPath(); pts.forEach(function (v, i) { i ? ctx.lineTo(x(i), y(v)) : ctx.moveTo(x(i), y(v)); }); ctx.strokeStyle = "#6E8BFF"; ctx.lineWidth = 2; ctx.lineJoin = "round"; ctx.stroke();
231
+ // emphasized endpoint (small marker; canvas is vertically scaled, so a rect reads cleaner than an arc)
232
+ ctx.fillStyle = "#6E8BFF"; ctx.fillRect(x(n - 1) - 3, y(pts[n - 1]) - 3, 6, 6);
233
+ setText("rh-chart-peak", "peak " + Math.round(peak) + "/s");
234
+ }
235
+ // Hover tooltip: map the cursor's fractional x to the nearest drawn point.
236
+ function chartHover(e) {
237
+ var cv = document.getElementById("rh-chart"), tip = document.getElementById("rh-chart-tip");
238
+ if (!cv || !tip || lastPts.length < 2) return;
239
+ var rect = cv.getBoundingClientRect();
240
+ var fx = Math.min(1, Math.max(0, (e.clientX - rect.left) / rect.width));
241
+ var idx = Math.round(fx * (lastPts.length - 1));
242
+ tip.textContent = Math.round(lastPts[idx]) + "/s";
243
+ tip.style.left = (e.clientX - rect.left) + "px";
244
+ tip.hidden = false;
188
245
  }
189
246
  function apply(d) {
190
247
  Object.keys(d).forEach(function (k) {
@@ -231,6 +288,11 @@
231
288
  .catch(function () {});
232
289
  }
233
290
  function startOnce() { if (started) return; started = true; poll(); setInterval(poll, POLL_MS); }
291
+ function wireChart() {
292
+ var cv = document.getElementById("rh-chart"); if (!cv) return;
293
+ cv.onmousemove = chartHover;
294
+ cv.onmouseleave = function () { var t = document.getElementById("rh-chart-tip"); if (t) t.hidden = true; };
295
+ }
234
296
  function syncTheme() { var b = document.getElementById("rh-theme"); if (b) b.textContent = document.documentElement.getAttribute("data-theme") === "light" ? "☀" : "☾"; }
235
297
  function syncWidth() { var b = document.getElementById("rh-width"); if (b) b.classList.toggle("is-on", document.documentElement.getAttribute("data-width") === "full"); }
236
298
 
@@ -342,8 +404,8 @@
342
404
  else if (e.key === "Enter") { e.preventDefault(); if (palFiltered[palSel]) palRun(palFiltered[palSel]); }
343
405
  });
344
406
 
345
- document.addEventListener("turbo:load", function () { startOnce(); syncTheme(); syncWidth(); setActiveNav(); restoreChartInterval(); draw(); });
346
- document.addEventListener("DOMContentLoaded", function () { startOnce(); syncTheme(); syncWidth(); setActiveNav(); restoreChartInterval(); });
407
+ document.addEventListener("turbo:load", function () { startOnce(); syncTheme(); syncWidth(); setActiveNav(); restoreChartInterval(); wireChart(); draw(); });
408
+ document.addEventListener("DOMContentLoaded", function () { startOnce(); syncTheme(); syncWidth(); setActiveNav(); restoreChartInterval(); wireChart(); });
347
409
  document.addEventListener("visibilitychange", function () { if (!document.hidden) poll(); });
348
410
  })();
349
411
  </script>
@@ -1,49 +1,83 @@
1
1
  <% content_for :title, "Dashboard" %>
2
2
  <% content_for :crumb, Rails.env %>
3
3
 
4
- <% stuck = @queues.select { |q| q.latency > 60 } %>
5
-
6
- <% if stuck.any? %>
7
- <div class="rh-alerts">
8
- <% stuck.each do |q| %>
9
- <div class="rh-alert <%= "warn" if q.latency <= 600 %>">
10
- <span class="msg">Queue <b><%= q.name %></b> is <%= q.latency > 600 ? "stuck" : "over budget" %> — oldest job <%= distance_of_time_in_words(0, q.latency) %>, <%= number_with_delimiter q.size %> waiting</span>
11
- <%= link_to "Manage →", queues_path, class: "rh-btn" %>
4
+ <% verdict = { ok: "Healthy", warn: "Degraded", crit: "Critical" }[@health.status] %>
5
+ <details class="rh-health rh-health-<%= @health.status %>"<%= " open".html_safe unless @health.healthy? %>>
6
+ <summary>
7
+ <span class="rh-health-dot"></span>
8
+ <span class="rh-health-verdict"><%= verdict %></span>
9
+ <span class="rh-health-reason"><%= @health.reason %></span>
10
+ <span class="rh-health-cta">why ▾</span>
11
+ </summary>
12
+ <div class="rh-health-signals">
13
+ <% @health.signals.each do |s| %>
14
+ <div class="rh-sig">
15
+ <span class="rh-st rh-st-<%= s.status %>"><%= s.status %></span>
16
+ <b><%= s.label %></b>
17
+ <span class="rh-sub"><%= s.detail %></span>
12
18
  </div>
13
19
  <% end %>
14
20
  </div>
15
- <% end %>
21
+ </details>
16
22
 
17
23
  <div class="rh-cards">
18
- <div class="rh-card">
19
- <% if stuck.any? %>
20
- <span class="pill">⚠ Degraded</span>
21
- <div class="k"><b style="color:var(--warn)"><%= stuck.first.name %></b> queue over budget</div>
22
- <% else %>
23
- <span class="pill ok">✓ Healthy</span>
24
- <div class="k">all queues within budget</div>
25
- <% end %>
26
- </div>
27
24
  <div class="rh-card">
28
25
  <div class="k">Processed</div>
29
26
  <div class="v num" data-stat="processed"><%= number_with_delimiter @stats.processed %></div>
30
- <div class="d"><span class="num" data-stat="rate">—</span> / min</div>
27
+ <div class="d"><span class="num" data-stat="rate">—</span> / min · <%= ((1 - @metrics.failure_ratio) * 100).round(1) %>% ok</div>
31
28
  </div>
32
29
  <div class="rh-card">
33
30
  <div class="k">Failed · total</div>
34
31
  <div class="v num" data-stat="failed"><%= number_with_delimiter @stats.failed %></div>
35
- <div class="d bad"><span class="num" data-stat="dead"><%= number_with_delimiter @stats.dead_size %></span> dead</div>
32
+ <div class="d bad"><span class="num" data-stat="dead"><%= number_with_delimiter @stats.dead_size %></span> dead · <%= (@metrics.failure_ratio * 100).round(1) %>% rate</div>
36
33
  </div>
37
34
  <div class="rh-card">
38
35
  <div class="k">Busy threads</div>
39
36
  <div class="v num" data-stat="busy"><%= @stats.workers_size %></div>
40
- <div class="d">enqueued <span class="num" data-stat="enqueued"><%= number_with_delimiter @stats.enqueued %></span></div>
37
+ <div class="d"><%= @metrics.utilization ? "#{(@metrics.utilization * 100).round}% of #{@metrics.concurrency} busy" : "no workers reporting" %></div>
38
+ </div>
39
+ <div class="rh-card">
40
+ <div class="k">Backlog</div>
41
+ <div class="v num"><%= number_with_delimiter @metrics.backlog %></div>
42
+ <div class="d">enqueued <span class="num" data-stat="enqueued"><%= number_with_delimiter @stats.enqueued %></span> · scheduled <span class="num" data-stat="scheduled"><%= number_with_delimiter @stats.scheduled_size %></span></div>
41
43
  </div>
42
44
  </div>
43
45
 
44
46
  <div class="rh-chart-wrap">
45
- <div class="top"><h3>Throughput</h3><span class="rh-sub">jobs / sec · live</span><span class="now"><span id="rh-chart-now">—</span>/s</span><select id="rh-chart-interval" aria-label="Throughput interval" style="margin-left:12px;background:var(--panel-2);color:var(--muted);border:1px solid var(--line);border-radius:7px;padding:3px 6px;font:12px var(--sans)"><option value="10">per 10s</option><option value="30" selected>per 30s</option><option value="60">per 1m</option><option value="300">per 5m</option></select></div>
47
+ <div class="top"><h3>Throughput</h3><span class="rh-sub">jobs / sec · live</span><span class="rh-sub" id="rh-chart-peak" style="margin-left:10px"></span><span class="now"><span id="rh-chart-now">—</span>/s</span><select id="rh-chart-interval" aria-label="Throughput interval" style="margin-left:12px;background:var(--panel-2);color:var(--muted);border:1px solid var(--line);border-radius:7px;padding:3px 6px;font:12px var(--sans)"><option value="10">per 10s</option><option value="30" selected>per 30s</option><option value="60">per 1m</option><option value="300">per 5m</option></select></div>
46
48
  <canvas id="rh-chart" width="1100" height="180"></canvas>
49
+ <div id="rh-chart-tip" hidden></div>
50
+ </div>
51
+
52
+ <div class="rh-insights">
53
+ <div class="rh-panel rh-insight">
54
+ <div class="rh-insight-h">Top failing job classes <%= link_to "all errors →", errors_path, class: "rh-sub" %></div>
55
+ <% if @top_errors.empty? %>
56
+ <div class="rh-empty">No failing jobs 🎉</div>
57
+ <% else %>
58
+ <% @top_errors.each do |g| %>
59
+ <div class="rh-insight-row">
60
+ <div class="rh-insight-main"><%= link_to g[:klass], errors_path(q: g[:klass]), class: "rh-joblink" %> <%= error_trace_link(klass: g[:klass], error: g[:error]) %><br><span class="rh-err"><%= g[:error] %></span></div>
61
+ <span class="rh-pill"><%= number_with_delimiter g[:count] %></span>
62
+ </div>
63
+ <% end %>
64
+ <% end %>
65
+ </div>
66
+
67
+ <div class="rh-panel rh-insight">
68
+ <div class="rh-insight-h">Problem queues <%= link_to "all queues →", queues_path, class: "rh-sub" %></div>
69
+ <% if @problem_queues.empty? %>
70
+ <div class="rh-empty">All queues fresh — nothing backing up.</div>
71
+ <% else %>
72
+ <% @problem_queues.each do |q| %>
73
+ <% label, css = queue_state(q.latency) %>
74
+ <div class="rh-insight-row">
75
+ <div class="rh-insight-main"><%= link_to q.name, queues_path, class: "rh-joblink rh-mono" %><br><span class="rh-sub"><%= number_with_delimiter q.size %> waiting</span></div>
76
+ <span class="rh-st <%= css %>"><%= q.latency < 60 ? "#{q.latency.round(1)}s" : distance_of_time_in_words(0, q.latency) %></span>
77
+ </div>
78
+ <% end %>
79
+ <% end %>
80
+ </div>
47
81
  </div>
48
82
 
49
83
  <h2 class="rh-h2">Queues <span class="hint">live · click a queue to manage</span></h2>
@@ -9,7 +9,15 @@
9
9
  </div>
10
10
 
11
11
  <div class="rh-sec">Arguments<% if RoundhouseUi.redact_args.present? %> <span class="rh-sub">— sensitive keys masked</span><% end %></div>
12
- <pre class="rh-pre"><%= JSON.pretty_generate(RoundhouseUi::Redaction.apply(item["args"] || [])) %></pre>
12
+ <% args_json = JSON.pretty_generate(RoundhouseUi::Redaction.apply(item["args"] || [])) %>
13
+ <% if args_json.lines.size > 30 %>
14
+ <details class="rh-disclose">
15
+ <summary><%= pluralize(args_json.lines.size, "line") %> — expand arguments</summary>
16
+ <pre class="rh-pre"><%= args_json %></pre>
17
+ </details>
18
+ <% else %>
19
+ <pre class="rh-pre"><%= args_json %></pre>
20
+ <% end %>
13
21
 
14
22
  <% if item["error_class"].present? %>
15
23
  <div class="rh-sec">Error</div>
@@ -17,7 +25,10 @@
17
25
  <% if item["error_message"].present? %><p class="rh-sub" style="margin:6px 0 0"><%= item["error_message"] %></p><% end %>
18
26
  <% bt = item["error_backtrace"] %>
19
27
  <% if bt.is_a?(Array) && bt.any? %>
20
- <pre class="rh-pre"><%= bt.first(20).join("\n") %></pre>
28
+ <details class="rh-disclose"<%= " open".html_safe if bt.size <= 20 %>>
29
+ <summary><%= pluralize(bt.size, "line") %> backtrace</summary>
30
+ <pre class="rh-pre"><%= bt.join("\n") %></pre>
31
+ </details>
21
32
  <% end %>
22
33
  <% end %>
23
34
 
@@ -0,0 +1,65 @@
1
+ require "sidekiq/api"
2
+
3
+ module RoundhouseUi
4
+ # Groups failing jobs across the retry + dead sets (and the sidekiq-failures
5
+ # `failed` set, when opted in) by a fingerprint of job class + error class —
6
+ # so one bad deploy reads as a single issue with a count, not thousands of
7
+ # identical rows. Used by the Errors page and the dashboard's "top failing"
8
+ # panel, so the aggregation lives here rather than in a controller.
9
+ class ErrorGroups
10
+ DEFAULT_SCAN_LIMIT = 1_000 # cap entries scanned per pass; surfaced honestly
11
+
12
+ Result = Struct.new(:groups, :scanned, :truncated, keyword_init: true)
13
+
14
+ def initialize(query: nil, limit: DEFAULT_SCAN_LIMIT)
15
+ @query = query.to_s.strip
16
+ @limit = limit
17
+ end
18
+
19
+ def call
20
+ groups = {}
21
+ scanned = 0
22
+ truncated = false
23
+
24
+ sources.each do |source, set|
25
+ set.each do |entry|
26
+ scanned += 1
27
+ if scanned > @limit
28
+ truncated = true
29
+ break
30
+ end
31
+ record(groups, source, entry)
32
+ end
33
+ break if truncated
34
+ end
35
+
36
+ list = groups.values.sort_by { |g| -g[:count] }
37
+ list = list.select { |g| "#{g[:klass]} #{g[:error]}".downcase.include?(@query.downcase) } if @query.present?
38
+ Result.new(groups: list, scanned: scanned, truncated: truncated)
39
+ end
40
+
41
+ private
42
+
43
+ # Sidekiq's native sets, plus the sidekiq-failures `failed` set when opted in
44
+ # and loaded. Its FailureSet is a Sidekiq::JobSet, so it iterates like the rest.
45
+ def sources
46
+ sets = { "retry" => Sidekiq::RetrySet.new, "dead" => Sidekiq::DeadSet.new }
47
+ if RoundhouseUi.show_sidekiq_failures && defined?(Sidekiq::Failures::FailureSet)
48
+ sets["failed"] = Sidekiq::Failures::FailureSet.new
49
+ end
50
+ sets
51
+ end
52
+
53
+ def record(groups, source, entry)
54
+ error = entry.item["error_class"] || "UnknownError"
55
+ group = (groups["#{entry.klass}|#{error}"] ||= {
56
+ klass: entry.klass, error: error, count: 0, last_at: nil, queues: [], sources: []
57
+ })
58
+ group[:count] += 1
59
+ group[:queues] |= [ entry.queue ]
60
+ group[:sources] |= [ source ]
61
+ at = entry.at
62
+ group[:last_at] = at if at && (group[:last_at].nil? || at > group[:last_at])
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,65 @@
1
+ module RoundhouseUi
2
+ # A composite health verdict for the dashboard. Instead of a static green dot,
3
+ # it rolls up the signals an on-call engineer actually checks — error rate,
4
+ # queue latency, worker utilization — into one status + a human reason, and
5
+ # exposes the sub-signals so the banner can explain *why*.
6
+ class Health
7
+ Signal = Struct.new(:key, :label, :status, :detail, keyword_init: true)
8
+
9
+ RANK = { ok: 0, warn: 1, crit: 2 }.freeze
10
+
11
+ def initialize(stats:, queues:, metrics:)
12
+ @stats = stats
13
+ @queues = queues
14
+ @metrics = metrics
15
+ end
16
+
17
+ def signals
18
+ @signals ||= [ error_rate_signal, latency_signal, utilization_signal ].compact
19
+ end
20
+
21
+ # Worst sub-signal wins.
22
+ def status
23
+ signals.map(&:status).max_by { |s| RANK[s] } || :ok
24
+ end
25
+
26
+ def reason
27
+ worst = signals.max_by { |s| RANK[s.status] }
28
+ return "all signals nominal" if worst.nil? || worst.status == :ok
29
+
30
+ worst.detail
31
+ end
32
+
33
+ def healthy?
34
+ status == :ok
35
+ end
36
+
37
+ private
38
+
39
+ def error_rate_signal
40
+ ratio = @metrics.failure_ratio
41
+ status = if ratio >= 0.10 then :crit elsif ratio >= 0.02 then :warn else :ok end
42
+ Signal.new(key: "error_rate", label: "Error rate (lifetime)", status: status,
43
+ detail: "#{(ratio * 100).round(1)}% of processed jobs have failed")
44
+ end
45
+
46
+ def latency_signal
47
+ worst = @queues.max_by(&:latency)
48
+ return Signal.new(key: "latency", label: "Queue latency", status: :ok, detail: "no active queues") if worst.nil?
49
+
50
+ lat = worst.latency
51
+ status = if lat > 600 then :crit elsif lat > 60 then :warn else :ok end
52
+ detail = status == :ok ? "all queues fresh (< 60s)" : "#{worst.name}: oldest job #{lat.round}s"
53
+ Signal.new(key: "latency", label: "Queue latency", status: status, detail: detail)
54
+ end
55
+
56
+ def utilization_signal
57
+ util = @metrics.utilization
58
+ return nil if util.nil? # no processes reporting in — can't judge
59
+
60
+ status = if util >= 1.0 then :crit elsif util >= 0.85 then :warn else :ok end
61
+ Signal.new(key: "utilization", label: "Worker utilization", status: status,
62
+ detail: "#{(util * 100).round}% of worker threads busy")
63
+ end
64
+ end
65
+ end
@@ -1,3 +1,3 @@
1
1
  module RoundhouseUi
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
data/lib/roundhouse_ui.rb CHANGED
@@ -10,6 +10,8 @@ require "roundhouse_ui/redaction"
10
10
  require "roundhouse_ui/cancellation"
11
11
  require "roundhouse_ui/cancel_middleware"
12
12
  require "roundhouse_ui/metrics"
13
+ require "roundhouse_ui/error_groups"
14
+ require "roundhouse_ui/health"
13
15
 
14
16
  # Brand name is "Roundhouse"; the gem and Ruby namespace are RoundhouseUi
15
17
  # (matching the published gem name `roundhouse_ui`).
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: roundhouse_ui
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - R.J. Robinson
@@ -97,7 +97,9 @@ files:
97
97
  - lib/roundhouse_ui/cancel_middleware.rb
98
98
  - lib/roundhouse_ui/cancellation.rb
99
99
  - lib/roundhouse_ui/engine.rb
100
+ - lib/roundhouse_ui/error_groups.rb
100
101
  - lib/roundhouse_ui/fetch.rb
102
+ - lib/roundhouse_ui/health.rb
101
103
  - lib/roundhouse_ui/metrics.rb
102
104
  - lib/roundhouse_ui/observability.rb
103
105
  - lib/roundhouse_ui/pause.rb