solid_queue_autoscaler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +189 -0
- data/LICENSE.txt +21 -0
- data/README.md +553 -0
- data/lib/generators/solid_queue_autoscaler/dashboard_generator.rb +54 -0
- data/lib/generators/solid_queue_autoscaler/install_generator.rb +21 -0
- data/lib/generators/solid_queue_autoscaler/migration_generator.rb +29 -0
- data/lib/generators/solid_queue_autoscaler/templates/README +41 -0
- data/lib/generators/solid_queue_autoscaler/templates/create_solid_queue_autoscaler_events.rb.erb +24 -0
- data/lib/generators/solid_queue_autoscaler/templates/create_solid_queue_autoscaler_state.rb.erb +15 -0
- data/lib/generators/solid_queue_autoscaler/templates/initializer.rb +58 -0
- data/lib/solid_queue_autoscaler/adapters/base.rb +102 -0
- data/lib/solid_queue_autoscaler/adapters/heroku.rb +93 -0
- data/lib/solid_queue_autoscaler/adapters/kubernetes.rb +158 -0
- data/lib/solid_queue_autoscaler/adapters.rb +57 -0
- data/lib/solid_queue_autoscaler/advisory_lock.rb +71 -0
- data/lib/solid_queue_autoscaler/autoscale_job.rb +71 -0
- data/lib/solid_queue_autoscaler/configuration.rb +269 -0
- data/lib/solid_queue_autoscaler/cooldown_tracker.rb +153 -0
- data/lib/solid_queue_autoscaler/dashboard/engine.rb +136 -0
- data/lib/solid_queue_autoscaler/dashboard/views/layouts/solid_queue_heroku_autoscaler/dashboard/application.html.erb +206 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/dashboard/index.html.erb +138 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/events/index.html.erb +102 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/workers/index.html.erb +106 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/workers/show.html.erb +209 -0
- data/lib/solid_queue_autoscaler/dashboard.rb +99 -0
- data/lib/solid_queue_autoscaler/decision_engine.rb +228 -0
- data/lib/solid_queue_autoscaler/errors.rb +44 -0
- data/lib/solid_queue_autoscaler/metrics.rb +172 -0
- data/lib/solid_queue_autoscaler/railtie.rb +179 -0
- data/lib/solid_queue_autoscaler/scale_event.rb +292 -0
- data/lib/solid_queue_autoscaler/scaler.rb +294 -0
- data/lib/solid_queue_autoscaler/version.rb +5 -0
- data/lib/solid_queue_autoscaler.rb +108 -0
- metadata +179 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
<div class="d-flex justify-between align-center mb-3">
|
|
2
|
+
<h2>
|
|
3
|
+
Worker: <%= @worker[:name] %>
|
|
4
|
+
<% if @worker[:enabled] %>
|
|
5
|
+
<span class="badge badge-success">Active</span>
|
|
6
|
+
<% else %>
|
|
7
|
+
<span class="badge badge-neutral">Disabled</span>
|
|
8
|
+
<% end %>
|
|
9
|
+
<% if @worker[:dry_run] %>
|
|
10
|
+
<span class="badge badge-warning">Dry Run</span>
|
|
11
|
+
<% end %>
|
|
12
|
+
</h2>
|
|
13
|
+
<div class="d-flex gap-2">
|
|
14
|
+
<%= link_to '← Back', workers_path, class: 'btn btn-secondary btn-sm' %>
|
|
15
|
+
<%= button_to 'Scale Now', scale_worker_path(@worker[:name]), method: :post, class: 'btn btn-primary btn-sm', disabled: !@worker[:enabled] %>
|
|
16
|
+
</div>
|
|
17
|
+
</div>
|
|
18
|
+
|
|
19
|
+
<div class="grid grid-2 mb-3">
|
|
20
|
+
<!-- Current Status -->
|
|
21
|
+
<div class="card">
|
|
22
|
+
<h3>Current Status</h3>
|
|
23
|
+
|
|
24
|
+
<div class="d-flex justify-between align-center mb-2">
|
|
25
|
+
<div>
|
|
26
|
+
<strong class="text-info" style="font-size: 2rem;"><%= @worker[:current_workers] %></strong>
|
|
27
|
+
<span class="text-muted">/ <%= @worker[:max_workers] %> workers</span>
|
|
28
|
+
</div>
|
|
29
|
+
<div class="text-muted">
|
|
30
|
+
Min: <%= @worker[:min_workers] %> | Max: <%= @worker[:max_workers] %>
|
|
31
|
+
</div>
|
|
32
|
+
</div>
|
|
33
|
+
|
|
34
|
+
<div class="progress-bar">
|
|
35
|
+
<% pct = (@worker[:current_workers].to_f / @worker[:max_workers] * 100).round %>
|
|
36
|
+
<div class="fill" style="width: <%= pct %>%; background: <%= pct > 80 ? 'var(--warning)' : 'var(--success)' %>;"></div>
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+
<div class="grid grid-2 mt-2">
|
|
40
|
+
<div>
|
|
41
|
+
<div class="text-muted">Process Type</div>
|
|
42
|
+
<code><%= @worker[:process_type] %></code>
|
|
43
|
+
</div>
|
|
44
|
+
<div>
|
|
45
|
+
<div class="text-muted">Strategy</div>
|
|
46
|
+
<code><%= @worker[:scaling_strategy] %></code>
|
|
47
|
+
</div>
|
|
48
|
+
<div>
|
|
49
|
+
<div class="text-muted">Queues</div>
|
|
50
|
+
<code><%= Array(@worker[:queues]).join(', ') %></code>
|
|
51
|
+
</div>
|
|
52
|
+
</div>
|
|
53
|
+
</div>
|
|
54
|
+
|
|
55
|
+
<!-- Metrics -->
|
|
56
|
+
<div class="card">
|
|
57
|
+
<h3>Current Metrics</h3>
|
|
58
|
+
|
|
59
|
+
<div class="grid grid-2">
|
|
60
|
+
<div class="mb-2">
|
|
61
|
+
<div class="text-muted">Queue Depth</div>
|
|
62
|
+
<strong class="<%= @worker[:metrics][:queue_depth] > @worker[:thresholds][:scale_up_queue_depth] ? 'text-danger' : '' %>" style="font-size: 1.5rem;">
|
|
63
|
+
<%= @worker[:metrics][:queue_depth] %>
|
|
64
|
+
</strong>
|
|
65
|
+
<div class="text-muted" style="font-size: 0.8rem;">
|
|
66
|
+
↑ threshold: <%= @worker[:thresholds][:scale_up_queue_depth] %> | ↓ threshold: <%= @worker[:thresholds][:scale_down_queue_depth] %>
|
|
67
|
+
</div>
|
|
68
|
+
</div>
|
|
69
|
+
|
|
70
|
+
<div class="mb-2">
|
|
71
|
+
<div class="text-muted">Oldest Job Age</div>
|
|
72
|
+
<strong class="<%= @worker[:metrics][:latency_seconds] > @worker[:thresholds][:scale_up_latency] ? 'text-danger' : '' %>" style="font-size: 1.5rem;">
|
|
73
|
+
<%= @worker[:metrics][:latency_seconds].round %>s
|
|
74
|
+
</strong>
|
|
75
|
+
<div class="text-muted" style="font-size: 0.8rem;">
|
|
76
|
+
↑ threshold: <%= @worker[:thresholds][:scale_up_latency] %>s | ↓ threshold: <%= @worker[:thresholds][:scale_down_latency] %>s
|
|
77
|
+
</div>
|
|
78
|
+
</div>
|
|
79
|
+
|
|
80
|
+
<div>
|
|
81
|
+
<div class="text-muted">Jobs/Minute</div>
|
|
82
|
+
<strong style="font-size: 1.5rem;"><%= @worker[:metrics][:jobs_per_minute] %></strong>
|
|
83
|
+
</div>
|
|
84
|
+
|
|
85
|
+
<div>
|
|
86
|
+
<div class="text-muted">Claimed Jobs</div>
|
|
87
|
+
<strong style="font-size: 1.5rem;"><%= @worker[:metrics][:claimed_jobs] %></strong>
|
|
88
|
+
</div>
|
|
89
|
+
|
|
90
|
+
<div>
|
|
91
|
+
<div class="text-muted">Failed Jobs</div>
|
|
92
|
+
<strong class="<%= @worker[:metrics][:failed_jobs] > 0 ? 'text-danger' : '' %>" style="font-size: 1.5rem;">
|
|
93
|
+
<%= @worker[:metrics][:failed_jobs] %>
|
|
94
|
+
</strong>
|
|
95
|
+
</div>
|
|
96
|
+
|
|
97
|
+
<div>
|
|
98
|
+
<div class="text-muted">Active Workers</div>
|
|
99
|
+
<strong style="font-size: 1.5rem;"><%= @worker[:metrics][:active_workers] %></strong>
|
|
100
|
+
</div>
|
|
101
|
+
</div>
|
|
102
|
+
</div>
|
|
103
|
+
</div>
|
|
104
|
+
|
|
105
|
+
<!-- Cooldowns -->
|
|
106
|
+
<div class="card mb-3">
|
|
107
|
+
<h3>Cooldown Status</h3>
|
|
108
|
+
|
|
109
|
+
<div class="grid grid-2">
|
|
110
|
+
<div>
|
|
111
|
+
<div class="text-muted mb-1">Scale Up Cooldown</div>
|
|
112
|
+
<% if @worker[:cooldowns][:scale_up_remaining] > 0 %>
|
|
113
|
+
<span class="badge badge-info">
|
|
114
|
+
<%= @worker[:cooldowns][:scale_up_remaining] %>s remaining
|
|
115
|
+
</span>
|
|
116
|
+
<% else %>
|
|
117
|
+
<span class="text-success">Ready</span>
|
|
118
|
+
<% end %>
|
|
119
|
+
<% if @worker[:cooldowns][:last_scale_up] %>
|
|
120
|
+
<div class="text-muted" style="font-size: 0.8rem;">
|
|
121
|
+
Last: <%= @worker[:cooldowns][:last_scale_up].strftime('%Y-%m-%d %H:%M:%S') %>
|
|
122
|
+
</div>
|
|
123
|
+
<% end %>
|
|
124
|
+
</div>
|
|
125
|
+
|
|
126
|
+
<div>
|
|
127
|
+
<div class="text-muted mb-1">Scale Down Cooldown</div>
|
|
128
|
+
<% if @worker[:cooldowns][:scale_down_remaining] > 0 %>
|
|
129
|
+
<span class="badge badge-warning">
|
|
130
|
+
<%= @worker[:cooldowns][:scale_down_remaining] %>s remaining
|
|
131
|
+
</span>
|
|
132
|
+
<% else %>
|
|
133
|
+
<span class="text-success">Ready</span>
|
|
134
|
+
<% end %>
|
|
135
|
+
<% if @worker[:cooldowns][:last_scale_down] %>
|
|
136
|
+
<div class="text-muted" style="font-size: 0.8rem;">
|
|
137
|
+
Last: <%= @worker[:cooldowns][:last_scale_down].strftime('%Y-%m-%d %H:%M:%S') %>
|
|
138
|
+
</div>
|
|
139
|
+
<% end %>
|
|
140
|
+
</div>
|
|
141
|
+
</div>
|
|
142
|
+
</div>
|
|
143
|
+
|
|
144
|
+
<!-- Recent Events -->
|
|
145
|
+
<div class="card">
|
|
146
|
+
<div class="d-flex justify-between align-center mb-2">
|
|
147
|
+
<h3>Recent Events</h3>
|
|
148
|
+
<%= link_to 'View All', events_path(worker: @worker[:name]), class: 'btn btn-secondary btn-sm' %>
|
|
149
|
+
</div>
|
|
150
|
+
|
|
151
|
+
<% if !events_available? %>
|
|
152
|
+
<p class="text-muted">
|
|
153
|
+
Events table not found. Run:
|
|
154
|
+
<code>rails generate solid_queue_autoscaler:dashboard</code>
|
|
155
|
+
</p>
|
|
156
|
+
<% elsif @events.empty? %>
|
|
157
|
+
<p class="text-muted">No scale events recorded for this worker.</p>
|
|
158
|
+
<% else %>
|
|
159
|
+
<table>
|
|
160
|
+
<thead>
|
|
161
|
+
<tr>
|
|
162
|
+
<th>Time</th>
|
|
163
|
+
<th>Action</th>
|
|
164
|
+
<th>Workers</th>
|
|
165
|
+
<th>Queue Depth</th>
|
|
166
|
+
<th>Latency</th>
|
|
167
|
+
<th>Reason</th>
|
|
168
|
+
</tr>
|
|
169
|
+
</thead>
|
|
170
|
+
<tbody>
|
|
171
|
+
<% @events.each do |event| %>
|
|
172
|
+
<tr>
|
|
173
|
+
<td>
|
|
174
|
+
<div><%= event.created_at.strftime('%Y-%m-%d') %></div>
|
|
175
|
+
<div class="time-ago"><%= event.created_at.strftime('%H:%M:%S') %></div>
|
|
176
|
+
</td>
|
|
177
|
+
<td>
|
|
178
|
+
<% case event.action %>
|
|
179
|
+
<% when 'scale_up' %>
|
|
180
|
+
<span class="badge badge-success">↑ Scale Up</span>
|
|
181
|
+
<% when 'scale_down' %>
|
|
182
|
+
<span class="badge badge-warning">↓ Scale Down</span>
|
|
183
|
+
<% when 'no_change' %>
|
|
184
|
+
<span class="badge badge-neutral">— No Change</span>
|
|
185
|
+
<% when 'skipped' %>
|
|
186
|
+
<span class="badge badge-info">⏭ Skipped</span>
|
|
187
|
+
<% when 'error' %>
|
|
188
|
+
<span class="badge badge-danger">✕ Error</span>
|
|
189
|
+
<% end %>
|
|
190
|
+
<% if event.dry_run %>
|
|
191
|
+
<span class="badge badge-neutral">DRY</span>
|
|
192
|
+
<% end %>
|
|
193
|
+
</td>
|
|
194
|
+
<td>
|
|
195
|
+
<% if event.scaled? %>
|
|
196
|
+
<strong><%= event.from_workers %></strong> → <strong><%= event.to_workers %></strong>
|
|
197
|
+
<% else %>
|
|
198
|
+
<%= event.from_workers %>
|
|
199
|
+
<% end %>
|
|
200
|
+
</td>
|
|
201
|
+
<td><%= event.queue_depth %></td>
|
|
202
|
+
<td><%= event.latency_seconds.round %>s</td>
|
|
203
|
+
<td class="text-muted"><%= event.reason %></td>
|
|
204
|
+
</tr>
|
|
205
|
+
<% end %>
|
|
206
|
+
</tbody>
|
|
207
|
+
</table>
|
|
208
|
+
<% end %>
|
|
209
|
+
</div>
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'dashboard/engine' if defined?(Rails::Engine)
|
|
4
|
+
|
|
5
|
+
module SolidQueueAutoscaler
|
|
6
|
+
# Dashboard module provides a web UI for monitoring the autoscaler.
|
|
7
|
+
# Integrates with Mission Control Solid Queue when available.
|
|
8
|
+
module Dashboard
|
|
9
|
+
class << self
|
|
10
|
+
# Returns current autoscaler status for all workers
|
|
11
|
+
# @return [Hash] Status information for all workers
|
|
12
|
+
def status
|
|
13
|
+
workers = SolidQueueAutoscaler.registered_workers
|
|
14
|
+
workers = [:default] if workers.empty?
|
|
15
|
+
|
|
16
|
+
workers.each_with_object({}) do |name, status|
|
|
17
|
+
status[name] = worker_status(name)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Returns status for a specific worker
|
|
22
|
+
# @param name [Symbol] Worker name
|
|
23
|
+
# @return [Hash] Status information
|
|
24
|
+
def worker_status(name)
|
|
25
|
+
config = SolidQueueAutoscaler.config(name)
|
|
26
|
+
metrics = safe_metrics(name)
|
|
27
|
+
tracker = CooldownTracker.new(config: config, key: name.to_s)
|
|
28
|
+
|
|
29
|
+
{
|
|
30
|
+
name: name,
|
|
31
|
+
enabled: config.enabled?,
|
|
32
|
+
dry_run: config.dry_run?,
|
|
33
|
+
current_workers: safe_current_workers(name),
|
|
34
|
+
min_workers: config.min_workers,
|
|
35
|
+
max_workers: config.max_workers,
|
|
36
|
+
queues: config.queues || ['all'],
|
|
37
|
+
process_type: config.process_type,
|
|
38
|
+
scaling_strategy: config.scaling_strategy,
|
|
39
|
+
metrics: {
|
|
40
|
+
queue_depth: metrics&.queue_depth || 0,
|
|
41
|
+
latency_seconds: metrics&.oldest_job_age_seconds || 0,
|
|
42
|
+
jobs_per_minute: metrics&.jobs_per_minute || 0,
|
|
43
|
+
claimed_jobs: metrics&.claimed_jobs || 0,
|
|
44
|
+
failed_jobs: metrics&.failed_jobs || 0,
|
|
45
|
+
active_workers: metrics&.active_workers || 0
|
|
46
|
+
},
|
|
47
|
+
cooldowns: {
|
|
48
|
+
scale_up_remaining: tracker.scale_up_cooldown_remaining.round,
|
|
49
|
+
scale_down_remaining: tracker.scale_down_cooldown_remaining.round,
|
|
50
|
+
last_scale_up: tracker.last_scale_up_at,
|
|
51
|
+
last_scale_down: tracker.last_scale_down_at
|
|
52
|
+
},
|
|
53
|
+
thresholds: {
|
|
54
|
+
scale_up_queue_depth: config.scale_up_queue_depth,
|
|
55
|
+
scale_up_latency: config.scale_up_latency_seconds,
|
|
56
|
+
scale_down_queue_depth: config.scale_down_queue_depth,
|
|
57
|
+
scale_down_latency: config.scale_down_latency_seconds
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Returns recent scale events
|
|
63
|
+
# @param limit [Integer] Maximum events to return
|
|
64
|
+
# @param worker_name [String, nil] Filter by worker
|
|
65
|
+
# @return [Array<ScaleEvent>] Recent events
|
|
66
|
+
def recent_events(limit: 50, worker_name: nil)
|
|
67
|
+
ScaleEvent.recent(limit: limit, worker_name: worker_name)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Returns event statistics
|
|
71
|
+
# @param since [Time] Start time
|
|
72
|
+
# @param worker_name [String, nil] Filter by worker
|
|
73
|
+
# @return [Hash] Statistics
|
|
74
|
+
def event_stats(since: 24.hours.ago, worker_name: nil)
|
|
75
|
+
ScaleEvent.stats(since: since, worker_name: worker_name)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Checks if the events table is available
|
|
79
|
+
# @return [Boolean] True if events can be recorded
|
|
80
|
+
def events_table_available?
|
|
81
|
+
ScaleEvent.table_exists?
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def safe_metrics(name)
|
|
87
|
+
SolidQueueAutoscaler.metrics(name)
|
|
88
|
+
rescue StandardError
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def safe_current_workers(name)
|
|
93
|
+
SolidQueueAutoscaler.current_workers(name)
|
|
94
|
+
rescue StandardError
|
|
95
|
+
0
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SolidQueueAutoscaler
|
|
4
|
+
class DecisionEngine
|
|
5
|
+
Decision = Struct.new(:action, :from, :to, :reason, keyword_init: true) do
|
|
6
|
+
def scale_up?
|
|
7
|
+
action == :scale_up
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def scale_down?
|
|
11
|
+
action == :scale_down
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def no_change?
|
|
15
|
+
action == :no_change
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def delta
|
|
19
|
+
to - from
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def initialize(config: nil)
|
|
24
|
+
@config = config || SolidQueueAutoscaler.config
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def decide(metrics:, current_workers:)
|
|
28
|
+
return no_change_decision(current_workers, 'Autoscaler is disabled') unless @config.enabled?
|
|
29
|
+
|
|
30
|
+
if should_scale_up?(metrics, current_workers)
|
|
31
|
+
scale_up_decision(metrics, current_workers)
|
|
32
|
+
elsif should_scale_down?(metrics, current_workers)
|
|
33
|
+
scale_down_decision(metrics, current_workers)
|
|
34
|
+
else
|
|
35
|
+
no_change_decision(current_workers, determine_no_change_reason(metrics, current_workers))
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def should_scale_up?(metrics, current_workers)
|
|
42
|
+
return false if current_workers >= @config.max_workers
|
|
43
|
+
|
|
44
|
+
queue_depth_high = metrics.queue_depth >= @config.scale_up_queue_depth
|
|
45
|
+
latency_high = metrics.oldest_job_age_seconds >= @config.scale_up_latency_seconds
|
|
46
|
+
|
|
47
|
+
queue_depth_high || latency_high
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def should_scale_down?(metrics, current_workers)
|
|
51
|
+
return false if current_workers <= @config.min_workers
|
|
52
|
+
|
|
53
|
+
queue_depth_low = metrics.queue_depth <= @config.scale_down_queue_depth
|
|
54
|
+
latency_low = metrics.oldest_job_age_seconds <= @config.scale_down_latency_seconds
|
|
55
|
+
is_idle = metrics.idle?
|
|
56
|
+
|
|
57
|
+
(queue_depth_low && latency_low) || is_idle
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def scale_up_decision(metrics, current_workers)
|
|
61
|
+
target = calculate_scale_up_target(metrics, current_workers)
|
|
62
|
+
reason = build_scale_up_reason(metrics, current_workers, target)
|
|
63
|
+
|
|
64
|
+
Decision.new(
|
|
65
|
+
action: :scale_up,
|
|
66
|
+
from: current_workers,
|
|
67
|
+
to: target,
|
|
68
|
+
reason: reason
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def scale_down_decision(metrics, current_workers)
|
|
73
|
+
target = calculate_scale_down_target(metrics, current_workers)
|
|
74
|
+
reason = build_scale_down_reason(metrics, current_workers, target)
|
|
75
|
+
|
|
76
|
+
Decision.new(
|
|
77
|
+
action: :scale_down,
|
|
78
|
+
from: current_workers,
|
|
79
|
+
to: target,
|
|
80
|
+
reason: reason
|
|
81
|
+
)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def calculate_scale_up_target(metrics, current_workers)
|
|
85
|
+
raw_target = case @config.scaling_strategy
|
|
86
|
+
when :proportional
|
|
87
|
+
calculate_proportional_scale_up_target(metrics, current_workers)
|
|
88
|
+
when :step_function
|
|
89
|
+
calculate_step_function_target(metrics, current_workers)
|
|
90
|
+
else # :fixed
|
|
91
|
+
current_workers + @config.scale_up_increment
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
[raw_target, @config.max_workers].min
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def calculate_scale_down_target(metrics, current_workers)
|
|
98
|
+
raw_target = case @config.scaling_strategy
|
|
99
|
+
when :proportional
|
|
100
|
+
calculate_proportional_scale_down_target(metrics, current_workers)
|
|
101
|
+
when :step_function
|
|
102
|
+
calculate_step_function_target(metrics, current_workers)
|
|
103
|
+
else # :fixed
|
|
104
|
+
current_workers - @config.scale_down_decrement
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
[raw_target, @config.min_workers].max
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def calculate_proportional_scale_up_target(metrics, current_workers)
|
|
111
|
+
# Calculate workers needed based on queue depth
|
|
112
|
+
jobs_over_threshold = [metrics.queue_depth - @config.scale_up_queue_depth, 0].max
|
|
113
|
+
workers_for_depth = (jobs_over_threshold.to_f / @config.scale_up_jobs_per_worker).ceil
|
|
114
|
+
|
|
115
|
+
# Calculate workers needed based on latency
|
|
116
|
+
latency_over_threshold = [metrics.oldest_job_age_seconds - @config.scale_up_latency_seconds, 0].max
|
|
117
|
+
workers_for_latency = (latency_over_threshold / @config.scale_up_latency_per_worker).ceil
|
|
118
|
+
|
|
119
|
+
# Take the higher of the two calculations
|
|
120
|
+
additional_workers = [workers_for_depth, workers_for_latency].max
|
|
121
|
+
|
|
122
|
+
# Always add at least scale_up_increment if we're scaling up
|
|
123
|
+
additional_workers = [@config.scale_up_increment, additional_workers].max
|
|
124
|
+
|
|
125
|
+
current_workers + additional_workers
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def calculate_proportional_scale_down_target(metrics, current_workers)
|
|
129
|
+
# If idle, scale down aggressively
|
|
130
|
+
return @config.min_workers if metrics.idle?
|
|
131
|
+
|
|
132
|
+
# Calculate how much capacity we have based on queue depth
|
|
133
|
+
jobs_under_capacity = [@config.scale_down_queue_depth - metrics.queue_depth, 0].max
|
|
134
|
+
workers_to_remove = (jobs_under_capacity.to_f / @config.scale_down_jobs_per_worker).floor
|
|
135
|
+
|
|
136
|
+
# Ensure we remove at least scale_down_decrement if we're scaling down
|
|
137
|
+
workers_to_remove = [@config.scale_down_decrement, workers_to_remove].max
|
|
138
|
+
|
|
139
|
+
current_workers - workers_to_remove
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def calculate_step_function_target(metrics, current_workers)
|
|
143
|
+
# Step function uses fixed thresholds (future implementation)
|
|
144
|
+
# For now, fall back to fixed strategy
|
|
145
|
+
if should_scale_up?(metrics, current_workers)
|
|
146
|
+
current_workers + @config.scale_up_increment
|
|
147
|
+
else
|
|
148
|
+
current_workers - @config.scale_down_decrement
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def no_change_decision(current_workers, reason)
|
|
153
|
+
Decision.new(
|
|
154
|
+
action: :no_change,
|
|
155
|
+
from: current_workers,
|
|
156
|
+
to: current_workers,
|
|
157
|
+
reason: reason
|
|
158
|
+
)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def build_scale_up_reason(metrics, current_workers = nil, target = nil)
|
|
162
|
+
reasons = []
|
|
163
|
+
|
|
164
|
+
if metrics.queue_depth >= @config.scale_up_queue_depth
|
|
165
|
+
reasons << "queue_depth=#{metrics.queue_depth} >= #{@config.scale_up_queue_depth}"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
if metrics.oldest_job_age_seconds >= @config.scale_up_latency_seconds
|
|
169
|
+
reasons << "latency=#{metrics.oldest_job_age_seconds.round}s >= #{@config.scale_up_latency_seconds}s"
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
base_reason = reasons.join(', ')
|
|
173
|
+
|
|
174
|
+
if @config.scaling_strategy == :proportional && current_workers && target
|
|
175
|
+
delta = target - current_workers
|
|
176
|
+
"#{base_reason} [proportional: +#{delta} workers]"
|
|
177
|
+
else
|
|
178
|
+
base_reason
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def build_scale_down_reason(metrics, current_workers = nil, target = nil)
|
|
183
|
+
if metrics.idle?
|
|
184
|
+
base_reason = 'queue is idle (no pending or claimed jobs)'
|
|
185
|
+
else
|
|
186
|
+
reasons = []
|
|
187
|
+
|
|
188
|
+
if metrics.queue_depth <= @config.scale_down_queue_depth
|
|
189
|
+
reasons << "queue_depth=#{metrics.queue_depth} <= #{@config.scale_down_queue_depth}"
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
if metrics.oldest_job_age_seconds <= @config.scale_down_latency_seconds
|
|
193
|
+
reasons << "latency=#{metrics.oldest_job_age_seconds.round}s <= #{@config.scale_down_latency_seconds}s"
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
base_reason = reasons.join(', ')
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
if @config.scaling_strategy == :proportional && current_workers && target
|
|
200
|
+
delta = current_workers - target
|
|
201
|
+
"#{base_reason} [proportional: -#{delta} workers]"
|
|
202
|
+
else
|
|
203
|
+
base_reason
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def determine_no_change_reason(metrics, current_workers)
|
|
208
|
+
# Check if we would scale up but we're at max
|
|
209
|
+
queue_depth_high = metrics.queue_depth >= @config.scale_up_queue_depth
|
|
210
|
+
latency_high = metrics.oldest_job_age_seconds >= @config.scale_up_latency_seconds
|
|
211
|
+
would_scale_up = queue_depth_high || latency_high
|
|
212
|
+
|
|
213
|
+
# Check if we would scale down but we're at min
|
|
214
|
+
queue_depth_low = metrics.queue_depth <= @config.scale_down_queue_depth
|
|
215
|
+
latency_low = metrics.oldest_job_age_seconds <= @config.scale_down_latency_seconds
|
|
216
|
+
is_idle = metrics.idle?
|
|
217
|
+
would_scale_down = (queue_depth_low && latency_low) || is_idle
|
|
218
|
+
|
|
219
|
+
if current_workers >= @config.max_workers && would_scale_up
|
|
220
|
+
"at max_workers (#{@config.max_workers})"
|
|
221
|
+
elsif current_workers <= @config.min_workers && would_scale_down
|
|
222
|
+
"at min_workers (#{@config.min_workers})"
|
|
223
|
+
else
|
|
224
|
+
"metrics within normal range (depth=#{metrics.queue_depth}, latency=#{metrics.oldest_job_age_seconds.round}s)"
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SolidQueueAutoscaler
|
|
4
|
+
# Base error class for all autoscaler errors.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when configuration is invalid.
|
|
8
|
+
class ConfigurationError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when the advisory lock cannot be acquired.
|
|
11
|
+
class LockError < Error; end
|
|
12
|
+
|
|
13
|
+
# Raised when Heroku API calls fail.
|
|
14
|
+
class HerokuAPIError < Error
|
|
15
|
+
attr_reader :status_code, :response_body
|
|
16
|
+
|
|
17
|
+
def initialize(message, status_code: nil, response_body: nil)
|
|
18
|
+
super(message)
|
|
19
|
+
@status_code = status_code
|
|
20
|
+
@response_body = response_body
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Raised when Kubernetes API calls fail.
|
|
25
|
+
class KubernetesAPIError < Error
|
|
26
|
+
attr_reader :original_error
|
|
27
|
+
|
|
28
|
+
def initialize(message, original_error: nil)
|
|
29
|
+
super(message)
|
|
30
|
+
@original_error = original_error
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
class MetricsError < Error; end
|
|
35
|
+
|
|
36
|
+
class CooldownActiveError < Error
|
|
37
|
+
attr_reader :remaining_seconds
|
|
38
|
+
|
|
39
|
+
def initialize(remaining_seconds)
|
|
40
|
+
@remaining_seconds = remaining_seconds
|
|
41
|
+
super("Cooldown active, #{remaining_seconds.round}s remaining")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|