solid_queue_autoscaler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +189 -0
- data/LICENSE.txt +21 -0
- data/README.md +553 -0
- data/lib/generators/solid_queue_autoscaler/dashboard_generator.rb +54 -0
- data/lib/generators/solid_queue_autoscaler/install_generator.rb +21 -0
- data/lib/generators/solid_queue_autoscaler/migration_generator.rb +29 -0
- data/lib/generators/solid_queue_autoscaler/templates/README +41 -0
- data/lib/generators/solid_queue_autoscaler/templates/create_solid_queue_autoscaler_events.rb.erb +24 -0
- data/lib/generators/solid_queue_autoscaler/templates/create_solid_queue_autoscaler_state.rb.erb +15 -0
- data/lib/generators/solid_queue_autoscaler/templates/initializer.rb +58 -0
- data/lib/solid_queue_autoscaler/adapters/base.rb +102 -0
- data/lib/solid_queue_autoscaler/adapters/heroku.rb +93 -0
- data/lib/solid_queue_autoscaler/adapters/kubernetes.rb +158 -0
- data/lib/solid_queue_autoscaler/adapters.rb +57 -0
- data/lib/solid_queue_autoscaler/advisory_lock.rb +71 -0
- data/lib/solid_queue_autoscaler/autoscale_job.rb +71 -0
- data/lib/solid_queue_autoscaler/configuration.rb +269 -0
- data/lib/solid_queue_autoscaler/cooldown_tracker.rb +153 -0
- data/lib/solid_queue_autoscaler/dashboard/engine.rb +136 -0
- data/lib/solid_queue_autoscaler/dashboard/views/layouts/solid_queue_heroku_autoscaler/dashboard/application.html.erb +206 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/dashboard/index.html.erb +138 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/events/index.html.erb +102 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/workers/index.html.erb +106 -0
- data/lib/solid_queue_autoscaler/dashboard/views/solid_queue_heroku_autoscaler/dashboard/workers/show.html.erb +209 -0
- data/lib/solid_queue_autoscaler/dashboard.rb +99 -0
- data/lib/solid_queue_autoscaler/decision_engine.rb +228 -0
- data/lib/solid_queue_autoscaler/errors.rb +44 -0
- data/lib/solid_queue_autoscaler/metrics.rb +172 -0
- data/lib/solid_queue_autoscaler/railtie.rb +179 -0
- data/lib/solid_queue_autoscaler/scale_event.rb +292 -0
- data/lib/solid_queue_autoscaler/scaler.rb +294 -0
- data/lib/solid_queue_autoscaler/version.rb +5 -0
- data/lib/solid_queue_autoscaler.rb +108 -0
- metadata +179 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SolidQueueAutoscaler
|
|
4
|
+
class Metrics
|
|
5
|
+
Result = Struct.new(
|
|
6
|
+
:queue_depth,
|
|
7
|
+
:oldest_job_age_seconds,
|
|
8
|
+
:jobs_per_minute,
|
|
9
|
+
:claimed_jobs,
|
|
10
|
+
:failed_jobs,
|
|
11
|
+
:blocked_jobs,
|
|
12
|
+
:active_workers,
|
|
13
|
+
:queues_breakdown,
|
|
14
|
+
:collected_at,
|
|
15
|
+
keyword_init: true
|
|
16
|
+
) do
|
|
17
|
+
def idle?
|
|
18
|
+
queue_depth.zero? && claimed_jobs.zero?
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def latency_seconds
|
|
22
|
+
oldest_job_age_seconds
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_h
|
|
26
|
+
{
|
|
27
|
+
queue_depth: queue_depth,
|
|
28
|
+
oldest_job_age_seconds: oldest_job_age_seconds,
|
|
29
|
+
jobs_per_minute: jobs_per_minute,
|
|
30
|
+
claimed_jobs: claimed_jobs,
|
|
31
|
+
failed_jobs: failed_jobs,
|
|
32
|
+
blocked_jobs: blocked_jobs,
|
|
33
|
+
active_workers: active_workers,
|
|
34
|
+
queues_breakdown: queues_breakdown,
|
|
35
|
+
collected_at: collected_at
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def initialize(config: nil)
|
|
41
|
+
@config = config || SolidQueueAutoscaler.config
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def collect
|
|
45
|
+
Result.new(
|
|
46
|
+
queue_depth: queue_depth,
|
|
47
|
+
oldest_job_age_seconds: oldest_job_age_seconds,
|
|
48
|
+
jobs_per_minute: jobs_per_minute,
|
|
49
|
+
claimed_jobs: claimed_jobs_count,
|
|
50
|
+
failed_jobs: failed_jobs_count,
|
|
51
|
+
blocked_jobs: blocked_jobs_count,
|
|
52
|
+
active_workers: active_workers_count,
|
|
53
|
+
queues_breakdown: queues_breakdown,
|
|
54
|
+
collected_at: Time.current
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def queue_depth
|
|
59
|
+
sql = <<~SQL
|
|
60
|
+
SELECT COUNT(*) FROM #{ready_executions_table}
|
|
61
|
+
WHERE 1=1
|
|
62
|
+
#{queue_filter_clause}
|
|
63
|
+
SQL
|
|
64
|
+
connection.select_value(sql).to_i
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def oldest_job_age_seconds
|
|
68
|
+
sql = <<~SQL
|
|
69
|
+
SELECT EXTRACT(EPOCH FROM (NOW() - MIN(created_at)))
|
|
70
|
+
FROM #{ready_executions_table}
|
|
71
|
+
WHERE 1=1
|
|
72
|
+
#{queue_filter_clause}
|
|
73
|
+
SQL
|
|
74
|
+
result = connection.select_value(sql)
|
|
75
|
+
result.to_f
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def jobs_per_minute
|
|
79
|
+
sql = <<~SQL
|
|
80
|
+
SELECT COUNT(*)
|
|
81
|
+
FROM #{jobs_table}
|
|
82
|
+
WHERE finished_at IS NOT NULL
|
|
83
|
+
AND finished_at > NOW() - INTERVAL '1 minute'
|
|
84
|
+
#{queue_filter_clause('queue_name')}
|
|
85
|
+
SQL
|
|
86
|
+
connection.select_value(sql).to_i
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def claimed_jobs_count
|
|
90
|
+
sql = <<~SQL
|
|
91
|
+
SELECT COUNT(*) FROM #{claimed_executions_table}
|
|
92
|
+
SQL
|
|
93
|
+
connection.select_value(sql).to_i
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def failed_jobs_count
|
|
97
|
+
sql = <<~SQL
|
|
98
|
+
SELECT COUNT(*) FROM #{failed_executions_table}
|
|
99
|
+
SQL
|
|
100
|
+
connection.select_value(sql).to_i
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def blocked_jobs_count
|
|
104
|
+
sql = <<~SQL
|
|
105
|
+
SELECT COUNT(*) FROM #{blocked_executions_table}
|
|
106
|
+
SQL
|
|
107
|
+
connection.select_value(sql).to_i
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def active_workers_count
|
|
111
|
+
sql = <<~SQL
|
|
112
|
+
SELECT COUNT(*)
|
|
113
|
+
FROM #{processes_table}
|
|
114
|
+
WHERE kind = 'Worker'
|
|
115
|
+
AND last_heartbeat_at > NOW() - INTERVAL '5 minutes'
|
|
116
|
+
SQL
|
|
117
|
+
connection.select_value(sql).to_i
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def queues_breakdown
|
|
121
|
+
sql = <<~SQL
|
|
122
|
+
SELECT queue_name, COUNT(*) as count
|
|
123
|
+
FROM #{ready_executions_table}
|
|
124
|
+
GROUP BY queue_name
|
|
125
|
+
ORDER BY count DESC
|
|
126
|
+
SQL
|
|
127
|
+
connection.select_all(sql).to_a.to_h { |row| [row['queue_name'], row['count'].to_i] }
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
def connection
|
|
133
|
+
@config.connection
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def queue_filter_clause(column_name = 'queue_name')
|
|
137
|
+
return '' unless @config.queues&.any?
|
|
138
|
+
|
|
139
|
+
quoted_queues = @config.queues.map { |q| connection.quote(q) }.join(', ')
|
|
140
|
+
"AND #{column_name} IN (#{quoted_queues})"
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Table name helpers using configurable prefix
|
|
144
|
+
def table_prefix
|
|
145
|
+
@config.table_prefix
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def ready_executions_table
|
|
149
|
+
"#{table_prefix}ready_executions"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def jobs_table
|
|
153
|
+
"#{table_prefix}jobs"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def claimed_executions_table
|
|
157
|
+
"#{table_prefix}claimed_executions"
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def failed_executions_table
|
|
161
|
+
"#{table_prefix}failed_executions"
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def blocked_executions_table
|
|
165
|
+
"#{table_prefix}blocked_executions"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def processes_table
|
|
169
|
+
"#{table_prefix}processes"
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SolidQueueAutoscaler
|
|
4
|
+
class Railtie < Rails::Railtie
|
|
5
|
+
initializer 'solid_queue_autoscaler.configure' do
|
|
6
|
+
# Configuration happens via initializer, nothing to do here
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
rake_tasks do
|
|
10
|
+
namespace :solid_queue_autoscaler do
|
|
11
|
+
desc 'Run the autoscaler once for a specific worker (default: :default). Use WORKER=name'
|
|
12
|
+
task scale: :environment do
|
|
13
|
+
worker_name = (ENV['WORKER'] || 'default').to_sym
|
|
14
|
+
result = SolidQueueAutoscaler.scale!(worker_name)
|
|
15
|
+
print_scale_result(result, worker_name)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
desc 'Run the autoscaler for all configured workers'
|
|
19
|
+
task scale_all: :environment do
|
|
20
|
+
results = SolidQueueAutoscaler.scale_all!
|
|
21
|
+
if results.empty?
|
|
22
|
+
puts 'No workers configured'
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
results.each do |worker_name, result|
|
|
26
|
+
print_scale_result(result, worker_name)
|
|
27
|
+
end
|
|
28
|
+
exit 1 if results.values.any? { |r| !r.success? }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
desc 'List all configured workers'
|
|
32
|
+
task workers: :environment do
|
|
33
|
+
workers = SolidQueueAutoscaler.registered_workers
|
|
34
|
+
if workers.empty?
|
|
35
|
+
puts 'No workers configured'
|
|
36
|
+
else
|
|
37
|
+
puts "Configured Workers (#{workers.size}):"
|
|
38
|
+
workers.each do |name|
|
|
39
|
+
config = SolidQueueAutoscaler.config(name)
|
|
40
|
+
queues = config.queues&.join(', ') || 'all'
|
|
41
|
+
puts " #{name}:"
|
|
42
|
+
puts " Process Type: #{config.process_type}"
|
|
43
|
+
puts " Queues: #{queues}"
|
|
44
|
+
puts " Workers: #{config.min_workers}-#{config.max_workers}"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
desc 'Show current queue metrics for a worker. Use WORKER=name'
|
|
50
|
+
task metrics: :environment do
|
|
51
|
+
worker_name = (ENV['WORKER'] || 'default').to_sym
|
|
52
|
+
metrics = SolidQueueAutoscaler.metrics(worker_name)
|
|
53
|
+
config = SolidQueueAutoscaler.config(worker_name)
|
|
54
|
+
puts "Queue Metrics#{" [#{worker_name}]" unless worker_name == :default}:"
|
|
55
|
+
puts " Queues Filter: #{config.queues&.join(', ') || 'all'}"
|
|
56
|
+
puts " Queue Depth: #{metrics.queue_depth}"
|
|
57
|
+
puts " Oldest Job Age: #{metrics.oldest_job_age_seconds.round}s"
|
|
58
|
+
puts " Jobs/Minute: #{metrics.jobs_per_minute}"
|
|
59
|
+
puts " Claimed Jobs: #{metrics.claimed_jobs}"
|
|
60
|
+
puts " Failed Jobs: #{metrics.failed_jobs}"
|
|
61
|
+
puts " Blocked Jobs: #{metrics.blocked_jobs}"
|
|
62
|
+
puts " Active Workers: #{metrics.active_workers}"
|
|
63
|
+
puts " Queues Breakdown: #{metrics.queues_breakdown}"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
desc 'Show current worker formation. Use WORKER=name'
|
|
67
|
+
task formation: :environment do
|
|
68
|
+
worker_name = (ENV['WORKER'] || 'default').to_sym
|
|
69
|
+
workers = SolidQueueAutoscaler.current_workers(worker_name)
|
|
70
|
+
config = SolidQueueAutoscaler.config(worker_name)
|
|
71
|
+
puts "Current Formation#{" [#{worker_name}]" unless worker_name == :default}:"
|
|
72
|
+
puts " Process Type: #{config.process_type}"
|
|
73
|
+
puts " Workers: #{workers}"
|
|
74
|
+
puts " Min: #{config.min_workers}"
|
|
75
|
+
puts " Max: #{config.max_workers}"
|
|
76
|
+
puts " Queues: #{config.queues&.join(', ') || 'all'}"
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
desc 'Show cooldown state for a worker. Use WORKER=name'
|
|
80
|
+
task cooldown: :environment do
|
|
81
|
+
worker_name = (ENV['WORKER'] || 'default').to_sym
|
|
82
|
+
config = SolidQueueAutoscaler.config(worker_name)
|
|
83
|
+
tracker = SolidQueueAutoscaler::CooldownTracker.new(config: config, key: worker_name.to_s)
|
|
84
|
+
|
|
85
|
+
puts "Cooldown State#{" [#{worker_name}]" unless worker_name == :default}:"
|
|
86
|
+
puts " Table Exists: #{tracker.table_exists?}"
|
|
87
|
+
|
|
88
|
+
if tracker.table_exists?
|
|
89
|
+
state = tracker.state
|
|
90
|
+
puts " Last Scale Up: #{state[:last_scale_up_at] || 'never'}"
|
|
91
|
+
puts " Last Scale Down: #{state[:last_scale_down_at] || 'never'}"
|
|
92
|
+
puts " Scale Up Cooldown Active: #{tracker.cooldown_active_for_scale_up?}"
|
|
93
|
+
puts " Scale Down Cooldown Active: #{tracker.cooldown_active_for_scale_down?}"
|
|
94
|
+
|
|
95
|
+
if tracker.cooldown_active_for_scale_up?
|
|
96
|
+
puts " Scale Up Cooldown Remaining: #{tracker.scale_up_cooldown_remaining.round}s"
|
|
97
|
+
end
|
|
98
|
+
if tracker.cooldown_active_for_scale_down?
|
|
99
|
+
puts " Scale Down Cooldown Remaining: #{tracker.scale_down_cooldown_remaining.round}s"
|
|
100
|
+
end
|
|
101
|
+
else
|
|
102
|
+
puts ' (Using in-memory cooldowns - run migration for persistence)'
|
|
103
|
+
scale_up = SolidQueueAutoscaler::Scaler.last_scale_up_at(worker_name)
|
|
104
|
+
scale_down = SolidQueueAutoscaler::Scaler.last_scale_down_at(worker_name)
|
|
105
|
+
puts " In-Memory Scale Up: #{scale_up || 'never'}"
|
|
106
|
+
puts " In-Memory Scale Down: #{scale_down || 'never'}"
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
desc 'Reset cooldown state for a worker (or all if WORKER=all). Use WORKER=name'
|
|
111
|
+
task reset_cooldown: :environment do
|
|
112
|
+
worker_name = ENV.fetch('WORKER', nil)&.to_sym
|
|
113
|
+
|
|
114
|
+
if worker_name == :all || worker_name.nil?
|
|
115
|
+
# Reset all workers
|
|
116
|
+
SolidQueueAutoscaler.registered_workers.each do |name|
|
|
117
|
+
config = SolidQueueAutoscaler.config(name)
|
|
118
|
+
tracker = SolidQueueAutoscaler::CooldownTracker.new(config: config, key: name.to_s)
|
|
119
|
+
tracker.reset! if tracker.table_exists?
|
|
120
|
+
end
|
|
121
|
+
SolidQueueAutoscaler::Scaler.reset_cooldowns!
|
|
122
|
+
puts 'All cooldown states reset'
|
|
123
|
+
else
|
|
124
|
+
config = SolidQueueAutoscaler.config(worker_name)
|
|
125
|
+
tracker = SolidQueueAutoscaler::CooldownTracker.new(config: config, key: worker_name.to_s)
|
|
126
|
+
tracker.reset! if tracker.table_exists?
|
|
127
|
+
SolidQueueAutoscaler::Scaler.reset_cooldowns!(worker_name)
|
|
128
|
+
puts "Cooldown state reset for #{worker_name}"
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
desc 'Show recent scale events. Use LIMIT=n and WORKER=name'
|
|
133
|
+
task events: :environment do
|
|
134
|
+
worker_name = ENV.fetch('WORKER', nil)
|
|
135
|
+
limit = (ENV['LIMIT'] || 20).to_i
|
|
136
|
+
|
|
137
|
+
events = SolidQueueAutoscaler::ScaleEvent.recent(limit: limit, worker_name: worker_name)
|
|
138
|
+
|
|
139
|
+
if events.empty?
|
|
140
|
+
puts 'No events found'
|
|
141
|
+
puts '(Make sure to run: rails generate solid_queue_autoscaler:dashboard)'
|
|
142
|
+
else
|
|
143
|
+
puts "Recent Scale Events#{" for #{worker_name}" if worker_name} (#{events.size}):"
|
|
144
|
+
puts '-' * 100
|
|
145
|
+
events.each do |event|
|
|
146
|
+
action = event.action.ljust(10)
|
|
147
|
+
workers = "#{event.from_workers}->#{event.to_workers}".ljust(8)
|
|
148
|
+
dry_run = event.dry_run ? ' [DRY RUN]' : ''
|
|
149
|
+
time = event.created_at.strftime('%Y-%m-%d %H:%M:%S')
|
|
150
|
+
puts "#{time} | #{event.worker_name.ljust(15)} | #{action} | #{workers} | #{event.reason}#{dry_run}"
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
desc 'Cleanup old scale events. Use KEEP_DAYS=n (default: 30)'
|
|
156
|
+
task cleanup_events: :environment do
|
|
157
|
+
keep_days = (ENV['KEEP_DAYS'] || 30).to_i
|
|
158
|
+
SolidQueueAutoscaler::ScaleEvent.cleanup!(keep_days: keep_days)
|
|
159
|
+
puts "Cleaned up events older than #{keep_days} days"
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def print_scale_result(result, worker_name)
|
|
163
|
+
prefix = worker_name == :default ? '' : "[#{worker_name}] "
|
|
164
|
+
if result.success?
|
|
165
|
+
if result.scaled?
|
|
166
|
+
puts "#{prefix}Scaled #{result.decision.from} -> #{result.decision.to} workers"
|
|
167
|
+
elsif result.skipped?
|
|
168
|
+
puts "#{prefix}Skipped: #{result.skipped_reason}"
|
|
169
|
+
else
|
|
170
|
+
puts "#{prefix}No change needed: #{result.decision&.reason}"
|
|
171
|
+
end
|
|
172
|
+
else
|
|
173
|
+
puts "#{prefix}Error: #{result.error}"
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SolidQueueAutoscaler
|
|
4
|
+
# Lightweight model for recording autoscaler events.
|
|
5
|
+
# Does not inherit from ActiveRecord to avoid requiring it as a dependency.
|
|
6
|
+
# Uses raw SQL for compatibility with any database connection.
|
|
7
|
+
class ScaleEvent
|
|
8
|
+
TABLE_NAME = 'solid_queue_autoscaler_events'
|
|
9
|
+
|
|
10
|
+
ACTIONS = %w[scale_up scale_down no_change skipped error].freeze
|
|
11
|
+
|
|
12
|
+
attr_reader :id, :worker_name, :action, :from_workers, :to_workers,
|
|
13
|
+
:reason, :queue_depth, :latency_seconds, :metrics_json,
|
|
14
|
+
:dry_run, :created_at
|
|
15
|
+
|
|
16
|
+
def initialize(attrs = {})
|
|
17
|
+
@id = attrs[:id]
|
|
18
|
+
@worker_name = attrs[:worker_name]
|
|
19
|
+
@action = attrs[:action]
|
|
20
|
+
@from_workers = attrs[:from_workers]
|
|
21
|
+
@to_workers = attrs[:to_workers]
|
|
22
|
+
@reason = attrs[:reason]
|
|
23
|
+
@queue_depth = attrs[:queue_depth]
|
|
24
|
+
@latency_seconds = attrs[:latency_seconds]
|
|
25
|
+
@metrics_json = attrs[:metrics_json]
|
|
26
|
+
@dry_run = attrs[:dry_run]
|
|
27
|
+
@created_at = attrs[:created_at]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def scaled?
|
|
31
|
+
%w[scale_up scale_down].include?(action)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def scale_up?
|
|
35
|
+
action == 'scale_up'
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def scale_down?
|
|
39
|
+
action == 'scale_down'
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def metrics
|
|
43
|
+
return nil unless metrics_json
|
|
44
|
+
|
|
45
|
+
JSON.parse(metrics_json, symbolize_names: true)
|
|
46
|
+
rescue JSON::ParserError
|
|
47
|
+
nil
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
class << self
|
|
51
|
+
# Creates a new scale event record.
|
|
52
|
+
# @param attrs [Hash] Event attributes
|
|
53
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
54
|
+
# @return [ScaleEvent] The created event
|
|
55
|
+
def create!(attrs, connection: nil)
|
|
56
|
+
conn = connection || default_connection
|
|
57
|
+
return nil unless table_exists?(conn)
|
|
58
|
+
|
|
59
|
+
now = Time.current
|
|
60
|
+
sql = <<~SQL
|
|
61
|
+
INSERT INTO #{TABLE_NAME}
|
|
62
|
+
(worker_name, action, from_workers, to_workers, reason,
|
|
63
|
+
queue_depth, latency_seconds, metrics_json, dry_run, created_at)
|
|
64
|
+
VALUES
|
|
65
|
+
(#{conn.quote(attrs[:worker_name])},
|
|
66
|
+
#{conn.quote(attrs[:action])},
|
|
67
|
+
#{conn.quote(attrs[:from_workers])},
|
|
68
|
+
#{conn.quote(attrs[:to_workers])},
|
|
69
|
+
#{conn.quote(attrs[:reason])},
|
|
70
|
+
#{conn.quote(attrs[:queue_depth])},
|
|
71
|
+
#{conn.quote(attrs[:latency_seconds])},
|
|
72
|
+
#{conn.quote(attrs[:metrics_json])},
|
|
73
|
+
#{conn.quote(attrs[:dry_run])},
|
|
74
|
+
#{conn.quote(now)})
|
|
75
|
+
RETURNING id
|
|
76
|
+
SQL
|
|
77
|
+
|
|
78
|
+
result = conn.execute(sql)
|
|
79
|
+
id = result.first&.fetch('id', nil)
|
|
80
|
+
|
|
81
|
+
new(attrs.merge(id: id, created_at: now))
|
|
82
|
+
rescue StandardError => e
|
|
83
|
+
# Log but don't fail if event recording fails
|
|
84
|
+
Rails.logger.warn("[Autoscaler] Failed to record event: #{e.message}") if defined?(Rails)
|
|
85
|
+
nil
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Finds recent events.
|
|
89
|
+
# @param limit [Integer] Maximum number of events to return
|
|
90
|
+
# @param worker_name [String, nil] Filter by worker name
|
|
91
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
92
|
+
# @return [Array<ScaleEvent>] Array of events
|
|
93
|
+
def recent(limit: 50, worker_name: nil, connection: nil)
|
|
94
|
+
conn = connection || default_connection
|
|
95
|
+
return [] unless table_exists?(conn)
|
|
96
|
+
|
|
97
|
+
filter = worker_name ? "WHERE worker_name = #{conn.quote(worker_name)}" : ''
|
|
98
|
+
|
|
99
|
+
sql = <<~SQL
|
|
100
|
+
SELECT id, worker_name, action, from_workers, to_workers, reason,
|
|
101
|
+
queue_depth, latency_seconds, metrics_json, dry_run, created_at
|
|
102
|
+
FROM #{TABLE_NAME}
|
|
103
|
+
#{filter}
|
|
104
|
+
ORDER BY created_at DESC
|
|
105
|
+
LIMIT #{limit.to_i}
|
|
106
|
+
SQL
|
|
107
|
+
|
|
108
|
+
conn.select_all(sql).map { |row| from_row(row) }
|
|
109
|
+
rescue StandardError
|
|
110
|
+
[]
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Finds events by action type.
|
|
114
|
+
# @param action [String] Action type (scale_up, scale_down, etc.)
|
|
115
|
+
# @param limit [Integer] Maximum number of events
|
|
116
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
117
|
+
# @return [Array<ScaleEvent>] Array of events
|
|
118
|
+
def by_action(action, limit: 50, connection: nil)
|
|
119
|
+
conn = connection || default_connection
|
|
120
|
+
return [] unless table_exists?(conn)
|
|
121
|
+
|
|
122
|
+
sql = <<~SQL
|
|
123
|
+
SELECT id, worker_name, action, from_workers, to_workers, reason,
|
|
124
|
+
queue_depth, latency_seconds, metrics_json, dry_run, created_at
|
|
125
|
+
FROM #{TABLE_NAME}
|
|
126
|
+
WHERE action = #{conn.quote(action)}
|
|
127
|
+
ORDER BY created_at DESC
|
|
128
|
+
LIMIT #{limit.to_i}
|
|
129
|
+
SQL
|
|
130
|
+
|
|
131
|
+
conn.select_all(sql).map { |row| from_row(row) }
|
|
132
|
+
rescue StandardError
|
|
133
|
+
[]
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Gets event statistics for a time period.
|
|
137
|
+
# @param since [Time] Start time for statistics
|
|
138
|
+
# @param worker_name [String, nil] Filter by worker name
|
|
139
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
140
|
+
# @return [Hash] Statistics hash
|
|
141
|
+
def stats(since: 24.hours.ago, worker_name: nil, connection: nil)
|
|
142
|
+
conn = connection || default_connection
|
|
143
|
+
return default_stats unless table_exists?(conn)
|
|
144
|
+
|
|
145
|
+
worker_filter = worker_name ? "AND worker_name = #{conn.quote(worker_name)}" : ''
|
|
146
|
+
|
|
147
|
+
sql = <<~SQL
|
|
148
|
+
SELECT
|
|
149
|
+
action,
|
|
150
|
+
COUNT(*) as count,
|
|
151
|
+
AVG(queue_depth) as avg_queue_depth,
|
|
152
|
+
AVG(latency_seconds) as avg_latency
|
|
153
|
+
FROM #{TABLE_NAME}
|
|
154
|
+
WHERE created_at >= #{conn.quote(since)}
|
|
155
|
+
#{worker_filter}
|
|
156
|
+
GROUP BY action
|
|
157
|
+
SQL
|
|
158
|
+
|
|
159
|
+
results = conn.select_all(sql).to_a
|
|
160
|
+
build_stats(results)
|
|
161
|
+
rescue StandardError
|
|
162
|
+
default_stats
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Cleans up old events.
|
|
166
|
+
# @param keep_days [Integer] Number of days to keep
|
|
167
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
168
|
+
# @return [Integer] Number of deleted records
|
|
169
|
+
def cleanup!(keep_days: 30, connection: nil)
|
|
170
|
+
conn = connection || default_connection
|
|
171
|
+
return 0 unless table_exists?(conn)
|
|
172
|
+
|
|
173
|
+
cutoff = Time.current - keep_days.days
|
|
174
|
+
|
|
175
|
+
sql = <<~SQL
|
|
176
|
+
DELETE FROM #{TABLE_NAME}
|
|
177
|
+
WHERE created_at < #{conn.quote(cutoff)}
|
|
178
|
+
SQL
|
|
179
|
+
|
|
180
|
+
result = conn.execute(sql)
|
|
181
|
+
result.cmd_tuples
|
|
182
|
+
rescue StandardError
|
|
183
|
+
0
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Checks if the events table exists.
|
|
187
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
188
|
+
# @return [Boolean] True if table exists
|
|
189
|
+
def table_exists?(connection = nil)
|
|
190
|
+
conn = connection || default_connection
|
|
191
|
+
conn.table_exists?(TABLE_NAME)
|
|
192
|
+
rescue StandardError
|
|
193
|
+
false
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Counts events in a time period.
|
|
197
|
+
# @param since [Time] Start time
|
|
198
|
+
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
199
|
+
# @return [Integer] Event count
|
|
200
|
+
def count(since: nil, connection: nil)
|
|
201
|
+
conn = connection || default_connection
|
|
202
|
+
return 0 unless table_exists?(conn)
|
|
203
|
+
|
|
204
|
+
time_filter = since ? "WHERE created_at >= #{conn.quote(since)}" : ''
|
|
205
|
+
|
|
206
|
+
sql = "SELECT COUNT(*) FROM #{TABLE_NAME} #{time_filter}"
|
|
207
|
+
conn.select_value(sql).to_i
|
|
208
|
+
rescue StandardError
|
|
209
|
+
0
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
private
|
|
213
|
+
|
|
214
|
+
def default_connection
|
|
215
|
+
ActiveRecord::Base.connection
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def from_row(row)
|
|
219
|
+
new(
|
|
220
|
+
id: row['id'],
|
|
221
|
+
worker_name: row['worker_name'],
|
|
222
|
+
action: row['action'],
|
|
223
|
+
from_workers: row['from_workers'].to_i,
|
|
224
|
+
to_workers: row['to_workers'].to_i,
|
|
225
|
+
reason: row['reason'],
|
|
226
|
+
queue_depth: row['queue_depth'].to_i,
|
|
227
|
+
latency_seconds: row['latency_seconds'].to_f,
|
|
228
|
+
metrics_json: row['metrics_json'],
|
|
229
|
+
dry_run: parse_boolean(row['dry_run']),
|
|
230
|
+
created_at: parse_time(row['created_at'])
|
|
231
|
+
)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def parse_boolean(value)
|
|
235
|
+
case value
|
|
236
|
+
when true, 't', 'true', '1', 1
|
|
237
|
+
true
|
|
238
|
+
else
|
|
239
|
+
false
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def parse_time(value)
|
|
244
|
+
case value
|
|
245
|
+
when Time, DateTime
|
|
246
|
+
value.to_time
|
|
247
|
+
when String
|
|
248
|
+
Time.parse(value)
|
|
249
|
+
else
|
|
250
|
+
value
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def default_stats
|
|
255
|
+
{
|
|
256
|
+
total: 0,
|
|
257
|
+
scale_up_count: 0,
|
|
258
|
+
scale_down_count: 0,
|
|
259
|
+
no_change_count: 0,
|
|
260
|
+
skipped_count: 0,
|
|
261
|
+
error_count: 0,
|
|
262
|
+
avg_queue_depth: 0,
|
|
263
|
+
avg_latency: 0
|
|
264
|
+
}
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def build_stats(results)
|
|
268
|
+
stats = default_stats
|
|
269
|
+
|
|
270
|
+
results.each do |row|
|
|
271
|
+
action = row['action']
|
|
272
|
+
count = row['count'].to_i
|
|
273
|
+
|
|
274
|
+
stats[:total] += count
|
|
275
|
+
stats[:"#{action}_count"] = count
|
|
276
|
+
|
|
277
|
+
# Use weighted average for overall metrics
|
|
278
|
+
stats[:avg_queue_depth] += row['avg_queue_depth'].to_f * count if row['avg_queue_depth']
|
|
279
|
+
stats[:avg_latency] += row['avg_latency'].to_f * count if row['avg_latency']
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Calculate averages
|
|
283
|
+
if stats[:total].positive?
|
|
284
|
+
stats[:avg_queue_depth] /= stats[:total]
|
|
285
|
+
stats[:avg_latency] /= stats[:total]
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
stats
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
end
|