solid_queue_autoscaler 1.0.10 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -0
- data/lib/solid_queue_autoscaler/adapters/base.rb +30 -0
- data/lib/solid_queue_autoscaler/adapters/heroku.rb +26 -3
- data/lib/solid_queue_autoscaler/adapters/kubernetes.rb +33 -6
- data/lib/solid_queue_autoscaler/advisory_lock.rb +20 -0
- data/lib/solid_queue_autoscaler/autoscale_job.rb +10 -8
- data/lib/solid_queue_autoscaler/configuration.rb +15 -0
- data/lib/solid_queue_autoscaler/cooldown_tracker.rb +50 -9
- data/lib/solid_queue_autoscaler/dashboard.rb +10 -4
- data/lib/solid_queue_autoscaler/metrics.rb +10 -7
- data/lib/solid_queue_autoscaler/railtie.rb +5 -0
- data/lib/solid_queue_autoscaler/scale_event.rb +26 -10
- data/lib/solid_queue_autoscaler/scaler.rb +20 -11
- data/lib/solid_queue_autoscaler/version.rb +1 -1
- data/lib/solid_queue_autoscaler.rb +27 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bf4f38fa3806f153c03715b02554d1a82837da595fb9dadf4fb8063d6f52b3c8
|
|
4
|
+
data.tar.gz: 3737a7de81ab147dbd8e38fc87a2f601c21ea4d895758418f4bd484b0483ea12
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1ef6933dfe8a7936ba524ebd2fc94f46b20b6545b2e008e6d5e2c5c4753f54f866b82ef84eb75b39aa9d49e69ca476bae9be36cfdfff9cd39f97a380627e38f0
|
|
7
|
+
data.tar.gz: 14b04452165bac891d292dfdcb4dd7fb11993d6b5c772c43e658e2492437bcf7af2b9004f616b1caf9b73e606bb018855cd9f36be7b7a0909f336be59fc34952
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,41 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [1.0.12] - 2025-01-17
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- **Fixed AutoscaleJob being enqueued to "default" queue** - Added `queue_as :autoscaler` to the job class
|
|
14
|
+
- The issue was that SolidQueue recurring jobs capture the queue name during initialization, BEFORE Rails `after_initialize` hooks run
|
|
15
|
+
- Without a static `queue_as` in the class, jobs defaulted to the "default" queue
|
|
16
|
+
- The `apply_job_settings!` method can still override this via configuration, but the default must be set in the class for SolidQueue recurring to work correctly
|
|
17
|
+
|
|
18
|
+
## [1.0.11] - 2025-01-17
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
|
|
22
|
+
#### Critical Fixes
|
|
23
|
+
- **Thread safety** - Fixed race condition in mutex initialization (`scaler.rb`). Changed from lazy `@cooldown_mutex ||= Mutex.new` to thread-safe class constant `COOLDOWN_MUTEX`
|
|
24
|
+
- **SQL injection prevention** - Added regex validation for `table_prefix` configuration to only allow `[a-z0-9_]+` pattern
|
|
25
|
+
- **PgBouncer documentation** - Added prominent warning in `advisory_lock.rb` about incompatibility with PgBouncer transaction pooling mode
|
|
26
|
+
|
|
27
|
+
#### High Priority Fixes
|
|
28
|
+
- **CooldownTracker caching** - Added 5-minute TTL for `table_exists?` cache and `reset_table_exists_cache!` method for manual invalidation
|
|
29
|
+
- **ScaleEvent naming** - Renamed `create!` to `create` (non-bang) since it catches exceptions and returns nil. Added `create!` as deprecated alias for backward compatibility
|
|
30
|
+
- **Decision struct mutation** - Fixed mutation of Decision struct when clamping target workers. Now creates a new Decision instead of modifying the existing one
|
|
31
|
+
- **ZeroDivisionError prevention** - Added validation that `scale_up_jobs_per_worker`, `scale_up_latency_per_worker`, and `scale_down_jobs_per_worker` must be > 0 when using proportional scaling
|
|
32
|
+
|
|
33
|
+
#### Medium Priority Fixes
|
|
34
|
+
- **Retry logic for adapters** - Added exponential backoff retry (3 attempts with 1s/2s/4s delays) for transient network errors in both Heroku and Kubernetes adapters
|
|
35
|
+
- **Time parsing** - Fixed timezone handling in `cooldown_tracker.rb` to properly handle Time, DateTime, and String values
|
|
36
|
+
- **Dashboard query optimization** - Batched cooldown state retrieval in `worker_status` to reduce database queries
|
|
37
|
+
- **Metrics nil handling** - `oldest_job_age_seconds` now returns `0.0` instead of `nil` when no jobs exist
|
|
38
|
+
- **Kubernetes timeout** - Added 30-second timeout configuration to kubeclient API calls
|
|
39
|
+
|
|
40
|
+
#### Low Priority Fixes
|
|
41
|
+
- **Safe logger calls** - Added safe navigation (`logger&.warn`) throughout to prevent nil errors
|
|
42
|
+
- **SQL table quoting** - Now uses `connection.quote_table_name()` for all table name interpolations
|
|
43
|
+
- **Rails.logger nil check** - Added proper nil check before using `Rails.logger` in `scale_event.rb`
|
|
44
|
+
|
|
10
45
|
## [1.0.10] - 2025-01-17
|
|
11
46
|
|
|
12
47
|
### Fixed
|
|
@@ -32,6 +32,10 @@ module SolidQueueAutoscaler
|
|
|
32
32
|
# end
|
|
33
33
|
# end
|
|
34
34
|
class Base
|
|
35
|
+
# Default retry configuration for transient network errors
|
|
36
|
+
DEFAULT_MAX_RETRIES = 3
|
|
37
|
+
DEFAULT_RETRY_DELAYS = [1, 2, 4].freeze # Exponential backoff in seconds
|
|
38
|
+
|
|
35
39
|
# @param config [Configuration] the autoscaler configuration
|
|
36
40
|
def initialize(config:)
|
|
37
41
|
@config = config
|
|
@@ -97,6 +101,32 @@ module SolidQueueAutoscaler
|
|
|
97
101
|
def log_dry_run(message)
|
|
98
102
|
logger.info("[DRY RUN] #{message}")
|
|
99
103
|
end
|
|
104
|
+
|
|
105
|
+
# Executes a block with retry logic for transient errors.
|
|
106
|
+
# Uses exponential backoff with configurable delays.
|
|
107
|
+
#
|
|
108
|
+
# @param error_classes [Array<Class>] Exception classes that should trigger a retry
|
|
109
|
+
# @param max_retries [Integer] Maximum number of retry attempts (default: 3)
|
|
110
|
+
# @param delays [Array<Integer>] Delay in seconds for each retry (default: [1, 2, 4])
|
|
111
|
+
# @param retryable_check [Proc, nil] Optional proc to determine if a specific error should be retried
|
|
112
|
+
# @yield The block to execute with retry logic
|
|
113
|
+
# @return [Object] The result of the block
|
|
114
|
+
def with_retry(error_classes, max_retries: DEFAULT_MAX_RETRIES, delays: DEFAULT_RETRY_DELAYS, retryable_check: nil)
|
|
115
|
+
attempts = 0
|
|
116
|
+
begin
|
|
117
|
+
attempts += 1
|
|
118
|
+
yield
|
|
119
|
+
rescue *error_classes => e
|
|
120
|
+
should_retry = retryable_check ? retryable_check.call(e) : true
|
|
121
|
+
if attempts < max_retries && should_retry
|
|
122
|
+
delay = delays[attempts - 1] || delays.last
|
|
123
|
+
logger&.warn("[Autoscaler] #{name} API error (attempt #{attempts}/#{max_retries}), retrying in #{delay}s: #{e.message}")
|
|
124
|
+
sleep(delay)
|
|
125
|
+
retry
|
|
126
|
+
end
|
|
127
|
+
raise
|
|
128
|
+
end
|
|
129
|
+
end
|
|
100
130
|
end
|
|
101
131
|
end
|
|
102
132
|
end
|
|
@@ -20,9 +20,18 @@ module SolidQueueAutoscaler
|
|
|
20
20
|
# config.process_type = 'worker'
|
|
21
21
|
# end
|
|
22
22
|
class Heroku < Base
|
|
23
|
+
# Errors that are safe to retry (transient network issues)
|
|
24
|
+
RETRYABLE_ERRORS = [
|
|
25
|
+
Excon::Error::Timeout,
|
|
26
|
+
Excon::Error::Socket,
|
|
27
|
+
Excon::Error::HTTPStatus
|
|
28
|
+
].freeze
|
|
29
|
+
|
|
23
30
|
def current_workers
|
|
24
|
-
|
|
25
|
-
|
|
31
|
+
with_retry(RETRYABLE_ERRORS, retryable_check: method(:retryable_error?)) do
|
|
32
|
+
formation = client.formation.info(app_name, process_type)
|
|
33
|
+
formation['quantity']
|
|
34
|
+
end
|
|
26
35
|
rescue Excon::Error => e
|
|
27
36
|
raise HerokuAPIError.new(
|
|
28
37
|
"Failed to get formation info: #{e.message}",
|
|
@@ -37,7 +46,9 @@ module SolidQueueAutoscaler
|
|
|
37
46
|
return quantity
|
|
38
47
|
end
|
|
39
48
|
|
|
40
|
-
|
|
49
|
+
with_retry(RETRYABLE_ERRORS, retryable_check: method(:retryable_error?)) do
|
|
50
|
+
client.formation.update(app_name, process_type, { quantity: quantity })
|
|
51
|
+
end
|
|
41
52
|
quantity
|
|
42
53
|
rescue Excon::Error => e
|
|
43
54
|
raise HerokuAPIError.new(
|
|
@@ -73,6 +84,18 @@ module SolidQueueAutoscaler
|
|
|
73
84
|
|
|
74
85
|
private
|
|
75
86
|
|
|
87
|
+
# Determines if an error should be retried.
|
|
88
|
+
# Retries timeouts and 5xx errors, but not 4xx client errors.
|
|
89
|
+
def retryable_error?(error)
|
|
90
|
+
return true unless error.respond_to?(:response) && error.response
|
|
91
|
+
|
|
92
|
+
status = error.response.status
|
|
93
|
+
return true if status.nil?
|
|
94
|
+
|
|
95
|
+
# Retry server errors (5xx), not client errors (4xx)
|
|
96
|
+
status >= 500 || status == 429 # Also retry rate limiting
|
|
97
|
+
end
|
|
98
|
+
|
|
76
99
|
def client
|
|
77
100
|
@client ||= PlatformAPI.connect_oauth(api_key)
|
|
78
101
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'net/http'
|
|
4
|
+
|
|
3
5
|
module SolidQueueAutoscaler
|
|
4
6
|
module Adapters
|
|
5
7
|
# Kubernetes adapter for scaling Deployment replicas.
|
|
@@ -30,9 +32,24 @@ module SolidQueueAutoscaler
|
|
|
30
32
|
# Kubernetes API path for apps/v1 group
|
|
31
33
|
APPS_API_VERSION = 'apis/apps/v1'
|
|
32
34
|
|
|
35
|
+
# Default timeout for Kubernetes API calls (seconds)
|
|
36
|
+
DEFAULT_TIMEOUT = 30
|
|
37
|
+
|
|
38
|
+
# Errors that are safe to retry (transient network issues)
|
|
39
|
+
RETRYABLE_ERRORS = [
|
|
40
|
+
Errno::ECONNREFUSED,
|
|
41
|
+
Errno::ETIMEDOUT,
|
|
42
|
+
Errno::ECONNRESET,
|
|
43
|
+
Net::OpenTimeout,
|
|
44
|
+
Net::ReadTimeout,
|
|
45
|
+
SocketError
|
|
46
|
+
].freeze
|
|
47
|
+
|
|
33
48
|
def current_workers
|
|
34
|
-
|
|
35
|
-
|
|
49
|
+
with_retry(RETRYABLE_ERRORS) do
|
|
50
|
+
deployment = apps_client.get_deployment(deployment_name, namespace)
|
|
51
|
+
deployment.spec.replicas
|
|
52
|
+
end
|
|
36
53
|
rescue StandardError => e
|
|
37
54
|
raise KubernetesAPIError.new("Failed to get deployment info: #{e.message}", original_error: e)
|
|
38
55
|
end
|
|
@@ -43,8 +60,10 @@ module SolidQueueAutoscaler
|
|
|
43
60
|
return quantity
|
|
44
61
|
end
|
|
45
62
|
|
|
46
|
-
|
|
47
|
-
|
|
63
|
+
with_retry(RETRYABLE_ERRORS) do
|
|
64
|
+
patch_body = { spec: { replicas: quantity } }
|
|
65
|
+
apps_client.patch_deployment(deployment_name, patch_body, namespace)
|
|
66
|
+
end
|
|
48
67
|
quantity
|
|
49
68
|
rescue StandardError => e
|
|
50
69
|
raise KubernetesAPIError.new("Failed to scale deployment #{deployment_name} to #{quantity}: #{e.message}",
|
|
@@ -95,7 +114,11 @@ module SolidQueueAutoscaler
|
|
|
95
114
|
api_endpoint,
|
|
96
115
|
'v1',
|
|
97
116
|
auth_options: auth_options,
|
|
98
|
-
ssl_options: ssl_options
|
|
117
|
+
ssl_options: ssl_options,
|
|
118
|
+
timeouts: {
|
|
119
|
+
open: DEFAULT_TIMEOUT,
|
|
120
|
+
read: DEFAULT_TIMEOUT
|
|
121
|
+
}
|
|
99
122
|
)
|
|
100
123
|
end
|
|
101
124
|
|
|
@@ -112,7 +135,11 @@ module SolidQueueAutoscaler
|
|
|
112
135
|
api_endpoint,
|
|
113
136
|
'v1',
|
|
114
137
|
ssl_options: context.ssl_options,
|
|
115
|
-
auth_options: context.auth_options
|
|
138
|
+
auth_options: context.auth_options,
|
|
139
|
+
timeouts: {
|
|
140
|
+
open: DEFAULT_TIMEOUT,
|
|
141
|
+
read: DEFAULT_TIMEOUT
|
|
142
|
+
}
|
|
116
143
|
)
|
|
117
144
|
end
|
|
118
145
|
|
|
@@ -3,6 +3,26 @@
|
|
|
3
3
|
require 'zlib'
|
|
4
4
|
|
|
5
5
|
module SolidQueueAutoscaler
|
|
6
|
+
# PostgreSQL advisory lock wrapper for singleton enforcement.
|
|
7
|
+
#
|
|
8
|
+
# IMPORTANT: PgBouncer Compatibility Warning
|
|
9
|
+
# ==========================================
|
|
10
|
+
# PostgreSQL advisory locks are connection-scoped (session-level locks).
|
|
11
|
+
# If you're using PgBouncer in transaction pooling mode, advisory locks
|
|
12
|
+
# will NOT work correctly because:
|
|
13
|
+
# 1. Each query may run on a different backend connection
|
|
14
|
+
# 2. The lock acquired on one connection won't be visible on another
|
|
15
|
+
# 3. The lock may be "released" when returned to the pool
|
|
16
|
+
#
|
|
17
|
+
# Solutions:
|
|
18
|
+
# - Use PgBouncer in session pooling mode for the queue database
|
|
19
|
+
# - Use a direct connection (bypass PgBouncer) for the autoscaler
|
|
20
|
+
# - Disable advisory locks and use external coordination (Redis, etc.)
|
|
21
|
+
# - Set config.persist_cooldowns = false and rely on a single process
|
|
22
|
+
#
|
|
23
|
+
# If you're seeing multiple autoscalers running simultaneously or
|
|
24
|
+
# lock acquisition always failing, PgBouncer is likely the cause.
|
|
25
|
+
#
|
|
6
26
|
class AdvisoryLock
|
|
7
27
|
attr_reader :lock_key, :timeout
|
|
8
28
|
|
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
module SolidQueueAutoscaler
|
|
4
4
|
class AutoscaleJob < ActiveJob::Base
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
# returns a Proc that isn't evaluated by recurring jobs, causing jobs to
|
|
9
|
-
# go to 'default' queue instead.
|
|
5
|
+
# Default queue - this MUST be set here (not dynamically) because SolidQueue
|
|
6
|
+
# recurring jobs capture the queue name during initialization, BEFORE
|
|
7
|
+
# Rails after_initialize hooks run.
|
|
10
8
|
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
9
|
+
# The apply_job_settings! method can override this after Rails initializers
|
|
10
|
+
# run, but the default must be set here for SolidQueue recurring to work.
|
|
11
|
+
#
|
|
12
|
+
# You can customize the queue via:
|
|
13
|
+
# config.job_queue = :my_queue
|
|
14
|
+
#
|
|
15
|
+
# For SolidQueue recurring.yml, you can also set queue: directly in the YAML.
|
|
14
16
|
queue_as :autoscaler
|
|
15
17
|
|
|
16
18
|
discard_on ConfigurationError
|
|
@@ -175,6 +175,21 @@ module SolidQueueAutoscaler
|
|
|
175
175
|
errors << 'table_prefix cannot be nil or empty'
|
|
176
176
|
elsif !table_prefix.to_s.end_with?('_')
|
|
177
177
|
errors << 'table_prefix must end with an underscore'
|
|
178
|
+
elsif !table_prefix.to_s.match?(/\A[a-z0-9_]+\z/)
|
|
179
|
+
errors << 'table_prefix must contain only lowercase letters, numbers, and underscores'
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Validate proportional scaling settings to prevent ZeroDivisionError
|
|
183
|
+
if scaling_strategy == :proportional
|
|
184
|
+
if scale_up_jobs_per_worker.nil? || scale_up_jobs_per_worker <= 0
|
|
185
|
+
errors << 'scale_up_jobs_per_worker must be > 0 for proportional scaling'
|
|
186
|
+
end
|
|
187
|
+
if scale_up_latency_per_worker.nil? || scale_up_latency_per_worker <= 0
|
|
188
|
+
errors << 'scale_up_latency_per_worker must be > 0 for proportional scaling'
|
|
189
|
+
end
|
|
190
|
+
if scale_down_jobs_per_worker.nil? || scale_down_jobs_per_worker <= 0
|
|
191
|
+
errors << 'scale_down_jobs_per_worker must be > 0 for proportional scaling'
|
|
192
|
+
end
|
|
178
193
|
end
|
|
179
194
|
|
|
180
195
|
unless VALID_SCALING_STRATEGIES.include?(scaling_strategy)
|
|
@@ -13,16 +13,24 @@ module SolidQueueAutoscaler
|
|
|
13
13
|
@config = config || SolidQueueAutoscaler.config
|
|
14
14
|
@key = key
|
|
15
15
|
@table_exists = nil
|
|
16
|
+
@table_exists_checked_at = nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Resets the cached table_exists? result.
|
|
20
|
+
# Call this after running migrations to re-check table existence.
|
|
21
|
+
def reset_table_exists_cache!
|
|
22
|
+
@table_exists = nil
|
|
23
|
+
@table_exists_checked_at = nil
|
|
16
24
|
end
|
|
17
25
|
|
|
18
26
|
def last_scale_up_at
|
|
19
27
|
return nil unless table_exists?
|
|
20
28
|
|
|
21
29
|
result = connection.select_value(<<~SQL)
|
|
22
|
-
SELECT last_scale_up_at FROM #{
|
|
30
|
+
SELECT last_scale_up_at FROM #{quoted_table_name}
|
|
23
31
|
WHERE key = #{connection.quote(key)}
|
|
24
32
|
SQL
|
|
25
|
-
|
|
33
|
+
parse_time_result(result)
|
|
26
34
|
rescue ArgumentError
|
|
27
35
|
nil
|
|
28
36
|
end
|
|
@@ -31,10 +39,10 @@ module SolidQueueAutoscaler
|
|
|
31
39
|
return nil unless table_exists?
|
|
32
40
|
|
|
33
41
|
result = connection.select_value(<<~SQL)
|
|
34
|
-
SELECT last_scale_down_at FROM #{
|
|
42
|
+
SELECT last_scale_down_at FROM #{quoted_table_name}
|
|
35
43
|
WHERE key = #{connection.quote(key)}
|
|
36
44
|
SQL
|
|
37
|
-
|
|
45
|
+
parse_time_result(result)
|
|
38
46
|
rescue ArgumentError
|
|
39
47
|
nil
|
|
40
48
|
end
|
|
@@ -57,7 +65,7 @@ module SolidQueueAutoscaler
|
|
|
57
65
|
return false unless table_exists?
|
|
58
66
|
|
|
59
67
|
connection.execute(<<~SQL)
|
|
60
|
-
DELETE FROM #{
|
|
68
|
+
DELETE FROM #{quoted_table_name} WHERE key = #{connection.quote(key)}
|
|
61
69
|
SQL
|
|
62
70
|
true
|
|
63
71
|
end
|
|
@@ -92,12 +100,23 @@ module SolidQueueAutoscaler
|
|
|
92
100
|
[remaining, 0].max
|
|
93
101
|
end
|
|
94
102
|
|
|
103
|
+
# Cache TTL for table existence check (5 minutes)
|
|
104
|
+
TABLE_EXISTS_CACHE_TTL = 300
|
|
105
|
+
|
|
95
106
|
def table_exists?
|
|
96
|
-
|
|
107
|
+
# Return cached result if still valid
|
|
108
|
+
if !@table_exists.nil? && @table_exists_checked_at
|
|
109
|
+
cache_age = Time.now - @table_exists_checked_at
|
|
110
|
+
return @table_exists if cache_age < TABLE_EXISTS_CACHE_TTL
|
|
111
|
+
end
|
|
97
112
|
|
|
98
113
|
@table_exists = connection.table_exists?(TABLE_NAME)
|
|
114
|
+
@table_exists_checked_at = Time.now
|
|
115
|
+
@table_exists
|
|
99
116
|
rescue StandardError
|
|
100
117
|
@table_exists = false
|
|
118
|
+
@table_exists_checked_at = Time.now
|
|
119
|
+
@table_exists
|
|
101
120
|
end
|
|
102
121
|
|
|
103
122
|
def state
|
|
@@ -105,7 +124,7 @@ module SolidQueueAutoscaler
|
|
|
105
124
|
|
|
106
125
|
row = connection.select_one(<<~SQL)
|
|
107
126
|
SELECT last_scale_up_at, last_scale_down_at, updated_at
|
|
108
|
-
FROM #{
|
|
127
|
+
FROM #{quoted_table_name}
|
|
109
128
|
WHERE key = #{connection.quote(key)}
|
|
110
129
|
SQL
|
|
111
130
|
|
|
@@ -124,6 +143,28 @@ module SolidQueueAutoscaler
|
|
|
124
143
|
@config.connection
|
|
125
144
|
end
|
|
126
145
|
|
|
146
|
+
def quoted_table_name
|
|
147
|
+
connection.quote_table_name(TABLE_NAME)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Safely parses a time value from the database.
|
|
151
|
+
# Handles Time, DateTime, String, and nil values.
|
|
152
|
+
def parse_time_result(value)
|
|
153
|
+
return nil if value.nil?
|
|
154
|
+
|
|
155
|
+
case value
|
|
156
|
+
when Time, DateTime
|
|
157
|
+
value.to_time
|
|
158
|
+
when String
|
|
159
|
+
Time.parse(value)
|
|
160
|
+
else
|
|
161
|
+
# Try to convert to time if possible
|
|
162
|
+
value.respond_to?(:to_time) ? value.to_time : Time.parse(value.to_s)
|
|
163
|
+
end
|
|
164
|
+
rescue ArgumentError, TypeError
|
|
165
|
+
nil
|
|
166
|
+
end
|
|
167
|
+
|
|
127
168
|
def upsert_state(last_scale_up_at: nil, last_scale_down_at: nil)
|
|
128
169
|
now = Time.current
|
|
129
170
|
quoted_key = connection.quote(key)
|
|
@@ -132,7 +173,7 @@ module SolidQueueAutoscaler
|
|
|
132
173
|
if last_scale_up_at
|
|
133
174
|
quoted_time = connection.quote(last_scale_up_at)
|
|
134
175
|
connection.execute(<<~SQL)
|
|
135
|
-
INSERT INTO #{
|
|
176
|
+
INSERT INTO #{quoted_table_name} (key, last_scale_up_at, created_at, updated_at)
|
|
136
177
|
VALUES (#{quoted_key}, #{quoted_time}, #{quoted_now}, #{quoted_now})
|
|
137
178
|
ON CONFLICT (key) DO UPDATE SET
|
|
138
179
|
last_scale_up_at = EXCLUDED.last_scale_up_at,
|
|
@@ -141,7 +182,7 @@ module SolidQueueAutoscaler
|
|
|
141
182
|
elsif last_scale_down_at
|
|
142
183
|
quoted_time = connection.quote(last_scale_down_at)
|
|
143
184
|
connection.execute(<<~SQL)
|
|
144
|
-
INSERT INTO #{
|
|
185
|
+
INSERT INTO #{quoted_table_name} (key, last_scale_down_at, created_at, updated_at)
|
|
145
186
|
VALUES (#{quoted_key}, #{quoted_time}, #{quoted_now}, #{quoted_now})
|
|
146
187
|
ON CONFLICT (key) DO UPDATE SET
|
|
147
188
|
last_scale_down_at = EXCLUDED.last_scale_down_at,
|
|
@@ -11,12 +11,15 @@ module SolidQueueAutoscaler
|
|
|
11
11
|
workers = SolidQueueAutoscaler.registered_workers
|
|
12
12
|
workers = [:default] if workers.empty?
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
# Batch collect metrics once per worker to reduce DB queries
|
|
15
|
+
workers.each_with_object({}) do |name, result|
|
|
16
|
+
result[name] = worker_status(name)
|
|
16
17
|
end
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# Returns status for a specific worker
|
|
21
|
+
# Note: Each call makes several DB queries. For multiple workers,
|
|
22
|
+
# consider caching or using status() which can batch some queries.
|
|
20
23
|
# @param name [Symbol] Worker name
|
|
21
24
|
# @return [Hash] Status information
|
|
22
25
|
def worker_status(name)
|
|
@@ -24,6 +27,9 @@ module SolidQueueAutoscaler
|
|
|
24
27
|
metrics = safe_metrics(name)
|
|
25
28
|
tracker = CooldownTracker.new(config: config, key: name.to_s)
|
|
26
29
|
|
|
30
|
+
# Batch cooldown state retrieval into one DB call
|
|
31
|
+
cooldown_state = tracker.state
|
|
32
|
+
|
|
27
33
|
{
|
|
28
34
|
name: name,
|
|
29
35
|
enabled: config.enabled?,
|
|
@@ -45,8 +51,8 @@ module SolidQueueAutoscaler
|
|
|
45
51
|
cooldowns: {
|
|
46
52
|
scale_up_remaining: tracker.scale_up_cooldown_remaining.round,
|
|
47
53
|
scale_down_remaining: tracker.scale_down_cooldown_remaining.round,
|
|
48
|
-
last_scale_up:
|
|
49
|
-
last_scale_down:
|
|
54
|
+
last_scale_up: cooldown_state[:last_scale_up_at],
|
|
55
|
+
last_scale_down: cooldown_state[:last_scale_down_at]
|
|
50
56
|
},
|
|
51
57
|
thresholds: {
|
|
52
58
|
scale_up_queue_depth: config.scale_up_queue_depth,
|
|
@@ -72,7 +72,9 @@ module SolidQueueAutoscaler
|
|
|
72
72
|
#{queue_filter_clause}
|
|
73
73
|
SQL
|
|
74
74
|
result = connection.select_value(sql)
|
|
75
|
-
result.to_f
|
|
75
|
+
# Return 0 if no jobs exist (result is nil) instead of nil.to_f which returns 0.0
|
|
76
|
+
# This makes the return value more predictable and avoids nil-related issues
|
|
77
|
+
result.nil? ? 0.0 : result.to_f
|
|
76
78
|
end
|
|
77
79
|
|
|
78
80
|
def jobs_per_minute
|
|
@@ -141,32 +143,33 @@ module SolidQueueAutoscaler
|
|
|
141
143
|
end
|
|
142
144
|
|
|
143
145
|
# Table name helpers using configurable prefix
|
|
146
|
+
# Uses quote_table_name for SQL safety
|
|
144
147
|
def table_prefix
|
|
145
148
|
@config.table_prefix
|
|
146
149
|
end
|
|
147
150
|
|
|
148
151
|
def ready_executions_table
|
|
149
|
-
"#{table_prefix}ready_executions"
|
|
152
|
+
connection.quote_table_name("#{table_prefix}ready_executions")
|
|
150
153
|
end
|
|
151
154
|
|
|
152
155
|
def jobs_table
|
|
153
|
-
"#{table_prefix}jobs"
|
|
156
|
+
connection.quote_table_name("#{table_prefix}jobs")
|
|
154
157
|
end
|
|
155
158
|
|
|
156
159
|
def claimed_executions_table
|
|
157
|
-
"#{table_prefix}claimed_executions"
|
|
160
|
+
connection.quote_table_name("#{table_prefix}claimed_executions")
|
|
158
161
|
end
|
|
159
162
|
|
|
160
163
|
def failed_executions_table
|
|
161
|
-
"#{table_prefix}failed_executions"
|
|
164
|
+
connection.quote_table_name("#{table_prefix}failed_executions")
|
|
162
165
|
end
|
|
163
166
|
|
|
164
167
|
def blocked_executions_table
|
|
165
|
-
"#{table_prefix}blocked_executions"
|
|
168
|
+
connection.quote_table_name("#{table_prefix}blocked_executions")
|
|
166
169
|
end
|
|
167
170
|
|
|
168
171
|
def processes_table
|
|
169
|
-
"#{table_prefix}processes"
|
|
172
|
+
connection.quote_table_name("#{table_prefix}processes")
|
|
170
173
|
end
|
|
171
174
|
end
|
|
172
175
|
end
|
|
@@ -11,6 +11,11 @@ module SolidQueueAutoscaler
|
|
|
11
11
|
# Configuration happens via initializer, nothing to do here
|
|
12
12
|
end
|
|
13
13
|
|
|
14
|
+
# After all initializers have run, apply job settings from configuration
|
|
15
|
+
config.after_initialize do
|
|
16
|
+
SolidQueueAutoscaler.apply_job_settings!
|
|
17
|
+
end
|
|
18
|
+
|
|
14
19
|
rake_tasks do
|
|
15
20
|
namespace :solid_queue_autoscaler do
|
|
16
21
|
desc 'Run the autoscaler once for a specific worker (default: :default). Use WORKER=name'
|
|
@@ -167,16 +167,18 @@ module SolidQueueAutoscaler
|
|
|
167
167
|
end
|
|
168
168
|
|
|
169
169
|
# Creates a new scale event record.
|
|
170
|
+
# Returns nil if the table doesn't exist or on error (does not raise).
|
|
170
171
|
# @param attrs [Hash] Event attributes
|
|
171
172
|
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
172
|
-
# @return [ScaleEvent] The created event
|
|
173
|
-
def create
|
|
173
|
+
# @return [ScaleEvent, nil] The created event, or nil on failure
|
|
174
|
+
def create(attrs, connection: nil)
|
|
174
175
|
conn = connection || default_connection
|
|
175
176
|
return nil unless table_exists?(conn)
|
|
176
177
|
|
|
177
178
|
now = Time.current
|
|
179
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
178
180
|
sql = <<~SQL
|
|
179
|
-
INSERT INTO #{
|
|
181
|
+
INSERT INTO #{quoted_table}
|
|
180
182
|
(worker_name, action, from_workers, to_workers, reason,
|
|
181
183
|
queue_depth, latency_seconds, metrics_json, dry_run, created_at)
|
|
182
184
|
VALUES
|
|
@@ -199,10 +201,18 @@ module SolidQueueAutoscaler
|
|
|
199
201
|
new(attrs.merge(id: id, created_at: now))
|
|
200
202
|
rescue StandardError => e
|
|
201
203
|
# Log but don't fail if event recording fails
|
|
202
|
-
Rails.
|
|
204
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
205
|
+
Rails.logger.warn("[Autoscaler] Failed to record event: #{e.message}")
|
|
206
|
+
end
|
|
203
207
|
nil
|
|
204
208
|
end
|
|
205
209
|
|
|
210
|
+
# Alias for backward compatibility
|
|
211
|
+
# @deprecated Use {#create} instead
|
|
212
|
+
def create!(attrs, connection: nil)
|
|
213
|
+
create(attrs, connection: connection)
|
|
214
|
+
end
|
|
215
|
+
|
|
206
216
|
# Finds recent events.
|
|
207
217
|
# @param limit [Integer] Maximum number of events to return
|
|
208
218
|
# @param worker_name [String, nil] Filter by worker name
|
|
@@ -213,11 +223,12 @@ module SolidQueueAutoscaler
|
|
|
213
223
|
return [] unless table_exists?(conn)
|
|
214
224
|
|
|
215
225
|
filter = worker_name ? "WHERE worker_name = #{conn.quote(worker_name)}" : ''
|
|
226
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
216
227
|
|
|
217
228
|
sql = <<~SQL
|
|
218
229
|
SELECT id, worker_name, action, from_workers, to_workers, reason,
|
|
219
230
|
queue_depth, latency_seconds, metrics_json, dry_run, created_at
|
|
220
|
-
FROM #{
|
|
231
|
+
FROM #{quoted_table}
|
|
221
232
|
#{filter}
|
|
222
233
|
ORDER BY created_at DESC
|
|
223
234
|
LIMIT #{limit.to_i}
|
|
@@ -237,10 +248,11 @@ module SolidQueueAutoscaler
|
|
|
237
248
|
conn = connection || default_connection
|
|
238
249
|
return [] unless table_exists?(conn)
|
|
239
250
|
|
|
251
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
240
252
|
sql = <<~SQL
|
|
241
253
|
SELECT id, worker_name, action, from_workers, to_workers, reason,
|
|
242
254
|
queue_depth, latency_seconds, metrics_json, dry_run, created_at
|
|
243
|
-
FROM #{
|
|
255
|
+
FROM #{quoted_table}
|
|
244
256
|
WHERE action = #{conn.quote(action)}
|
|
245
257
|
ORDER BY created_at DESC
|
|
246
258
|
LIMIT #{limit.to_i}
|
|
@@ -261,6 +273,7 @@ module SolidQueueAutoscaler
|
|
|
261
273
|
return default_stats unless table_exists?(conn)
|
|
262
274
|
|
|
263
275
|
worker_filter = worker_name ? "AND worker_name = #{conn.quote(worker_name)}" : ''
|
|
276
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
264
277
|
|
|
265
278
|
sql = <<~SQL
|
|
266
279
|
SELECT
|
|
@@ -268,7 +281,7 @@ module SolidQueueAutoscaler
|
|
|
268
281
|
COUNT(*) as count,
|
|
269
282
|
AVG(queue_depth) as avg_queue_depth,
|
|
270
283
|
AVG(latency_seconds) as avg_latency
|
|
271
|
-
FROM #{
|
|
284
|
+
FROM #{quoted_table}
|
|
272
285
|
WHERE created_at >= #{conn.quote(since)}
|
|
273
286
|
#{worker_filter}
|
|
274
287
|
GROUP BY action
|
|
@@ -289,9 +302,10 @@ module SolidQueueAutoscaler
|
|
|
289
302
|
return 0 unless table_exists?(conn)
|
|
290
303
|
|
|
291
304
|
cutoff = Time.current - keep_days.days
|
|
305
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
292
306
|
|
|
293
307
|
sql = <<~SQL
|
|
294
|
-
DELETE FROM #{
|
|
308
|
+
DELETE FROM #{quoted_table}
|
|
295
309
|
WHERE created_at < #{conn.quote(cutoff)}
|
|
296
310
|
SQL
|
|
297
311
|
|
|
@@ -320,8 +334,9 @@ module SolidQueueAutoscaler
|
|
|
320
334
|
return 0 unless table_exists?(conn)
|
|
321
335
|
|
|
322
336
|
time_filter = since ? "WHERE created_at >= #{conn.quote(since)}" : ''
|
|
337
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
323
338
|
|
|
324
|
-
sql = "SELECT COUNT(*) FROM #{
|
|
339
|
+
sql = "SELECT COUNT(*) FROM #{quoted_table} #{time_filter}"
|
|
325
340
|
conn.select_value(sql).to_i
|
|
326
341
|
rescue StandardError
|
|
327
342
|
0
|
|
@@ -367,7 +382,8 @@ module SolidQueueAutoscaler
|
|
|
367
382
|
result[:recent_events] = count(since: 24.hours.ago, connection: conn)
|
|
368
383
|
|
|
369
384
|
# Get last event time
|
|
370
|
-
|
|
385
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
386
|
+
sql = "SELECT MAX(created_at) FROM #{quoted_table}"
|
|
371
387
|
last_at = conn.select_value(sql)
|
|
372
388
|
result[:last_event_at] = last_at ? parse_time(last_at) : nil
|
|
373
389
|
rescue StandardError => e
|
|
@@ -25,9 +25,13 @@ module SolidQueueAutoscaler
|
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
# Per-configuration cooldown tracking for multi-worker support
|
|
28
|
+
# Thread-safe mutex for cooldown tracking - defined as constant to avoid
|
|
29
|
+
# race condition where lazy initialization could create multiple mutexes
|
|
30
|
+
COOLDOWN_MUTEX = Mutex.new
|
|
31
|
+
|
|
28
32
|
class << self
|
|
29
33
|
def cooldown_mutex
|
|
30
|
-
|
|
34
|
+
COOLDOWN_MUTEX
|
|
31
35
|
end
|
|
32
36
|
|
|
33
37
|
def cooldowns
|
|
@@ -158,12 +162,17 @@ module SolidQueueAutoscaler
|
|
|
158
162
|
target = decision.to.clamp(@config.min_workers, @config.max_workers)
|
|
159
163
|
|
|
160
164
|
if target != decision.to
|
|
161
|
-
logger
|
|
165
|
+
logger&.warn(
|
|
162
166
|
"[Autoscaler] Clamping target from #{decision.to} to #{target} " \
|
|
163
167
|
"(limits: #{@config.min_workers}-#{@config.max_workers})"
|
|
164
168
|
)
|
|
165
|
-
#
|
|
166
|
-
decision
|
|
169
|
+
# Create a new decision with the clamped target instead of mutating
|
|
170
|
+
decision = DecisionEngine::Decision.new(
|
|
171
|
+
action: decision.action,
|
|
172
|
+
from: decision.from,
|
|
173
|
+
to: target,
|
|
174
|
+
reason: decision.reason
|
|
175
|
+
)
|
|
167
176
|
end
|
|
168
177
|
|
|
169
178
|
@adapter.scale(target)
|
|
@@ -250,7 +259,7 @@ module SolidQueueAutoscaler
|
|
|
250
259
|
|
|
251
260
|
def log_decision(decision, metrics)
|
|
252
261
|
worker_label = @config.name == :default ? '' : "[#{@config.name}] "
|
|
253
|
-
logger
|
|
262
|
+
logger&.info(
|
|
254
263
|
"[Autoscaler] #{worker_label}Evaluated: action=#{decision.action} " \
|
|
255
264
|
"workers=#{decision.from}->#{decision.to} " \
|
|
256
265
|
"queue_depth=#{metrics.queue_depth} " \
|
|
@@ -262,7 +271,7 @@ module SolidQueueAutoscaler
|
|
|
262
271
|
def log_scale_action(decision)
|
|
263
272
|
prefix = @config.dry_run? ? '[DRY RUN] ' : ''
|
|
264
273
|
worker_label = @config.name == :default ? '' : "[#{@config.name}] "
|
|
265
|
-
logger
|
|
274
|
+
logger&.info(
|
|
266
275
|
"#{prefix}[Autoscaler] #{worker_label}Scaling #{decision.action}: " \
|
|
267
276
|
"#{decision.from} -> #{decision.to} workers (#{decision.reason})"
|
|
268
277
|
)
|
|
@@ -281,7 +290,7 @@ module SolidQueueAutoscaler
|
|
|
281
290
|
end
|
|
282
291
|
|
|
283
292
|
def skipped_result(reason, decision: nil, metrics: nil)
|
|
284
|
-
logger
|
|
293
|
+
logger&.debug("[Autoscaler] Skipped: #{reason}")
|
|
285
294
|
|
|
286
295
|
# Record skipped events
|
|
287
296
|
record_skipped_event(reason, decision, metrics)
|
|
@@ -296,7 +305,7 @@ module SolidQueueAutoscaler
|
|
|
296
305
|
end
|
|
297
306
|
|
|
298
307
|
def error_result(error)
|
|
299
|
-
logger
|
|
308
|
+
logger&.error("[Autoscaler] Error: #{error.class}: #{error.message}")
|
|
300
309
|
|
|
301
310
|
# Record error events
|
|
302
311
|
record_error_event(error)
|
|
@@ -315,7 +324,7 @@ module SolidQueueAutoscaler
|
|
|
315
324
|
def record_scale_event(decision, metrics)
|
|
316
325
|
return unless @config.record_events?
|
|
317
326
|
|
|
318
|
-
ScaleEvent.create
|
|
327
|
+
ScaleEvent.create(
|
|
319
328
|
{
|
|
320
329
|
worker_name: @config.name.to_s,
|
|
321
330
|
action: decision.action.to_s,
|
|
@@ -334,7 +343,7 @@ module SolidQueueAutoscaler
|
|
|
334
343
|
def record_skipped_event(reason, decision, metrics)
|
|
335
344
|
return unless @config.record_events?
|
|
336
345
|
|
|
337
|
-
ScaleEvent.create
|
|
346
|
+
ScaleEvent.create(
|
|
338
347
|
{
|
|
339
348
|
worker_name: @config.name.to_s,
|
|
340
349
|
action: 'skipped',
|
|
@@ -353,7 +362,7 @@ module SolidQueueAutoscaler
|
|
|
353
362
|
def record_error_event(error)
|
|
354
363
|
return unless @config.record_events?
|
|
355
364
|
|
|
356
|
-
ScaleEvent.create
|
|
365
|
+
ScaleEvent.create(
|
|
357
366
|
{
|
|
358
367
|
worker_name: @config.name.to_s,
|
|
359
368
|
action: 'error',
|
|
@@ -100,6 +100,33 @@ module SolidQueueAutoscaler
|
|
|
100
100
|
end
|
|
101
101
|
end
|
|
102
102
|
|
|
103
|
+
# Apply job settings (queue, priority) from configuration to AutoscaleJob.
|
|
104
|
+
# Called automatically after Rails initializers run via the railtie.
|
|
105
|
+
# Uses the first configured worker's job_queue/job_priority settings.
|
|
106
|
+
def apply_job_settings!
|
|
107
|
+
return unless defined?(AutoscaleJob)
|
|
108
|
+
return if configurations.empty?
|
|
109
|
+
|
|
110
|
+
# Use the first configured worker's settings
|
|
111
|
+
first_config = configurations.values.first
|
|
112
|
+
job_queue = first_config&.job_queue || :autoscaler
|
|
113
|
+
job_priority = first_config&.job_priority
|
|
114
|
+
|
|
115
|
+
# Set the queue_name class attribute directly (not via queue_as block)
|
|
116
|
+
# This ensures SolidQueue recurring jobs pick up the correct queue
|
|
117
|
+
# Convert to string since ActiveJob internally uses strings for queue names
|
|
118
|
+
AutoscaleJob.queue_name = job_queue.to_s
|
|
119
|
+
|
|
120
|
+
# Set priority if configured
|
|
121
|
+
if job_priority && AutoscaleJob.respond_to?(:priority=)
|
|
122
|
+
AutoscaleJob.priority = job_priority
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
first_config&.logger&.debug(
|
|
126
|
+
"[SolidQueueAutoscaler] AutoscaleJob configured: queue=#{job_queue}, priority=#{job_priority || 'default'}"
|
|
127
|
+
)
|
|
128
|
+
end
|
|
129
|
+
|
|
103
130
|
# Verify the installation is complete and working.
|
|
104
131
|
# Prints a human-friendly report (when verbose: true) and returns a VerificationResult.
|
|
105
132
|
#
|