solid_queue_autoscaler 1.0.10 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/lib/solid_queue_autoscaler/adapters/heroku.rb +48 -3
- data/lib/solid_queue_autoscaler/adapters/kubernetes.rb +44 -6
- data/lib/solid_queue_autoscaler/advisory_lock.rb +20 -0
- data/lib/solid_queue_autoscaler/configuration.rb +15 -0
- data/lib/solid_queue_autoscaler/cooldown_tracker.rb +50 -9
- data/lib/solid_queue_autoscaler/dashboard.rb +10 -4
- data/lib/solid_queue_autoscaler/metrics.rb +10 -7
- data/lib/solid_queue_autoscaler/scale_event.rb +26 -10
- data/lib/solid_queue_autoscaler/scaler.rb +20 -11
- data/lib/solid_queue_autoscaler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cf794daeb74474c136c8aec706793bf74617dcb610abf42df89ddb4fefd99274
|
|
4
|
+
data.tar.gz: 96ec9ad6993871c7773ff524c5d71ff3e919ff89aacba5587d5dee63aa277f5d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0b8dd105d028035aee534300ee1d91af9193faefe4e27c8ba3d72c98b3c401cdc52a04894bdeef0b14ebdd7a42f120e563bb378509f2a9ac046a18554c7079d0
|
|
7
|
+
data.tar.gz: 9980ac5a53affb82b264cd9e7c9bacb388d26a5ecc0116fdd1e22ce527c66ec63bc51d5fe47a492df1941df1b4d0052cfed721e7e4b62566a5c70ab885852f9c
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [1.0.11] - 2025-01-17
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
#### Critical Fixes
|
|
15
|
+
- **Thread safety** - Fixed race condition in mutex initialization (`scaler.rb`). Changed from lazy `@cooldown_mutex ||= Mutex.new` to thread-safe class constant `COOLDOWN_MUTEX`
|
|
16
|
+
- **SQL injection prevention** - Added regex validation for `table_prefix` configuration to only allow `[a-z0-9_]+` pattern
|
|
17
|
+
- **PgBouncer documentation** - Added prominent warning in `advisory_lock.rb` about incompatibility with PgBouncer transaction pooling mode
|
|
18
|
+
|
|
19
|
+
#### High Priority Fixes
|
|
20
|
+
- **CooldownTracker caching** - Added 5-minute TTL for `table_exists?` cache and `reset_table_exists_cache!` method for manual invalidation
|
|
21
|
+
- **ScaleEvent naming** - Renamed `create!` to `create` (non-bang) since it catches exceptions and returns nil. Added `create!` as deprecated alias for backward compatibility
|
|
22
|
+
- **Decision struct mutation** - Fixed mutation of Decision struct when clamping target workers. Now creates a new Decision instead of modifying the existing one
|
|
23
|
+
- **ZeroDivisionError prevention** - Added validation that `scale_up_jobs_per_worker`, `scale_up_latency_per_worker`, and `scale_down_jobs_per_worker` must be > 0 when using proportional scaling
|
|
24
|
+
|
|
25
|
+
#### Medium Priority Fixes
|
|
26
|
+
- **Retry logic for adapters** - Added exponential backoff retry (3 attempts with 1s/2s/4s delays) for transient network errors in both Heroku and Kubernetes adapters
|
|
27
|
+
- **Time parsing** - Fixed timezone handling in `cooldown_tracker.rb` to properly handle Time, DateTime, and String values
|
|
28
|
+
- **Dashboard query optimization** - Batched cooldown state retrieval in `worker_status` to reduce database queries
|
|
29
|
+
- **Metrics nil handling** - `oldest_job_age_seconds` now returns `0.0` instead of `nil` when no jobs exist
|
|
30
|
+
- **Kubernetes timeout** - Added 30-second timeout configuration to kubeclient API calls
|
|
31
|
+
|
|
32
|
+
#### Low Priority Fixes
|
|
33
|
+
- **Safe logger calls** - Added safe navigation (`logger&.warn`) throughout to prevent nil errors
|
|
34
|
+
- **SQL table quoting** - Now uses `connection.quote_table_name()` for all table name interpolations
|
|
35
|
+
- **Rails.logger nil check** - Added proper nil check before using `Rails.logger` in `scale_event.rb`
|
|
36
|
+
|
|
10
37
|
## [1.0.10] - 2025-01-17
|
|
11
38
|
|
|
12
39
|
### Fixed
|
|
@@ -20,9 +20,22 @@ module SolidQueueAutoscaler
|
|
|
20
20
|
# config.process_type = 'worker'
|
|
21
21
|
# end
|
|
22
22
|
class Heroku < Base
|
|
23
|
+
# Retry configuration for transient network errors
|
|
24
|
+
MAX_RETRIES = 3
|
|
25
|
+
RETRY_DELAYS = [1, 2, 4].freeze # Exponential backoff in seconds
|
|
26
|
+
|
|
27
|
+
# Errors that are safe to retry (transient network issues)
|
|
28
|
+
RETRYABLE_ERRORS = [
|
|
29
|
+
Excon::Error::Timeout,
|
|
30
|
+
Excon::Error::Socket,
|
|
31
|
+
Excon::Error::HTTPStatus
|
|
32
|
+
].freeze
|
|
33
|
+
|
|
23
34
|
def current_workers
|
|
24
|
-
|
|
25
|
-
|
|
35
|
+
with_retry do
|
|
36
|
+
formation = client.formation.info(app_name, process_type)
|
|
37
|
+
formation['quantity']
|
|
38
|
+
end
|
|
26
39
|
rescue Excon::Error => e
|
|
27
40
|
raise HerokuAPIError.new(
|
|
28
41
|
"Failed to get formation info: #{e.message}",
|
|
@@ -37,7 +50,9 @@ module SolidQueueAutoscaler
|
|
|
37
50
|
return quantity
|
|
38
51
|
end
|
|
39
52
|
|
|
40
|
-
|
|
53
|
+
with_retry do
|
|
54
|
+
client.formation.update(app_name, process_type, { quantity: quantity })
|
|
55
|
+
end
|
|
41
56
|
quantity
|
|
42
57
|
rescue Excon::Error => e
|
|
43
58
|
raise HerokuAPIError.new(
|
|
@@ -73,6 +88,36 @@ module SolidQueueAutoscaler
|
|
|
73
88
|
|
|
74
89
|
private
|
|
75
90
|
|
|
91
|
+
# Executes a block with retry logic for transient network errors.
|
|
92
|
+
# Uses exponential backoff: 1s, 2s, 4s delays between retries.
|
|
93
|
+
def with_retry
|
|
94
|
+
attempts = 0
|
|
95
|
+
begin
|
|
96
|
+
attempts += 1
|
|
97
|
+
yield
|
|
98
|
+
rescue *RETRYABLE_ERRORS => e
|
|
99
|
+
if attempts < MAX_RETRIES && retryable_error?(e)
|
|
100
|
+
delay = RETRY_DELAYS[attempts - 1] || RETRY_DELAYS.last
|
|
101
|
+
logger&.warn("[Autoscaler] Heroku API error (attempt #{attempts}/#{MAX_RETRIES}), retrying in #{delay}s: #{e.message}")
|
|
102
|
+
sleep(delay)
|
|
103
|
+
retry
|
|
104
|
+
end
|
|
105
|
+
raise
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Determines if an error should be retried.
|
|
110
|
+
# Retries timeouts and 5xx errors, but not 4xx client errors.
|
|
111
|
+
def retryable_error?(error)
|
|
112
|
+
return true unless error.respond_to?(:response) && error.response
|
|
113
|
+
|
|
114
|
+
status = error.response.status
|
|
115
|
+
return true if status.nil?
|
|
116
|
+
|
|
117
|
+
# Retry server errors (5xx), not client errors (4xx)
|
|
118
|
+
status >= 500 || status == 429 # Also retry rate limiting
|
|
119
|
+
end
|
|
120
|
+
|
|
76
121
|
def client
|
|
77
122
|
@client ||= PlatformAPI.connect_oauth(api_key)
|
|
78
123
|
end
|
|
@@ -30,9 +30,18 @@ module SolidQueueAutoscaler
|
|
|
30
30
|
# Kubernetes API path for apps/v1 group
|
|
31
31
|
APPS_API_VERSION = 'apis/apps/v1'
|
|
32
32
|
|
|
33
|
+
# Retry configuration for transient network errors
|
|
34
|
+
MAX_RETRIES = 3
|
|
35
|
+
RETRY_DELAYS = [1, 2, 4].freeze # Exponential backoff in seconds
|
|
36
|
+
|
|
37
|
+
# Default timeout for Kubernetes API calls (seconds)
|
|
38
|
+
DEFAULT_TIMEOUT = 30
|
|
39
|
+
|
|
33
40
|
def current_workers
|
|
34
|
-
|
|
35
|
-
|
|
41
|
+
with_retry do
|
|
42
|
+
deployment = apps_client.get_deployment(deployment_name, namespace)
|
|
43
|
+
deployment.spec.replicas
|
|
44
|
+
end
|
|
36
45
|
rescue StandardError => e
|
|
37
46
|
raise KubernetesAPIError.new("Failed to get deployment info: #{e.message}", original_error: e)
|
|
38
47
|
end
|
|
@@ -43,8 +52,10 @@ module SolidQueueAutoscaler
|
|
|
43
52
|
return quantity
|
|
44
53
|
end
|
|
45
54
|
|
|
46
|
-
|
|
47
|
-
|
|
55
|
+
with_retry do
|
|
56
|
+
patch_body = { spec: { replicas: quantity } }
|
|
57
|
+
apps_client.patch_deployment(deployment_name, patch_body, namespace)
|
|
58
|
+
end
|
|
48
59
|
quantity
|
|
49
60
|
rescue StandardError => e
|
|
50
61
|
raise KubernetesAPIError.new("Failed to scale deployment #{deployment_name} to #{quantity}: #{e.message}",
|
|
@@ -64,6 +75,25 @@ module SolidQueueAutoscaler
|
|
|
64
75
|
|
|
65
76
|
private
|
|
66
77
|
|
|
78
|
+
# Executes a block with retry logic for transient network errors.
|
|
79
|
+
# Uses exponential backoff: 1s, 2s, 4s delays between retries.
|
|
80
|
+
def with_retry
|
|
81
|
+
attempts = 0
|
|
82
|
+
begin
|
|
83
|
+
attempts += 1
|
|
84
|
+
yield
|
|
85
|
+
rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::ECONNRESET,
|
|
86
|
+
Net::OpenTimeout, Net::ReadTimeout, SocketError => e
|
|
87
|
+
if attempts < MAX_RETRIES
|
|
88
|
+
delay = RETRY_DELAYS[attempts - 1] || RETRY_DELAYS.last
|
|
89
|
+
logger&.warn("[Autoscaler] Kubernetes API error (attempt #{attempts}/#{MAX_RETRIES}), retrying in #{delay}s: #{e.message}")
|
|
90
|
+
sleep(delay)
|
|
91
|
+
retry
|
|
92
|
+
end
|
|
93
|
+
raise
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
67
97
|
def apps_client
|
|
68
98
|
@apps_client ||= build_apps_client
|
|
69
99
|
end
|
|
@@ -95,7 +125,11 @@ module SolidQueueAutoscaler
|
|
|
95
125
|
api_endpoint,
|
|
96
126
|
'v1',
|
|
97
127
|
auth_options: auth_options,
|
|
98
|
-
ssl_options: ssl_options
|
|
128
|
+
ssl_options: ssl_options,
|
|
129
|
+
timeouts: {
|
|
130
|
+
open: DEFAULT_TIMEOUT,
|
|
131
|
+
read: DEFAULT_TIMEOUT
|
|
132
|
+
}
|
|
99
133
|
)
|
|
100
134
|
end
|
|
101
135
|
|
|
@@ -112,7 +146,11 @@ module SolidQueueAutoscaler
|
|
|
112
146
|
api_endpoint,
|
|
113
147
|
'v1',
|
|
114
148
|
ssl_options: context.ssl_options,
|
|
115
|
-
auth_options: context.auth_options
|
|
149
|
+
auth_options: context.auth_options,
|
|
150
|
+
timeouts: {
|
|
151
|
+
open: DEFAULT_TIMEOUT,
|
|
152
|
+
read: DEFAULT_TIMEOUT
|
|
153
|
+
}
|
|
116
154
|
)
|
|
117
155
|
end
|
|
118
156
|
|
|
@@ -3,6 +3,26 @@
|
|
|
3
3
|
require 'zlib'
|
|
4
4
|
|
|
5
5
|
module SolidQueueAutoscaler
|
|
6
|
+
# PostgreSQL advisory lock wrapper for singleton enforcement.
|
|
7
|
+
#
|
|
8
|
+
# IMPORTANT: PgBouncer Compatibility Warning
|
|
9
|
+
# ==========================================
|
|
10
|
+
# PostgreSQL advisory locks are connection-scoped (session-level locks).
|
|
11
|
+
# If you're using PgBouncer in transaction pooling mode, advisory locks
|
|
12
|
+
# will NOT work correctly because:
|
|
13
|
+
# 1. Each query may run on a different backend connection
|
|
14
|
+
# 2. The lock acquired on one connection won't be visible on another
|
|
15
|
+
# 3. The lock may be "released" when returned to the pool
|
|
16
|
+
#
|
|
17
|
+
# Solutions:
|
|
18
|
+
# - Use PgBouncer in session pooling mode for the queue database
|
|
19
|
+
# - Use a direct connection (bypass PgBouncer) for the autoscaler
|
|
20
|
+
# - Disable advisory locks and use external coordination (Redis, etc.)
|
|
21
|
+
# - Set config.persist_cooldowns = false and rely on a single process
|
|
22
|
+
#
|
|
23
|
+
# If you're seeing multiple autoscalers running simultaneously or
|
|
24
|
+
# lock acquisition always failing, PgBouncer is likely the cause.
|
|
25
|
+
#
|
|
6
26
|
class AdvisoryLock
|
|
7
27
|
attr_reader :lock_key, :timeout
|
|
8
28
|
|
|
@@ -175,6 +175,21 @@ module SolidQueueAutoscaler
|
|
|
175
175
|
errors << 'table_prefix cannot be nil or empty'
|
|
176
176
|
elsif !table_prefix.to_s.end_with?('_')
|
|
177
177
|
errors << 'table_prefix must end with an underscore'
|
|
178
|
+
elsif !table_prefix.to_s.match?(/\A[a-z0-9_]+\z/)
|
|
179
|
+
errors << 'table_prefix must contain only lowercase letters, numbers, and underscores'
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Validate proportional scaling settings to prevent ZeroDivisionError
|
|
183
|
+
if scaling_strategy == :proportional
|
|
184
|
+
if scale_up_jobs_per_worker.nil? || scale_up_jobs_per_worker <= 0
|
|
185
|
+
errors << 'scale_up_jobs_per_worker must be > 0 for proportional scaling'
|
|
186
|
+
end
|
|
187
|
+
if scale_up_latency_per_worker.nil? || scale_up_latency_per_worker <= 0
|
|
188
|
+
errors << 'scale_up_latency_per_worker must be > 0 for proportional scaling'
|
|
189
|
+
end
|
|
190
|
+
if scale_down_jobs_per_worker.nil? || scale_down_jobs_per_worker <= 0
|
|
191
|
+
errors << 'scale_down_jobs_per_worker must be > 0 for proportional scaling'
|
|
192
|
+
end
|
|
178
193
|
end
|
|
179
194
|
|
|
180
195
|
unless VALID_SCALING_STRATEGIES.include?(scaling_strategy)
|
|
@@ -13,16 +13,24 @@ module SolidQueueAutoscaler
|
|
|
13
13
|
@config = config || SolidQueueAutoscaler.config
|
|
14
14
|
@key = key
|
|
15
15
|
@table_exists = nil
|
|
16
|
+
@table_exists_checked_at = nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Resets the cached table_exists? result.
|
|
20
|
+
# Call this after running migrations to re-check table existence.
|
|
21
|
+
def reset_table_exists_cache!
|
|
22
|
+
@table_exists = nil
|
|
23
|
+
@table_exists_checked_at = nil
|
|
16
24
|
end
|
|
17
25
|
|
|
18
26
|
def last_scale_up_at
|
|
19
27
|
return nil unless table_exists?
|
|
20
28
|
|
|
21
29
|
result = connection.select_value(<<~SQL)
|
|
22
|
-
SELECT last_scale_up_at FROM #{
|
|
30
|
+
SELECT last_scale_up_at FROM #{quoted_table_name}
|
|
23
31
|
WHERE key = #{connection.quote(key)}
|
|
24
32
|
SQL
|
|
25
|
-
|
|
33
|
+
parse_time_result(result)
|
|
26
34
|
rescue ArgumentError
|
|
27
35
|
nil
|
|
28
36
|
end
|
|
@@ -31,10 +39,10 @@ module SolidQueueAutoscaler
|
|
|
31
39
|
return nil unless table_exists?
|
|
32
40
|
|
|
33
41
|
result = connection.select_value(<<~SQL)
|
|
34
|
-
SELECT last_scale_down_at FROM #{
|
|
42
|
+
SELECT last_scale_down_at FROM #{quoted_table_name}
|
|
35
43
|
WHERE key = #{connection.quote(key)}
|
|
36
44
|
SQL
|
|
37
|
-
|
|
45
|
+
parse_time_result(result)
|
|
38
46
|
rescue ArgumentError
|
|
39
47
|
nil
|
|
40
48
|
end
|
|
@@ -57,7 +65,7 @@ module SolidQueueAutoscaler
|
|
|
57
65
|
return false unless table_exists?
|
|
58
66
|
|
|
59
67
|
connection.execute(<<~SQL)
|
|
60
|
-
DELETE FROM #{
|
|
68
|
+
DELETE FROM #{quoted_table_name} WHERE key = #{connection.quote(key)}
|
|
61
69
|
SQL
|
|
62
70
|
true
|
|
63
71
|
end
|
|
@@ -92,12 +100,23 @@ module SolidQueueAutoscaler
|
|
|
92
100
|
[remaining, 0].max
|
|
93
101
|
end
|
|
94
102
|
|
|
103
|
+
# Cache TTL for table existence check (5 minutes)
|
|
104
|
+
TABLE_EXISTS_CACHE_TTL = 300
|
|
105
|
+
|
|
95
106
|
def table_exists?
|
|
96
|
-
|
|
107
|
+
# Return cached result if still valid
|
|
108
|
+
if !@table_exists.nil? && @table_exists_checked_at
|
|
109
|
+
cache_age = Time.now - @table_exists_checked_at
|
|
110
|
+
return @table_exists if cache_age < TABLE_EXISTS_CACHE_TTL
|
|
111
|
+
end
|
|
97
112
|
|
|
98
113
|
@table_exists = connection.table_exists?(TABLE_NAME)
|
|
114
|
+
@table_exists_checked_at = Time.now
|
|
115
|
+
@table_exists
|
|
99
116
|
rescue StandardError
|
|
100
117
|
@table_exists = false
|
|
118
|
+
@table_exists_checked_at = Time.now
|
|
119
|
+
@table_exists
|
|
101
120
|
end
|
|
102
121
|
|
|
103
122
|
def state
|
|
@@ -105,7 +124,7 @@ module SolidQueueAutoscaler
|
|
|
105
124
|
|
|
106
125
|
row = connection.select_one(<<~SQL)
|
|
107
126
|
SELECT last_scale_up_at, last_scale_down_at, updated_at
|
|
108
|
-
FROM #{
|
|
127
|
+
FROM #{quoted_table_name}
|
|
109
128
|
WHERE key = #{connection.quote(key)}
|
|
110
129
|
SQL
|
|
111
130
|
|
|
@@ -124,6 +143,28 @@ module SolidQueueAutoscaler
|
|
|
124
143
|
@config.connection
|
|
125
144
|
end
|
|
126
145
|
|
|
146
|
+
def quoted_table_name
|
|
147
|
+
connection.quote_table_name(TABLE_NAME)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Safely parses a time value from the database.
|
|
151
|
+
# Handles Time, DateTime, String, and nil values.
|
|
152
|
+
def parse_time_result(value)
|
|
153
|
+
return nil if value.nil?
|
|
154
|
+
|
|
155
|
+
case value
|
|
156
|
+
when Time, DateTime
|
|
157
|
+
value.to_time
|
|
158
|
+
when String
|
|
159
|
+
Time.parse(value)
|
|
160
|
+
else
|
|
161
|
+
# Try to convert to time if possible
|
|
162
|
+
value.respond_to?(:to_time) ? value.to_time : Time.parse(value.to_s)
|
|
163
|
+
end
|
|
164
|
+
rescue ArgumentError, TypeError
|
|
165
|
+
nil
|
|
166
|
+
end
|
|
167
|
+
|
|
127
168
|
def upsert_state(last_scale_up_at: nil, last_scale_down_at: nil)
|
|
128
169
|
now = Time.current
|
|
129
170
|
quoted_key = connection.quote(key)
|
|
@@ -132,7 +173,7 @@ module SolidQueueAutoscaler
|
|
|
132
173
|
if last_scale_up_at
|
|
133
174
|
quoted_time = connection.quote(last_scale_up_at)
|
|
134
175
|
connection.execute(<<~SQL)
|
|
135
|
-
INSERT INTO #{
|
|
176
|
+
INSERT INTO #{quoted_table_name} (key, last_scale_up_at, created_at, updated_at)
|
|
136
177
|
VALUES (#{quoted_key}, #{quoted_time}, #{quoted_now}, #{quoted_now})
|
|
137
178
|
ON CONFLICT (key) DO UPDATE SET
|
|
138
179
|
last_scale_up_at = EXCLUDED.last_scale_up_at,
|
|
@@ -141,7 +182,7 @@ module SolidQueueAutoscaler
|
|
|
141
182
|
elsif last_scale_down_at
|
|
142
183
|
quoted_time = connection.quote(last_scale_down_at)
|
|
143
184
|
connection.execute(<<~SQL)
|
|
144
|
-
INSERT INTO #{
|
|
185
|
+
INSERT INTO #{quoted_table_name} (key, last_scale_down_at, created_at, updated_at)
|
|
145
186
|
VALUES (#{quoted_key}, #{quoted_time}, #{quoted_now}, #{quoted_now})
|
|
146
187
|
ON CONFLICT (key) DO UPDATE SET
|
|
147
188
|
last_scale_down_at = EXCLUDED.last_scale_down_at,
|
|
@@ -11,12 +11,15 @@ module SolidQueueAutoscaler
|
|
|
11
11
|
workers = SolidQueueAutoscaler.registered_workers
|
|
12
12
|
workers = [:default] if workers.empty?
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
# Batch collect metrics once per worker to reduce DB queries
|
|
15
|
+
workers.each_with_object({}) do |name, result|
|
|
16
|
+
result[name] = worker_status(name)
|
|
16
17
|
end
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# Returns status for a specific worker
|
|
21
|
+
# Note: Each call makes several DB queries. For multiple workers,
|
|
22
|
+
# consider caching or using status() which can batch some queries.
|
|
20
23
|
# @param name [Symbol] Worker name
|
|
21
24
|
# @return [Hash] Status information
|
|
22
25
|
def worker_status(name)
|
|
@@ -24,6 +27,9 @@ module SolidQueueAutoscaler
|
|
|
24
27
|
metrics = safe_metrics(name)
|
|
25
28
|
tracker = CooldownTracker.new(config: config, key: name.to_s)
|
|
26
29
|
|
|
30
|
+
# Batch cooldown state retrieval into one DB call
|
|
31
|
+
cooldown_state = tracker.state
|
|
32
|
+
|
|
27
33
|
{
|
|
28
34
|
name: name,
|
|
29
35
|
enabled: config.enabled?,
|
|
@@ -45,8 +51,8 @@ module SolidQueueAutoscaler
|
|
|
45
51
|
cooldowns: {
|
|
46
52
|
scale_up_remaining: tracker.scale_up_cooldown_remaining.round,
|
|
47
53
|
scale_down_remaining: tracker.scale_down_cooldown_remaining.round,
|
|
48
|
-
last_scale_up:
|
|
49
|
-
last_scale_down:
|
|
54
|
+
last_scale_up: cooldown_state[:last_scale_up_at],
|
|
55
|
+
last_scale_down: cooldown_state[:last_scale_down_at]
|
|
50
56
|
},
|
|
51
57
|
thresholds: {
|
|
52
58
|
scale_up_queue_depth: config.scale_up_queue_depth,
|
|
@@ -72,7 +72,9 @@ module SolidQueueAutoscaler
|
|
|
72
72
|
#{queue_filter_clause}
|
|
73
73
|
SQL
|
|
74
74
|
result = connection.select_value(sql)
|
|
75
|
-
result.to_f
|
|
75
|
+
# Return 0 if no jobs exist (result is nil) instead of nil.to_f which returns 0.0
|
|
76
|
+
# This makes the return value more predictable and avoids nil-related issues
|
|
77
|
+
result.nil? ? 0.0 : result.to_f
|
|
76
78
|
end
|
|
77
79
|
|
|
78
80
|
def jobs_per_minute
|
|
@@ -141,32 +143,33 @@ module SolidQueueAutoscaler
|
|
|
141
143
|
end
|
|
142
144
|
|
|
143
145
|
# Table name helpers using configurable prefix
|
|
146
|
+
# Uses quote_table_name for SQL safety
|
|
144
147
|
def table_prefix
|
|
145
148
|
@config.table_prefix
|
|
146
149
|
end
|
|
147
150
|
|
|
148
151
|
def ready_executions_table
|
|
149
|
-
"#{table_prefix}ready_executions"
|
|
152
|
+
connection.quote_table_name("#{table_prefix}ready_executions")
|
|
150
153
|
end
|
|
151
154
|
|
|
152
155
|
def jobs_table
|
|
153
|
-
"#{table_prefix}jobs"
|
|
156
|
+
connection.quote_table_name("#{table_prefix}jobs")
|
|
154
157
|
end
|
|
155
158
|
|
|
156
159
|
def claimed_executions_table
|
|
157
|
-
"#{table_prefix}claimed_executions"
|
|
160
|
+
connection.quote_table_name("#{table_prefix}claimed_executions")
|
|
158
161
|
end
|
|
159
162
|
|
|
160
163
|
def failed_executions_table
|
|
161
|
-
"#{table_prefix}failed_executions"
|
|
164
|
+
connection.quote_table_name("#{table_prefix}failed_executions")
|
|
162
165
|
end
|
|
163
166
|
|
|
164
167
|
def blocked_executions_table
|
|
165
|
-
"#{table_prefix}blocked_executions"
|
|
168
|
+
connection.quote_table_name("#{table_prefix}blocked_executions")
|
|
166
169
|
end
|
|
167
170
|
|
|
168
171
|
def processes_table
|
|
169
|
-
"#{table_prefix}processes"
|
|
172
|
+
connection.quote_table_name("#{table_prefix}processes")
|
|
170
173
|
end
|
|
171
174
|
end
|
|
172
175
|
end
|
|
@@ -167,16 +167,18 @@ module SolidQueueAutoscaler
|
|
|
167
167
|
end
|
|
168
168
|
|
|
169
169
|
# Creates a new scale event record.
|
|
170
|
+
# Returns nil if the table doesn't exist or on error (does not raise).
|
|
170
171
|
# @param attrs [Hash] Event attributes
|
|
171
172
|
# @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
|
|
172
|
-
# @return [ScaleEvent] The created event
|
|
173
|
-
def create
|
|
173
|
+
# @return [ScaleEvent, nil] The created event, or nil on failure
|
|
174
|
+
def create(attrs, connection: nil)
|
|
174
175
|
conn = connection || default_connection
|
|
175
176
|
return nil unless table_exists?(conn)
|
|
176
177
|
|
|
177
178
|
now = Time.current
|
|
179
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
178
180
|
sql = <<~SQL
|
|
179
|
-
INSERT INTO #{
|
|
181
|
+
INSERT INTO #{quoted_table}
|
|
180
182
|
(worker_name, action, from_workers, to_workers, reason,
|
|
181
183
|
queue_depth, latency_seconds, metrics_json, dry_run, created_at)
|
|
182
184
|
VALUES
|
|
@@ -199,10 +201,18 @@ module SolidQueueAutoscaler
|
|
|
199
201
|
new(attrs.merge(id: id, created_at: now))
|
|
200
202
|
rescue StandardError => e
|
|
201
203
|
# Log but don't fail if event recording fails
|
|
202
|
-
Rails.
|
|
204
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
205
|
+
Rails.logger.warn("[Autoscaler] Failed to record event: #{e.message}")
|
|
206
|
+
end
|
|
203
207
|
nil
|
|
204
208
|
end
|
|
205
209
|
|
|
210
|
+
# Alias for backward compatibility
|
|
211
|
+
# @deprecated Use {#create} instead
|
|
212
|
+
def create!(attrs, connection: nil)
|
|
213
|
+
create(attrs, connection: connection)
|
|
214
|
+
end
|
|
215
|
+
|
|
206
216
|
# Finds recent events.
|
|
207
217
|
# @param limit [Integer] Maximum number of events to return
|
|
208
218
|
# @param worker_name [String, nil] Filter by worker name
|
|
@@ -213,11 +223,12 @@ module SolidQueueAutoscaler
|
|
|
213
223
|
return [] unless table_exists?(conn)
|
|
214
224
|
|
|
215
225
|
filter = worker_name ? "WHERE worker_name = #{conn.quote(worker_name)}" : ''
|
|
226
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
216
227
|
|
|
217
228
|
sql = <<~SQL
|
|
218
229
|
SELECT id, worker_name, action, from_workers, to_workers, reason,
|
|
219
230
|
queue_depth, latency_seconds, metrics_json, dry_run, created_at
|
|
220
|
-
FROM #{
|
|
231
|
+
FROM #{quoted_table}
|
|
221
232
|
#{filter}
|
|
222
233
|
ORDER BY created_at DESC
|
|
223
234
|
LIMIT #{limit.to_i}
|
|
@@ -237,10 +248,11 @@ module SolidQueueAutoscaler
|
|
|
237
248
|
conn = connection || default_connection
|
|
238
249
|
return [] unless table_exists?(conn)
|
|
239
250
|
|
|
251
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
240
252
|
sql = <<~SQL
|
|
241
253
|
SELECT id, worker_name, action, from_workers, to_workers, reason,
|
|
242
254
|
queue_depth, latency_seconds, metrics_json, dry_run, created_at
|
|
243
|
-
FROM #{
|
|
255
|
+
FROM #{quoted_table}
|
|
244
256
|
WHERE action = #{conn.quote(action)}
|
|
245
257
|
ORDER BY created_at DESC
|
|
246
258
|
LIMIT #{limit.to_i}
|
|
@@ -261,6 +273,7 @@ module SolidQueueAutoscaler
|
|
|
261
273
|
return default_stats unless table_exists?(conn)
|
|
262
274
|
|
|
263
275
|
worker_filter = worker_name ? "AND worker_name = #{conn.quote(worker_name)}" : ''
|
|
276
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
264
277
|
|
|
265
278
|
sql = <<~SQL
|
|
266
279
|
SELECT
|
|
@@ -268,7 +281,7 @@ module SolidQueueAutoscaler
|
|
|
268
281
|
COUNT(*) as count,
|
|
269
282
|
AVG(queue_depth) as avg_queue_depth,
|
|
270
283
|
AVG(latency_seconds) as avg_latency
|
|
271
|
-
FROM #{
|
|
284
|
+
FROM #{quoted_table}
|
|
272
285
|
WHERE created_at >= #{conn.quote(since)}
|
|
273
286
|
#{worker_filter}
|
|
274
287
|
GROUP BY action
|
|
@@ -289,9 +302,10 @@ module SolidQueueAutoscaler
|
|
|
289
302
|
return 0 unless table_exists?(conn)
|
|
290
303
|
|
|
291
304
|
cutoff = Time.current - keep_days.days
|
|
305
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
292
306
|
|
|
293
307
|
sql = <<~SQL
|
|
294
|
-
DELETE FROM #{
|
|
308
|
+
DELETE FROM #{quoted_table}
|
|
295
309
|
WHERE created_at < #{conn.quote(cutoff)}
|
|
296
310
|
SQL
|
|
297
311
|
|
|
@@ -320,8 +334,9 @@ module SolidQueueAutoscaler
|
|
|
320
334
|
return 0 unless table_exists?(conn)
|
|
321
335
|
|
|
322
336
|
time_filter = since ? "WHERE created_at >= #{conn.quote(since)}" : ''
|
|
337
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
323
338
|
|
|
324
|
-
sql = "SELECT COUNT(*) FROM #{
|
|
339
|
+
sql = "SELECT COUNT(*) FROM #{quoted_table} #{time_filter}"
|
|
325
340
|
conn.select_value(sql).to_i
|
|
326
341
|
rescue StandardError
|
|
327
342
|
0
|
|
@@ -367,7 +382,8 @@ module SolidQueueAutoscaler
|
|
|
367
382
|
result[:recent_events] = count(since: 24.hours.ago, connection: conn)
|
|
368
383
|
|
|
369
384
|
# Get last event time
|
|
370
|
-
|
|
385
|
+
quoted_table = conn.quote_table_name(TABLE_NAME)
|
|
386
|
+
sql = "SELECT MAX(created_at) FROM #{quoted_table}"
|
|
371
387
|
last_at = conn.select_value(sql)
|
|
372
388
|
result[:last_event_at] = last_at ? parse_time(last_at) : nil
|
|
373
389
|
rescue StandardError => e
|
|
@@ -25,9 +25,13 @@ module SolidQueueAutoscaler
|
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
# Per-configuration cooldown tracking for multi-worker support
|
|
28
|
+
# Thread-safe mutex for cooldown tracking - defined as constant to avoid
|
|
29
|
+
# race condition where lazy initialization could create multiple mutexes
|
|
30
|
+
COOLDOWN_MUTEX = Mutex.new
|
|
31
|
+
|
|
28
32
|
class << self
|
|
29
33
|
def cooldown_mutex
|
|
30
|
-
|
|
34
|
+
COOLDOWN_MUTEX
|
|
31
35
|
end
|
|
32
36
|
|
|
33
37
|
def cooldowns
|
|
@@ -158,12 +162,17 @@ module SolidQueueAutoscaler
|
|
|
158
162
|
target = decision.to.clamp(@config.min_workers, @config.max_workers)
|
|
159
163
|
|
|
160
164
|
if target != decision.to
|
|
161
|
-
logger
|
|
165
|
+
logger&.warn(
|
|
162
166
|
"[Autoscaler] Clamping target from #{decision.to} to #{target} " \
|
|
163
167
|
"(limits: #{@config.min_workers}-#{@config.max_workers})"
|
|
164
168
|
)
|
|
165
|
-
#
|
|
166
|
-
decision
|
|
169
|
+
# Create a new decision with the clamped target instead of mutating
|
|
170
|
+
decision = DecisionEngine::Decision.new(
|
|
171
|
+
action: decision.action,
|
|
172
|
+
from: decision.from,
|
|
173
|
+
to: target,
|
|
174
|
+
reason: decision.reason
|
|
175
|
+
)
|
|
167
176
|
end
|
|
168
177
|
|
|
169
178
|
@adapter.scale(target)
|
|
@@ -250,7 +259,7 @@ module SolidQueueAutoscaler
|
|
|
250
259
|
|
|
251
260
|
def log_decision(decision, metrics)
|
|
252
261
|
worker_label = @config.name == :default ? '' : "[#{@config.name}] "
|
|
253
|
-
logger
|
|
262
|
+
logger&.info(
|
|
254
263
|
"[Autoscaler] #{worker_label}Evaluated: action=#{decision.action} " \
|
|
255
264
|
"workers=#{decision.from}->#{decision.to} " \
|
|
256
265
|
"queue_depth=#{metrics.queue_depth} " \
|
|
@@ -262,7 +271,7 @@ module SolidQueueAutoscaler
|
|
|
262
271
|
def log_scale_action(decision)
|
|
263
272
|
prefix = @config.dry_run? ? '[DRY RUN] ' : ''
|
|
264
273
|
worker_label = @config.name == :default ? '' : "[#{@config.name}] "
|
|
265
|
-
logger
|
|
274
|
+
logger&.info(
|
|
266
275
|
"#{prefix}[Autoscaler] #{worker_label}Scaling #{decision.action}: " \
|
|
267
276
|
"#{decision.from} -> #{decision.to} workers (#{decision.reason})"
|
|
268
277
|
)
|
|
@@ -281,7 +290,7 @@ module SolidQueueAutoscaler
|
|
|
281
290
|
end
|
|
282
291
|
|
|
283
292
|
def skipped_result(reason, decision: nil, metrics: nil)
|
|
284
|
-
logger
|
|
293
|
+
logger&.debug("[Autoscaler] Skipped: #{reason}")
|
|
285
294
|
|
|
286
295
|
# Record skipped events
|
|
287
296
|
record_skipped_event(reason, decision, metrics)
|
|
@@ -296,7 +305,7 @@ module SolidQueueAutoscaler
|
|
|
296
305
|
end
|
|
297
306
|
|
|
298
307
|
def error_result(error)
|
|
299
|
-
logger
|
|
308
|
+
logger&.error("[Autoscaler] Error: #{error.class}: #{error.message}")
|
|
300
309
|
|
|
301
310
|
# Record error events
|
|
302
311
|
record_error_event(error)
|
|
@@ -315,7 +324,7 @@ module SolidQueueAutoscaler
|
|
|
315
324
|
def record_scale_event(decision, metrics)
|
|
316
325
|
return unless @config.record_events?
|
|
317
326
|
|
|
318
|
-
ScaleEvent.create
|
|
327
|
+
ScaleEvent.create(
|
|
319
328
|
{
|
|
320
329
|
worker_name: @config.name.to_s,
|
|
321
330
|
action: decision.action.to_s,
|
|
@@ -334,7 +343,7 @@ module SolidQueueAutoscaler
|
|
|
334
343
|
def record_skipped_event(reason, decision, metrics)
|
|
335
344
|
return unless @config.record_events?
|
|
336
345
|
|
|
337
|
-
ScaleEvent.create
|
|
346
|
+
ScaleEvent.create(
|
|
338
347
|
{
|
|
339
348
|
worker_name: @config.name.to_s,
|
|
340
349
|
action: 'skipped',
|
|
@@ -353,7 +362,7 @@ module SolidQueueAutoscaler
|
|
|
353
362
|
def record_error_event(error)
|
|
354
363
|
return unless @config.record_events?
|
|
355
364
|
|
|
356
|
-
ScaleEvent.create
|
|
365
|
+
ScaleEvent.create(
|
|
357
366
|
{
|
|
358
367
|
worker_name: @config.name.to_s,
|
|
359
368
|
action: 'error',
|