solid_queue_autoscaler 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 39531d30eeaa5c53e6c5f4fb01f106b586ebe9b0243be5d7e6ae7748d591c52f
4
- data.tar.gz: f8a4035f055d66f9ce04d6450653c86cc9ee6cd632145db26e0b8c2c2085a731
3
+ metadata.gz: cf794daeb74474c136c8aec706793bf74617dcb610abf42df89ddb4fefd99274
4
+ data.tar.gz: 96ec9ad6993871c7773ff524c5d71ff3e919ff89aacba5587d5dee63aa277f5d
5
5
  SHA512:
6
- metadata.gz: df8dca7f8a6e75ee7feea851c3120af1fe522e273e7b937d1df02988bc8297dc72094ea272095db490c80ae4d617bb47f15e5adbee26bd36706373f0c020c7cb
7
- data.tar.gz: aa902d3745ab5992f5698bb130078170217ed389511aba8f6669704fafe1e04a7ab8df7ff2391104af75951d4ce68941f3cf15df0bf3c1c2cae05e1d4d26648e
6
+ metadata.gz: 0b8dd105d028035aee534300ee1d91af9193faefe4e27c8ba3d72c98b3c401cdc52a04894bdeef0b14ebdd7a42f120e563bb378509f2a9ac046a18554c7079d0
7
+ data.tar.gz: 9980ac5a53affb82b264cd9e7c9bacb388d26a5ecc0116fdd1e22ce527c66ec63bc51d5fe47a492df1941df1b4d0052cfed721e7e4b62566a5c70ab885852f9c
data/CHANGELOG.md CHANGED
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [1.0.11] - 2025-01-17
11
+
12
+ ### Fixed
13
+
14
+ #### Critical Fixes
15
+ - **Thread safety** - Fixed race condition in mutex initialization (`scaler.rb`). Changed from lazy `@cooldown_mutex ||= Mutex.new` to thread-safe class constant `COOLDOWN_MUTEX`
16
+ - **SQL injection prevention** - Added regex validation for `table_prefix` configuration to only allow `[a-z0-9_]+` pattern
17
+ - **PgBouncer documentation** - Added prominent warning in `advisory_lock.rb` about incompatibility with PgBouncer transaction pooling mode
18
+
19
+ #### High Priority Fixes
20
+ - **CooldownTracker caching** - Added 5-minute TTL for `table_exists?` cache and `reset_table_exists_cache!` method for manual invalidation
21
+ - **ScaleEvent naming** - Renamed `create!` to `create` (non-bang) since it catches exceptions and returns nil. Added `create!` as deprecated alias for backward compatibility
22
+ - **Decision struct mutation** - Fixed mutation of Decision struct when clamping target workers. Now creates a new Decision instead of modifying the existing one
23
+ - **ZeroDivisionError prevention** - Added validation that `scale_up_jobs_per_worker`, `scale_up_latency_per_worker`, and `scale_down_jobs_per_worker` must be > 0 when using proportional scaling
24
+
25
+ #### Medium Priority Fixes
26
+ - **Retry logic for adapters** - Added exponential backoff retry (3 attempts with 1s/2s/4s delays) for transient network errors in both Heroku and Kubernetes adapters
27
+ - **Time parsing** - Fixed timezone handling in `cooldown_tracker.rb` to properly handle Time, DateTime, and String values
28
+ - **Dashboard query optimization** - Batched cooldown state retrieval in `worker_status` to reduce database queries
29
+ - **Metrics nil handling** - `oldest_job_age_seconds` now returns `0.0` instead of `nil` when no jobs exist
30
+ - **Kubernetes timeout** - Added 30-second timeout configuration to kubeclient API calls
31
+
32
+ #### Low Priority Fixes
33
+ - **Safe logger calls** - Added safe navigation (`logger&.warn`) throughout to prevent nil errors
34
+ - **SQL table quoting** - Now uses `connection.quote_table_name()` for all table name interpolations
35
+ - **Rails.logger nil check** - Added proper nil check before using `Rails.logger` in `scale_event.rb`
36
+
10
37
  ## [1.0.10] - 2025-01-17
11
38
 
12
39
  ### Fixed
@@ -20,9 +20,22 @@ module SolidQueueAutoscaler
20
20
  # config.process_type = 'worker'
21
21
  # end
22
22
  class Heroku < Base
23
+ # Retry configuration for transient network errors
24
+ MAX_RETRIES = 3
25
+ RETRY_DELAYS = [1, 2, 4].freeze # Exponential backoff in seconds
26
+
27
+ # Errors that are safe to retry (transient network issues)
28
+ RETRYABLE_ERRORS = [
29
+ Excon::Error::Timeout,
30
+ Excon::Error::Socket,
31
+ Excon::Error::HTTPStatus
32
+ ].freeze
33
+
23
34
  def current_workers
24
- formation = client.formation.info(app_name, process_type)
25
- formation['quantity']
35
+ with_retry do
36
+ formation = client.formation.info(app_name, process_type)
37
+ formation['quantity']
38
+ end
26
39
  rescue Excon::Error => e
27
40
  raise HerokuAPIError.new(
28
41
  "Failed to get formation info: #{e.message}",
@@ -37,7 +50,9 @@ module SolidQueueAutoscaler
37
50
  return quantity
38
51
  end
39
52
 
40
- client.formation.update(app_name, process_type, { quantity: quantity })
53
+ with_retry do
54
+ client.formation.update(app_name, process_type, { quantity: quantity })
55
+ end
41
56
  quantity
42
57
  rescue Excon::Error => e
43
58
  raise HerokuAPIError.new(
@@ -73,6 +88,36 @@ module SolidQueueAutoscaler
73
88
 
74
89
  private
75
90
 
91
+ # Executes a block with retry logic for transient network errors.
92
+ # Uses exponential backoff: 1s, 2s, 4s delays between retries.
93
+ def with_retry
94
+ attempts = 0
95
+ begin
96
+ attempts += 1
97
+ yield
98
+ rescue *RETRYABLE_ERRORS => e
99
+ if attempts < MAX_RETRIES && retryable_error?(e)
100
+ delay = RETRY_DELAYS[attempts - 1] || RETRY_DELAYS.last
101
+ logger&.warn("[Autoscaler] Heroku API error (attempt #{attempts}/#{MAX_RETRIES}), retrying in #{delay}s: #{e.message}")
102
+ sleep(delay)
103
+ retry
104
+ end
105
+ raise
106
+ end
107
+ end
108
+
109
+ # Determines if an error should be retried.
110
+ # Retries timeouts and 5xx errors, but not 4xx client errors.
111
+ def retryable_error?(error)
112
+ return true unless error.respond_to?(:response) && error.response
113
+
114
+ status = error.response.status
115
+ return true if status.nil?
116
+
117
+ # Retry server errors (5xx), not client errors (4xx)
118
+ status >= 500 || status == 429 # Also retry rate limiting
119
+ end
120
+
76
121
  def client
77
122
  @client ||= PlatformAPI.connect_oauth(api_key)
78
123
  end
@@ -30,9 +30,18 @@ module SolidQueueAutoscaler
30
30
  # Kubernetes API path for apps/v1 group
31
31
  APPS_API_VERSION = 'apis/apps/v1'
32
32
 
33
+ # Retry configuration for transient network errors
34
+ MAX_RETRIES = 3
35
+ RETRY_DELAYS = [1, 2, 4].freeze # Exponential backoff in seconds
36
+
37
+ # Default timeout for Kubernetes API calls (seconds)
38
+ DEFAULT_TIMEOUT = 30
39
+
33
40
  def current_workers
34
- deployment = apps_client.get_deployment(deployment_name, namespace)
35
- deployment.spec.replicas
41
+ with_retry do
42
+ deployment = apps_client.get_deployment(deployment_name, namespace)
43
+ deployment.spec.replicas
44
+ end
36
45
  rescue StandardError => e
37
46
  raise KubernetesAPIError.new("Failed to get deployment info: #{e.message}", original_error: e)
38
47
  end
@@ -43,8 +52,10 @@ module SolidQueueAutoscaler
43
52
  return quantity
44
53
  end
45
54
 
46
- patch_body = { spec: { replicas: quantity } }
47
- apps_client.patch_deployment(deployment_name, patch_body, namespace)
55
+ with_retry do
56
+ patch_body = { spec: { replicas: quantity } }
57
+ apps_client.patch_deployment(deployment_name, patch_body, namespace)
58
+ end
48
59
  quantity
49
60
  rescue StandardError => e
50
61
  raise KubernetesAPIError.new("Failed to scale deployment #{deployment_name} to #{quantity}: #{e.message}",
@@ -64,6 +75,25 @@ module SolidQueueAutoscaler
64
75
 
65
76
  private
66
77
 
78
+ # Executes a block with retry logic for transient network errors.
79
+ # Uses exponential backoff: 1s, 2s, 4s delays between retries.
80
+ def with_retry
81
+ attempts = 0
82
+ begin
83
+ attempts += 1
84
+ yield
85
+ rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::ECONNRESET,
86
+ Net::OpenTimeout, Net::ReadTimeout, SocketError => e
87
+ if attempts < MAX_RETRIES
88
+ delay = RETRY_DELAYS[attempts - 1] || RETRY_DELAYS.last
89
+ logger&.warn("[Autoscaler] Kubernetes API error (attempt #{attempts}/#{MAX_RETRIES}), retrying in #{delay}s: #{e.message}")
90
+ sleep(delay)
91
+ retry
92
+ end
93
+ raise
94
+ end
95
+ end
96
+
67
97
  def apps_client
68
98
  @apps_client ||= build_apps_client
69
99
  end
@@ -95,7 +125,11 @@ module SolidQueueAutoscaler
95
125
  api_endpoint,
96
126
  'v1',
97
127
  auth_options: auth_options,
98
- ssl_options: ssl_options
128
+ ssl_options: ssl_options,
129
+ timeouts: {
130
+ open: DEFAULT_TIMEOUT,
131
+ read: DEFAULT_TIMEOUT
132
+ }
99
133
  )
100
134
  end
101
135
 
@@ -112,7 +146,11 @@ module SolidQueueAutoscaler
112
146
  api_endpoint,
113
147
  'v1',
114
148
  ssl_options: context.ssl_options,
115
- auth_options: context.auth_options
149
+ auth_options: context.auth_options,
150
+ timeouts: {
151
+ open: DEFAULT_TIMEOUT,
152
+ read: DEFAULT_TIMEOUT
153
+ }
116
154
  )
117
155
  end
118
156
 
@@ -3,6 +3,26 @@
3
3
  require 'zlib'
4
4
 
5
5
  module SolidQueueAutoscaler
6
+ # PostgreSQL advisory lock wrapper for singleton enforcement.
7
+ #
8
+ # IMPORTANT: PgBouncer Compatibility Warning
9
+ # ==========================================
10
+ # PostgreSQL advisory locks are connection-scoped (session-level locks).
11
+ # If you're using PgBouncer in transaction pooling mode, advisory locks
12
+ # will NOT work correctly because:
13
+ # 1. Each query may run on a different backend connection
14
+ # 2. The lock acquired on one connection won't be visible on another
15
+ # 3. The lock may be "released" when returned to the pool
16
+ #
17
+ # Solutions:
18
+ # - Use PgBouncer in session pooling mode for the queue database
19
+ # - Use a direct connection (bypass PgBouncer) for the autoscaler
20
+ # - Disable advisory locks and use external coordination (Redis, etc.)
21
+ # - Set config.persist_cooldowns = false and rely on a single process
22
+ #
23
+ # If you're seeing multiple autoscalers running simultaneously or
24
+ # lock acquisition always failing, PgBouncer is likely the cause.
25
+ #
6
26
  class AdvisoryLock
7
27
  attr_reader :lock_key, :timeout
8
28
 
@@ -175,6 +175,21 @@ module SolidQueueAutoscaler
175
175
  errors << 'table_prefix cannot be nil or empty'
176
176
  elsif !table_prefix.to_s.end_with?('_')
177
177
  errors << 'table_prefix must end with an underscore'
178
+ elsif !table_prefix.to_s.match?(/\A[a-z0-9_]+\z/)
179
+ errors << 'table_prefix must contain only lowercase letters, numbers, and underscores'
180
+ end
181
+
182
+ # Validate proportional scaling settings to prevent ZeroDivisionError
183
+ if scaling_strategy == :proportional
184
+ if scale_up_jobs_per_worker.nil? || scale_up_jobs_per_worker <= 0
185
+ errors << 'scale_up_jobs_per_worker must be > 0 for proportional scaling'
186
+ end
187
+ if scale_up_latency_per_worker.nil? || scale_up_latency_per_worker <= 0
188
+ errors << 'scale_up_latency_per_worker must be > 0 for proportional scaling'
189
+ end
190
+ if scale_down_jobs_per_worker.nil? || scale_down_jobs_per_worker <= 0
191
+ errors << 'scale_down_jobs_per_worker must be > 0 for proportional scaling'
192
+ end
178
193
  end
179
194
 
180
195
  unless VALID_SCALING_STRATEGIES.include?(scaling_strategy)
@@ -13,16 +13,24 @@ module SolidQueueAutoscaler
13
13
  @config = config || SolidQueueAutoscaler.config
14
14
  @key = key
15
15
  @table_exists = nil
16
+ @table_exists_checked_at = nil
17
+ end
18
+
19
+ # Resets the cached table_exists? result.
20
+ # Call this after running migrations to re-check table existence.
21
+ def reset_table_exists_cache!
22
+ @table_exists = nil
23
+ @table_exists_checked_at = nil
16
24
  end
17
25
 
18
26
  def last_scale_up_at
19
27
  return nil unless table_exists?
20
28
 
21
29
  result = connection.select_value(<<~SQL)
22
- SELECT last_scale_up_at FROM #{TABLE_NAME}
30
+ SELECT last_scale_up_at FROM #{quoted_table_name}
23
31
  WHERE key = #{connection.quote(key)}
24
32
  SQL
25
- result ? Time.parse(result.to_s) : nil
33
+ parse_time_result(result)
26
34
  rescue ArgumentError
27
35
  nil
28
36
  end
@@ -31,10 +39,10 @@ module SolidQueueAutoscaler
31
39
  return nil unless table_exists?
32
40
 
33
41
  result = connection.select_value(<<~SQL)
34
- SELECT last_scale_down_at FROM #{TABLE_NAME}
42
+ SELECT last_scale_down_at FROM #{quoted_table_name}
35
43
  WHERE key = #{connection.quote(key)}
36
44
  SQL
37
- result ? Time.parse(result.to_s) : nil
45
+ parse_time_result(result)
38
46
  rescue ArgumentError
39
47
  nil
40
48
  end
@@ -57,7 +65,7 @@ module SolidQueueAutoscaler
57
65
  return false unless table_exists?
58
66
 
59
67
  connection.execute(<<~SQL)
60
- DELETE FROM #{TABLE_NAME} WHERE key = #{connection.quote(key)}
68
+ DELETE FROM #{quoted_table_name} WHERE key = #{connection.quote(key)}
61
69
  SQL
62
70
  true
63
71
  end
@@ -92,12 +100,23 @@ module SolidQueueAutoscaler
92
100
  [remaining, 0].max
93
101
  end
94
102
 
103
+ # Cache TTL for table existence check (5 minutes)
104
+ TABLE_EXISTS_CACHE_TTL = 300
105
+
95
106
  def table_exists?
96
- return @table_exists unless @table_exists.nil?
107
+ # Return cached result if still valid
108
+ if !@table_exists.nil? && @table_exists_checked_at
109
+ cache_age = Time.now - @table_exists_checked_at
110
+ return @table_exists if cache_age < TABLE_EXISTS_CACHE_TTL
111
+ end
97
112
 
98
113
  @table_exists = connection.table_exists?(TABLE_NAME)
114
+ @table_exists_checked_at = Time.now
115
+ @table_exists
99
116
  rescue StandardError
100
117
  @table_exists = false
118
+ @table_exists_checked_at = Time.now
119
+ @table_exists
101
120
  end
102
121
 
103
122
  def state
@@ -105,7 +124,7 @@ module SolidQueueAutoscaler
105
124
 
106
125
  row = connection.select_one(<<~SQL)
107
126
  SELECT last_scale_up_at, last_scale_down_at, updated_at
108
- FROM #{TABLE_NAME}
127
+ FROM #{quoted_table_name}
109
128
  WHERE key = #{connection.quote(key)}
110
129
  SQL
111
130
 
@@ -124,6 +143,28 @@ module SolidQueueAutoscaler
124
143
  @config.connection
125
144
  end
126
145
 
146
+ def quoted_table_name
147
+ connection.quote_table_name(TABLE_NAME)
148
+ end
149
+
150
+ # Safely parses a time value from the database.
151
+ # Handles Time, DateTime, String, and nil values.
152
+ def parse_time_result(value)
153
+ return nil if value.nil?
154
+
155
+ case value
156
+ when Time, DateTime
157
+ value.to_time
158
+ when String
159
+ Time.parse(value)
160
+ else
161
+ # Try to convert to time if possible
162
+ value.respond_to?(:to_time) ? value.to_time : Time.parse(value.to_s)
163
+ end
164
+ rescue ArgumentError, TypeError
165
+ nil
166
+ end
167
+
127
168
  def upsert_state(last_scale_up_at: nil, last_scale_down_at: nil)
128
169
  now = Time.current
129
170
  quoted_key = connection.quote(key)
@@ -132,7 +173,7 @@ module SolidQueueAutoscaler
132
173
  if last_scale_up_at
133
174
  quoted_time = connection.quote(last_scale_up_at)
134
175
  connection.execute(<<~SQL)
135
- INSERT INTO #{TABLE_NAME} (key, last_scale_up_at, created_at, updated_at)
176
+ INSERT INTO #{quoted_table_name} (key, last_scale_up_at, created_at, updated_at)
136
177
  VALUES (#{quoted_key}, #{quoted_time}, #{quoted_now}, #{quoted_now})
137
178
  ON CONFLICT (key) DO UPDATE SET
138
179
  last_scale_up_at = EXCLUDED.last_scale_up_at,
@@ -141,7 +182,7 @@ module SolidQueueAutoscaler
141
182
  elsif last_scale_down_at
142
183
  quoted_time = connection.quote(last_scale_down_at)
143
184
  connection.execute(<<~SQL)
144
- INSERT INTO #{TABLE_NAME} (key, last_scale_down_at, created_at, updated_at)
185
+ INSERT INTO #{quoted_table_name} (key, last_scale_down_at, created_at, updated_at)
145
186
  VALUES (#{quoted_key}, #{quoted_time}, #{quoted_now}, #{quoted_now})
146
187
  ON CONFLICT (key) DO UPDATE SET
147
188
  last_scale_down_at = EXCLUDED.last_scale_down_at,
@@ -11,12 +11,15 @@ module SolidQueueAutoscaler
11
11
  workers = SolidQueueAutoscaler.registered_workers
12
12
  workers = [:default] if workers.empty?
13
13
 
14
- workers.each_with_object({}) do |name, status|
15
- status[name] = worker_status(name)
14
+ # Batch collect metrics once per worker to reduce DB queries
15
+ workers.each_with_object({}) do |name, result|
16
+ result[name] = worker_status(name)
16
17
  end
17
18
  end
18
19
 
19
20
  # Returns status for a specific worker
21
+ # Note: Each call makes several DB queries. For multiple workers,
22
+ # consider caching or using status() which can batch some queries.
20
23
  # @param name [Symbol] Worker name
21
24
  # @return [Hash] Status information
22
25
  def worker_status(name)
@@ -24,6 +27,9 @@ module SolidQueueAutoscaler
24
27
  metrics = safe_metrics(name)
25
28
  tracker = CooldownTracker.new(config: config, key: name.to_s)
26
29
 
30
+ # Batch cooldown state retrieval into one DB call
31
+ cooldown_state = tracker.state
32
+
27
33
  {
28
34
  name: name,
29
35
  enabled: config.enabled?,
@@ -45,8 +51,8 @@ module SolidQueueAutoscaler
45
51
  cooldowns: {
46
52
  scale_up_remaining: tracker.scale_up_cooldown_remaining.round,
47
53
  scale_down_remaining: tracker.scale_down_cooldown_remaining.round,
48
- last_scale_up: tracker.last_scale_up_at,
49
- last_scale_down: tracker.last_scale_down_at
54
+ last_scale_up: cooldown_state[:last_scale_up_at],
55
+ last_scale_down: cooldown_state[:last_scale_down_at]
50
56
  },
51
57
  thresholds: {
52
58
  scale_up_queue_depth: config.scale_up_queue_depth,
@@ -72,7 +72,9 @@ module SolidQueueAutoscaler
72
72
  #{queue_filter_clause}
73
73
  SQL
74
74
  result = connection.select_value(sql)
75
- result.to_f
75
+ # Return 0 if no jobs exist (result is nil) instead of nil.to_f which returns 0.0
76
+ # This makes the return value more predictable and avoids nil-related issues
77
+ result.nil? ? 0.0 : result.to_f
76
78
  end
77
79
 
78
80
  def jobs_per_minute
@@ -141,32 +143,33 @@ module SolidQueueAutoscaler
141
143
  end
142
144
 
143
145
  # Table name helpers using configurable prefix
146
+ # Uses quote_table_name for SQL safety
144
147
  def table_prefix
145
148
  @config.table_prefix
146
149
  end
147
150
 
148
151
  def ready_executions_table
149
- "#{table_prefix}ready_executions"
152
+ connection.quote_table_name("#{table_prefix}ready_executions")
150
153
  end
151
154
 
152
155
  def jobs_table
153
- "#{table_prefix}jobs"
156
+ connection.quote_table_name("#{table_prefix}jobs")
154
157
  end
155
158
 
156
159
  def claimed_executions_table
157
- "#{table_prefix}claimed_executions"
160
+ connection.quote_table_name("#{table_prefix}claimed_executions")
158
161
  end
159
162
 
160
163
  def failed_executions_table
161
- "#{table_prefix}failed_executions"
164
+ connection.quote_table_name("#{table_prefix}failed_executions")
162
165
  end
163
166
 
164
167
  def blocked_executions_table
165
- "#{table_prefix}blocked_executions"
168
+ connection.quote_table_name("#{table_prefix}blocked_executions")
166
169
  end
167
170
 
168
171
  def processes_table
169
- "#{table_prefix}processes"
172
+ connection.quote_table_name("#{table_prefix}processes")
170
173
  end
171
174
  end
172
175
  end
@@ -167,16 +167,18 @@ module SolidQueueAutoscaler
167
167
  end
168
168
 
169
169
  # Creates a new scale event record.
170
+ # Returns nil if the table doesn't exist or on error (does not raise).
170
171
  # @param attrs [Hash] Event attributes
171
172
  # @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] Database connection
172
- # @return [ScaleEvent] The created event
173
- def create!(attrs, connection: nil)
173
+ # @return [ScaleEvent, nil] The created event, or nil on failure
174
+ def create(attrs, connection: nil)
174
175
  conn = connection || default_connection
175
176
  return nil unless table_exists?(conn)
176
177
 
177
178
  now = Time.current
179
+ quoted_table = conn.quote_table_name(TABLE_NAME)
178
180
  sql = <<~SQL
179
- INSERT INTO #{TABLE_NAME}
181
+ INSERT INTO #{quoted_table}
180
182
  (worker_name, action, from_workers, to_workers, reason,
181
183
  queue_depth, latency_seconds, metrics_json, dry_run, created_at)
182
184
  VALUES
@@ -199,10 +201,18 @@ module SolidQueueAutoscaler
199
201
  new(attrs.merge(id: id, created_at: now))
200
202
  rescue StandardError => e
201
203
  # Log but don't fail if event recording fails
202
- Rails.logger.warn("[Autoscaler] Failed to record event: #{e.message}") if defined?(Rails)
204
+ if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
205
+ Rails.logger.warn("[Autoscaler] Failed to record event: #{e.message}")
206
+ end
203
207
  nil
204
208
  end
205
209
 
210
+ # Alias for backward compatibility
211
+ # @deprecated Use {#create} instead
212
+ def create!(attrs, connection: nil)
213
+ create(attrs, connection: connection)
214
+ end
215
+
206
216
  # Finds recent events.
207
217
  # @param limit [Integer] Maximum number of events to return
208
218
  # @param worker_name [String, nil] Filter by worker name
@@ -213,11 +223,12 @@ module SolidQueueAutoscaler
213
223
  return [] unless table_exists?(conn)
214
224
 
215
225
  filter = worker_name ? "WHERE worker_name = #{conn.quote(worker_name)}" : ''
226
+ quoted_table = conn.quote_table_name(TABLE_NAME)
216
227
 
217
228
  sql = <<~SQL
218
229
  SELECT id, worker_name, action, from_workers, to_workers, reason,
219
230
  queue_depth, latency_seconds, metrics_json, dry_run, created_at
220
- FROM #{TABLE_NAME}
231
+ FROM #{quoted_table}
221
232
  #{filter}
222
233
  ORDER BY created_at DESC
223
234
  LIMIT #{limit.to_i}
@@ -237,10 +248,11 @@ module SolidQueueAutoscaler
237
248
  conn = connection || default_connection
238
249
  return [] unless table_exists?(conn)
239
250
 
251
+ quoted_table = conn.quote_table_name(TABLE_NAME)
240
252
  sql = <<~SQL
241
253
  SELECT id, worker_name, action, from_workers, to_workers, reason,
242
254
  queue_depth, latency_seconds, metrics_json, dry_run, created_at
243
- FROM #{TABLE_NAME}
255
+ FROM #{quoted_table}
244
256
  WHERE action = #{conn.quote(action)}
245
257
  ORDER BY created_at DESC
246
258
  LIMIT #{limit.to_i}
@@ -261,6 +273,7 @@ module SolidQueueAutoscaler
261
273
  return default_stats unless table_exists?(conn)
262
274
 
263
275
  worker_filter = worker_name ? "AND worker_name = #{conn.quote(worker_name)}" : ''
276
+ quoted_table = conn.quote_table_name(TABLE_NAME)
264
277
 
265
278
  sql = <<~SQL
266
279
  SELECT
@@ -268,7 +281,7 @@ module SolidQueueAutoscaler
268
281
  COUNT(*) as count,
269
282
  AVG(queue_depth) as avg_queue_depth,
270
283
  AVG(latency_seconds) as avg_latency
271
- FROM #{TABLE_NAME}
284
+ FROM #{quoted_table}
272
285
  WHERE created_at >= #{conn.quote(since)}
273
286
  #{worker_filter}
274
287
  GROUP BY action
@@ -289,9 +302,10 @@ module SolidQueueAutoscaler
289
302
  return 0 unless table_exists?(conn)
290
303
 
291
304
  cutoff = Time.current - keep_days.days
305
+ quoted_table = conn.quote_table_name(TABLE_NAME)
292
306
 
293
307
  sql = <<~SQL
294
- DELETE FROM #{TABLE_NAME}
308
+ DELETE FROM #{quoted_table}
295
309
  WHERE created_at < #{conn.quote(cutoff)}
296
310
  SQL
297
311
 
@@ -320,8 +334,9 @@ module SolidQueueAutoscaler
320
334
  return 0 unless table_exists?(conn)
321
335
 
322
336
  time_filter = since ? "WHERE created_at >= #{conn.quote(since)}" : ''
337
+ quoted_table = conn.quote_table_name(TABLE_NAME)
323
338
 
324
- sql = "SELECT COUNT(*) FROM #{TABLE_NAME} #{time_filter}"
339
+ sql = "SELECT COUNT(*) FROM #{quoted_table} #{time_filter}"
325
340
  conn.select_value(sql).to_i
326
341
  rescue StandardError
327
342
  0
@@ -367,7 +382,8 @@ module SolidQueueAutoscaler
367
382
  result[:recent_events] = count(since: 24.hours.ago, connection: conn)
368
383
 
369
384
  # Get last event time
370
- sql = "SELECT MAX(created_at) FROM #{TABLE_NAME}"
385
+ quoted_table = conn.quote_table_name(TABLE_NAME)
386
+ sql = "SELECT MAX(created_at) FROM #{quoted_table}"
371
387
  last_at = conn.select_value(sql)
372
388
  result[:last_event_at] = last_at ? parse_time(last_at) : nil
373
389
  rescue StandardError => e
@@ -25,9 +25,13 @@ module SolidQueueAutoscaler
25
25
  end
26
26
 
27
27
  # Per-configuration cooldown tracking for multi-worker support
28
+ # Thread-safe mutex for cooldown tracking - defined as constant to avoid
29
+ # race condition where lazy initialization could create multiple mutexes
30
+ COOLDOWN_MUTEX = Mutex.new
31
+
28
32
  class << self
29
33
  def cooldown_mutex
30
- @cooldown_mutex ||= Mutex.new
34
+ COOLDOWN_MUTEX
31
35
  end
32
36
 
33
37
  def cooldowns
@@ -158,12 +162,17 @@ module SolidQueueAutoscaler
158
162
  target = decision.to.clamp(@config.min_workers, @config.max_workers)
159
163
 
160
164
  if target != decision.to
161
- logger.warn(
165
+ logger&.warn(
162
166
  "[Autoscaler] Clamping target from #{decision.to} to #{target} " \
163
167
  "(limits: #{@config.min_workers}-#{@config.max_workers})"
164
168
  )
165
- # Ensure decision reflects the clamped target for logging and events
166
- decision.to = target
169
+ # Create a new decision with the clamped target instead of mutating
170
+ decision = DecisionEngine::Decision.new(
171
+ action: decision.action,
172
+ from: decision.from,
173
+ to: target,
174
+ reason: decision.reason
175
+ )
167
176
  end
168
177
 
169
178
  @adapter.scale(target)
@@ -250,7 +259,7 @@ module SolidQueueAutoscaler
250
259
 
251
260
  def log_decision(decision, metrics)
252
261
  worker_label = @config.name == :default ? '' : "[#{@config.name}] "
253
- logger.info(
262
+ logger&.info(
254
263
  "[Autoscaler] #{worker_label}Evaluated: action=#{decision.action} " \
255
264
  "workers=#{decision.from}->#{decision.to} " \
256
265
  "queue_depth=#{metrics.queue_depth} " \
@@ -262,7 +271,7 @@ module SolidQueueAutoscaler
262
271
  def log_scale_action(decision)
263
272
  prefix = @config.dry_run? ? '[DRY RUN] ' : ''
264
273
  worker_label = @config.name == :default ? '' : "[#{@config.name}] "
265
- logger.info(
274
+ logger&.info(
266
275
  "#{prefix}[Autoscaler] #{worker_label}Scaling #{decision.action}: " \
267
276
  "#{decision.from} -> #{decision.to} workers (#{decision.reason})"
268
277
  )
@@ -281,7 +290,7 @@ module SolidQueueAutoscaler
281
290
  end
282
291
 
283
292
  def skipped_result(reason, decision: nil, metrics: nil)
284
- logger.debug("[Autoscaler] Skipped: #{reason}")
293
+ logger&.debug("[Autoscaler] Skipped: #{reason}")
285
294
 
286
295
  # Record skipped events
287
296
  record_skipped_event(reason, decision, metrics)
@@ -296,7 +305,7 @@ module SolidQueueAutoscaler
296
305
  end
297
306
 
298
307
  def error_result(error)
299
- logger.error("[Autoscaler] Error: #{error.class}: #{error.message}")
308
+ logger&.error("[Autoscaler] Error: #{error.class}: #{error.message}")
300
309
 
301
310
  # Record error events
302
311
  record_error_event(error)
@@ -315,7 +324,7 @@ module SolidQueueAutoscaler
315
324
  def record_scale_event(decision, metrics)
316
325
  return unless @config.record_events?
317
326
 
318
- ScaleEvent.create!(
327
+ ScaleEvent.create(
319
328
  {
320
329
  worker_name: @config.name.to_s,
321
330
  action: decision.action.to_s,
@@ -334,7 +343,7 @@ module SolidQueueAutoscaler
334
343
  def record_skipped_event(reason, decision, metrics)
335
344
  return unless @config.record_events?
336
345
 
337
- ScaleEvent.create!(
346
+ ScaleEvent.create(
338
347
  {
339
348
  worker_name: @config.name.to_s,
340
349
  action: 'skipped',
@@ -353,7 +362,7 @@ module SolidQueueAutoscaler
353
362
  def record_error_event(error)
354
363
  return unless @config.record_events?
355
364
 
356
- ScaleEvent.create!(
365
+ ScaleEvent.create(
357
366
  {
358
367
  worker_name: @config.name.to_s,
359
368
  action: 'error',
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SolidQueueAutoscaler
4
- VERSION = '1.0.10'
4
+ VERSION = '1.0.11'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: solid_queue_autoscaler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - reillyse