vmpooler 3.8.1 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/vmpooler/adaptive_timeout.rb +130 -0
- data/lib/vmpooler/circuit_breaker.rb +189 -0
- data/lib/vmpooler/generic_connection_pool.rb +28 -0
- data/lib/vmpooler/metrics/promstats.rb +36 -0
- data/lib/vmpooler/pool_manager.rb +78 -0
- data/lib/vmpooler/providers/base.rb +89 -0
- data/lib/vmpooler/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 82a54b52698f8e24f5677370ab21664e97ec9412812a47447ac8d4619c5a23c4
|
|
4
|
+
data.tar.gz: 98a148c9febf6d9dae630b382b5ffc0a6da5bff7d1bdca113c9da79d9ab022da
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4dba5aa2774558fc34d405f06b3307471671f4074da2e058b5c670eb35099472a1779436fb84c83f270b652413cdbcb97121782f3184e232b7871034ad6e0cfb
|
|
7
|
+
data.tar.gz: 47ba9d7f37f75ebfe5f48f5d99afb4beda77d0d11351173f77fb7f88026710ea72e76d1b1cb5512e2b86851b955e443b6471c0e68b7ca58630b767b47e4edfc0
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Vmpooler
|
|
4
|
+
# Adaptive timeout that adjusts based on observed connection performance
|
|
5
|
+
# to optimize between responsiveness and reliability.
|
|
6
|
+
#
|
|
7
|
+
# Tracks recent connection durations and adjusts timeout to p95 + buffer,
|
|
8
|
+
# reducing timeout on failures to fail faster during outages.
|
|
9
|
+
class AdaptiveTimeout
|
|
10
|
+
attr_reader :current_timeout
|
|
11
|
+
|
|
12
|
+
# Initialize adaptive timeout
|
|
13
|
+
#
|
|
14
|
+
# @param name [String] Name for logging (e.g., "vsphere_connections")
|
|
15
|
+
# @param logger [Object] Logger instance
|
|
16
|
+
# @param metrics [Object] Metrics instance
|
|
17
|
+
# @param min [Integer] Minimum timeout in seconds
|
|
18
|
+
# @param max [Integer] Maximum timeout in seconds
|
|
19
|
+
# @param initial [Integer] Initial timeout in seconds
|
|
20
|
+
# @param max_samples [Integer] Number of recent samples to track
|
|
21
|
+
def initialize(name:, logger:, metrics:, min: 5, max: 60, initial: 30, max_samples: 100)
|
|
22
|
+
@name = name
|
|
23
|
+
@logger = logger
|
|
24
|
+
@metrics = metrics
|
|
25
|
+
@min_timeout = min
|
|
26
|
+
@max_timeout = max
|
|
27
|
+
@current_timeout = initial
|
|
28
|
+
@recent_durations = []
|
|
29
|
+
@max_samples = max_samples
|
|
30
|
+
@mutex = Mutex.new
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Get current timeout value (thread-safe)
|
|
34
|
+
# @return [Integer] Current timeout in seconds
|
|
35
|
+
def timeout
|
|
36
|
+
@mutex.synchronize { @current_timeout }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Record a successful operation duration
|
|
40
|
+
# @param duration [Float] Duration in seconds
|
|
41
|
+
def record_success(duration)
|
|
42
|
+
@mutex.synchronize do
|
|
43
|
+
@recent_durations << duration
|
|
44
|
+
@recent_durations.shift if @recent_durations.size > @max_samples
|
|
45
|
+
|
|
46
|
+
# Adjust timeout based on recent performance
|
|
47
|
+
adjust_timeout if @recent_durations.size >= 10
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Record a failure (timeout or error)
|
|
52
|
+
# Reduces current timeout to fail faster on subsequent attempts
|
|
53
|
+
def record_failure
|
|
54
|
+
@mutex.synchronize do
|
|
55
|
+
old_timeout = @current_timeout
|
|
56
|
+
# Reduce timeout by 20% on failure, but don't go below minimum
|
|
57
|
+
@current_timeout = [(@current_timeout * 0.8).round, @min_timeout].max
|
|
58
|
+
|
|
59
|
+
if old_timeout != @current_timeout
|
|
60
|
+
@logger.log('d', "[*] [adaptive_timeout] '#{@name}' reduced timeout #{old_timeout}s → #{@current_timeout}s after failure")
|
|
61
|
+
@metrics.gauge("adaptive_timeout.current.#{@name}", @current_timeout)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Reset to initial timeout (useful after recovery)
|
|
67
|
+
def reset
|
|
68
|
+
@mutex.synchronize do
|
|
69
|
+
@recent_durations.clear
|
|
70
|
+
old_timeout = @current_timeout
|
|
71
|
+
@current_timeout = [@max_timeout, 30].min
|
|
72
|
+
|
|
73
|
+
@logger.log('d', "[*] [adaptive_timeout] '#{@name}' reset timeout #{old_timeout}s → #{@current_timeout}s")
|
|
74
|
+
@metrics.gauge("adaptive_timeout.current.#{@name}", @current_timeout)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Get statistics about recent durations
|
|
79
|
+
# @return [Hash] Statistics including min, max, avg, p95
|
|
80
|
+
def stats
|
|
81
|
+
@mutex.synchronize do
|
|
82
|
+
return { samples: 0 } if @recent_durations.empty?
|
|
83
|
+
|
|
84
|
+
sorted = @recent_durations.sort
|
|
85
|
+
{
|
|
86
|
+
samples: sorted.size,
|
|
87
|
+
min: sorted.first.round(2),
|
|
88
|
+
max: sorted.last.round(2),
|
|
89
|
+
avg: (sorted.sum / sorted.size.to_f).round(2),
|
|
90
|
+
p50: percentile(sorted, 0.50).round(2),
|
|
91
|
+
p95: percentile(sorted, 0.95).round(2),
|
|
92
|
+
p99: percentile(sorted, 0.99).round(2),
|
|
93
|
+
current_timeout: @current_timeout
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
def adjust_timeout
|
|
101
|
+
return if @recent_durations.empty?
|
|
102
|
+
|
|
103
|
+
sorted = @recent_durations.sort
|
|
104
|
+
p95_duration = percentile(sorted, 0.95)
|
|
105
|
+
|
|
106
|
+
# Set timeout to p95 + 50% buffer, bounded by min/max
|
|
107
|
+
new_timeout = (p95_duration * 1.5).round
|
|
108
|
+
new_timeout = [[new_timeout, @min_timeout].max, @max_timeout].min
|
|
109
|
+
|
|
110
|
+
# Only adjust if change is significant (> 5 seconds)
|
|
111
|
+
if (new_timeout - @current_timeout).abs > 5
|
|
112
|
+
old_timeout = @current_timeout
|
|
113
|
+
@current_timeout = new_timeout
|
|
114
|
+
|
|
115
|
+
@logger.log('d', "[*] [adaptive_timeout] '#{@name}' adjusted timeout #{old_timeout}s → #{@current_timeout}s (p95: #{p95_duration.round(2)}s)")
|
|
116
|
+
@metrics.gauge("adaptive_timeout.current.#{@name}", @current_timeout)
|
|
117
|
+
@metrics.gauge("adaptive_timeout.p95.#{@name}", p95_duration)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def percentile(sorted_array, percentile)
|
|
122
|
+
return 0 if sorted_array.empty?
|
|
123
|
+
|
|
124
|
+
index = (sorted_array.size * percentile).ceil - 1
|
|
125
|
+
index = [index, 0].max
|
|
126
|
+
index = [index, sorted_array.size - 1].min
|
|
127
|
+
sorted_array[index]
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Vmpooler
|
|
4
|
+
# Circuit breaker pattern implementation to prevent cascading failures
|
|
5
|
+
# when a provider becomes unresponsive or experiences repeated failures.
|
|
6
|
+
#
|
|
7
|
+
# States:
|
|
8
|
+
# - CLOSED: Normal operation, requests flow through
|
|
9
|
+
# - OPEN: Provider is failing, reject requests immediately (fail fast)
|
|
10
|
+
# - HALF_OPEN: Testing if provider has recovered with limited requests
|
|
11
|
+
class CircuitBreaker
|
|
12
|
+
STATES = %i[closed open half_open].freeze
|
|
13
|
+
|
|
14
|
+
class CircuitOpenError < StandardError; end
|
|
15
|
+
|
|
16
|
+
attr_reader :state, :failure_count, :success_count
|
|
17
|
+
|
|
18
|
+
# Initialize a new circuit breaker
|
|
19
|
+
#
|
|
20
|
+
# @param name [String] Name for logging/metrics (e.g., "vsphere_provider")
|
|
21
|
+
# @param logger [Object] Logger instance
|
|
22
|
+
# @param metrics [Object] Metrics instance
|
|
23
|
+
# @param failure_threshold [Integer] Number of failures before opening circuit
|
|
24
|
+
# @param timeout [Integer] Seconds to wait in open state before testing (half-open)
|
|
25
|
+
# @param half_open_attempts [Integer] Number of successful test requests needed to close
|
|
26
|
+
def initialize(name:, logger:, metrics:, failure_threshold: 5, timeout: 30, half_open_attempts: 3)
|
|
27
|
+
@name = name
|
|
28
|
+
@logger = logger
|
|
29
|
+
@metrics = metrics
|
|
30
|
+
@failure_threshold = failure_threshold
|
|
31
|
+
@timeout = timeout
|
|
32
|
+
@half_open_attempts = half_open_attempts
|
|
33
|
+
|
|
34
|
+
@state = :closed
|
|
35
|
+
@failure_count = 0
|
|
36
|
+
@success_count = 0
|
|
37
|
+
@last_failure_time = nil
|
|
38
|
+
@mutex = Mutex.new
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Execute a block with circuit breaker protection
|
|
42
|
+
#
|
|
43
|
+
# @yield Block to execute if circuit allows
|
|
44
|
+
# @return Result of the block
|
|
45
|
+
# @raise CircuitOpenError if circuit is open and timeout hasn't elapsed
|
|
46
|
+
def call
|
|
47
|
+
check_state
|
|
48
|
+
|
|
49
|
+
begin
|
|
50
|
+
result = yield
|
|
51
|
+
on_success
|
|
52
|
+
result
|
|
53
|
+
rescue StandardError => e
|
|
54
|
+
on_failure(e)
|
|
55
|
+
raise
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Check if circuit allows requests
|
|
60
|
+
# @return [Boolean] true if circuit is closed or half-open
|
|
61
|
+
def allow_request?
|
|
62
|
+
@mutex.synchronize do
|
|
63
|
+
case @state
|
|
64
|
+
when :closed
|
|
65
|
+
true
|
|
66
|
+
when :half_open
|
|
67
|
+
true
|
|
68
|
+
when :open
|
|
69
|
+
if should_attempt_reset?
|
|
70
|
+
true
|
|
71
|
+
else
|
|
72
|
+
false
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Get current circuit breaker status
|
|
79
|
+
# @return [Hash] Status information
|
|
80
|
+
def status
|
|
81
|
+
@mutex.synchronize do
|
|
82
|
+
{
|
|
83
|
+
name: @name,
|
|
84
|
+
state: @state,
|
|
85
|
+
failure_count: @failure_count,
|
|
86
|
+
success_count: @success_count,
|
|
87
|
+
last_failure_time: @last_failure_time,
|
|
88
|
+
next_retry_time: next_retry_time
|
|
89
|
+
}
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def check_state
|
|
96
|
+
@mutex.synchronize do
|
|
97
|
+
case @state
|
|
98
|
+
when :open
|
|
99
|
+
if should_attempt_reset?
|
|
100
|
+
transition_to_half_open
|
|
101
|
+
else
|
|
102
|
+
time_remaining = (@timeout - (Time.now - @last_failure_time)).round(1)
|
|
103
|
+
raise CircuitOpenError, "Circuit breaker '#{@name}' is open (#{@failure_count} failures, retry in #{time_remaining}s)"
|
|
104
|
+
end
|
|
105
|
+
when :half_open
|
|
106
|
+
# Allow limited requests through for testing
|
|
107
|
+
when :closed
|
|
108
|
+
# Normal operation
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def should_attempt_reset?
|
|
114
|
+
return false unless @last_failure_time
|
|
115
|
+
|
|
116
|
+
Time.now - @last_failure_time >= @timeout
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def next_retry_time
|
|
120
|
+
return nil unless @last_failure_time && @state == :open
|
|
121
|
+
|
|
122
|
+
@last_failure_time + @timeout
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def on_success
|
|
126
|
+
@mutex.synchronize do
|
|
127
|
+
case @state
|
|
128
|
+
when :closed
|
|
129
|
+
# Reset failure count on success in closed state
|
|
130
|
+
@failure_count = 0 if @failure_count > 0
|
|
131
|
+
when :half_open
|
|
132
|
+
@success_count += 1
|
|
133
|
+
@failure_count = 0
|
|
134
|
+
@logger.log('d', "[+] [circuit_breaker] '#{@name}' successful test request (#{@success_count}/#{@half_open_attempts})")
|
|
135
|
+
|
|
136
|
+
transition_to_closed if @success_count >= @half_open_attempts
|
|
137
|
+
when :open
|
|
138
|
+
# Should not happen, but reset if we somehow get a success
|
|
139
|
+
transition_to_closed
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def on_failure(error)
|
|
145
|
+
@mutex.synchronize do
|
|
146
|
+
@failure_count += 1
|
|
147
|
+
@last_failure_time = Time.now
|
|
148
|
+
|
|
149
|
+
case @state
|
|
150
|
+
when :closed
|
|
151
|
+
@logger.log('d', "[!] [circuit_breaker] '#{@name}' failure #{@failure_count}/#{@failure_threshold}: #{error.class}")
|
|
152
|
+
transition_to_open if @failure_count >= @failure_threshold
|
|
153
|
+
when :half_open
|
|
154
|
+
@logger.log('d', "[!] [circuit_breaker] '#{@name}' failed during half-open test")
|
|
155
|
+
transition_to_open
|
|
156
|
+
when :open
|
|
157
|
+
# Already open, just log
|
|
158
|
+
@logger.log('d', "[!] [circuit_breaker] '#{@name}' additional failure while open")
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def transition_to_open
|
|
164
|
+
@state = :open
|
|
165
|
+
@success_count = 0
|
|
166
|
+
@logger.log('s', "[!] [circuit_breaker] '#{@name}' OPENED after #{@failure_count} failures (will retry in #{@timeout}s)")
|
|
167
|
+
@metrics.increment("circuit_breaker.opened.#{@name}")
|
|
168
|
+
@metrics.gauge("circuit_breaker.state.#{@name}", 1) # 1 = open
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def transition_to_half_open
|
|
172
|
+
@state = :half_open
|
|
173
|
+
@success_count = 0
|
|
174
|
+
@failure_count = 0
|
|
175
|
+
@logger.log('s', "[*] [circuit_breaker] '#{@name}' HALF-OPEN, testing provider health")
|
|
176
|
+
@metrics.increment("circuit_breaker.half_open.#{@name}")
|
|
177
|
+
@metrics.gauge("circuit_breaker.state.#{@name}", 0.5) # 0.5 = half-open
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def transition_to_closed
|
|
181
|
+
@state = :closed
|
|
182
|
+
@failure_count = 0
|
|
183
|
+
@success_count = 0
|
|
184
|
+
@logger.log('s', "[+] [circuit_breaker] '#{@name}' CLOSED, provider recovered")
|
|
185
|
+
@metrics.increment("circuit_breaker.closed.#{@name}")
|
|
186
|
+
@metrics.gauge("circuit_breaker.state.#{@name}", 0) # 0 = closed
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
@@ -34,6 +34,34 @@ module Vmpooler
|
|
|
34
34
|
end
|
|
35
35
|
end
|
|
36
36
|
end
|
|
37
|
+
|
|
38
|
+
# Get connection pool health status
|
|
39
|
+
# @return [Hash] Health status including utilization and queue depth
|
|
40
|
+
def health_status
|
|
41
|
+
{
|
|
42
|
+
size: @size,
|
|
43
|
+
available: @available.length,
|
|
44
|
+
in_use: @size - @available.length,
|
|
45
|
+
utilization: ((@size - @available.length).to_f / @size * 100).round(2),
|
|
46
|
+
waiting_threads: (@queue.respond_to?(:length) ? @queue.length : 0),
|
|
47
|
+
state: determine_health_state
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def determine_health_state
|
|
54
|
+
utilization = ((@size - @available.length).to_f / @size * 100)
|
|
55
|
+
waiting = @queue.respond_to?(:length) ? @queue.length : 0
|
|
56
|
+
|
|
57
|
+
if utilization >= 90 || waiting > 5
|
|
58
|
+
:critical # Pool exhausted or many waiting threads
|
|
59
|
+
elsif utilization >= 70 || waiting > 2
|
|
60
|
+
:warning # Pool under stress
|
|
61
|
+
else
|
|
62
|
+
:healthy # Normal operation
|
|
63
|
+
end
|
|
64
|
+
end
|
|
37
65
|
end
|
|
38
66
|
end
|
|
39
67
|
end
|
|
@@ -353,6 +353,42 @@ module Vmpooler
|
|
|
353
353
|
torun: %i[manager],
|
|
354
354
|
docstring: 'vmpooler clone metrics',
|
|
355
355
|
param_labels: %i[poolname]
|
|
356
|
+
},
|
|
357
|
+
circuit_breaker: {
|
|
358
|
+
mtype: M_GAUGE,
|
|
359
|
+
torun: %i[manager],
|
|
360
|
+
docstring: 'Circuit breaker state and failure tracking',
|
|
361
|
+
param_labels: %i[metric_path]
|
|
362
|
+
},
|
|
363
|
+
connection_pool: {
|
|
364
|
+
mtype: M_GAUGE,
|
|
365
|
+
torun: %i[manager],
|
|
366
|
+
docstring: 'Connection pool health metrics',
|
|
367
|
+
param_labels: %i[metric_path]
|
|
368
|
+
},
|
|
369
|
+
adaptive_timeout: {
|
|
370
|
+
mtype: M_GAUGE,
|
|
371
|
+
torun: %i[manager],
|
|
372
|
+
docstring: 'Adaptive timeout statistics',
|
|
373
|
+
param_labels: %i[metric_path]
|
|
374
|
+
},
|
|
375
|
+
vmpooler_performance: {
|
|
376
|
+
mtype: M_GAUGE,
|
|
377
|
+
torun: %i[manager],
|
|
378
|
+
docstring: 'vmpooler performance metrics for pool operations',
|
|
379
|
+
param_labels: %i[metric_path]
|
|
380
|
+
},
|
|
381
|
+
vmpooler_dlq: {
|
|
382
|
+
mtype: M_COUNTER,
|
|
383
|
+
torun: %i[manager],
|
|
384
|
+
docstring: 'vmpooler dead letter queue metrics',
|
|
385
|
+
param_labels: %i[metric_path]
|
|
386
|
+
},
|
|
387
|
+
vmpooler_errors: {
|
|
388
|
+
mtype: M_COUNTER,
|
|
389
|
+
torun: %i[manager],
|
|
390
|
+
docstring: 'vmpooler error counters including permanent failures',
|
|
391
|
+
param_labels: %i[metric_path]
|
|
356
392
|
}
|
|
357
393
|
}
|
|
358
394
|
end
|
|
@@ -1287,6 +1287,63 @@ module Vmpooler
|
|
|
1287
1287
|
$metrics.gauge('vmpooler_health.status', status_value)
|
|
1288
1288
|
end
|
|
1289
1289
|
|
|
1290
|
+
# Monitor connection pool health across all providers
|
|
1291
|
+
def monitor_connection_pools
|
|
1292
|
+
return unless $providers
|
|
1293
|
+
|
|
1294
|
+
$providers.each do |provider_name, provider|
|
|
1295
|
+
begin
|
|
1296
|
+
next unless provider.respond_to?(:connection_pool)
|
|
1297
|
+
|
|
1298
|
+
pool = provider.connection_pool
|
|
1299
|
+
next unless pool.respond_to?(:health_status)
|
|
1300
|
+
|
|
1301
|
+
health = pool.health_status
|
|
1302
|
+
|
|
1303
|
+
# Push metrics using metric_path pattern
|
|
1304
|
+
$metrics.gauge("connection_pool.#{provider_name}.available", health[:available])
|
|
1305
|
+
$metrics.gauge("connection_pool.#{provider_name}.in_use", health[:in_use])
|
|
1306
|
+
$metrics.gauge("connection_pool.#{provider_name}.utilization", health[:utilization])
|
|
1307
|
+
$metrics.gauge("connection_pool.#{provider_name}.waiting", health[:waiting_threads])
|
|
1308
|
+
|
|
1309
|
+
# Log warnings for unhealthy states
|
|
1310
|
+
if health[:state] == :critical
|
|
1311
|
+
$logger.log('s', "[!] [connection_pool] '#{provider_name}' CRITICAL: #{health[:utilization]}% used, #{health[:waiting_threads]} waiting")
|
|
1312
|
+
$metrics.increment("connection_pool.#{provider_name}.critical")
|
|
1313
|
+
elsif health[:state] == :warning
|
|
1314
|
+
$logger.log('d', "[*] [connection_pool] '#{provider_name}' WARNING: #{health[:utilization]}% used, #{health[:waiting_threads]} waiting")
|
|
1315
|
+
$metrics.increment("connection_pool.#{provider_name}.warning")
|
|
1316
|
+
end
|
|
1317
|
+
|
|
1318
|
+
# Check circuit breaker status
|
|
1319
|
+
if provider.respond_to?(:circuit_breaker) && provider.circuit_breaker
|
|
1320
|
+
cb_status = provider.circuit_breaker.status
|
|
1321
|
+
state_value = { closed: 0, half_open: 0.5, open: 1 }[cb_status[:state]] || 1
|
|
1322
|
+
$metrics.gauge("circuit_breaker.state.#{provider_name}", state_value)
|
|
1323
|
+
$metrics.gauge("circuit_breaker.failures.#{provider_name}", cb_status[:failure_count])
|
|
1324
|
+
end
|
|
1325
|
+
|
|
1326
|
+
# Log adaptive timeout stats
|
|
1327
|
+
if provider.respond_to?(:adaptive_timeout) && provider.adaptive_timeout
|
|
1328
|
+
timeout_stats = provider.adaptive_timeout.stats
|
|
1329
|
+
if timeout_stats[:samples] > 0
|
|
1330
|
+
$metrics.gauge("adaptive_timeout.current.#{provider_name}", timeout_stats[:current_timeout])
|
|
1331
|
+
$metrics.gauge("adaptive_timeout.p95.#{provider_name}", timeout_stats[:p95])
|
|
1332
|
+
end
|
|
1333
|
+
end
|
|
1334
|
+
rescue StandardError => e
|
|
1335
|
+
$logger.log('d', "[!] [connection_pool_monitor] Failed to monitor '#{provider_name}': #{e}")
|
|
1336
|
+
end
|
|
1337
|
+
end
|
|
1338
|
+
end
|
|
1339
|
+
|
|
1340
|
+
def connection_pool_monitor_enabled?
|
|
1341
|
+
global_config = $config[:config] || {}
|
|
1342
|
+
enabled = global_config['connection_pool_monitor_enabled']
|
|
1343
|
+
enabled = true if enabled.nil? # Default to enabled
|
|
1344
|
+
enabled
|
|
1345
|
+
end
|
|
1346
|
+
|
|
1290
1347
|
def create_vm_disk(pool_name, vm, disk_size, provider)
|
|
1291
1348
|
Thread.new do
|
|
1292
1349
|
begin
|
|
@@ -2525,6 +2582,27 @@ module Vmpooler
|
|
|
2525
2582
|
end
|
|
2526
2583
|
end
|
|
2527
2584
|
|
|
2585
|
+
# Connection pool monitoring thread
|
|
2586
|
+
if connection_pool_monitor_enabled?
|
|
2587
|
+
monitor_interval = ($config[:config] && $config[:config]['connection_pool_monitor_interval']) || 10 # default 10 seconds
|
|
2588
|
+
if !$threads['connection_pool_monitor']
|
|
2589
|
+
$threads['connection_pool_monitor'] = Thread.new do
|
|
2590
|
+
loop do
|
|
2591
|
+
monitor_connection_pools
|
|
2592
|
+
sleep(monitor_interval)
|
|
2593
|
+
end
|
|
2594
|
+
end
|
|
2595
|
+
elsif !$threads['connection_pool_monitor'].alive?
|
|
2596
|
+
$logger.log('d', '[!] [connection_pool_monitor] worker thread died, restarting')
|
|
2597
|
+
$threads['connection_pool_monitor'] = Thread.new do
|
|
2598
|
+
loop do
|
|
2599
|
+
monitor_connection_pools
|
|
2600
|
+
sleep(monitor_interval)
|
|
2601
|
+
end
|
|
2602
|
+
end
|
|
2603
|
+
end
|
|
2604
|
+
end
|
|
2605
|
+
|
|
2528
2606
|
sleep(loop_delay)
|
|
2529
2607
|
|
|
2530
2608
|
unless maxloop == 0
|
|
@@ -13,6 +13,10 @@ module Vmpooler
|
|
|
13
13
|
attr_reader :metrics
|
|
14
14
|
# Provider options passed in during initialization
|
|
15
15
|
attr_reader :provider_options
|
|
16
|
+
# Circuit breaker for provider resilience
|
|
17
|
+
attr_reader :circuit_breaker
|
|
18
|
+
# Adaptive timeout for connections
|
|
19
|
+
attr_reader :adaptive_timeout
|
|
16
20
|
|
|
17
21
|
def initialize(config, logger, metrics, redis_connection_pool, name, options)
|
|
18
22
|
@config = config
|
|
@@ -30,6 +34,11 @@ module Vmpooler
|
|
|
30
34
|
|
|
31
35
|
@provider_options = options
|
|
32
36
|
logger.log('s', "[!] Creating provider '#{name}'")
|
|
37
|
+
|
|
38
|
+
# Initialize circuit breaker if enabled
|
|
39
|
+
initialize_circuit_breaker if circuit_breaker_enabled?
|
|
40
|
+
# Initialize adaptive timeout if enabled
|
|
41
|
+
initialize_adaptive_timeout if adaptive_timeout_enabled?
|
|
33
42
|
end
|
|
34
43
|
|
|
35
44
|
# Helper Methods
|
|
@@ -271,6 +280,86 @@ module Vmpooler
|
|
|
271
280
|
logger.log('s', '[!] purge_unconfigured_folders was renamed to purge_unconfigured_resources, please update your provider implementation')
|
|
272
281
|
purge_unconfigured_resources(allowlist)
|
|
273
282
|
end
|
|
283
|
+
|
|
284
|
+
private
|
|
285
|
+
|
|
286
|
+
# Circuit breaker configuration and initialization
|
|
287
|
+
|
|
288
|
+
def circuit_breaker_enabled?
|
|
289
|
+
global_config = @config[:config] || {}
|
|
290
|
+
provider_cfg = provider_config || {}
|
|
291
|
+
|
|
292
|
+
# Check provider-specific setting first, then global
|
|
293
|
+
enabled = provider_cfg['circuit_breaker_enabled']
|
|
294
|
+
enabled = global_config['circuit_breaker_enabled'] if enabled.nil?
|
|
295
|
+
enabled = true if enabled.nil? # Default to enabled
|
|
296
|
+
|
|
297
|
+
enabled
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def initialize_circuit_breaker
|
|
301
|
+
require 'vmpooler/circuit_breaker'
|
|
302
|
+
|
|
303
|
+
global_config = @config[:config] || {}
|
|
304
|
+
provider_cfg = provider_config || {}
|
|
305
|
+
|
|
306
|
+
# Get circuit breaker settings (provider-specific overrides global)
|
|
307
|
+
failure_threshold = provider_cfg['circuit_breaker_failure_threshold'] ||
|
|
308
|
+
global_config['circuit_breaker_failure_threshold'] || 5
|
|
309
|
+
timeout = provider_cfg['circuit_breaker_timeout'] ||
|
|
310
|
+
global_config['circuit_breaker_timeout'] || 30
|
|
311
|
+
half_open_attempts = provider_cfg['circuit_breaker_half_open_attempts'] ||
|
|
312
|
+
global_config['circuit_breaker_half_open_attempts'] || 3
|
|
313
|
+
|
|
314
|
+
@circuit_breaker = Vmpooler::CircuitBreaker.new(
|
|
315
|
+
name: @provider_name,
|
|
316
|
+
logger: @logger,
|
|
317
|
+
metrics: @metrics,
|
|
318
|
+
failure_threshold: failure_threshold.to_i,
|
|
319
|
+
timeout: timeout.to_i,
|
|
320
|
+
half_open_attempts: half_open_attempts.to_i
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
@logger.log('d', "[*] [#{@provider_name}] Circuit breaker initialized (threshold: #{failure_threshold}, timeout: #{timeout}s)")
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def adaptive_timeout_enabled?
|
|
327
|
+
global_config = @config[:config] || {}
|
|
328
|
+
provider_cfg = provider_config || {}
|
|
329
|
+
|
|
330
|
+
# Check provider-specific setting first, then global
|
|
331
|
+
enabled = provider_cfg['adaptive_timeout_enabled']
|
|
332
|
+
enabled = global_config['adaptive_timeout_enabled'] if enabled.nil?
|
|
333
|
+
enabled = true if enabled.nil? # Default to enabled
|
|
334
|
+
|
|
335
|
+
enabled
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
def initialize_adaptive_timeout
|
|
339
|
+
require 'vmpooler/adaptive_timeout'
|
|
340
|
+
|
|
341
|
+
global_config = @config[:config] || {}
|
|
342
|
+
provider_cfg = provider_config || {}
|
|
343
|
+
|
|
344
|
+
# Get adaptive timeout settings (provider-specific overrides global)
|
|
345
|
+
min = provider_cfg['connection_pool_timeout_min'] ||
|
|
346
|
+
global_config['connection_pool_timeout_min'] || 5
|
|
347
|
+
max = provider_cfg['connection_pool_timeout_max'] ||
|
|
348
|
+
global_config['connection_pool_timeout_max'] || 60
|
|
349
|
+
initial = provider_cfg['connection_pool_timeout_initial'] ||
|
|
350
|
+
global_config['connection_pool_timeout_initial'] || 30
|
|
351
|
+
|
|
352
|
+
@adaptive_timeout = Vmpooler::AdaptiveTimeout.new(
|
|
353
|
+
name: "#{@provider_name}_connections",
|
|
354
|
+
logger: @logger,
|
|
355
|
+
metrics: @metrics,
|
|
356
|
+
min: min.to_i,
|
|
357
|
+
max: max.to_i,
|
|
358
|
+
initial: initial.to_i
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
@logger.log('d', "[*] [#{@provider_name}] Adaptive timeout initialized (min: #{min}s, max: #{max}s, initial: #{initial}s)")
|
|
362
|
+
end
|
|
274
363
|
end
|
|
275
364
|
end
|
|
276
365
|
end
|
data/lib/vmpooler/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: vmpooler
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Puppet
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2026-
|
|
10
|
+
date: 2026-03-17 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|
|
@@ -468,6 +468,7 @@ extra_rdoc_files: []
|
|
|
468
468
|
files:
|
|
469
469
|
- bin/vmpooler
|
|
470
470
|
- lib/vmpooler.rb
|
|
471
|
+
- lib/vmpooler/adaptive_timeout.rb
|
|
471
472
|
- lib/vmpooler/api.rb
|
|
472
473
|
- lib/vmpooler/api/dashboard.rb
|
|
473
474
|
- lib/vmpooler/api/healthcheck.rb
|
|
@@ -476,6 +477,7 @@ files:
|
|
|
476
477
|
- lib/vmpooler/api/rate_limiter.rb
|
|
477
478
|
- lib/vmpooler/api/request_logger.rb
|
|
478
479
|
- lib/vmpooler/api/v3.rb
|
|
480
|
+
- lib/vmpooler/circuit_breaker.rb
|
|
479
481
|
- lib/vmpooler/dashboard.rb
|
|
480
482
|
- lib/vmpooler/dns.rb
|
|
481
483
|
- lib/vmpooler/dns/base.rb
|