vmpooler 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,20 @@ module Vmpooler
9
9
  api_version = '3'
10
10
  api_prefix = "/api/v#{api_version}"
11
11
 
12
+ # Simple in-memory cache for status endpoint
13
+ # rubocop:disable Style/ClassVars
14
+ @@status_cache = {}
15
+ @@status_cache_mutex = Mutex.new
16
+ # rubocop:enable Style/ClassVars
17
+ STATUS_CACHE_TTL = 30 # seconds
18
+
19
+ # Clear cache (useful for testing)
20
+ def self.clear_status_cache
21
+ @@status_cache_mutex.synchronize do
22
+ @@status_cache.clear
23
+ end
24
+ end
25
+
12
26
  helpers do
13
27
  include Vmpooler::API::Helpers
14
28
  end
@@ -464,6 +478,32 @@ module Vmpooler
464
478
  end
465
479
  end
466
480
 
481
+ # Cache helper methods for status endpoint
482
+ def get_cached_status(cache_key)
483
+ @@status_cache_mutex.synchronize do
484
+ cached = @@status_cache[cache_key]
485
+ if cached && (Time.now - cached[:timestamp]) < STATUS_CACHE_TTL
486
+ return cached[:data]
487
+ end
488
+
489
+ nil
490
+ end
491
+ end
492
+
493
+ def set_cached_status(cache_key, data)
494
+ @@status_cache_mutex.synchronize do
495
+ @@status_cache[cache_key] = {
496
+ data: data,
497
+ timestamp: Time.now
498
+ }
499
+ # Cleanup old cache entries (keep only last 10 unique view combinations)
500
+ if @@status_cache.size > 10
501
+ oldest = @@status_cache.min_by { |_k, v| v[:timestamp] }
502
+ @@status_cache.delete(oldest[0])
503
+ end
504
+ end
505
+ end
506
+
467
507
  def sync_pool_templates
468
508
  tracer.in_span("Vmpooler::API::V3.#{__method__}") do
469
509
  pool_index = pool_index(pools)
@@ -646,6 +686,13 @@ module Vmpooler
646
686
  get "#{api_prefix}/status/?" do
647
687
  content_type :json
648
688
 
689
+ # Create cache key based on view parameters
690
+ cache_key = params[:view] ? "status_#{params[:view]}" : "status_all"
691
+
692
+ # Try to get cached response
693
+ cached_response = get_cached_status(cache_key)
694
+ return cached_response if cached_response
695
+
649
696
  if params[:view]
650
697
  views = params[:view].split(",")
651
698
  end
@@ -706,7 +753,12 @@ module Vmpooler
706
753
 
707
754
  result[:status][:uptime] = (Time.now - Vmpooler::API.settings.config[:uptime]).round(1) if Vmpooler::API.settings.config[:uptime]
708
755
 
709
- JSON.pretty_generate(Hash[result.sort_by { |k, _v| k }])
756
+ response = JSON.pretty_generate(Hash[result.sort_by { |k, _v| k }])
757
+
758
+ # Cache the response
759
+ set_cached_status(cache_key, response)
760
+
761
+ response
710
762
  end
711
763
 
712
764
  # request statistics for specific pools by passing parameter 'pool'
@@ -1085,9 +1137,29 @@ module Vmpooler
1085
1137
  result = { 'ok' => false }
1086
1138
  metrics.increment('http_requests_vm_total.post.vm.checkout')
1087
1139
 
1088
- payload = JSON.parse(request.body.read)
1140
+ # Validate and sanitize JSON body
1141
+ payload = sanitize_json_body(request.body.read)
1142
+ if validation_error?(payload)
1143
+ status 400
1144
+ return JSON.pretty_generate(payload)
1145
+ end
1089
1146
 
1090
- if payload
1147
+ # Validate each template and count
1148
+ payload.each do |template, count|
1149
+ validation = validate_pool_name(template)
1150
+ if validation_error?(validation)
1151
+ status 400
1152
+ return JSON.pretty_generate(validation)
1153
+ end
1154
+
1155
+ validated_count = validate_vm_count(count)
1156
+ if validation_error?(validated_count)
1157
+ status 400
1158
+ return JSON.pretty_generate(validated_count)
1159
+ end
1160
+ end
1161
+
1162
+ if payload && !payload.empty?
1091
1163
  invalid = invalid_templates(payload)
1092
1164
  if invalid.empty?
1093
1165
  result = atomically_allocate_vms(payload)
@@ -1206,6 +1278,7 @@ module Vmpooler
1206
1278
  result = { 'ok' => false }
1207
1279
  metrics.increment('http_requests_vm_total.get.vm.template')
1208
1280
 
1281
+ # Template can contain multiple pools separated by +, so validate after parsing
1209
1282
  payload = extract_templates_from_query_params(params[:template])
1210
1283
 
1211
1284
  if payload
@@ -1235,6 +1308,13 @@ module Vmpooler
1235
1308
  status 404
1236
1309
  result['ok'] = false
1237
1310
 
1311
+ # Validate hostname
1312
+ validation = validate_hostname(params[:hostname])
1313
+ if validation_error?(validation)
1314
+ status 400
1315
+ return JSON.pretty_generate(validation)
1316
+ end
1317
+
1238
1318
  params[:hostname] = hostname_shorten(params[:hostname])
1239
1319
 
1240
1320
  rdata = backend.hgetall("vmpooler__vm__#{params[:hostname]}")
@@ -1373,6 +1453,13 @@ module Vmpooler
1373
1453
  status 404
1374
1454
  result['ok'] = false
1375
1455
 
1456
+ # Validate hostname
1457
+ validation = validate_hostname(params[:hostname])
1458
+ if validation_error?(validation)
1459
+ status 400
1460
+ return JSON.pretty_generate(validation)
1461
+ end
1462
+
1376
1463
  params[:hostname] = hostname_shorten(params[:hostname])
1377
1464
 
1378
1465
  rdata = backend.hgetall("vmpooler__vm__#{params[:hostname]}")
@@ -1403,16 +1490,21 @@ module Vmpooler
1403
1490
 
1404
1491
  failure = []
1405
1492
 
1493
+ # Validate hostname
1494
+ validation = validate_hostname(params[:hostname])
1495
+ if validation_error?(validation)
1496
+ status 400
1497
+ return JSON.pretty_generate(validation)
1498
+ end
1499
+
1406
1500
  params[:hostname] = hostname_shorten(params[:hostname])
1407
1501
 
1408
1502
  if backend.exists?("vmpooler__vm__#{params[:hostname]}")
1409
- begin
1410
- jdata = JSON.parse(request.body.read)
1411
- rescue StandardError => e
1412
- span = OpenTelemetry::Trace.current_span
1413
- span.record_exception(e)
1414
- span.status = OpenTelemetry::Trace::Status.error(e.to_s)
1415
- halt 400, JSON.pretty_generate(result)
1503
+ # Validate and sanitize JSON body
1504
+ jdata = sanitize_json_body(request.body.read)
1505
+ if validation_error?(jdata)
1506
+ status 400
1507
+ return JSON.pretty_generate(jdata)
1416
1508
  end
1417
1509
 
1418
1510
  # Validate data payload
@@ -1421,6 +1513,13 @@ module Vmpooler
1421
1513
  when 'lifetime'
1422
1514
  need_token! if Vmpooler::API.settings.config[:auth]
1423
1515
 
1516
+ # Validate lifetime is a positive integer
1517
+ lifetime_int = arg.to_i
1518
+ if lifetime_int <= 0
1519
+ failure.push("Lifetime must be a positive integer (got #{arg})")
1520
+ next
1521
+ end
1522
+
1424
1523
  # in hours, defaults to one week
1425
1524
  max_lifetime_upper_limit = config['max_lifetime_upper_limit']
1426
1525
  if max_lifetime_upper_limit
@@ -1430,13 +1529,17 @@ module Vmpooler
1430
1529
  end
1431
1530
  end
1432
1531
 
1433
- # validate lifetime is within boundaries
1434
- unless arg.to_i > 0
1435
- failure.push("You provided a lifetime (#{arg}) but you must provide a positive number.")
1436
- end
1437
-
1438
1532
  when 'tags'
1439
1533
  failure.push("You provided tags (#{arg}) as something other than a hash.") unless arg.is_a?(Hash)
1534
+
1535
+ # Validate each tag key and value
1536
+ arg.each do |key, value|
1537
+ tag_validation = validate_tag(key, value)
1538
+ if validation_error?(tag_validation)
1539
+ failure.push(tag_validation['error'])
1540
+ end
1541
+ end
1542
+
1440
1543
  failure.push("You provided unsuppored tags (#{arg}).") if config['allowed_tags'] && !(arg.keys - config['allowed_tags']).empty?
1441
1544
  else
1442
1545
  failure.push("Unknown argument #{arg}.")
@@ -1478,9 +1581,23 @@ module Vmpooler
1478
1581
  status 404
1479
1582
  result = { 'ok' => false }
1480
1583
 
1584
+ # Validate hostname
1585
+ validation = validate_hostname(params[:hostname])
1586
+ if validation_error?(validation)
1587
+ status 400
1588
+ return JSON.pretty_generate(validation)
1589
+ end
1590
+
1591
+ # Validate disk size
1592
+ validated_size = validate_disk_size(params[:size])
1593
+ if validation_error?(validated_size)
1594
+ status 400
1595
+ return JSON.pretty_generate(validated_size)
1596
+ end
1597
+
1481
1598
  params[:hostname] = hostname_shorten(params[:hostname])
1482
1599
 
1483
- if ((params[:size].to_i > 0 )and (backend.exists?("vmpooler__vm__#{params[:hostname]}")))
1600
+ if backend.exists?("vmpooler__vm__#{params[:hostname]}")
1484
1601
  result[params[:hostname]] = {}
1485
1602
  result[params[:hostname]]['disk'] = "+#{params[:size]}gb"
1486
1603
 
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vmpooler
4
+ # Circuit breaker pattern implementation to prevent cascading failures
5
+ # when a provider becomes unresponsive or experiences repeated failures.
6
+ #
7
+ # States:
8
+ # - CLOSED: Normal operation, requests flow through
9
+ # - OPEN: Provider is failing, reject requests immediately (fail fast)
10
+ # - HALF_OPEN: Testing if provider has recovered with limited requests
11
+ class CircuitBreaker
12
+ STATES = %i[closed open half_open].freeze
13
+
14
+ class CircuitOpenError < StandardError; end
15
+
16
+ attr_reader :state, :failure_count, :success_count
17
+
18
+ # Initialize a new circuit breaker
19
+ #
20
+ # @param name [String] Name for logging/metrics (e.g., "vsphere_provider")
21
+ # @param logger [Object] Logger instance
22
+ # @param metrics [Object] Metrics instance
23
+ # @param failure_threshold [Integer] Number of failures before opening circuit
24
+ # @param timeout [Integer] Seconds to wait in open state before testing (half-open)
25
+ # @param half_open_attempts [Integer] Number of successful test requests needed to close
26
+ def initialize(name:, logger:, metrics:, failure_threshold: 5, timeout: 30, half_open_attempts: 3)
27
+ @name = name
28
+ @logger = logger
29
+ @metrics = metrics
30
+ @failure_threshold = failure_threshold
31
+ @timeout = timeout
32
+ @half_open_attempts = half_open_attempts
33
+
34
+ @state = :closed
35
+ @failure_count = 0
36
+ @success_count = 0
37
+ @last_failure_time = nil
38
+ @mutex = Mutex.new
39
+ end
40
+
41
+ # Execute a block with circuit breaker protection
42
+ #
43
+ # @yield Block to execute if circuit allows
44
+ # @return Result of the block
45
+ # @raise CircuitOpenError if circuit is open and timeout hasn't elapsed
46
+ def call
47
+ check_state
48
+
49
+ begin
50
+ result = yield
51
+ on_success
52
+ result
53
+ rescue StandardError => e
54
+ on_failure(e)
55
+ raise
56
+ end
57
+ end
58
+
59
+ # Check if circuit allows requests
60
+ # @return [Boolean] true if circuit is closed or half-open
61
+ def allow_request?
62
+ @mutex.synchronize do
63
+ case @state
64
+ when :closed
65
+ true
66
+ when :half_open
67
+ true
68
+ when :open
69
+ if should_attempt_reset?
70
+ true
71
+ else
72
+ false
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ # Get current circuit breaker status
79
+ # @return [Hash] Status information
80
+ def status
81
+ @mutex.synchronize do
82
+ {
83
+ name: @name,
84
+ state: @state,
85
+ failure_count: @failure_count,
86
+ success_count: @success_count,
87
+ last_failure_time: @last_failure_time,
88
+ next_retry_time: next_retry_time
89
+ }
90
+ end
91
+ end
92
+
93
+ private
94
+
95
+ def check_state
96
+ @mutex.synchronize do
97
+ case @state
98
+ when :open
99
+ if should_attempt_reset?
100
+ transition_to_half_open
101
+ else
102
+ time_remaining = (@timeout - (Time.now - @last_failure_time)).round(1)
103
+ raise CircuitOpenError, "Circuit breaker '#{@name}' is open (#{@failure_count} failures, retry in #{time_remaining}s)"
104
+ end
105
+ when :half_open
106
+ # Allow limited requests through for testing
107
+ when :closed
108
+ # Normal operation
109
+ end
110
+ end
111
+ end
112
+
113
+ def should_attempt_reset?
114
+ return false unless @last_failure_time
115
+
116
+ Time.now - @last_failure_time >= @timeout
117
+ end
118
+
119
+ def next_retry_time
120
+ return nil unless @last_failure_time && @state == :open
121
+
122
+ @last_failure_time + @timeout
123
+ end
124
+
125
+ def on_success
126
+ @mutex.synchronize do
127
+ case @state
128
+ when :closed
129
+ # Reset failure count on success in closed state
130
+ @failure_count = 0 if @failure_count > 0
131
+ when :half_open
132
+ @success_count += 1
133
+ @failure_count = 0
134
+ @logger.log('d', "[+] [circuit_breaker] '#{@name}' successful test request (#{@success_count}/#{@half_open_attempts})")
135
+
136
+ transition_to_closed if @success_count >= @half_open_attempts
137
+ when :open
138
+ # Should not happen, but reset if we somehow get a success
139
+ transition_to_closed
140
+ end
141
+ end
142
+ end
143
+
144
+ def on_failure(error)
145
+ @mutex.synchronize do
146
+ @failure_count += 1
147
+ @last_failure_time = Time.now
148
+
149
+ case @state
150
+ when :closed
151
+ @logger.log('d', "[!] [circuit_breaker] '#{@name}' failure #{@failure_count}/#{@failure_threshold}: #{error.class}")
152
+ transition_to_open if @failure_count >= @failure_threshold
153
+ when :half_open
154
+ @logger.log('d', "[!] [circuit_breaker] '#{@name}' failed during half-open test")
155
+ transition_to_open
156
+ when :open
157
+ # Already open, just log
158
+ @logger.log('d', "[!] [circuit_breaker] '#{@name}' additional failure while open")
159
+ end
160
+ end
161
+ end
162
+
163
+ def transition_to_open
164
+ @state = :open
165
+ @success_count = 0
166
+ @logger.log('s', "[!] [circuit_breaker] '#{@name}' OPENED after #{@failure_count} failures (will retry in #{@timeout}s)")
167
+ @metrics.increment("circuit_breaker.opened.#{@name}")
168
+ @metrics.gauge("circuit_breaker.state.#{@name}", 1) # 1 = open
169
+ end
170
+
171
+ def transition_to_half_open
172
+ @state = :half_open
173
+ @success_count = 0
174
+ @failure_count = 0
175
+ @logger.log('s', "[*] [circuit_breaker] '#{@name}' HALF-OPEN, testing provider health")
176
+ @metrics.increment("circuit_breaker.half_open.#{@name}")
177
+ @metrics.gauge("circuit_breaker.state.#{@name}", 0.5) # 0.5 = half-open
178
+ end
179
+
180
+ def transition_to_closed
181
+ @state = :closed
182
+ @failure_count = 0
183
+ @success_count = 0
184
+ @logger.log('s', "[+] [circuit_breaker] '#{@name}' CLOSED, provider recovered")
185
+ @metrics.increment("circuit_breaker.closed.#{@name}")
186
+ @metrics.gauge("circuit_breaker.state.#{@name}", 0) # 0 = closed
187
+ end
188
+ end
189
+ end
@@ -34,6 +34,34 @@ module Vmpooler
34
34
  end
35
35
  end
36
36
  end
37
+
38
+ # Get connection pool health status
39
+ # @return [Hash] Health status including utilization and queue depth
40
+ def health_status
41
+ {
42
+ size: @size,
43
+ available: @available.length,
44
+ in_use: @size - @available.length,
45
+ utilization: ((@size - @available.length).to_f / @size * 100).round(2),
46
+ waiting_threads: (@queue.respond_to?(:length) ? @queue.length : 0),
47
+ state: determine_health_state
48
+ }
49
+ end
50
+
51
+ private
52
+
53
+ def determine_health_state
54
+ utilization = ((@size - @available.length).to_f / @size * 100)
55
+ waiting = @queue.respond_to?(:length) ? @queue.length : 0
56
+
57
+ if utilization >= 90 || waiting > 5
58
+ :critical # Pool exhausted or many waiting threads
59
+ elsif utilization >= 70 || waiting > 2
60
+ :warning # Pool under stress
61
+ else
62
+ :healthy # Normal operation
63
+ end
64
+ end
37
65
  end
38
66
  end
39
67
  end
@@ -329,6 +329,66 @@ module Vmpooler
329
329
  buckets: REDIS_CONNECT_BUCKETS,
330
330
  docstring: 'vmpooler redis connection wait time',
331
331
  param_labels: %i[type provider]
332
+ },
333
+ vmpooler_health: {
334
+ mtype: M_GAUGE,
335
+ torun: %i[manager],
336
+ docstring: 'vmpooler health check metrics',
337
+ param_labels: %i[metric_path]
338
+ },
339
+ vmpooler_purge: {
340
+ mtype: M_GAUGE,
341
+ torun: %i[manager],
342
+ docstring: 'vmpooler purge metrics',
343
+ param_labels: %i[metric_path]
344
+ },
345
+ vmpooler_destroy: {
346
+ mtype: M_GAUGE,
347
+ torun: %i[manager],
348
+ docstring: 'vmpooler destroy metrics',
349
+ param_labels: %i[poolname]
350
+ },
351
+ vmpooler_clone: {
352
+ mtype: M_GAUGE,
353
+ torun: %i[manager],
354
+ docstring: 'vmpooler clone metrics',
355
+ param_labels: %i[poolname]
356
+ },
357
+ circuit_breaker: {
358
+ mtype: M_GAUGE,
359
+ torun: %i[manager],
360
+ docstring: 'Circuit breaker state and failure tracking',
361
+ param_labels: %i[metric_path]
362
+ },
363
+ connection_pool: {
364
+ mtype: M_GAUGE,
365
+ torun: %i[manager],
366
+ docstring: 'Connection pool health metrics',
367
+ param_labels: %i[metric_path]
368
+ },
369
+ adaptive_timeout: {
370
+ mtype: M_GAUGE,
371
+ torun: %i[manager],
372
+ docstring: 'Adaptive timeout statistics',
373
+ param_labels: %i[metric_path]
374
+ },
375
+ vmpooler_performance: {
376
+ mtype: M_GAUGE,
377
+ torun: %i[manager],
378
+ docstring: 'vmpooler performance metrics for pool operations',
379
+ param_labels: %i[metric_path]
380
+ },
381
+ vmpooler_dlq: {
382
+ mtype: M_COUNTER,
383
+ torun: %i[manager],
384
+ docstring: 'vmpooler dead letter queue metrics',
385
+ param_labels: %i[metric_path]
386
+ },
387
+ vmpooler_errors: {
388
+ mtype: M_COUNTER,
389
+ torun: %i[manager],
390
+ docstring: 'vmpooler error counters including permanent failures',
391
+ param_labels: %i[metric_path]
332
392
  }
333
393
  }
334
394
  end