vmpooler 3.7.0 → 3.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/vmpooler/api/helpers.rb +31 -10
- data/lib/vmpooler/api/input_validator.rb +159 -0
- data/lib/vmpooler/api/rate_limiter.rb +116 -0
- data/lib/vmpooler/api/v3.rb +133 -16
- data/lib/vmpooler/metrics/promstats.rb +24 -0
- data/lib/vmpooler/pool_manager.rb +772 -11
- data/lib/vmpooler/version.rb +1 -1
- metadata +5 -7
|
@@ -161,16 +161,80 @@ module Vmpooler
|
|
|
161
161
|
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
|
|
162
162
|
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
|
|
163
163
|
open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
|
|
164
|
+
retry_count = redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count').to_i if request_id
|
|
165
|
+
|
|
166
|
+
# Move to DLQ before moving to completed queue
|
|
167
|
+
move_to_dlq(vm, pool, 'pending', 'Timeout',
|
|
168
|
+
open_socket_error || 'VM timed out during pending phase',
|
|
169
|
+
redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
|
|
170
|
+
|
|
171
|
+
clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error')
|
|
172
|
+
clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class')
|
|
164
173
|
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
|
|
174
|
+
|
|
165
175
|
if request_id
|
|
166
176
|
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
|
|
167
177
|
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
|
|
168
|
-
#
|
|
169
|
-
redis.
|
|
178
|
+
# Check retry count and max retry limit before retrying
|
|
179
|
+
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
|
|
180
|
+
max_retries = $config[:config]['max_vm_retries'] || 3
|
|
181
|
+
|
|
182
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' checking retry logic: error='#{clone_error}', error_class='#{clone_error_class}', retry_count=#{retry_count}, max_retries=#{max_retries}")
|
|
183
|
+
|
|
184
|
+
# Determine if error is likely permanent (configuration issues)
|
|
185
|
+
permanent_error = permanent_error?(clone_error, clone_error_class)
|
|
186
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' permanent_error check result: #{permanent_error}")
|
|
187
|
+
|
|
188
|
+
if retry_count < max_retries && !permanent_error
|
|
189
|
+
# Increment retry count and retry VM creation
|
|
190
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1)
|
|
191
|
+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
|
|
192
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})")
|
|
193
|
+
else
|
|
194
|
+
# Max retries exceeded or permanent error, mark request as permanently failed
|
|
195
|
+
failure_reason = if permanent_error
|
|
196
|
+
"Configuration error: #{clone_error}"
|
|
197
|
+
else
|
|
198
|
+
'Max retry attempts exceeded'
|
|
199
|
+
end
|
|
200
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
|
|
201
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason)
|
|
202
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}")
|
|
203
|
+
$metrics.increment("vmpooler_errors.permanently_failed.#{pool}")
|
|
204
|
+
end
|
|
170
205
|
end
|
|
171
206
|
end
|
|
172
|
-
$metrics.increment("
|
|
173
|
-
open_socket_error
|
|
207
|
+
$metrics.increment("vmpooler_errors.markedasfailed.#{pool}")
|
|
208
|
+
open_socket_error || clone_error
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Determine if an error is likely permanent (configuration issue) vs transient
|
|
212
|
+
def permanent_error?(error_message, error_class)
|
|
213
|
+
return false if error_message.nil? || error_class.nil?
|
|
214
|
+
|
|
215
|
+
permanent_error_patterns = [
|
|
216
|
+
/template.*not found/i,
|
|
217
|
+
/template.*does not exist/i,
|
|
218
|
+
/invalid.*path/i,
|
|
219
|
+
/folder.*not found/i,
|
|
220
|
+
/datastore.*not found/i,
|
|
221
|
+
/resource pool.*not found/i,
|
|
222
|
+
/permission.*denied/i,
|
|
223
|
+
/authentication.*failed/i,
|
|
224
|
+
/invalid.*credentials/i,
|
|
225
|
+
/configuration.*error/i
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
permanent_error_classes = [
|
|
229
|
+
'ArgumentError',
|
|
230
|
+
'NoMethodError',
|
|
231
|
+
'NameError'
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
# Check error message patterns
|
|
235
|
+
permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } ||
|
|
236
|
+
# Check error class types
|
|
237
|
+
permanent_error_classes.include?(error_class)
|
|
174
238
|
end
|
|
175
239
|
|
|
176
240
|
def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
|
|
@@ -223,8 +287,16 @@ module Vmpooler
|
|
|
223
287
|
return true if provider.vm_ready?(pool_name, vm_name, redis)
|
|
224
288
|
|
|
225
289
|
raise("VM #{vm_name} is not ready")
|
|
226
|
-
rescue StandardError
|
|
290
|
+
rescue StandardError => e
|
|
227
291
|
open_socket_error = redis.hget("vmpooler__vm__#{vm_name}", 'open_socket_error')
|
|
292
|
+
request_id = redis.hget("vmpooler__vm__#{vm_name}", 'request_id')
|
|
293
|
+
pool_alias = redis.hget("vmpooler__vm__#{vm_name}", 'pool_alias')
|
|
294
|
+
|
|
295
|
+
# Move to DLQ before moving to completed queue
|
|
296
|
+
move_to_dlq(vm_name, pool_name, 'ready', e.class.name,
|
|
297
|
+
open_socket_error || 'VM became unreachable in ready queue',
|
|
298
|
+
redis, request_id: request_id, pool_alias: pool_alias)
|
|
299
|
+
|
|
228
300
|
move_vm_queue(pool_name, vm_name, 'ready', 'completed', redis, "removed from 'ready' queue. vm unreachable with error: #{open_socket_error}")
|
|
229
301
|
end
|
|
230
302
|
|
|
@@ -357,6 +429,60 @@ module Vmpooler
|
|
|
357
429
|
$logger.log('d', "[!] [#{pool}] '#{vm}' #{msg}") if msg
|
|
358
430
|
end
|
|
359
431
|
|
|
432
|
+
# Dead-Letter Queue (DLQ) helper methods
|
|
433
|
+
def dlq_enabled?
|
|
434
|
+
$config[:config] && $config[:config]['dlq_enabled'] == true
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def dlq_ttl
|
|
438
|
+
($config[:config] && $config[:config]['dlq_ttl']) || 168 # default 7 days in hours
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def dlq_max_entries
|
|
442
|
+
($config[:config] && $config[:config]['dlq_max_entries']) || 10_000
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
def move_to_dlq(vm, pool, queue_type, error_class, error_message, redis, request_id: nil, pool_alias: nil, retry_count: 0, skip_metrics: false)
|
|
446
|
+
return unless dlq_enabled?
|
|
447
|
+
|
|
448
|
+
dlq_key = "vmpooler__dlq__#{queue_type}"
|
|
449
|
+
timestamp = Time.now.to_i
|
|
450
|
+
|
|
451
|
+
# Build DLQ entry
|
|
452
|
+
dlq_entry = {
|
|
453
|
+
'vm' => vm,
|
|
454
|
+
'pool' => pool,
|
|
455
|
+
'queue_from' => queue_type,
|
|
456
|
+
'error_class' => error_class.to_s,
|
|
457
|
+
'error_message' => error_message.to_s,
|
|
458
|
+
'failed_at' => Time.now.iso8601,
|
|
459
|
+
'retry_count' => retry_count,
|
|
460
|
+
'request_id' => request_id,
|
|
461
|
+
'pool_alias' => pool_alias
|
|
462
|
+
}.compact
|
|
463
|
+
|
|
464
|
+
# Use sorted set with timestamp as score for easy age-based queries and TTL
|
|
465
|
+
dlq_entry_json = dlq_entry.to_json
|
|
466
|
+
redis.zadd(dlq_key, timestamp, "#{vm}:#{timestamp}:#{dlq_entry_json}")
|
|
467
|
+
|
|
468
|
+
# Enforce max entries limit by removing oldest entries
|
|
469
|
+
current_size = redis.zcard(dlq_key)
|
|
470
|
+
if current_size > dlq_max_entries
|
|
471
|
+
remove_count = current_size - dlq_max_entries
|
|
472
|
+
redis.zremrangebyrank(dlq_key, 0, remove_count - 1)
|
|
473
|
+
$logger.log('d', "[!] [dlq] Trimmed #{remove_count} oldest entries from #{dlq_key}")
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# Set expiration on the entire DLQ (will be refreshed on next write)
|
|
477
|
+
ttl_seconds = dlq_ttl * 3600
|
|
478
|
+
redis.expire(dlq_key, ttl_seconds)
|
|
479
|
+
|
|
480
|
+
$metrics.increment("vmpooler_dlq.#{queue_type}.count") unless skip_metrics
|
|
481
|
+
$logger.log('d', "[!] [dlq] Moved '#{vm}' from '#{queue_type}' queue to DLQ: #{error_message}")
|
|
482
|
+
rescue StandardError => e
|
|
483
|
+
$logger.log('s', "[!] [dlq] Failed to move '#{vm}' to DLQ: #{e}")
|
|
484
|
+
end
|
|
485
|
+
|
|
360
486
|
# Clone a VM
|
|
361
487
|
def clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = nil)
|
|
362
488
|
Thread.new do
|
|
@@ -366,7 +492,13 @@ module Vmpooler
|
|
|
366
492
|
if request_id
|
|
367
493
|
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}")
|
|
368
494
|
@redis.with_metrics do |redis|
|
|
369
|
-
|
|
495
|
+
# Only re-queue if the request wasn't already marked as failed (e.g., by permanent error detection)
|
|
496
|
+
request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
|
|
497
|
+
if request_status != 'failed'
|
|
498
|
+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
|
499
|
+
else
|
|
500
|
+
$logger.log('s', "[!] [#{pool_name}] Request #{request_id} already marked as failed, not re-queueing")
|
|
501
|
+
end
|
|
370
502
|
end
|
|
371
503
|
else
|
|
372
504
|
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")
|
|
@@ -419,10 +551,10 @@ module Vmpooler
|
|
|
419
551
|
hostname_retries += 1
|
|
420
552
|
|
|
421
553
|
if !hostname_available
|
|
422
|
-
$metrics.increment("
|
|
554
|
+
$metrics.increment("vmpooler_errors.duplicatehostname.#{pool_name}")
|
|
423
555
|
$logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} was not unique (attempt \##{hostname_retries} of #{max_hostname_retries})")
|
|
424
556
|
elsif !dns_available
|
|
425
|
-
$metrics.increment("
|
|
557
|
+
$metrics.increment("vmpooler_errors.staledns.#{pool_name}")
|
|
426
558
|
$logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} already exists in DNS records (#{dns_ip}), stale DNS")
|
|
427
559
|
end
|
|
428
560
|
end
|
|
@@ -468,7 +600,7 @@ module Vmpooler
|
|
|
468
600
|
provider.create_vm(pool_name, new_vmname)
|
|
469
601
|
finish = format('%<time>.2f', time: Time.now - start)
|
|
470
602
|
$logger.log('s', "[+] [#{pool_name}] '#{new_vmname}' cloned in #{finish} seconds")
|
|
471
|
-
$metrics.
|
|
603
|
+
$metrics.gauge("vmpooler_clone.#{pool_name}", finish)
|
|
472
604
|
|
|
473
605
|
$logger.log('d', "[ ] [#{pool_name}] Obtaining IP for '#{new_vmname}'")
|
|
474
606
|
ip_start = Time.now
|
|
@@ -489,14 +621,50 @@ module Vmpooler
|
|
|
489
621
|
|
|
490
622
|
dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
|
|
491
623
|
dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
|
|
492
|
-
rescue StandardError
|
|
624
|
+
rescue StandardError => e
|
|
625
|
+
# Store error details for retry decision making
|
|
493
626
|
@redis.with_metrics do |redis|
|
|
627
|
+
# Get retry count before moving to DLQ
|
|
628
|
+
retry_count = 0
|
|
629
|
+
if request_id
|
|
630
|
+
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
|
|
631
|
+
retry_count = ondemandrequest_hash['retry_count'].to_i if ondemandrequest_hash
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
# Move to DLQ before removing from pending queue
|
|
635
|
+
move_to_dlq(new_vmname, pool_name, 'clone', e.class.name, e.message,
|
|
636
|
+
redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
|
|
637
|
+
|
|
494
638
|
redis.pipelined do |pipeline|
|
|
495
639
|
pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
|
|
640
|
+
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message)
|
|
641
|
+
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name)
|
|
496
642
|
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
|
|
497
643
|
pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
|
|
498
644
|
end
|
|
645
|
+
|
|
646
|
+
# Handle retry logic for on-demand requests
|
|
647
|
+
if request_id
|
|
648
|
+
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
|
|
649
|
+
max_retries = $config[:config]['max_vm_retries'] || 3
|
|
650
|
+
is_permanent = permanent_error?(e.message, e.class.name)
|
|
651
|
+
|
|
652
|
+
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' checking immediate failure retry: error='#{e.message}', error_class='#{e.class.name}', retry_count=#{retry_count}, max_retries=#{max_retries}, permanent_error=#{is_permanent}")
|
|
653
|
+
|
|
654
|
+
if is_permanent || retry_count >= max_retries
|
|
655
|
+
reason = is_permanent ? 'permanent error detected' : 'max retries exceeded'
|
|
656
|
+
$logger.log('s', "[!] [#{pool_name}] Cancelling request #{request_id} due to #{reason}")
|
|
657
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
|
|
658
|
+
redis.zadd('vmpooler__odcreate__task', 0, "#{pool_alias}:#{pool_name}:0:#{request_id}")
|
|
659
|
+
else
|
|
660
|
+
# Increment retry count and re-queue for retry
|
|
661
|
+
redis.hincrby("vmpooler__odrequest__#{request_id}", 'retry_count', 1)
|
|
662
|
+
$logger.log('s', "[+] [#{pool_name}] Request #{request_id} will be retried (attempt #{retry_count + 1}/#{max_retries})")
|
|
663
|
+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
|
664
|
+
end
|
|
665
|
+
end
|
|
499
666
|
end
|
|
667
|
+
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}")
|
|
500
668
|
raise
|
|
501
669
|
ensure
|
|
502
670
|
@redis.with_metrics do |redis|
|
|
@@ -546,7 +714,7 @@ module Vmpooler
|
|
|
546
714
|
|
|
547
715
|
finish = format('%<time>.2f', time: Time.now - start)
|
|
548
716
|
$logger.log('s', "[-] [#{pool}] '#{vm}' destroyed in #{finish} seconds")
|
|
549
|
-
$metrics.
|
|
717
|
+
$metrics.gauge("vmpooler_destroy.#{pool}", finish)
|
|
550
718
|
end
|
|
551
719
|
end
|
|
552
720
|
dereference_mutex(vm)
|
|
@@ -582,6 +750,543 @@ module Vmpooler
|
|
|
582
750
|
provider.purge_unconfigured_resources(allowlist)
|
|
583
751
|
end
|
|
584
752
|
|
|
753
|
+
# Auto-purge stale queue entries
|
|
754
|
+
def purge_enabled?
|
|
755
|
+
$config[:config] && $config[:config]['purge_enabled'] == true
|
|
756
|
+
end
|
|
757
|
+
|
|
758
|
+
def purge_dry_run?
|
|
759
|
+
$config[:config] && $config[:config]['purge_dry_run'] == true
|
|
760
|
+
end
|
|
761
|
+
|
|
762
|
+
def max_pending_age
|
|
763
|
+
($config[:config] && $config[:config]['max_pending_age']) || 7200 # default 2 hours in seconds
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
def max_ready_age
|
|
767
|
+
($config[:config] && $config[:config]['max_ready_age']) || 86_400 # default 24 hours in seconds
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
def max_completed_age
|
|
771
|
+
($config[:config] && $config[:config]['max_completed_age']) || 3600 # default 1 hour in seconds
|
|
772
|
+
end
|
|
773
|
+
|
|
774
|
+
def max_orphaned_age
|
|
775
|
+
($config[:config] && $config[:config]['max_orphaned_age']) || 86_400 # default 24 hours in seconds
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
def purge_stale_queue_entries
|
|
779
|
+
return unless purge_enabled?
|
|
780
|
+
|
|
781
|
+
Thread.new do
|
|
782
|
+
begin
|
|
783
|
+
$logger.log('d', '[*] [purge] Starting stale queue entry purge cycle')
|
|
784
|
+
purge_start = Time.now
|
|
785
|
+
|
|
786
|
+
@redis.with_metrics do |redis|
|
|
787
|
+
total_purged = 0
|
|
788
|
+
|
|
789
|
+
# Purge stale entries from each pool
|
|
790
|
+
$config[:pools].each do |pool|
|
|
791
|
+
pool_name = pool['name']
|
|
792
|
+
|
|
793
|
+
# Purge pending queue
|
|
794
|
+
purged_pending = purge_pending_queue(pool_name, redis)
|
|
795
|
+
total_purged += purged_pending
|
|
796
|
+
|
|
797
|
+
# Purge ready queue
|
|
798
|
+
purged_ready = purge_ready_queue(pool_name, redis)
|
|
799
|
+
total_purged += purged_ready
|
|
800
|
+
|
|
801
|
+
# Purge completed queue
|
|
802
|
+
purged_completed = purge_completed_queue(pool_name, redis)
|
|
803
|
+
total_purged += purged_completed
|
|
804
|
+
end
|
|
805
|
+
|
|
806
|
+
# Purge orphaned VM metadata
|
|
807
|
+
purged_orphaned = purge_orphaned_metadata(redis)
|
|
808
|
+
total_purged += purged_orphaned
|
|
809
|
+
|
|
810
|
+
purge_duration = Time.now - purge_start
|
|
811
|
+
$logger.log('s', "[*] [purge] Completed purge cycle in #{purge_duration.round(2)}s: #{total_purged} entries purged")
|
|
812
|
+
$metrics.gauge('vmpooler_purge.cycle.duration', purge_duration)
|
|
813
|
+
$metrics.gauge('vmpooler_purge.total.count', total_purged)
|
|
814
|
+
end
|
|
815
|
+
rescue StandardError => e
|
|
816
|
+
$logger.log('s', "[!] [purge] Failed during purge cycle: #{e}")
|
|
817
|
+
end
|
|
818
|
+
end
|
|
819
|
+
end
|
|
820
|
+
|
|
821
|
+
def purge_pending_queue(pool_name, redis)
|
|
822
|
+
queue_key = "vmpooler__pending__#{pool_name}"
|
|
823
|
+
vms = redis.smembers(queue_key)
|
|
824
|
+
purged_count = 0
|
|
825
|
+
|
|
826
|
+
vms.each do |vm|
|
|
827
|
+
begin
|
|
828
|
+
clone_time_str = redis.hget("vmpooler__vm__#{vm}", 'clone')
|
|
829
|
+
next unless clone_time_str
|
|
830
|
+
|
|
831
|
+
clone_time = Time.parse(clone_time_str)
|
|
832
|
+
age = Time.now - clone_time
|
|
833
|
+
|
|
834
|
+
if age > max_pending_age
|
|
835
|
+
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
|
|
836
|
+
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias')
|
|
837
|
+
|
|
838
|
+
purged_count += 1
|
|
839
|
+
|
|
840
|
+
if purge_dry_run?
|
|
841
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge stale pending VM '#{vm}' (age: #{age.round(0)}s, max: #{max_pending_age}s)")
|
|
842
|
+
else
|
|
843
|
+
# Move to DLQ before removing (skip DLQ metric since we're tracking purge metric)
|
|
844
|
+
move_to_dlq(vm, pool_name, 'pending', 'Purge',
|
|
845
|
+
"Stale pending VM (age: #{age.round(0)}s > max: #{max_pending_age}s)",
|
|
846
|
+
redis, request_id: request_id, pool_alias: pool_alias, skip_metrics: true)
|
|
847
|
+
|
|
848
|
+
redis.srem(queue_key, vm)
|
|
849
|
+
|
|
850
|
+
# Set expiration on VM metadata if data_ttl is configured
|
|
851
|
+
if $config[:redis] && $config[:redis]['data_ttl']
|
|
852
|
+
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
|
|
853
|
+
redis.expire("vmpooler__vm__#{vm}", expiration_ttl)
|
|
854
|
+
end
|
|
855
|
+
|
|
856
|
+
$logger.log('d', "[!] [purge] Purged stale pending VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
|
|
857
|
+
$metrics.increment("vmpooler_purge.pending.#{pool_name}.count")
|
|
858
|
+
end
|
|
859
|
+
end
|
|
860
|
+
rescue StandardError => e
|
|
861
|
+
$logger.log('d', "[!] [purge] Error checking pending VM '#{vm}': #{e}")
|
|
862
|
+
end
|
|
863
|
+
end
|
|
864
|
+
|
|
865
|
+
purged_count
|
|
866
|
+
end
|
|
867
|
+
|
|
868
|
+
def purge_ready_queue(pool_name, redis)
|
|
869
|
+
queue_key = "vmpooler__ready__#{pool_name}"
|
|
870
|
+
vms = redis.smembers(queue_key)
|
|
871
|
+
purged_count = 0
|
|
872
|
+
|
|
873
|
+
vms.each do |vm|
|
|
874
|
+
begin
|
|
875
|
+
ready_time_str = redis.hget("vmpooler__vm__#{vm}", 'ready')
|
|
876
|
+
next unless ready_time_str
|
|
877
|
+
|
|
878
|
+
ready_time = Time.parse(ready_time_str)
|
|
879
|
+
age = Time.now - ready_time
|
|
880
|
+
|
|
881
|
+
if age > max_ready_age
|
|
882
|
+
if purge_dry_run?
|
|
883
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge stale ready VM '#{vm}' (age: #{age.round(0)}s, max: #{max_ready_age}s)")
|
|
884
|
+
else
|
|
885
|
+
redis.smove(queue_key, "vmpooler__completed__#{pool_name}", vm)
|
|
886
|
+
$logger.log('d', "[!] [purge] Moved stale ready VM '#{vm}' from '#{pool_name}' to completed (age: #{age.round(0)}s)")
|
|
887
|
+
$metrics.increment("vmpooler_purge.ready.#{pool_name}.count")
|
|
888
|
+
end
|
|
889
|
+
purged_count += 1
|
|
890
|
+
end
|
|
891
|
+
rescue StandardError => e
|
|
892
|
+
$logger.log('d', "[!] [purge] Error checking ready VM '#{vm}': #{e}")
|
|
893
|
+
end
|
|
894
|
+
end
|
|
895
|
+
|
|
896
|
+
purged_count
|
|
897
|
+
end
|
|
898
|
+
|
|
899
|
+
def purge_completed_queue(pool_name, redis)
|
|
900
|
+
queue_key = "vmpooler__completed__#{pool_name}"
|
|
901
|
+
vms = redis.smembers(queue_key)
|
|
902
|
+
purged_count = 0
|
|
903
|
+
|
|
904
|
+
vms.each do |vm|
|
|
905
|
+
begin
|
|
906
|
+
# Check destroy time or last activity time
|
|
907
|
+
destroy_time_str = redis.hget("vmpooler__vm__#{vm}", 'destroy')
|
|
908
|
+
checkout_time_str = redis.hget("vmpooler__vm__#{vm}", 'checkout')
|
|
909
|
+
|
|
910
|
+
# Use the most recent timestamp
|
|
911
|
+
timestamp_str = destroy_time_str || checkout_time_str
|
|
912
|
+
next unless timestamp_str
|
|
913
|
+
|
|
914
|
+
timestamp = Time.parse(timestamp_str)
|
|
915
|
+
age = Time.now - timestamp
|
|
916
|
+
|
|
917
|
+
if age > max_completed_age
|
|
918
|
+
if purge_dry_run?
|
|
919
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge stale completed VM '#{vm}' (age: #{age.round(0)}s, max: #{max_completed_age}s)")
|
|
920
|
+
else
|
|
921
|
+
redis.srem(queue_key, vm)
|
|
922
|
+
$logger.log('d', "[!] [purge] Removed stale completed VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
|
|
923
|
+
$metrics.increment("vmpooler_purge.completed.#{pool_name}.count")
|
|
924
|
+
end
|
|
925
|
+
purged_count += 1
|
|
926
|
+
end
|
|
927
|
+
rescue StandardError => e
|
|
928
|
+
$logger.log('d', "[!] [purge] Error checking completed VM '#{vm}': #{e}")
|
|
929
|
+
end
|
|
930
|
+
end
|
|
931
|
+
|
|
932
|
+
purged_count
|
|
933
|
+
end
|
|
934
|
+
|
|
935
|
+
def purge_orphaned_metadata(redis)
|
|
936
|
+
# Find VM metadata that doesn't belong to any queue
|
|
937
|
+
all_vm_keys = redis.keys('vmpooler__vm__*')
|
|
938
|
+
purged_count = 0
|
|
939
|
+
|
|
940
|
+
all_vm_keys.each do |vm_key|
|
|
941
|
+
begin
|
|
942
|
+
vm = vm_key.sub('vmpooler__vm__', '')
|
|
943
|
+
|
|
944
|
+
# Check if VM exists in any queue
|
|
945
|
+
pool_name = redis.hget(vm_key, 'pool')
|
|
946
|
+
next unless pool_name
|
|
947
|
+
|
|
948
|
+
in_pending = redis.sismember("vmpooler__pending__#{pool_name}", vm)
|
|
949
|
+
in_ready = redis.sismember("vmpooler__ready__#{pool_name}", vm)
|
|
950
|
+
in_running = redis.sismember("vmpooler__running__#{pool_name}", vm)
|
|
951
|
+
in_completed = redis.sismember("vmpooler__completed__#{pool_name}", vm)
|
|
952
|
+
in_discovered = redis.sismember("vmpooler__discovered__#{pool_name}", vm)
|
|
953
|
+
in_migrating = redis.sismember("vmpooler__migrating__#{pool_name}", vm)
|
|
954
|
+
|
|
955
|
+
# VM is orphaned if not in any queue
|
|
956
|
+
unless in_pending || in_ready || in_running || in_completed || in_discovered || in_migrating
|
|
957
|
+
# Check age
|
|
958
|
+
clone_time_str = redis.hget(vm_key, 'clone')
|
|
959
|
+
next unless clone_time_str
|
|
960
|
+
|
|
961
|
+
clone_time = Time.parse(clone_time_str)
|
|
962
|
+
age = Time.now - clone_time
|
|
963
|
+
|
|
964
|
+
if age > max_orphaned_age
|
|
965
|
+
if purge_dry_run?
|
|
966
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge orphaned metadata for '#{vm}' (age: #{age.round(0)}s, max: #{max_orphaned_age}s)")
|
|
967
|
+
else
|
|
968
|
+
expiration_ttl = 3600 # 1 hour
|
|
969
|
+
redis.expire(vm_key, expiration_ttl)
|
|
970
|
+
$logger.log('d', "[!] [purge] Set expiration on orphaned metadata for '#{vm}' (age: #{age.round(0)}s)")
|
|
971
|
+
$metrics.increment('vmpooler_purge.orphaned.count')
|
|
972
|
+
end
|
|
973
|
+
purged_count += 1
|
|
974
|
+
end
|
|
975
|
+
end
|
|
976
|
+
rescue StandardError => e
|
|
977
|
+
$logger.log('d', "[!] [purge] Error checking orphaned metadata '#{vm_key}': #{e}")
|
|
978
|
+
end
|
|
979
|
+
end
|
|
980
|
+
|
|
981
|
+
purged_count
|
|
982
|
+
end
|
|
983
|
+
|
|
984
|
+
# Health checks for Redis queues
|
|
985
|
+
def health_check_enabled?
|
|
986
|
+
$config[:config] && $config[:config]['health_check_enabled'] == true
|
|
987
|
+
end
|
|
988
|
+
|
|
989
|
+
def health_thresholds
|
|
990
|
+
defaults = {
|
|
991
|
+
'pending_queue_max' => 100,
|
|
992
|
+
'ready_queue_max' => 500,
|
|
993
|
+
'dlq_max_warning' => 100,
|
|
994
|
+
'dlq_max_critical' => 1000,
|
|
995
|
+
'stuck_vm_age_threshold' => 7200, # 2 hours
|
|
996
|
+
'stuck_vm_max_warning' => 10,
|
|
997
|
+
'stuck_vm_max_critical' => 50
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
if $config[:config] && $config[:config]['health_thresholds']
|
|
1001
|
+
defaults.merge($config[:config]['health_thresholds'])
|
|
1002
|
+
else
|
|
1003
|
+
defaults
|
|
1004
|
+
end
|
|
1005
|
+
end
|
|
1006
|
+
|
|
1007
|
+
def check_queue_health
|
|
1008
|
+
return unless health_check_enabled?
|
|
1009
|
+
|
|
1010
|
+
Thread.new do
|
|
1011
|
+
begin
|
|
1012
|
+
$logger.log('d', '[*] [health] Running queue health check')
|
|
1013
|
+
health_start = Time.now
|
|
1014
|
+
|
|
1015
|
+
@redis.with_metrics do |redis|
|
|
1016
|
+
health_metrics = calculate_health_metrics(redis)
|
|
1017
|
+
health_status = determine_health_status(health_metrics)
|
|
1018
|
+
|
|
1019
|
+
# Store health metrics in Redis for API consumption
|
|
1020
|
+
# Convert nested hash to JSON for storage
|
|
1021
|
+
require 'json'
|
|
1022
|
+
redis.hset('vmpooler__health', 'metrics', health_metrics.to_json)
|
|
1023
|
+
redis.hset('vmpooler__health', 'status', health_status)
|
|
1024
|
+
redis.hset('vmpooler__health', 'last_check', Time.now.iso8601)
|
|
1025
|
+
redis.expire('vmpooler__health', 3600) # Expire after 1 hour
|
|
1026
|
+
|
|
1027
|
+
# Log health summary
|
|
1028
|
+
log_health_summary(health_metrics, health_status)
|
|
1029
|
+
|
|
1030
|
+
# Push metrics
|
|
1031
|
+
push_health_metrics(health_metrics, health_status)
|
|
1032
|
+
|
|
1033
|
+
health_duration = Time.now - health_start
|
|
1034
|
+
$metrics.gauge('vmpooler_health.check.duration', health_duration)
|
|
1035
|
+
end
|
|
1036
|
+
rescue StandardError => e
|
|
1037
|
+
$logger.log('s', "[!] [health] Failed during health check: #{e}")
|
|
1038
|
+
end
|
|
1039
|
+
end
|
|
1040
|
+
end
|
|
1041
|
+
|
|
1042
|
+
def calculate_health_metrics(redis)
|
|
1043
|
+
metrics = {
|
|
1044
|
+
'queues' => {},
|
|
1045
|
+
'tasks' => {},
|
|
1046
|
+
'errors' => {}
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
total_stuck_vms = 0
|
|
1050
|
+
total_dlq_size = 0
|
|
1051
|
+
thresholds = health_thresholds
|
|
1052
|
+
|
|
1053
|
+
# Check each pool's queues
|
|
1054
|
+
$config[:pools].each do |pool|
|
|
1055
|
+
pool_name = pool['name']
|
|
1056
|
+
metrics['queues'][pool_name] = {}
|
|
1057
|
+
|
|
1058
|
+
# Pending queue metrics
|
|
1059
|
+
pending_key = "vmpooler__pending__#{pool_name}"
|
|
1060
|
+
pending_vms = redis.smembers(pending_key)
|
|
1061
|
+
pending_ages = calculate_queue_ages(pending_vms, 'clone', redis)
|
|
1062
|
+
stuck_pending = pending_ages.count { |age| age > thresholds['stuck_vm_age_threshold'] }
|
|
1063
|
+
total_stuck_vms += stuck_pending
|
|
1064
|
+
|
|
1065
|
+
metrics['queues'][pool_name]['pending'] = {
|
|
1066
|
+
'size' => pending_vms.size,
|
|
1067
|
+
'oldest_age' => pending_ages.max || 0,
|
|
1068
|
+
'avg_age' => pending_ages.empty? ? 0 : (pending_ages.sum / pending_ages.size).round(0),
|
|
1069
|
+
'stuck_count' => stuck_pending
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
# Ready queue metrics
|
|
1073
|
+
ready_key = "vmpooler__ready__#{pool_name}"
|
|
1074
|
+
ready_vms = redis.smembers(ready_key)
|
|
1075
|
+
ready_ages = calculate_queue_ages(ready_vms, 'ready', redis)
|
|
1076
|
+
|
|
1077
|
+
metrics['queues'][pool_name]['ready'] = {
|
|
1078
|
+
'size' => ready_vms.size,
|
|
1079
|
+
'oldest_age' => ready_ages.max || 0,
|
|
1080
|
+
'avg_age' => ready_ages.empty? ? 0 : (ready_ages.sum / ready_ages.size).round(0)
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
# Completed queue metrics
|
|
1084
|
+
completed_key = "vmpooler__completed__#{pool_name}"
|
|
1085
|
+
completed_size = redis.scard(completed_key)
|
|
1086
|
+
metrics['queues'][pool_name]['completed'] = { 'size' => completed_size }
|
|
1087
|
+
end
|
|
1088
|
+
|
|
1089
|
+
# Task queue metrics
|
|
1090
|
+
clone_active = redis.get('vmpooler__tasks__clone').to_i
|
|
1091
|
+
ondemand_active = redis.get('vmpooler__tasks__ondemandclone').to_i
|
|
1092
|
+
odcreate_pending = redis.zcard('vmpooler__odcreate__task')
|
|
1093
|
+
|
|
1094
|
+
metrics['tasks']['clone'] = { 'active' => clone_active }
|
|
1095
|
+
metrics['tasks']['ondemand'] = { 'active' => ondemand_active, 'pending' => odcreate_pending }
|
|
1096
|
+
|
|
1097
|
+
# DLQ metrics
|
|
1098
|
+
if dlq_enabled?
|
|
1099
|
+
dlq_keys = redis.keys('vmpooler__dlq__*')
|
|
1100
|
+
dlq_keys.each do |dlq_key|
|
|
1101
|
+
queue_type = dlq_key.sub('vmpooler__dlq__', '')
|
|
1102
|
+
dlq_size = redis.zcard(dlq_key)
|
|
1103
|
+
total_dlq_size += dlq_size
|
|
1104
|
+
metrics['queues']['dlq'] ||= {}
|
|
1105
|
+
metrics['queues']['dlq'][queue_type] = { 'size' => dlq_size }
|
|
1106
|
+
end
|
|
1107
|
+
end
|
|
1108
|
+
|
|
1109
|
+
# Error metrics
|
|
1110
|
+
metrics['errors']['dlq_total_size'] = total_dlq_size
|
|
1111
|
+
metrics['errors']['stuck_vm_count'] = total_stuck_vms
|
|
1112
|
+
|
|
1113
|
+
# Orphaned metadata count
|
|
1114
|
+
orphaned_count = count_orphaned_metadata(redis)
|
|
1115
|
+
metrics['errors']['orphaned_metadata_count'] = orphaned_count
|
|
1116
|
+
|
|
1117
|
+
metrics
|
|
1118
|
+
end
|
|
1119
|
+
|
|
1120
|
+
def calculate_queue_ages(vms, timestamp_field, redis)
|
|
1121
|
+
ages = []
|
|
1122
|
+
vms.each do |vm|
|
|
1123
|
+
begin
|
|
1124
|
+
timestamp_str = redis.hget("vmpooler__vm__#{vm}", timestamp_field)
|
|
1125
|
+
next unless timestamp_str
|
|
1126
|
+
|
|
1127
|
+
timestamp = Time.parse(timestamp_str)
|
|
1128
|
+
age = (Time.now - timestamp).to_i
|
|
1129
|
+
ages << age
|
|
1130
|
+
rescue StandardError
|
|
1131
|
+
# Skip VMs with invalid timestamps
|
|
1132
|
+
end
|
|
1133
|
+
end
|
|
1134
|
+
ages
|
|
1135
|
+
end
|
|
1136
|
+
|
|
1137
|
+
def count_orphaned_metadata(redis)
|
|
1138
|
+
all_vm_keys = redis.keys('vmpooler__vm__*')
|
|
1139
|
+
orphaned_count = 0
|
|
1140
|
+
|
|
1141
|
+
all_vm_keys.each do |vm_key|
|
|
1142
|
+
begin
|
|
1143
|
+
vm = vm_key.sub('vmpooler__vm__', '')
|
|
1144
|
+
pool_name = redis.hget(vm_key, 'pool')
|
|
1145
|
+
next unless pool_name
|
|
1146
|
+
|
|
1147
|
+
in_any_queue = redis.sismember("vmpooler__pending__#{pool_name}", vm) ||
|
|
1148
|
+
redis.sismember("vmpooler__ready__#{pool_name}", vm) ||
|
|
1149
|
+
redis.sismember("vmpooler__running__#{pool_name}", vm) ||
|
|
1150
|
+
redis.sismember("vmpooler__completed__#{pool_name}", vm) ||
|
|
1151
|
+
redis.sismember("vmpooler__discovered__#{pool_name}", vm) ||
|
|
1152
|
+
redis.sismember("vmpooler__migrating__#{pool_name}", vm)
|
|
1153
|
+
|
|
1154
|
+
orphaned_count += 1 unless in_any_queue
|
|
1155
|
+
rescue StandardError
|
|
1156
|
+
# Skip on error
|
|
1157
|
+
end
|
|
1158
|
+
end
|
|
1159
|
+
|
|
1160
|
+
orphaned_count
|
|
1161
|
+
end
|
|
1162
|
+
|
|
1163
|
+
def determine_health_status(metrics)
|
|
1164
|
+
thresholds = health_thresholds
|
|
1165
|
+
|
|
1166
|
+
# Check DLQ size
|
|
1167
|
+
dlq_size = metrics['errors']['dlq_total_size']
|
|
1168
|
+
return 'unhealthy' if dlq_size > thresholds['dlq_max_critical']
|
|
1169
|
+
|
|
1170
|
+
# Check stuck VM count
|
|
1171
|
+
stuck_count = metrics['errors']['stuck_vm_count']
|
|
1172
|
+
return 'unhealthy' if stuck_count > thresholds['stuck_vm_max_critical']
|
|
1173
|
+
|
|
1174
|
+
# Check queue sizes
|
|
1175
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1176
|
+
next if pool_name == 'dlq'
|
|
1177
|
+
|
|
1178
|
+
pending_size = begin
|
|
1179
|
+
queues['pending']['size']
|
|
1180
|
+
rescue StandardError
|
|
1181
|
+
0
|
|
1182
|
+
end
|
|
1183
|
+
ready_size = begin
|
|
1184
|
+
queues['ready']['size']
|
|
1185
|
+
rescue StandardError
|
|
1186
|
+
0
|
|
1187
|
+
end
|
|
1188
|
+
|
|
1189
|
+
return 'unhealthy' if pending_size > thresholds['pending_queue_max'] * 2
|
|
1190
|
+
return 'unhealthy' if ready_size > thresholds['ready_queue_max'] * 2
|
|
1191
|
+
end
|
|
1192
|
+
|
|
1193
|
+
# Check for degraded conditions
|
|
1194
|
+
return 'degraded' if dlq_size > thresholds['dlq_max_warning']
|
|
1195
|
+
return 'degraded' if stuck_count > thresholds['stuck_vm_max_warning']
|
|
1196
|
+
|
|
1197
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1198
|
+
next if pool_name == 'dlq'
|
|
1199
|
+
|
|
1200
|
+
pending_size = begin
|
|
1201
|
+
queues['pending']['size']
|
|
1202
|
+
rescue StandardError
|
|
1203
|
+
0
|
|
1204
|
+
end
|
|
1205
|
+
ready_size = begin
|
|
1206
|
+
queues['ready']['size']
|
|
1207
|
+
rescue StandardError
|
|
1208
|
+
0
|
|
1209
|
+
end
|
|
1210
|
+
|
|
1211
|
+
return 'degraded' if pending_size > thresholds['pending_queue_max']
|
|
1212
|
+
return 'degraded' if ready_size > thresholds['ready_queue_max']
|
|
1213
|
+
end
|
|
1214
|
+
|
|
1215
|
+
'healthy'
|
|
1216
|
+
end
|
|
1217
|
+
|
|
1218
|
+
def log_health_summary(metrics, status)
|
|
1219
|
+
summary = "[*] [health] Status: #{status.upcase}"
|
|
1220
|
+
|
|
1221
|
+
# Queue summary
|
|
1222
|
+
total_pending = 0
|
|
1223
|
+
total_ready = 0
|
|
1224
|
+
total_completed = 0
|
|
1225
|
+
|
|
1226
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1227
|
+
next if pool_name == 'dlq'
|
|
1228
|
+
|
|
1229
|
+
total_pending += begin
|
|
1230
|
+
queues['pending']['size']
|
|
1231
|
+
rescue StandardError
|
|
1232
|
+
0
|
|
1233
|
+
end
|
|
1234
|
+
total_ready += begin
|
|
1235
|
+
queues['ready']['size']
|
|
1236
|
+
rescue StandardError
|
|
1237
|
+
0
|
|
1238
|
+
end
|
|
1239
|
+
total_completed += begin
|
|
1240
|
+
queues['completed']['size']
|
|
1241
|
+
rescue StandardError
|
|
1242
|
+
0
|
|
1243
|
+
end
|
|
1244
|
+
end
|
|
1245
|
+
|
|
1246
|
+
summary += " | Queues: P=#{total_pending} R=#{total_ready} C=#{total_completed}"
|
|
1247
|
+
summary += " | DLQ=#{metrics['errors']['dlq_total_size']}"
|
|
1248
|
+
summary += " | Stuck=#{metrics['errors']['stuck_vm_count']}"
|
|
1249
|
+
summary += " | Orphaned=#{metrics['errors']['orphaned_metadata_count']}"
|
|
1250
|
+
|
|
1251
|
+
log_level = status == 'healthy' ? 's' : 'd'
|
|
1252
|
+
$logger.log(log_level, summary)
|
|
1253
|
+
end
|
|
1254
|
+
|
|
1255
|
+
def push_health_metrics(metrics, status)
|
|
1256
|
+
# Push error metrics first
|
|
1257
|
+
$metrics.gauge('vmpooler_health.dlq.total_size', metrics['errors']['dlq_total_size'])
|
|
1258
|
+
$metrics.gauge('vmpooler_health.stuck_vms.count', metrics['errors']['stuck_vm_count'])
|
|
1259
|
+
$metrics.gauge('vmpooler_health.orphaned_metadata.count', metrics['errors']['orphaned_metadata_count'])
|
|
1260
|
+
|
|
1261
|
+
# Push per-pool queue metrics
|
|
1262
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1263
|
+
next if pool_name == 'dlq'
|
|
1264
|
+
|
|
1265
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.size", queues['pending']['size'])
|
|
1266
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.oldest_age", queues['pending']['oldest_age'])
|
|
1267
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.stuck_count", queues['pending']['stuck_count'])
|
|
1268
|
+
|
|
1269
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.size", queues['ready']['size'])
|
|
1270
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.oldest_age", queues['ready']['oldest_age'])
|
|
1271
|
+
|
|
1272
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.completed.size", queues['completed']['size'])
|
|
1273
|
+
end
|
|
1274
|
+
|
|
1275
|
+
# Push DLQ metrics
|
|
1276
|
+
metrics['queues']['dlq']&.each do |queue_type, dlq_metrics|
|
|
1277
|
+
$metrics.gauge("vmpooler_health.dlq.#{queue_type}.size", dlq_metrics['size'])
|
|
1278
|
+
end
|
|
1279
|
+
|
|
1280
|
+
# Push task metrics
|
|
1281
|
+
$metrics.gauge('vmpooler_health.tasks.clone.active', metrics['tasks']['clone']['active'])
|
|
1282
|
+
$metrics.gauge('vmpooler_health.tasks.ondemand.active', metrics['tasks']['ondemand']['active'])
|
|
1283
|
+
$metrics.gauge('vmpooler_health.tasks.ondemand.pending', metrics['tasks']['ondemand']['pending'])
|
|
1284
|
+
|
|
1285
|
+
# Push status last (0=healthy, 1=degraded, 2=unhealthy)
|
|
1286
|
+
status_value = { 'healthy' => 0, 'degraded' => 1, 'unhealthy' => 2 }[status] || 2
|
|
1287
|
+
$metrics.gauge('vmpooler_health.status', status_value)
|
|
1288
|
+
end
|
|
1289
|
+
|
|
585
1290
|
def create_vm_disk(pool_name, vm, disk_size, provider)
|
|
586
1291
|
Thread.new do
|
|
587
1292
|
begin
|
|
@@ -982,7 +1687,12 @@ module Vmpooler
|
|
|
982
1687
|
|
|
983
1688
|
sync_pool_template(pool)
|
|
984
1689
|
loop do
|
|
1690
|
+
start_time = Time.now
|
|
985
1691
|
result = _check_pool(pool, provider)
|
|
1692
|
+
duration = Time.now - start_time
|
|
1693
|
+
|
|
1694
|
+
$metrics.gauge("vmpooler_performance.check_pool.#{pool['name']}", duration)
|
|
1695
|
+
$logger.log('d', "[!] check_pool for #{pool['name']} took #{duration.round(2)}s") if duration > 5
|
|
986
1696
|
|
|
987
1697
|
if result[:cloned_vms] > 0 || result[:checked_pending_vms] > 0 || result[:discovered_vms] > 0
|
|
988
1698
|
loop_delay = loop_delay_min
|
|
@@ -1541,6 +2251,15 @@ module Vmpooler
|
|
|
1541
2251
|
redis.zrem('vmpooler__provisioning__request', request_id)
|
|
1542
2252
|
return
|
|
1543
2253
|
end
|
|
2254
|
+
|
|
2255
|
+
# Check if request was already marked as failed (e.g., by delete endpoint)
|
|
2256
|
+
request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
|
|
2257
|
+
if request_status == 'failed'
|
|
2258
|
+
$logger.log('s', "Request '#{request_id}' already marked as failed, skipping VM creation")
|
|
2259
|
+
redis.zrem('vmpooler__provisioning__request', request_id)
|
|
2260
|
+
return
|
|
2261
|
+
end
|
|
2262
|
+
|
|
1544
2263
|
score = redis.zscore('vmpooler__provisioning__request', request_id)
|
|
1545
2264
|
requested = requested.split(',')
|
|
1546
2265
|
|
|
@@ -1764,6 +2483,48 @@ module Vmpooler
|
|
|
1764
2483
|
check_ondemand_requests(check_loop_delay_min, check_loop_delay_max, check_loop_delay_decay)
|
|
1765
2484
|
end
|
|
1766
2485
|
|
|
2486
|
+
# Queue purge thread
|
|
2487
|
+
if purge_enabled?
|
|
2488
|
+
purge_interval = ($config[:config] && $config[:config]['purge_interval']) || 3600 # default 1 hour
|
|
2489
|
+
if !$threads['queue_purge']
|
|
2490
|
+
$threads['queue_purge'] = Thread.new do
|
|
2491
|
+
loop do
|
|
2492
|
+
purge_stale_queue_entries
|
|
2493
|
+
sleep(purge_interval)
|
|
2494
|
+
end
|
|
2495
|
+
end
|
|
2496
|
+
elsif !$threads['queue_purge'].alive?
|
|
2497
|
+
$logger.log('d', '[!] [queue_purge] worker thread died, restarting')
|
|
2498
|
+
$threads['queue_purge'] = Thread.new do
|
|
2499
|
+
loop do
|
|
2500
|
+
purge_stale_queue_entries
|
|
2501
|
+
sleep(purge_interval)
|
|
2502
|
+
end
|
|
2503
|
+
end
|
|
2504
|
+
end
|
|
2505
|
+
end
|
|
2506
|
+
|
|
2507
|
+
# Health check thread
|
|
2508
|
+
if health_check_enabled?
|
|
2509
|
+
health_interval = ($config[:config] && $config[:config]['health_check_interval']) || 300 # default 5 minutes
|
|
2510
|
+
if !$threads['health_check']
|
|
2511
|
+
$threads['health_check'] = Thread.new do
|
|
2512
|
+
loop do
|
|
2513
|
+
check_queue_health
|
|
2514
|
+
sleep(health_interval)
|
|
2515
|
+
end
|
|
2516
|
+
end
|
|
2517
|
+
elsif !$threads['health_check'].alive?
|
|
2518
|
+
$logger.log('d', '[!] [health_check] worker thread died, restarting')
|
|
2519
|
+
$threads['health_check'] = Thread.new do
|
|
2520
|
+
loop do
|
|
2521
|
+
check_queue_health
|
|
2522
|
+
sleep(health_interval)
|
|
2523
|
+
end
|
|
2524
|
+
end
|
|
2525
|
+
end
|
|
2526
|
+
end
|
|
2527
|
+
|
|
1767
2528
|
sleep(loop_delay)
|
|
1768
2529
|
|
|
1769
2530
|
unless maxloop == 0
|