vmpooler 3.6.0 → 3.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/vmpooler/api/helpers.rb +32 -8
- data/lib/vmpooler/api/input_validator.rb +159 -0
- data/lib/vmpooler/api/rate_limiter.rb +116 -0
- data/lib/vmpooler/api/v3.rb +133 -16
- data/lib/vmpooler/metrics/promstats.rb +24 -0
- data/lib/vmpooler/pool_manager.rb +774 -12
- data/lib/vmpooler/version.rb +1 -1
- metadata +83 -77
|
@@ -145,7 +145,8 @@ module Vmpooler
|
|
|
145
145
|
"[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: #{open_socket_error}"
|
|
146
146
|
elsif timing_out_soon
|
|
147
147
|
time_remaining = timeout - timeout_notification
|
|
148
|
-
|
|
148
|
+
open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
|
|
149
|
+
"[!] [#{pool}] '#{vm}' impending failure in #{time_remaining} minutes with error: #{open_socket_error}"
|
|
149
150
|
else
|
|
150
151
|
"[!] [#{pool}] '#{vm}' This error is wholly unexpected"
|
|
151
152
|
end
|
|
@@ -160,16 +161,80 @@ module Vmpooler
|
|
|
160
161
|
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
|
|
161
162
|
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
|
|
162
163
|
open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
|
|
164
|
+
retry_count = redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count').to_i if request_id
|
|
165
|
+
|
|
166
|
+
# Move to DLQ before moving to completed queue
|
|
167
|
+
move_to_dlq(vm, pool, 'pending', 'Timeout',
|
|
168
|
+
open_socket_error || 'VM timed out during pending phase',
|
|
169
|
+
redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
|
|
170
|
+
|
|
171
|
+
clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error')
|
|
172
|
+
clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class')
|
|
163
173
|
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
|
|
174
|
+
|
|
164
175
|
if request_id
|
|
165
176
|
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
|
|
166
177
|
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
|
|
167
|
-
#
|
|
168
|
-
redis.
|
|
178
|
+
# Check retry count and max retry limit before retrying
|
|
179
|
+
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
|
|
180
|
+
max_retries = $config[:config]['max_vm_retries'] || 3
|
|
181
|
+
|
|
182
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' checking retry logic: error='#{clone_error}', error_class='#{clone_error_class}', retry_count=#{retry_count}, max_retries=#{max_retries}")
|
|
183
|
+
|
|
184
|
+
# Determine if error is likely permanent (configuration issues)
|
|
185
|
+
permanent_error = permanent_error?(clone_error, clone_error_class)
|
|
186
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' permanent_error check result: #{permanent_error}")
|
|
187
|
+
|
|
188
|
+
if retry_count < max_retries && !permanent_error
|
|
189
|
+
# Increment retry count and retry VM creation
|
|
190
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1)
|
|
191
|
+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
|
|
192
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})")
|
|
193
|
+
else
|
|
194
|
+
# Max retries exceeded or permanent error, mark request as permanently failed
|
|
195
|
+
failure_reason = if permanent_error
|
|
196
|
+
"Configuration error: #{clone_error}"
|
|
197
|
+
else
|
|
198
|
+
'Max retry attempts exceeded'
|
|
199
|
+
end
|
|
200
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
|
|
201
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason)
|
|
202
|
+
$logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}")
|
|
203
|
+
$metrics.increment("vmpooler_errors.permanently_failed.#{pool}")
|
|
204
|
+
end
|
|
169
205
|
end
|
|
170
206
|
end
|
|
171
|
-
$metrics.increment("
|
|
172
|
-
open_socket_error
|
|
207
|
+
$metrics.increment("vmpooler_errors.markedasfailed.#{pool}")
|
|
208
|
+
open_socket_error || clone_error
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Determine if an error is likely permanent (configuration issue) vs transient
|
|
212
|
+
def permanent_error?(error_message, error_class)
|
|
213
|
+
return false if error_message.nil? || error_class.nil?
|
|
214
|
+
|
|
215
|
+
permanent_error_patterns = [
|
|
216
|
+
/template.*not found/i,
|
|
217
|
+
/template.*does not exist/i,
|
|
218
|
+
/invalid.*path/i,
|
|
219
|
+
/folder.*not found/i,
|
|
220
|
+
/datastore.*not found/i,
|
|
221
|
+
/resource pool.*not found/i,
|
|
222
|
+
/permission.*denied/i,
|
|
223
|
+
/authentication.*failed/i,
|
|
224
|
+
/invalid.*credentials/i,
|
|
225
|
+
/configuration.*error/i
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
permanent_error_classes = [
|
|
229
|
+
'ArgumentError',
|
|
230
|
+
'NoMethodError',
|
|
231
|
+
'NameError'
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
# Check error message patterns
|
|
235
|
+
permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } ||
|
|
236
|
+
# Check error class types
|
|
237
|
+
permanent_error_classes.include?(error_class)
|
|
173
238
|
end
|
|
174
239
|
|
|
175
240
|
def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
|
|
@@ -222,8 +287,16 @@ module Vmpooler
|
|
|
222
287
|
return true if provider.vm_ready?(pool_name, vm_name, redis)
|
|
223
288
|
|
|
224
289
|
raise("VM #{vm_name} is not ready")
|
|
225
|
-
rescue StandardError
|
|
290
|
+
rescue StandardError => e
|
|
226
291
|
open_socket_error = redis.hget("vmpooler__vm__#{vm_name}", 'open_socket_error')
|
|
292
|
+
request_id = redis.hget("vmpooler__vm__#{vm_name}", 'request_id')
|
|
293
|
+
pool_alias = redis.hget("vmpooler__vm__#{vm_name}", 'pool_alias')
|
|
294
|
+
|
|
295
|
+
# Move to DLQ before moving to completed queue
|
|
296
|
+
move_to_dlq(vm_name, pool_name, 'ready', e.class.name,
|
|
297
|
+
open_socket_error || 'VM became unreachable in ready queue',
|
|
298
|
+
redis, request_id: request_id, pool_alias: pool_alias)
|
|
299
|
+
|
|
227
300
|
move_vm_queue(pool_name, vm_name, 'ready', 'completed', redis, "removed from 'ready' queue. vm unreachable with error: #{open_socket_error}")
|
|
228
301
|
end
|
|
229
302
|
|
|
@@ -356,6 +429,60 @@ module Vmpooler
|
|
|
356
429
|
$logger.log('d', "[!] [#{pool}] '#{vm}' #{msg}") if msg
|
|
357
430
|
end
|
|
358
431
|
|
|
432
|
+
# Dead-Letter Queue (DLQ) helper methods
|
|
433
|
+
def dlq_enabled?
|
|
434
|
+
$config[:config] && $config[:config]['dlq_enabled'] == true
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def dlq_ttl
|
|
438
|
+
($config[:config] && $config[:config]['dlq_ttl']) || 168 # default 7 days in hours
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def dlq_max_entries
|
|
442
|
+
($config[:config] && $config[:config]['dlq_max_entries']) || 10_000
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
def move_to_dlq(vm, pool, queue_type, error_class, error_message, redis, request_id: nil, pool_alias: nil, retry_count: 0, skip_metrics: false)
|
|
446
|
+
return unless dlq_enabled?
|
|
447
|
+
|
|
448
|
+
dlq_key = "vmpooler__dlq__#{queue_type}"
|
|
449
|
+
timestamp = Time.now.to_i
|
|
450
|
+
|
|
451
|
+
# Build DLQ entry
|
|
452
|
+
dlq_entry = {
|
|
453
|
+
'vm' => vm,
|
|
454
|
+
'pool' => pool,
|
|
455
|
+
'queue_from' => queue_type,
|
|
456
|
+
'error_class' => error_class.to_s,
|
|
457
|
+
'error_message' => error_message.to_s,
|
|
458
|
+
'failed_at' => Time.now.iso8601,
|
|
459
|
+
'retry_count' => retry_count,
|
|
460
|
+
'request_id' => request_id,
|
|
461
|
+
'pool_alias' => pool_alias
|
|
462
|
+
}.compact
|
|
463
|
+
|
|
464
|
+
# Use sorted set with timestamp as score for easy age-based queries and TTL
|
|
465
|
+
dlq_entry_json = dlq_entry.to_json
|
|
466
|
+
redis.zadd(dlq_key, timestamp, "#{vm}:#{timestamp}:#{dlq_entry_json}")
|
|
467
|
+
|
|
468
|
+
# Enforce max entries limit by removing oldest entries
|
|
469
|
+
current_size = redis.zcard(dlq_key)
|
|
470
|
+
if current_size > dlq_max_entries
|
|
471
|
+
remove_count = current_size - dlq_max_entries
|
|
472
|
+
redis.zremrangebyrank(dlq_key, 0, remove_count - 1)
|
|
473
|
+
$logger.log('d', "[!] [dlq] Trimmed #{remove_count} oldest entries from #{dlq_key}")
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# Set expiration on the entire DLQ (will be refreshed on next write)
|
|
477
|
+
ttl_seconds = dlq_ttl * 3600
|
|
478
|
+
redis.expire(dlq_key, ttl_seconds)
|
|
479
|
+
|
|
480
|
+
$metrics.increment("vmpooler_dlq.#{queue_type}.count") unless skip_metrics
|
|
481
|
+
$logger.log('d', "[!] [dlq] Moved '#{vm}' from '#{queue_type}' queue to DLQ: #{error_message}")
|
|
482
|
+
rescue StandardError => e
|
|
483
|
+
$logger.log('s', "[!] [dlq] Failed to move '#{vm}' to DLQ: #{e}")
|
|
484
|
+
end
|
|
485
|
+
|
|
359
486
|
# Clone a VM
|
|
360
487
|
def clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = nil)
|
|
361
488
|
Thread.new do
|
|
@@ -365,7 +492,13 @@ module Vmpooler
|
|
|
365
492
|
if request_id
|
|
366
493
|
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}")
|
|
367
494
|
@redis.with_metrics do |redis|
|
|
368
|
-
|
|
495
|
+
# Only re-queue if the request wasn't already marked as failed (e.g., by permanent error detection)
|
|
496
|
+
request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
|
|
497
|
+
if request_status != 'failed'
|
|
498
|
+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
|
499
|
+
else
|
|
500
|
+
$logger.log('s', "[!] [#{pool_name}] Request #{request_id} already marked as failed, not re-queueing")
|
|
501
|
+
end
|
|
369
502
|
end
|
|
370
503
|
else
|
|
371
504
|
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")
|
|
@@ -418,10 +551,10 @@ module Vmpooler
|
|
|
418
551
|
hostname_retries += 1
|
|
419
552
|
|
|
420
553
|
if !hostname_available
|
|
421
|
-
$metrics.increment("
|
|
554
|
+
$metrics.increment("vmpooler_errors.duplicatehostname.#{pool_name}")
|
|
422
555
|
$logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} was not unique (attempt \##{hostname_retries} of #{max_hostname_retries})")
|
|
423
556
|
elsif !dns_available
|
|
424
|
-
$metrics.increment("
|
|
557
|
+
$metrics.increment("vmpooler_errors.staledns.#{pool_name}")
|
|
425
558
|
$logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} already exists in DNS records (#{dns_ip}), stale DNS")
|
|
426
559
|
end
|
|
427
560
|
end
|
|
@@ -467,7 +600,7 @@ module Vmpooler
|
|
|
467
600
|
provider.create_vm(pool_name, new_vmname)
|
|
468
601
|
finish = format('%<time>.2f', time: Time.now - start)
|
|
469
602
|
$logger.log('s', "[+] [#{pool_name}] '#{new_vmname}' cloned in #{finish} seconds")
|
|
470
|
-
$metrics.
|
|
603
|
+
$metrics.gauge("vmpooler_clone.#{pool_name}", finish)
|
|
471
604
|
|
|
472
605
|
$logger.log('d', "[ ] [#{pool_name}] Obtaining IP for '#{new_vmname}'")
|
|
473
606
|
ip_start = Time.now
|
|
@@ -488,14 +621,50 @@ module Vmpooler
|
|
|
488
621
|
|
|
489
622
|
dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
|
|
490
623
|
dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
|
|
491
|
-
rescue StandardError
|
|
624
|
+
rescue StandardError => e
|
|
625
|
+
# Store error details for retry decision making
|
|
492
626
|
@redis.with_metrics do |redis|
|
|
627
|
+
# Get retry count before moving to DLQ
|
|
628
|
+
retry_count = 0
|
|
629
|
+
if request_id
|
|
630
|
+
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
|
|
631
|
+
retry_count = ondemandrequest_hash['retry_count'].to_i if ondemandrequest_hash
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
# Move to DLQ before removing from pending queue
|
|
635
|
+
move_to_dlq(new_vmname, pool_name, 'clone', e.class.name, e.message,
|
|
636
|
+
redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
|
|
637
|
+
|
|
493
638
|
redis.pipelined do |pipeline|
|
|
494
639
|
pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
|
|
640
|
+
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message)
|
|
641
|
+
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name)
|
|
495
642
|
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
|
|
496
643
|
pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
|
|
497
644
|
end
|
|
645
|
+
|
|
646
|
+
# Handle retry logic for on-demand requests
|
|
647
|
+
if request_id
|
|
648
|
+
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
|
|
649
|
+
max_retries = $config[:config]['max_vm_retries'] || 3
|
|
650
|
+
is_permanent = permanent_error?(e.message, e.class.name)
|
|
651
|
+
|
|
652
|
+
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' checking immediate failure retry: error='#{e.message}', error_class='#{e.class.name}', retry_count=#{retry_count}, max_retries=#{max_retries}, permanent_error=#{is_permanent}")
|
|
653
|
+
|
|
654
|
+
if is_permanent || retry_count >= max_retries
|
|
655
|
+
reason = is_permanent ? 'permanent error detected' : 'max retries exceeded'
|
|
656
|
+
$logger.log('s', "[!] [#{pool_name}] Cancelling request #{request_id} due to #{reason}")
|
|
657
|
+
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
|
|
658
|
+
redis.zadd('vmpooler__odcreate__task', 0, "#{pool_alias}:#{pool_name}:0:#{request_id}")
|
|
659
|
+
else
|
|
660
|
+
# Increment retry count and re-queue for retry
|
|
661
|
+
redis.hincrby("vmpooler__odrequest__#{request_id}", 'retry_count', 1)
|
|
662
|
+
$logger.log('s', "[+] [#{pool_name}] Request #{request_id} will be retried (attempt #{retry_count + 1}/#{max_retries})")
|
|
663
|
+
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
|
664
|
+
end
|
|
665
|
+
end
|
|
498
666
|
end
|
|
667
|
+
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}")
|
|
499
668
|
raise
|
|
500
669
|
ensure
|
|
501
670
|
@redis.with_metrics do |redis|
|
|
@@ -545,7 +714,7 @@ module Vmpooler
|
|
|
545
714
|
|
|
546
715
|
finish = format('%<time>.2f', time: Time.now - start)
|
|
547
716
|
$logger.log('s', "[-] [#{pool}] '#{vm}' destroyed in #{finish} seconds")
|
|
548
|
-
$metrics.
|
|
717
|
+
$metrics.gauge("vmpooler_destroy.#{pool}", finish)
|
|
549
718
|
end
|
|
550
719
|
end
|
|
551
720
|
dereference_mutex(vm)
|
|
@@ -581,6 +750,543 @@ module Vmpooler
|
|
|
581
750
|
provider.purge_unconfigured_resources(allowlist)
|
|
582
751
|
end
|
|
583
752
|
|
|
753
|
+
# Auto-purge stale queue entries
|
|
754
|
+
def purge_enabled?
|
|
755
|
+
$config[:config] && $config[:config]['purge_enabled'] == true
|
|
756
|
+
end
|
|
757
|
+
|
|
758
|
+
def purge_dry_run?
|
|
759
|
+
$config[:config] && $config[:config]['purge_dry_run'] == true
|
|
760
|
+
end
|
|
761
|
+
|
|
762
|
+
def max_pending_age
|
|
763
|
+
($config[:config] && $config[:config]['max_pending_age']) || 7200 # default 2 hours in seconds
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
def max_ready_age
|
|
767
|
+
($config[:config] && $config[:config]['max_ready_age']) || 86_400 # default 24 hours in seconds
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
def max_completed_age
|
|
771
|
+
($config[:config] && $config[:config]['max_completed_age']) || 3600 # default 1 hour in seconds
|
|
772
|
+
end
|
|
773
|
+
|
|
774
|
+
def max_orphaned_age
|
|
775
|
+
($config[:config] && $config[:config]['max_orphaned_age']) || 86_400 # default 24 hours in seconds
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
def purge_stale_queue_entries
|
|
779
|
+
return unless purge_enabled?
|
|
780
|
+
|
|
781
|
+
Thread.new do
|
|
782
|
+
begin
|
|
783
|
+
$logger.log('d', '[*] [purge] Starting stale queue entry purge cycle')
|
|
784
|
+
purge_start = Time.now
|
|
785
|
+
|
|
786
|
+
@redis.with_metrics do |redis|
|
|
787
|
+
total_purged = 0
|
|
788
|
+
|
|
789
|
+
# Purge stale entries from each pool
|
|
790
|
+
$config[:pools].each do |pool|
|
|
791
|
+
pool_name = pool['name']
|
|
792
|
+
|
|
793
|
+
# Purge pending queue
|
|
794
|
+
purged_pending = purge_pending_queue(pool_name, redis)
|
|
795
|
+
total_purged += purged_pending
|
|
796
|
+
|
|
797
|
+
# Purge ready queue
|
|
798
|
+
purged_ready = purge_ready_queue(pool_name, redis)
|
|
799
|
+
total_purged += purged_ready
|
|
800
|
+
|
|
801
|
+
# Purge completed queue
|
|
802
|
+
purged_completed = purge_completed_queue(pool_name, redis)
|
|
803
|
+
total_purged += purged_completed
|
|
804
|
+
end
|
|
805
|
+
|
|
806
|
+
# Purge orphaned VM metadata
|
|
807
|
+
purged_orphaned = purge_orphaned_metadata(redis)
|
|
808
|
+
total_purged += purged_orphaned
|
|
809
|
+
|
|
810
|
+
purge_duration = Time.now - purge_start
|
|
811
|
+
$logger.log('s', "[*] [purge] Completed purge cycle in #{purge_duration.round(2)}s: #{total_purged} entries purged")
|
|
812
|
+
$metrics.gauge('vmpooler_purge.cycle.duration', purge_duration)
|
|
813
|
+
$metrics.gauge('vmpooler_purge.total.count', total_purged)
|
|
814
|
+
end
|
|
815
|
+
rescue StandardError => e
|
|
816
|
+
$logger.log('s', "[!] [purge] Failed during purge cycle: #{e}")
|
|
817
|
+
end
|
|
818
|
+
end
|
|
819
|
+
end
|
|
820
|
+
|
|
821
|
+
def purge_pending_queue(pool_name, redis)
|
|
822
|
+
queue_key = "vmpooler__pending__#{pool_name}"
|
|
823
|
+
vms = redis.smembers(queue_key)
|
|
824
|
+
purged_count = 0
|
|
825
|
+
|
|
826
|
+
vms.each do |vm|
|
|
827
|
+
begin
|
|
828
|
+
clone_time_str = redis.hget("vmpooler__vm__#{vm}", 'clone')
|
|
829
|
+
next unless clone_time_str
|
|
830
|
+
|
|
831
|
+
clone_time = Time.parse(clone_time_str)
|
|
832
|
+
age = Time.now - clone_time
|
|
833
|
+
|
|
834
|
+
if age > max_pending_age
|
|
835
|
+
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
|
|
836
|
+
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias')
|
|
837
|
+
|
|
838
|
+
purged_count += 1
|
|
839
|
+
|
|
840
|
+
if purge_dry_run?
|
|
841
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge stale pending VM '#{vm}' (age: #{age.round(0)}s, max: #{max_pending_age}s)")
|
|
842
|
+
else
|
|
843
|
+
# Move to DLQ before removing (skip DLQ metric since we're tracking purge metric)
|
|
844
|
+
move_to_dlq(vm, pool_name, 'pending', 'Purge',
|
|
845
|
+
"Stale pending VM (age: #{age.round(0)}s > max: #{max_pending_age}s)",
|
|
846
|
+
redis, request_id: request_id, pool_alias: pool_alias, skip_metrics: true)
|
|
847
|
+
|
|
848
|
+
redis.srem(queue_key, vm)
|
|
849
|
+
|
|
850
|
+
# Set expiration on VM metadata if data_ttl is configured
|
|
851
|
+
if $config[:redis] && $config[:redis]['data_ttl']
|
|
852
|
+
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
|
|
853
|
+
redis.expire("vmpooler__vm__#{vm}", expiration_ttl)
|
|
854
|
+
end
|
|
855
|
+
|
|
856
|
+
$logger.log('d', "[!] [purge] Purged stale pending VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
|
|
857
|
+
$metrics.increment("vmpooler_purge.pending.#{pool_name}.count")
|
|
858
|
+
end
|
|
859
|
+
end
|
|
860
|
+
rescue StandardError => e
|
|
861
|
+
$logger.log('d', "[!] [purge] Error checking pending VM '#{vm}': #{e}")
|
|
862
|
+
end
|
|
863
|
+
end
|
|
864
|
+
|
|
865
|
+
purged_count
|
|
866
|
+
end
|
|
867
|
+
|
|
868
|
+
def purge_ready_queue(pool_name, redis)
|
|
869
|
+
queue_key = "vmpooler__ready__#{pool_name}"
|
|
870
|
+
vms = redis.smembers(queue_key)
|
|
871
|
+
purged_count = 0
|
|
872
|
+
|
|
873
|
+
vms.each do |vm|
|
|
874
|
+
begin
|
|
875
|
+
ready_time_str = redis.hget("vmpooler__vm__#{vm}", 'ready')
|
|
876
|
+
next unless ready_time_str
|
|
877
|
+
|
|
878
|
+
ready_time = Time.parse(ready_time_str)
|
|
879
|
+
age = Time.now - ready_time
|
|
880
|
+
|
|
881
|
+
if age > max_ready_age
|
|
882
|
+
if purge_dry_run?
|
|
883
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge stale ready VM '#{vm}' (age: #{age.round(0)}s, max: #{max_ready_age}s)")
|
|
884
|
+
else
|
|
885
|
+
redis.smove(queue_key, "vmpooler__completed__#{pool_name}", vm)
|
|
886
|
+
$logger.log('d', "[!] [purge] Moved stale ready VM '#{vm}' from '#{pool_name}' to completed (age: #{age.round(0)}s)")
|
|
887
|
+
$metrics.increment("vmpooler_purge.ready.#{pool_name}.count")
|
|
888
|
+
end
|
|
889
|
+
purged_count += 1
|
|
890
|
+
end
|
|
891
|
+
rescue StandardError => e
|
|
892
|
+
$logger.log('d', "[!] [purge] Error checking ready VM '#{vm}': #{e}")
|
|
893
|
+
end
|
|
894
|
+
end
|
|
895
|
+
|
|
896
|
+
purged_count
|
|
897
|
+
end
|
|
898
|
+
|
|
899
|
+
def purge_completed_queue(pool_name, redis)
|
|
900
|
+
queue_key = "vmpooler__completed__#{pool_name}"
|
|
901
|
+
vms = redis.smembers(queue_key)
|
|
902
|
+
purged_count = 0
|
|
903
|
+
|
|
904
|
+
vms.each do |vm|
|
|
905
|
+
begin
|
|
906
|
+
# Check destroy time or last activity time
|
|
907
|
+
destroy_time_str = redis.hget("vmpooler__vm__#{vm}", 'destroy')
|
|
908
|
+
checkout_time_str = redis.hget("vmpooler__vm__#{vm}", 'checkout')
|
|
909
|
+
|
|
910
|
+
# Use the most recent timestamp
|
|
911
|
+
timestamp_str = destroy_time_str || checkout_time_str
|
|
912
|
+
next unless timestamp_str
|
|
913
|
+
|
|
914
|
+
timestamp = Time.parse(timestamp_str)
|
|
915
|
+
age = Time.now - timestamp
|
|
916
|
+
|
|
917
|
+
if age > max_completed_age
|
|
918
|
+
if purge_dry_run?
|
|
919
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge stale completed VM '#{vm}' (age: #{age.round(0)}s, max: #{max_completed_age}s)")
|
|
920
|
+
else
|
|
921
|
+
redis.srem(queue_key, vm)
|
|
922
|
+
$logger.log('d', "[!] [purge] Removed stale completed VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
|
|
923
|
+
$metrics.increment("vmpooler_purge.completed.#{pool_name}.count")
|
|
924
|
+
end
|
|
925
|
+
purged_count += 1
|
|
926
|
+
end
|
|
927
|
+
rescue StandardError => e
|
|
928
|
+
$logger.log('d', "[!] [purge] Error checking completed VM '#{vm}': #{e}")
|
|
929
|
+
end
|
|
930
|
+
end
|
|
931
|
+
|
|
932
|
+
purged_count
|
|
933
|
+
end
|
|
934
|
+
|
|
935
|
+
def purge_orphaned_metadata(redis)
|
|
936
|
+
# Find VM metadata that doesn't belong to any queue
|
|
937
|
+
all_vm_keys = redis.keys('vmpooler__vm__*')
|
|
938
|
+
purged_count = 0
|
|
939
|
+
|
|
940
|
+
all_vm_keys.each do |vm_key|
|
|
941
|
+
begin
|
|
942
|
+
vm = vm_key.sub('vmpooler__vm__', '')
|
|
943
|
+
|
|
944
|
+
# Check if VM exists in any queue
|
|
945
|
+
pool_name = redis.hget(vm_key, 'pool')
|
|
946
|
+
next unless pool_name
|
|
947
|
+
|
|
948
|
+
in_pending = redis.sismember("vmpooler__pending__#{pool_name}", vm)
|
|
949
|
+
in_ready = redis.sismember("vmpooler__ready__#{pool_name}", vm)
|
|
950
|
+
in_running = redis.sismember("vmpooler__running__#{pool_name}", vm)
|
|
951
|
+
in_completed = redis.sismember("vmpooler__completed__#{pool_name}", vm)
|
|
952
|
+
in_discovered = redis.sismember("vmpooler__discovered__#{pool_name}", vm)
|
|
953
|
+
in_migrating = redis.sismember("vmpooler__migrating__#{pool_name}", vm)
|
|
954
|
+
|
|
955
|
+
# VM is orphaned if not in any queue
|
|
956
|
+
unless in_pending || in_ready || in_running || in_completed || in_discovered || in_migrating
|
|
957
|
+
# Check age
|
|
958
|
+
clone_time_str = redis.hget(vm_key, 'clone')
|
|
959
|
+
next unless clone_time_str
|
|
960
|
+
|
|
961
|
+
clone_time = Time.parse(clone_time_str)
|
|
962
|
+
age = Time.now - clone_time
|
|
963
|
+
|
|
964
|
+
if age > max_orphaned_age
|
|
965
|
+
if purge_dry_run?
|
|
966
|
+
$logger.log('d', "[*] [purge][dry-run] Would purge orphaned metadata for '#{vm}' (age: #{age.round(0)}s, max: #{max_orphaned_age}s)")
|
|
967
|
+
else
|
|
968
|
+
expiration_ttl = 3600 # 1 hour
|
|
969
|
+
redis.expire(vm_key, expiration_ttl)
|
|
970
|
+
$logger.log('d', "[!] [purge] Set expiration on orphaned metadata for '#{vm}' (age: #{age.round(0)}s)")
|
|
971
|
+
$metrics.increment('vmpooler_purge.orphaned.count')
|
|
972
|
+
end
|
|
973
|
+
purged_count += 1
|
|
974
|
+
end
|
|
975
|
+
end
|
|
976
|
+
rescue StandardError => e
|
|
977
|
+
$logger.log('d', "[!] [purge] Error checking orphaned metadata '#{vm_key}': #{e}")
|
|
978
|
+
end
|
|
979
|
+
end
|
|
980
|
+
|
|
981
|
+
purged_count
|
|
982
|
+
end
|
|
983
|
+
|
|
984
|
+
# Health checks for Redis queues
|
|
985
|
+
def health_check_enabled?
|
|
986
|
+
$config[:config] && $config[:config]['health_check_enabled'] == true
|
|
987
|
+
end
|
|
988
|
+
|
|
989
|
+
def health_thresholds
|
|
990
|
+
defaults = {
|
|
991
|
+
'pending_queue_max' => 100,
|
|
992
|
+
'ready_queue_max' => 500,
|
|
993
|
+
'dlq_max_warning' => 100,
|
|
994
|
+
'dlq_max_critical' => 1000,
|
|
995
|
+
'stuck_vm_age_threshold' => 7200, # 2 hours
|
|
996
|
+
'stuck_vm_max_warning' => 10,
|
|
997
|
+
'stuck_vm_max_critical' => 50
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
if $config[:config] && $config[:config]['health_thresholds']
|
|
1001
|
+
defaults.merge($config[:config]['health_thresholds'])
|
|
1002
|
+
else
|
|
1003
|
+
defaults
|
|
1004
|
+
end
|
|
1005
|
+
end
|
|
1006
|
+
|
|
1007
|
+
def check_queue_health
|
|
1008
|
+
return unless health_check_enabled?
|
|
1009
|
+
|
|
1010
|
+
Thread.new do
|
|
1011
|
+
begin
|
|
1012
|
+
$logger.log('d', '[*] [health] Running queue health check')
|
|
1013
|
+
health_start = Time.now
|
|
1014
|
+
|
|
1015
|
+
@redis.with_metrics do |redis|
|
|
1016
|
+
health_metrics = calculate_health_metrics(redis)
|
|
1017
|
+
health_status = determine_health_status(health_metrics)
|
|
1018
|
+
|
|
1019
|
+
# Store health metrics in Redis for API consumption
|
|
1020
|
+
# Convert nested hash to JSON for storage
|
|
1021
|
+
require 'json'
|
|
1022
|
+
redis.hset('vmpooler__health', 'metrics', health_metrics.to_json)
|
|
1023
|
+
redis.hset('vmpooler__health', 'status', health_status)
|
|
1024
|
+
redis.hset('vmpooler__health', 'last_check', Time.now.iso8601)
|
|
1025
|
+
redis.expire('vmpooler__health', 3600) # Expire after 1 hour
|
|
1026
|
+
|
|
1027
|
+
# Log health summary
|
|
1028
|
+
log_health_summary(health_metrics, health_status)
|
|
1029
|
+
|
|
1030
|
+
# Push metrics
|
|
1031
|
+
push_health_metrics(health_metrics, health_status)
|
|
1032
|
+
|
|
1033
|
+
health_duration = Time.now - health_start
|
|
1034
|
+
$metrics.gauge('vmpooler_health.check.duration', health_duration)
|
|
1035
|
+
end
|
|
1036
|
+
rescue StandardError => e
|
|
1037
|
+
$logger.log('s', "[!] [health] Failed during health check: #{e}")
|
|
1038
|
+
end
|
|
1039
|
+
end
|
|
1040
|
+
end
|
|
1041
|
+
|
|
1042
|
+
def calculate_health_metrics(redis)
|
|
1043
|
+
metrics = {
|
|
1044
|
+
'queues' => {},
|
|
1045
|
+
'tasks' => {},
|
|
1046
|
+
'errors' => {}
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
total_stuck_vms = 0
|
|
1050
|
+
total_dlq_size = 0
|
|
1051
|
+
thresholds = health_thresholds
|
|
1052
|
+
|
|
1053
|
+
# Check each pool's queues
|
|
1054
|
+
$config[:pools].each do |pool|
|
|
1055
|
+
pool_name = pool['name']
|
|
1056
|
+
metrics['queues'][pool_name] = {}
|
|
1057
|
+
|
|
1058
|
+
# Pending queue metrics
|
|
1059
|
+
pending_key = "vmpooler__pending__#{pool_name}"
|
|
1060
|
+
pending_vms = redis.smembers(pending_key)
|
|
1061
|
+
pending_ages = calculate_queue_ages(pending_vms, 'clone', redis)
|
|
1062
|
+
stuck_pending = pending_ages.count { |age| age > thresholds['stuck_vm_age_threshold'] }
|
|
1063
|
+
total_stuck_vms += stuck_pending
|
|
1064
|
+
|
|
1065
|
+
metrics['queues'][pool_name]['pending'] = {
|
|
1066
|
+
'size' => pending_vms.size,
|
|
1067
|
+
'oldest_age' => pending_ages.max || 0,
|
|
1068
|
+
'avg_age' => pending_ages.empty? ? 0 : (pending_ages.sum / pending_ages.size).round(0),
|
|
1069
|
+
'stuck_count' => stuck_pending
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
# Ready queue metrics
|
|
1073
|
+
ready_key = "vmpooler__ready__#{pool_name}"
|
|
1074
|
+
ready_vms = redis.smembers(ready_key)
|
|
1075
|
+
ready_ages = calculate_queue_ages(ready_vms, 'ready', redis)
|
|
1076
|
+
|
|
1077
|
+
metrics['queues'][pool_name]['ready'] = {
|
|
1078
|
+
'size' => ready_vms.size,
|
|
1079
|
+
'oldest_age' => ready_ages.max || 0,
|
|
1080
|
+
'avg_age' => ready_ages.empty? ? 0 : (ready_ages.sum / ready_ages.size).round(0)
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
# Completed queue metrics
|
|
1084
|
+
completed_key = "vmpooler__completed__#{pool_name}"
|
|
1085
|
+
completed_size = redis.scard(completed_key)
|
|
1086
|
+
metrics['queues'][pool_name]['completed'] = { 'size' => completed_size }
|
|
1087
|
+
end
|
|
1088
|
+
|
|
1089
|
+
# Task queue metrics
|
|
1090
|
+
clone_active = redis.get('vmpooler__tasks__clone').to_i
|
|
1091
|
+
ondemand_active = redis.get('vmpooler__tasks__ondemandclone').to_i
|
|
1092
|
+
odcreate_pending = redis.zcard('vmpooler__odcreate__task')
|
|
1093
|
+
|
|
1094
|
+
metrics['tasks']['clone'] = { 'active' => clone_active }
|
|
1095
|
+
metrics['tasks']['ondemand'] = { 'active' => ondemand_active, 'pending' => odcreate_pending }
|
|
1096
|
+
|
|
1097
|
+
# DLQ metrics
|
|
1098
|
+
if dlq_enabled?
|
|
1099
|
+
dlq_keys = redis.keys('vmpooler__dlq__*')
|
|
1100
|
+
dlq_keys.each do |dlq_key|
|
|
1101
|
+
queue_type = dlq_key.sub('vmpooler__dlq__', '')
|
|
1102
|
+
dlq_size = redis.zcard(dlq_key)
|
|
1103
|
+
total_dlq_size += dlq_size
|
|
1104
|
+
metrics['queues']['dlq'] ||= {}
|
|
1105
|
+
metrics['queues']['dlq'][queue_type] = { 'size' => dlq_size }
|
|
1106
|
+
end
|
|
1107
|
+
end
|
|
1108
|
+
|
|
1109
|
+
# Error metrics
|
|
1110
|
+
metrics['errors']['dlq_total_size'] = total_dlq_size
|
|
1111
|
+
metrics['errors']['stuck_vm_count'] = total_stuck_vms
|
|
1112
|
+
|
|
1113
|
+
# Orphaned metadata count
|
|
1114
|
+
orphaned_count = count_orphaned_metadata(redis)
|
|
1115
|
+
metrics['errors']['orphaned_metadata_count'] = orphaned_count
|
|
1116
|
+
|
|
1117
|
+
metrics
|
|
1118
|
+
end
|
|
1119
|
+
|
|
1120
|
+
def calculate_queue_ages(vms, timestamp_field, redis)
|
|
1121
|
+
ages = []
|
|
1122
|
+
vms.each do |vm|
|
|
1123
|
+
begin
|
|
1124
|
+
timestamp_str = redis.hget("vmpooler__vm__#{vm}", timestamp_field)
|
|
1125
|
+
next unless timestamp_str
|
|
1126
|
+
|
|
1127
|
+
timestamp = Time.parse(timestamp_str)
|
|
1128
|
+
age = (Time.now - timestamp).to_i
|
|
1129
|
+
ages << age
|
|
1130
|
+
rescue StandardError
|
|
1131
|
+
# Skip VMs with invalid timestamps
|
|
1132
|
+
end
|
|
1133
|
+
end
|
|
1134
|
+
ages
|
|
1135
|
+
end
|
|
1136
|
+
|
|
1137
|
+
def count_orphaned_metadata(redis)
|
|
1138
|
+
all_vm_keys = redis.keys('vmpooler__vm__*')
|
|
1139
|
+
orphaned_count = 0
|
|
1140
|
+
|
|
1141
|
+
all_vm_keys.each do |vm_key|
|
|
1142
|
+
begin
|
|
1143
|
+
vm = vm_key.sub('vmpooler__vm__', '')
|
|
1144
|
+
pool_name = redis.hget(vm_key, 'pool')
|
|
1145
|
+
next unless pool_name
|
|
1146
|
+
|
|
1147
|
+
in_any_queue = redis.sismember("vmpooler__pending__#{pool_name}", vm) ||
|
|
1148
|
+
redis.sismember("vmpooler__ready__#{pool_name}", vm) ||
|
|
1149
|
+
redis.sismember("vmpooler__running__#{pool_name}", vm) ||
|
|
1150
|
+
redis.sismember("vmpooler__completed__#{pool_name}", vm) ||
|
|
1151
|
+
redis.sismember("vmpooler__discovered__#{pool_name}", vm) ||
|
|
1152
|
+
redis.sismember("vmpooler__migrating__#{pool_name}", vm)
|
|
1153
|
+
|
|
1154
|
+
orphaned_count += 1 unless in_any_queue
|
|
1155
|
+
rescue StandardError
|
|
1156
|
+
# Skip on error
|
|
1157
|
+
end
|
|
1158
|
+
end
|
|
1159
|
+
|
|
1160
|
+
orphaned_count
|
|
1161
|
+
end
|
|
1162
|
+
|
|
1163
|
+
def determine_health_status(metrics)
|
|
1164
|
+
thresholds = health_thresholds
|
|
1165
|
+
|
|
1166
|
+
# Check DLQ size
|
|
1167
|
+
dlq_size = metrics['errors']['dlq_total_size']
|
|
1168
|
+
return 'unhealthy' if dlq_size > thresholds['dlq_max_critical']
|
|
1169
|
+
|
|
1170
|
+
# Check stuck VM count
|
|
1171
|
+
stuck_count = metrics['errors']['stuck_vm_count']
|
|
1172
|
+
return 'unhealthy' if stuck_count > thresholds['stuck_vm_max_critical']
|
|
1173
|
+
|
|
1174
|
+
# Check queue sizes
|
|
1175
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1176
|
+
next if pool_name == 'dlq'
|
|
1177
|
+
|
|
1178
|
+
pending_size = begin
|
|
1179
|
+
queues['pending']['size']
|
|
1180
|
+
rescue StandardError
|
|
1181
|
+
0
|
|
1182
|
+
end
|
|
1183
|
+
ready_size = begin
|
|
1184
|
+
queues['ready']['size']
|
|
1185
|
+
rescue StandardError
|
|
1186
|
+
0
|
|
1187
|
+
end
|
|
1188
|
+
|
|
1189
|
+
return 'unhealthy' if pending_size > thresholds['pending_queue_max'] * 2
|
|
1190
|
+
return 'unhealthy' if ready_size > thresholds['ready_queue_max'] * 2
|
|
1191
|
+
end
|
|
1192
|
+
|
|
1193
|
+
# Check for degraded conditions
|
|
1194
|
+
return 'degraded' if dlq_size > thresholds['dlq_max_warning']
|
|
1195
|
+
return 'degraded' if stuck_count > thresholds['stuck_vm_max_warning']
|
|
1196
|
+
|
|
1197
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1198
|
+
next if pool_name == 'dlq'
|
|
1199
|
+
|
|
1200
|
+
pending_size = begin
|
|
1201
|
+
queues['pending']['size']
|
|
1202
|
+
rescue StandardError
|
|
1203
|
+
0
|
|
1204
|
+
end
|
|
1205
|
+
ready_size = begin
|
|
1206
|
+
queues['ready']['size']
|
|
1207
|
+
rescue StandardError
|
|
1208
|
+
0
|
|
1209
|
+
end
|
|
1210
|
+
|
|
1211
|
+
return 'degraded' if pending_size > thresholds['pending_queue_max']
|
|
1212
|
+
return 'degraded' if ready_size > thresholds['ready_queue_max']
|
|
1213
|
+
end
|
|
1214
|
+
|
|
1215
|
+
'healthy'
|
|
1216
|
+
end
|
|
1217
|
+
|
|
1218
|
+
def log_health_summary(metrics, status)
|
|
1219
|
+
summary = "[*] [health] Status: #{status.upcase}"
|
|
1220
|
+
|
|
1221
|
+
# Queue summary
|
|
1222
|
+
total_pending = 0
|
|
1223
|
+
total_ready = 0
|
|
1224
|
+
total_completed = 0
|
|
1225
|
+
|
|
1226
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1227
|
+
next if pool_name == 'dlq'
|
|
1228
|
+
|
|
1229
|
+
total_pending += begin
|
|
1230
|
+
queues['pending']['size']
|
|
1231
|
+
rescue StandardError
|
|
1232
|
+
0
|
|
1233
|
+
end
|
|
1234
|
+
total_ready += begin
|
|
1235
|
+
queues['ready']['size']
|
|
1236
|
+
rescue StandardError
|
|
1237
|
+
0
|
|
1238
|
+
end
|
|
1239
|
+
total_completed += begin
|
|
1240
|
+
queues['completed']['size']
|
|
1241
|
+
rescue StandardError
|
|
1242
|
+
0
|
|
1243
|
+
end
|
|
1244
|
+
end
|
|
1245
|
+
|
|
1246
|
+
summary += " | Queues: P=#{total_pending} R=#{total_ready} C=#{total_completed}"
|
|
1247
|
+
summary += " | DLQ=#{metrics['errors']['dlq_total_size']}"
|
|
1248
|
+
summary += " | Stuck=#{metrics['errors']['stuck_vm_count']}"
|
|
1249
|
+
summary += " | Orphaned=#{metrics['errors']['orphaned_metadata_count']}"
|
|
1250
|
+
|
|
1251
|
+
log_level = status == 'healthy' ? 's' : 'd'
|
|
1252
|
+
$logger.log(log_level, summary)
|
|
1253
|
+
end
|
|
1254
|
+
|
|
1255
|
+
def push_health_metrics(metrics, status)
|
|
1256
|
+
# Push error metrics first
|
|
1257
|
+
$metrics.gauge('vmpooler_health.dlq.total_size', metrics['errors']['dlq_total_size'])
|
|
1258
|
+
$metrics.gauge('vmpooler_health.stuck_vms.count', metrics['errors']['stuck_vm_count'])
|
|
1259
|
+
$metrics.gauge('vmpooler_health.orphaned_metadata.count', metrics['errors']['orphaned_metadata_count'])
|
|
1260
|
+
|
|
1261
|
+
# Push per-pool queue metrics
|
|
1262
|
+
metrics['queues'].each do |pool_name, queues|
|
|
1263
|
+
next if pool_name == 'dlq'
|
|
1264
|
+
|
|
1265
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.size", queues['pending']['size'])
|
|
1266
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.oldest_age", queues['pending']['oldest_age'])
|
|
1267
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.stuck_count", queues['pending']['stuck_count'])
|
|
1268
|
+
|
|
1269
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.size", queues['ready']['size'])
|
|
1270
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.oldest_age", queues['ready']['oldest_age'])
|
|
1271
|
+
|
|
1272
|
+
$metrics.gauge("vmpooler_health.queue.#{pool_name}.completed.size", queues['completed']['size'])
|
|
1273
|
+
end
|
|
1274
|
+
|
|
1275
|
+
# Push DLQ metrics
|
|
1276
|
+
metrics['queues']['dlq']&.each do |queue_type, dlq_metrics|
|
|
1277
|
+
$metrics.gauge("vmpooler_health.dlq.#{queue_type}.size", dlq_metrics['size'])
|
|
1278
|
+
end
|
|
1279
|
+
|
|
1280
|
+
# Push task metrics
|
|
1281
|
+
$metrics.gauge('vmpooler_health.tasks.clone.active', metrics['tasks']['clone']['active'])
|
|
1282
|
+
$metrics.gauge('vmpooler_health.tasks.ondemand.active', metrics['tasks']['ondemand']['active'])
|
|
1283
|
+
$metrics.gauge('vmpooler_health.tasks.ondemand.pending', metrics['tasks']['ondemand']['pending'])
|
|
1284
|
+
|
|
1285
|
+
# Push status last (0=healthy, 1=degraded, 2=unhealthy)
|
|
1286
|
+
status_value = { 'healthy' => 0, 'degraded' => 1, 'unhealthy' => 2 }[status] || 2
|
|
1287
|
+
$metrics.gauge('vmpooler_health.status', status_value)
|
|
1288
|
+
end
|
|
1289
|
+
|
|
584
1290
|
def create_vm_disk(pool_name, vm, disk_size, provider)
|
|
585
1291
|
Thread.new do
|
|
586
1292
|
begin
|
|
@@ -981,7 +1687,12 @@ module Vmpooler
|
|
|
981
1687
|
|
|
982
1688
|
sync_pool_template(pool)
|
|
983
1689
|
loop do
|
|
1690
|
+
start_time = Time.now
|
|
984
1691
|
result = _check_pool(pool, provider)
|
|
1692
|
+
duration = Time.now - start_time
|
|
1693
|
+
|
|
1694
|
+
$metrics.gauge("vmpooler_performance.check_pool.#{pool['name']}", duration)
|
|
1695
|
+
$logger.log('d', "[!] check_pool for #{pool['name']} took #{duration.round(2)}s") if duration > 5
|
|
985
1696
|
|
|
986
1697
|
if result[:cloned_vms] > 0 || result[:checked_pending_vms] > 0 || result[:discovered_vms] > 0
|
|
987
1698
|
loop_delay = loop_delay_min
|
|
@@ -1540,6 +2251,15 @@ module Vmpooler
|
|
|
1540
2251
|
redis.zrem('vmpooler__provisioning__request', request_id)
|
|
1541
2252
|
return
|
|
1542
2253
|
end
|
|
2254
|
+
|
|
2255
|
+
# Check if request was already marked as failed (e.g., by delete endpoint)
|
|
2256
|
+
request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
|
|
2257
|
+
if request_status == 'failed'
|
|
2258
|
+
$logger.log('s', "Request '#{request_id}' already marked as failed, skipping VM creation")
|
|
2259
|
+
redis.zrem('vmpooler__provisioning__request', request_id)
|
|
2260
|
+
return
|
|
2261
|
+
end
|
|
2262
|
+
|
|
1543
2263
|
score = redis.zscore('vmpooler__provisioning__request', request_id)
|
|
1544
2264
|
requested = requested.split(',')
|
|
1545
2265
|
|
|
@@ -1763,6 +2483,48 @@ module Vmpooler
|
|
|
1763
2483
|
check_ondemand_requests(check_loop_delay_min, check_loop_delay_max, check_loop_delay_decay)
|
|
1764
2484
|
end
|
|
1765
2485
|
|
|
2486
|
+
# Queue purge thread
|
|
2487
|
+
if purge_enabled?
|
|
2488
|
+
purge_interval = ($config[:config] && $config[:config]['purge_interval']) || 3600 # default 1 hour
|
|
2489
|
+
if !$threads['queue_purge']
|
|
2490
|
+
$threads['queue_purge'] = Thread.new do
|
|
2491
|
+
loop do
|
|
2492
|
+
purge_stale_queue_entries
|
|
2493
|
+
sleep(purge_interval)
|
|
2494
|
+
end
|
|
2495
|
+
end
|
|
2496
|
+
elsif !$threads['queue_purge'].alive?
|
|
2497
|
+
$logger.log('d', '[!] [queue_purge] worker thread died, restarting')
|
|
2498
|
+
$threads['queue_purge'] = Thread.new do
|
|
2499
|
+
loop do
|
|
2500
|
+
purge_stale_queue_entries
|
|
2501
|
+
sleep(purge_interval)
|
|
2502
|
+
end
|
|
2503
|
+
end
|
|
2504
|
+
end
|
|
2505
|
+
end
|
|
2506
|
+
|
|
2507
|
+
# Health check thread
|
|
2508
|
+
if health_check_enabled?
|
|
2509
|
+
health_interval = ($config[:config] && $config[:config]['health_check_interval']) || 300 # default 5 minutes
|
|
2510
|
+
if !$threads['health_check']
|
|
2511
|
+
$threads['health_check'] = Thread.new do
|
|
2512
|
+
loop do
|
|
2513
|
+
check_queue_health
|
|
2514
|
+
sleep(health_interval)
|
|
2515
|
+
end
|
|
2516
|
+
end
|
|
2517
|
+
elsif !$threads['health_check'].alive?
|
|
2518
|
+
$logger.log('d', '[!] [health_check] worker thread died, restarting')
|
|
2519
|
+
$threads['health_check'] = Thread.new do
|
|
2520
|
+
loop do
|
|
2521
|
+
check_queue_health
|
|
2522
|
+
sleep(health_interval)
|
|
2523
|
+
end
|
|
2524
|
+
end
|
|
2525
|
+
end
|
|
2526
|
+
end
|
|
2527
|
+
|
|
1766
2528
|
sleep(loop_delay)
|
|
1767
2529
|
|
|
1768
2530
|
unless maxloop == 0
|