vmpooler 3.6.0 → 3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,7 +145,8 @@ module Vmpooler
145
145
  "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: #{open_socket_error}"
146
146
  elsif timing_out_soon
147
147
  time_remaining = timeout - timeout_notification
148
- "[!] [#{pool}] '#{vm}' will be marked as 'failed' in #{time_remaining} minutes with error: #{open_socket_error}"
148
+ open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
149
+ "[!] [#{pool}] '#{vm}' impending failure in #{time_remaining} minutes with error: #{open_socket_error}"
149
150
  else
150
151
  "[!] [#{pool}] '#{vm}' This error is wholly unexpected"
151
152
  end
@@ -160,16 +161,80 @@ module Vmpooler
160
161
  request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
161
162
  pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
162
163
  open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
164
+ retry_count = redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count').to_i if request_id
165
+
166
+ # Move to DLQ before moving to completed queue
167
+ move_to_dlq(vm, pool, 'pending', 'Timeout',
168
+ open_socket_error || 'VM timed out during pending phase',
169
+ redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
170
+
171
+ clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error')
172
+ clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class')
163
173
  redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
174
+
164
175
  if request_id
165
176
  ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
166
177
  if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
167
- # will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted
168
- redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
178
+ # Check retry count and max retry limit before retrying
179
+ retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
180
+ max_retries = $config[:config]['max_vm_retries'] || 3
181
+
182
+ $logger.log('s', "[!] [#{pool}] '#{vm}' checking retry logic: error='#{clone_error}', error_class='#{clone_error_class}', retry_count=#{retry_count}, max_retries=#{max_retries}")
183
+
184
+ # Determine if error is likely permanent (configuration issues)
185
+ permanent_error = permanent_error?(clone_error, clone_error_class)
186
+ $logger.log('s', "[!] [#{pool}] '#{vm}' permanent_error check result: #{permanent_error}")
187
+
188
+ if retry_count < max_retries && !permanent_error
189
+ # Increment retry count and retry VM creation
190
+ redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1)
191
+ redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
192
+ $logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})")
193
+ else
194
+ # Max retries exceeded or permanent error, mark request as permanently failed
195
+ failure_reason = if permanent_error
196
+ "Configuration error: #{clone_error}"
197
+ else
198
+ 'Max retry attempts exceeded'
199
+ end
200
+ redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
201
+ redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason)
202
+ $logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}")
203
+ $metrics.increment("vmpooler_errors.permanently_failed.#{pool}")
204
+ end
169
205
  end
170
206
  end
171
- $metrics.increment("errors.markedasfailed.#{pool}")
172
- open_socket_error
207
+ $metrics.increment("vmpooler_errors.markedasfailed.#{pool}")
208
+ open_socket_error || clone_error
209
+ end
210
+
211
+ # Determine if an error is likely permanent (configuration issue) vs transient
212
+ def permanent_error?(error_message, error_class)
213
+ return false if error_message.nil? || error_class.nil?
214
+
215
+ permanent_error_patterns = [
216
+ /template.*not found/i,
217
+ /template.*does not exist/i,
218
+ /invalid.*path/i,
219
+ /folder.*not found/i,
220
+ /datastore.*not found/i,
221
+ /resource pool.*not found/i,
222
+ /permission.*denied/i,
223
+ /authentication.*failed/i,
224
+ /invalid.*credentials/i,
225
+ /configuration.*error/i
226
+ ]
227
+
228
+ permanent_error_classes = [
229
+ 'ArgumentError',
230
+ 'NoMethodError',
231
+ 'NameError'
232
+ ]
233
+
234
+ # Check error message patterns
235
+ permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } ||
236
+ # Check error class types
237
+ permanent_error_classes.include?(error_class)
173
238
  end
174
239
 
175
240
  def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
@@ -222,8 +287,16 @@ module Vmpooler
222
287
  return true if provider.vm_ready?(pool_name, vm_name, redis)
223
288
 
224
289
  raise("VM #{vm_name} is not ready")
225
- rescue StandardError
290
+ rescue StandardError => e
226
291
  open_socket_error = redis.hget("vmpooler__vm__#{vm_name}", 'open_socket_error')
292
+ request_id = redis.hget("vmpooler__vm__#{vm_name}", 'request_id')
293
+ pool_alias = redis.hget("vmpooler__vm__#{vm_name}", 'pool_alias')
294
+
295
+ # Move to DLQ before moving to completed queue
296
+ move_to_dlq(vm_name, pool_name, 'ready', e.class.name,
297
+ open_socket_error || 'VM became unreachable in ready queue',
298
+ redis, request_id: request_id, pool_alias: pool_alias)
299
+
227
300
  move_vm_queue(pool_name, vm_name, 'ready', 'completed', redis, "removed from 'ready' queue. vm unreachable with error: #{open_socket_error}")
228
301
  end
229
302
 
@@ -356,6 +429,60 @@ module Vmpooler
356
429
  $logger.log('d', "[!] [#{pool}] '#{vm}' #{msg}") if msg
357
430
  end
358
431
 
432
+ # Dead-Letter Queue (DLQ) helper methods
433
+ def dlq_enabled?
434
+ $config[:config] && $config[:config]['dlq_enabled'] == true
435
+ end
436
+
437
+ def dlq_ttl
438
+ ($config[:config] && $config[:config]['dlq_ttl']) || 168 # default 7 days in hours
439
+ end
440
+
441
+ def dlq_max_entries
442
+ ($config[:config] && $config[:config]['dlq_max_entries']) || 10_000
443
+ end
444
+
445
+ def move_to_dlq(vm, pool, queue_type, error_class, error_message, redis, request_id: nil, pool_alias: nil, retry_count: 0, skip_metrics: false)
446
+ return unless dlq_enabled?
447
+
448
+ dlq_key = "vmpooler__dlq__#{queue_type}"
449
+ timestamp = Time.now.to_i
450
+
451
+ # Build DLQ entry
452
+ dlq_entry = {
453
+ 'vm' => vm,
454
+ 'pool' => pool,
455
+ 'queue_from' => queue_type,
456
+ 'error_class' => error_class.to_s,
457
+ 'error_message' => error_message.to_s,
458
+ 'failed_at' => Time.now.iso8601,
459
+ 'retry_count' => retry_count,
460
+ 'request_id' => request_id,
461
+ 'pool_alias' => pool_alias
462
+ }.compact
463
+
464
+ # Use sorted set with timestamp as score for easy age-based queries and TTL
465
+ dlq_entry_json = dlq_entry.to_json
466
+ redis.zadd(dlq_key, timestamp, "#{vm}:#{timestamp}:#{dlq_entry_json}")
467
+
468
+ # Enforce max entries limit by removing oldest entries
469
+ current_size = redis.zcard(dlq_key)
470
+ if current_size > dlq_max_entries
471
+ remove_count = current_size - dlq_max_entries
472
+ redis.zremrangebyrank(dlq_key, 0, remove_count - 1)
473
+ $logger.log('d', "[!] [dlq] Trimmed #{remove_count} oldest entries from #{dlq_key}")
474
+ end
475
+
476
+ # Set expiration on the entire DLQ (will be refreshed on next write)
477
+ ttl_seconds = dlq_ttl * 3600
478
+ redis.expire(dlq_key, ttl_seconds)
479
+
480
+ $metrics.increment("vmpooler_dlq.#{queue_type}.count") unless skip_metrics
481
+ $logger.log('d', "[!] [dlq] Moved '#{vm}' from '#{queue_type}' queue to DLQ: #{error_message}")
482
+ rescue StandardError => e
483
+ $logger.log('s', "[!] [dlq] Failed to move '#{vm}' to DLQ: #{e}")
484
+ end
485
+
359
486
  # Clone a VM
360
487
  def clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = nil)
361
488
  Thread.new do
@@ -365,7 +492,13 @@ module Vmpooler
365
492
  if request_id
366
493
  $logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}")
367
494
  @redis.with_metrics do |redis|
368
- redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
495
+ # Only re-queue if the request wasn't already marked as failed (e.g., by permanent error detection)
496
+ request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
497
+ if request_status != 'failed'
498
+ redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
499
+ else
500
+ $logger.log('s', "[!] [#{pool_name}] Request #{request_id} already marked as failed, not re-queueing")
501
+ end
369
502
  end
370
503
  else
371
504
  $logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")
@@ -418,10 +551,10 @@ module Vmpooler
418
551
  hostname_retries += 1
419
552
 
420
553
  if !hostname_available
421
- $metrics.increment("errors.duplicatehostname.#{pool_name}")
554
+ $metrics.increment("vmpooler_errors.duplicatehostname.#{pool_name}")
422
555
  $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} was not unique (attempt \##{hostname_retries} of #{max_hostname_retries})")
423
556
  elsif !dns_available
424
- $metrics.increment("errors.staledns.#{pool_name}")
557
+ $metrics.increment("vmpooler_errors.staledns.#{pool_name}")
425
558
  $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} already exists in DNS records (#{dns_ip}), stale DNS")
426
559
  end
427
560
  end
@@ -467,7 +600,7 @@ module Vmpooler
467
600
  provider.create_vm(pool_name, new_vmname)
468
601
  finish = format('%<time>.2f', time: Time.now - start)
469
602
  $logger.log('s', "[+] [#{pool_name}] '#{new_vmname}' cloned in #{finish} seconds")
470
- $metrics.timing("clone.#{pool_name}", finish)
603
+ $metrics.gauge("vmpooler_clone.#{pool_name}", finish)
471
604
 
472
605
  $logger.log('d', "[ ] [#{pool_name}] Obtaining IP for '#{new_vmname}'")
473
606
  ip_start = Time.now
@@ -488,14 +621,50 @@ module Vmpooler
488
621
 
489
622
  dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
490
623
  dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
491
- rescue StandardError
624
+ rescue StandardError => e
625
+ # Store error details for retry decision making
492
626
  @redis.with_metrics do |redis|
627
+ # Get retry count before moving to DLQ
628
+ retry_count = 0
629
+ if request_id
630
+ ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
631
+ retry_count = ondemandrequest_hash['retry_count'].to_i if ondemandrequest_hash
632
+ end
633
+
634
+ # Move to DLQ before removing from pending queue
635
+ move_to_dlq(new_vmname, pool_name, 'clone', e.class.name, e.message,
636
+ redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
637
+
493
638
  redis.pipelined do |pipeline|
494
639
  pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
640
+ pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message)
641
+ pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name)
495
642
  expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
496
643
  pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
497
644
  end
645
+
646
+ # Handle retry logic for on-demand requests
647
+ if request_id
648
+ retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
649
+ max_retries = $config[:config]['max_vm_retries'] || 3
650
+ is_permanent = permanent_error?(e.message, e.class.name)
651
+
652
+ $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' checking immediate failure retry: error='#{e.message}', error_class='#{e.class.name}', retry_count=#{retry_count}, max_retries=#{max_retries}, permanent_error=#{is_permanent}")
653
+
654
+ if is_permanent || retry_count >= max_retries
655
+ reason = is_permanent ? 'permanent error detected' : 'max retries exceeded'
656
+ $logger.log('s', "[!] [#{pool_name}] Cancelling request #{request_id} due to #{reason}")
657
+ redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
658
+ redis.zadd('vmpooler__odcreate__task', 0, "#{pool_alias}:#{pool_name}:0:#{request_id}")
659
+ else
660
+ # Increment retry count and re-queue for retry
661
+ redis.hincrby("vmpooler__odrequest__#{request_id}", 'retry_count', 1)
662
+ $logger.log('s', "[+] [#{pool_name}] Request #{request_id} will be retried (attempt #{retry_count + 1}/#{max_retries})")
663
+ redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
664
+ end
665
+ end
498
666
  end
667
+ $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}")
499
668
  raise
500
669
  ensure
501
670
  @redis.with_metrics do |redis|
@@ -545,7 +714,7 @@ module Vmpooler
545
714
 
546
715
  finish = format('%<time>.2f', time: Time.now - start)
547
716
  $logger.log('s', "[-] [#{pool}] '#{vm}' destroyed in #{finish} seconds")
548
- $metrics.timing("destroy.#{pool}", finish)
717
+ $metrics.gauge("vmpooler_destroy.#{pool}", finish)
549
718
  end
550
719
  end
551
720
  dereference_mutex(vm)
@@ -581,6 +750,543 @@ module Vmpooler
581
750
  provider.purge_unconfigured_resources(allowlist)
582
751
  end
583
752
 
753
+ # Auto-purge stale queue entries
754
+ def purge_enabled?
755
+ $config[:config] && $config[:config]['purge_enabled'] == true
756
+ end
757
+
758
+ def purge_dry_run?
759
+ $config[:config] && $config[:config]['purge_dry_run'] == true
760
+ end
761
+
762
+ def max_pending_age
763
+ ($config[:config] && $config[:config]['max_pending_age']) || 7200 # default 2 hours in seconds
764
+ end
765
+
766
+ def max_ready_age
767
+ ($config[:config] && $config[:config]['max_ready_age']) || 86_400 # default 24 hours in seconds
768
+ end
769
+
770
+ def max_completed_age
771
+ ($config[:config] && $config[:config]['max_completed_age']) || 3600 # default 1 hour in seconds
772
+ end
773
+
774
+ def max_orphaned_age
775
+ ($config[:config] && $config[:config]['max_orphaned_age']) || 86_400 # default 24 hours in seconds
776
+ end
777
+
778
+ def purge_stale_queue_entries
779
+ return unless purge_enabled?
780
+
781
+ Thread.new do
782
+ begin
783
+ $logger.log('d', '[*] [purge] Starting stale queue entry purge cycle')
784
+ purge_start = Time.now
785
+
786
+ @redis.with_metrics do |redis|
787
+ total_purged = 0
788
+
789
+ # Purge stale entries from each pool
790
+ $config[:pools].each do |pool|
791
+ pool_name = pool['name']
792
+
793
+ # Purge pending queue
794
+ purged_pending = purge_pending_queue(pool_name, redis)
795
+ total_purged += purged_pending
796
+
797
+ # Purge ready queue
798
+ purged_ready = purge_ready_queue(pool_name, redis)
799
+ total_purged += purged_ready
800
+
801
+ # Purge completed queue
802
+ purged_completed = purge_completed_queue(pool_name, redis)
803
+ total_purged += purged_completed
804
+ end
805
+
806
+ # Purge orphaned VM metadata
807
+ purged_orphaned = purge_orphaned_metadata(redis)
808
+ total_purged += purged_orphaned
809
+
810
+ purge_duration = Time.now - purge_start
811
+ $logger.log('s', "[*] [purge] Completed purge cycle in #{purge_duration.round(2)}s: #{total_purged} entries purged")
812
+ $metrics.gauge('vmpooler_purge.cycle.duration', purge_duration)
813
+ $metrics.gauge('vmpooler_purge.total.count', total_purged)
814
+ end
815
+ rescue StandardError => e
816
+ $logger.log('s', "[!] [purge] Failed during purge cycle: #{e}")
817
+ end
818
+ end
819
+ end
820
+
821
+ def purge_pending_queue(pool_name, redis)
822
+ queue_key = "vmpooler__pending__#{pool_name}"
823
+ vms = redis.smembers(queue_key)
824
+ purged_count = 0
825
+
826
+ vms.each do |vm|
827
+ begin
828
+ clone_time_str = redis.hget("vmpooler__vm__#{vm}", 'clone')
829
+ next unless clone_time_str
830
+
831
+ clone_time = Time.parse(clone_time_str)
832
+ age = Time.now - clone_time
833
+
834
+ if age > max_pending_age
835
+ request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
836
+ pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias')
837
+
838
+ purged_count += 1
839
+
840
+ if purge_dry_run?
841
+ $logger.log('d', "[*] [purge][dry-run] Would purge stale pending VM '#{vm}' (age: #{age.round(0)}s, max: #{max_pending_age}s)")
842
+ else
843
+ # Move to DLQ before removing (skip DLQ metric since we're tracking purge metric)
844
+ move_to_dlq(vm, pool_name, 'pending', 'Purge',
845
+ "Stale pending VM (age: #{age.round(0)}s > max: #{max_pending_age}s)",
846
+ redis, request_id: request_id, pool_alias: pool_alias, skip_metrics: true)
847
+
848
+ redis.srem(queue_key, vm)
849
+
850
+ # Set expiration on VM metadata if data_ttl is configured
851
+ if $config[:redis] && $config[:redis]['data_ttl']
852
+ expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
853
+ redis.expire("vmpooler__vm__#{vm}", expiration_ttl)
854
+ end
855
+
856
+ $logger.log('d', "[!] [purge] Purged stale pending VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
857
+ $metrics.increment("vmpooler_purge.pending.#{pool_name}.count")
858
+ end
859
+ end
860
+ rescue StandardError => e
861
+ $logger.log('d', "[!] [purge] Error checking pending VM '#{vm}': #{e}")
862
+ end
863
+ end
864
+
865
+ purged_count
866
+ end
867
+
868
+ def purge_ready_queue(pool_name, redis)
869
+ queue_key = "vmpooler__ready__#{pool_name}"
870
+ vms = redis.smembers(queue_key)
871
+ purged_count = 0
872
+
873
+ vms.each do |vm|
874
+ begin
875
+ ready_time_str = redis.hget("vmpooler__vm__#{vm}", 'ready')
876
+ next unless ready_time_str
877
+
878
+ ready_time = Time.parse(ready_time_str)
879
+ age = Time.now - ready_time
880
+
881
+ if age > max_ready_age
882
+ if purge_dry_run?
883
+ $logger.log('d', "[*] [purge][dry-run] Would purge stale ready VM '#{vm}' (age: #{age.round(0)}s, max: #{max_ready_age}s)")
884
+ else
885
+ redis.smove(queue_key, "vmpooler__completed__#{pool_name}", vm)
886
+ $logger.log('d', "[!] [purge] Moved stale ready VM '#{vm}' from '#{pool_name}' to completed (age: #{age.round(0)}s)")
887
+ $metrics.increment("vmpooler_purge.ready.#{pool_name}.count")
888
+ end
889
+ purged_count += 1
890
+ end
891
+ rescue StandardError => e
892
+ $logger.log('d', "[!] [purge] Error checking ready VM '#{vm}': #{e}")
893
+ end
894
+ end
895
+
896
+ purged_count
897
+ end
898
+
899
+ def purge_completed_queue(pool_name, redis)
900
+ queue_key = "vmpooler__completed__#{pool_name}"
901
+ vms = redis.smembers(queue_key)
902
+ purged_count = 0
903
+
904
+ vms.each do |vm|
905
+ begin
906
+ # Check destroy time or last activity time
907
+ destroy_time_str = redis.hget("vmpooler__vm__#{vm}", 'destroy')
908
+ checkout_time_str = redis.hget("vmpooler__vm__#{vm}", 'checkout')
909
+
910
+ # Use the most recent timestamp
911
+ timestamp_str = destroy_time_str || checkout_time_str
912
+ next unless timestamp_str
913
+
914
+ timestamp = Time.parse(timestamp_str)
915
+ age = Time.now - timestamp
916
+
917
+ if age > max_completed_age
918
+ if purge_dry_run?
919
+ $logger.log('d', "[*] [purge][dry-run] Would purge stale completed VM '#{vm}' (age: #{age.round(0)}s, max: #{max_completed_age}s)")
920
+ else
921
+ redis.srem(queue_key, vm)
922
+ $logger.log('d', "[!] [purge] Removed stale completed VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
923
+ $metrics.increment("vmpooler_purge.completed.#{pool_name}.count")
924
+ end
925
+ purged_count += 1
926
+ end
927
+ rescue StandardError => e
928
+ $logger.log('d', "[!] [purge] Error checking completed VM '#{vm}': #{e}")
929
+ end
930
+ end
931
+
932
+ purged_count
933
+ end
934
+
935
+ def purge_orphaned_metadata(redis)
936
+ # Find VM metadata that doesn't belong to any queue
937
+ all_vm_keys = redis.keys('vmpooler__vm__*')
938
+ purged_count = 0
939
+
940
+ all_vm_keys.each do |vm_key|
941
+ begin
942
+ vm = vm_key.sub('vmpooler__vm__', '')
943
+
944
+ # Check if VM exists in any queue
945
+ pool_name = redis.hget(vm_key, 'pool')
946
+ next unless pool_name
947
+
948
+ in_pending = redis.sismember("vmpooler__pending__#{pool_name}", vm)
949
+ in_ready = redis.sismember("vmpooler__ready__#{pool_name}", vm)
950
+ in_running = redis.sismember("vmpooler__running__#{pool_name}", vm)
951
+ in_completed = redis.sismember("vmpooler__completed__#{pool_name}", vm)
952
+ in_discovered = redis.sismember("vmpooler__discovered__#{pool_name}", vm)
953
+ in_migrating = redis.sismember("vmpooler__migrating__#{pool_name}", vm)
954
+
955
+ # VM is orphaned if not in any queue
956
+ unless in_pending || in_ready || in_running || in_completed || in_discovered || in_migrating
957
+ # Check age
958
+ clone_time_str = redis.hget(vm_key, 'clone')
959
+ next unless clone_time_str
960
+
961
+ clone_time = Time.parse(clone_time_str)
962
+ age = Time.now - clone_time
963
+
964
+ if age > max_orphaned_age
965
+ if purge_dry_run?
966
+ $logger.log('d', "[*] [purge][dry-run] Would purge orphaned metadata for '#{vm}' (age: #{age.round(0)}s, max: #{max_orphaned_age}s)")
967
+ else
968
+ expiration_ttl = 3600 # 1 hour
969
+ redis.expire(vm_key, expiration_ttl)
970
+ $logger.log('d', "[!] [purge] Set expiration on orphaned metadata for '#{vm}' (age: #{age.round(0)}s)")
971
+ $metrics.increment('vmpooler_purge.orphaned.count')
972
+ end
973
+ purged_count += 1
974
+ end
975
+ end
976
+ rescue StandardError => e
977
+ $logger.log('d', "[!] [purge] Error checking orphaned metadata '#{vm_key}': #{e}")
978
+ end
979
+ end
980
+
981
+ purged_count
982
+ end
983
+
984
+ # Health checks for Redis queues
985
+ def health_check_enabled?
986
+ $config[:config] && $config[:config]['health_check_enabled'] == true
987
+ end
988
+
989
+ def health_thresholds
990
+ defaults = {
991
+ 'pending_queue_max' => 100,
992
+ 'ready_queue_max' => 500,
993
+ 'dlq_max_warning' => 100,
994
+ 'dlq_max_critical' => 1000,
995
+ 'stuck_vm_age_threshold' => 7200, # 2 hours
996
+ 'stuck_vm_max_warning' => 10,
997
+ 'stuck_vm_max_critical' => 50
998
+ }
999
+
1000
+ if $config[:config] && $config[:config]['health_thresholds']
1001
+ defaults.merge($config[:config]['health_thresholds'])
1002
+ else
1003
+ defaults
1004
+ end
1005
+ end
1006
+
1007
+ def check_queue_health
1008
+ return unless health_check_enabled?
1009
+
1010
+ Thread.new do
1011
+ begin
1012
+ $logger.log('d', '[*] [health] Running queue health check')
1013
+ health_start = Time.now
1014
+
1015
+ @redis.with_metrics do |redis|
1016
+ health_metrics = calculate_health_metrics(redis)
1017
+ health_status = determine_health_status(health_metrics)
1018
+
1019
+ # Store health metrics in Redis for API consumption
1020
+ # Convert nested hash to JSON for storage
1021
+ require 'json'
1022
+ redis.hset('vmpooler__health', 'metrics', health_metrics.to_json)
1023
+ redis.hset('vmpooler__health', 'status', health_status)
1024
+ redis.hset('vmpooler__health', 'last_check', Time.now.iso8601)
1025
+ redis.expire('vmpooler__health', 3600) # Expire after 1 hour
1026
+
1027
+ # Log health summary
1028
+ log_health_summary(health_metrics, health_status)
1029
+
1030
+ # Push metrics
1031
+ push_health_metrics(health_metrics, health_status)
1032
+
1033
+ health_duration = Time.now - health_start
1034
+ $metrics.gauge('vmpooler_health.check.duration', health_duration)
1035
+ end
1036
+ rescue StandardError => e
1037
+ $logger.log('s', "[!] [health] Failed during health check: #{e}")
1038
+ end
1039
+ end
1040
+ end
1041
+
1042
+ def calculate_health_metrics(redis)
1043
+ metrics = {
1044
+ 'queues' => {},
1045
+ 'tasks' => {},
1046
+ 'errors' => {}
1047
+ }
1048
+
1049
+ total_stuck_vms = 0
1050
+ total_dlq_size = 0
1051
+ thresholds = health_thresholds
1052
+
1053
+ # Check each pool's queues
1054
+ $config[:pools].each do |pool|
1055
+ pool_name = pool['name']
1056
+ metrics['queues'][pool_name] = {}
1057
+
1058
+ # Pending queue metrics
1059
+ pending_key = "vmpooler__pending__#{pool_name}"
1060
+ pending_vms = redis.smembers(pending_key)
1061
+ pending_ages = calculate_queue_ages(pending_vms, 'clone', redis)
1062
+ stuck_pending = pending_ages.count { |age| age > thresholds['stuck_vm_age_threshold'] }
1063
+ total_stuck_vms += stuck_pending
1064
+
1065
+ metrics['queues'][pool_name]['pending'] = {
1066
+ 'size' => pending_vms.size,
1067
+ 'oldest_age' => pending_ages.max || 0,
1068
+ 'avg_age' => pending_ages.empty? ? 0 : (pending_ages.sum / pending_ages.size).round(0),
1069
+ 'stuck_count' => stuck_pending
1070
+ }
1071
+
1072
+ # Ready queue metrics
1073
+ ready_key = "vmpooler__ready__#{pool_name}"
1074
+ ready_vms = redis.smembers(ready_key)
1075
+ ready_ages = calculate_queue_ages(ready_vms, 'ready', redis)
1076
+
1077
+ metrics['queues'][pool_name]['ready'] = {
1078
+ 'size' => ready_vms.size,
1079
+ 'oldest_age' => ready_ages.max || 0,
1080
+ 'avg_age' => ready_ages.empty? ? 0 : (ready_ages.sum / ready_ages.size).round(0)
1081
+ }
1082
+
1083
+ # Completed queue metrics
1084
+ completed_key = "vmpooler__completed__#{pool_name}"
1085
+ completed_size = redis.scard(completed_key)
1086
+ metrics['queues'][pool_name]['completed'] = { 'size' => completed_size }
1087
+ end
1088
+
1089
+ # Task queue metrics
1090
+ clone_active = redis.get('vmpooler__tasks__clone').to_i
1091
+ ondemand_active = redis.get('vmpooler__tasks__ondemandclone').to_i
1092
+ odcreate_pending = redis.zcard('vmpooler__odcreate__task')
1093
+
1094
+ metrics['tasks']['clone'] = { 'active' => clone_active }
1095
+ metrics['tasks']['ondemand'] = { 'active' => ondemand_active, 'pending' => odcreate_pending }
1096
+
1097
+ # DLQ metrics
1098
+ if dlq_enabled?
1099
+ dlq_keys = redis.keys('vmpooler__dlq__*')
1100
+ dlq_keys.each do |dlq_key|
1101
+ queue_type = dlq_key.sub('vmpooler__dlq__', '')
1102
+ dlq_size = redis.zcard(dlq_key)
1103
+ total_dlq_size += dlq_size
1104
+ metrics['queues']['dlq'] ||= {}
1105
+ metrics['queues']['dlq'][queue_type] = { 'size' => dlq_size }
1106
+ end
1107
+ end
1108
+
1109
+ # Error metrics
1110
+ metrics['errors']['dlq_total_size'] = total_dlq_size
1111
+ metrics['errors']['stuck_vm_count'] = total_stuck_vms
1112
+
1113
+ # Orphaned metadata count
1114
+ orphaned_count = count_orphaned_metadata(redis)
1115
+ metrics['errors']['orphaned_metadata_count'] = orphaned_count
1116
+
1117
+ metrics
1118
+ end
1119
+
1120
+ def calculate_queue_ages(vms, timestamp_field, redis)
1121
+ ages = []
1122
+ vms.each do |vm|
1123
+ begin
1124
+ timestamp_str = redis.hget("vmpooler__vm__#{vm}", timestamp_field)
1125
+ next unless timestamp_str
1126
+
1127
+ timestamp = Time.parse(timestamp_str)
1128
+ age = (Time.now - timestamp).to_i
1129
+ ages << age
1130
+ rescue StandardError
1131
+ # Skip VMs with invalid timestamps
1132
+ end
1133
+ end
1134
+ ages
1135
+ end
1136
+
1137
+ def count_orphaned_metadata(redis)
1138
+ all_vm_keys = redis.keys('vmpooler__vm__*')
1139
+ orphaned_count = 0
1140
+
1141
+ all_vm_keys.each do |vm_key|
1142
+ begin
1143
+ vm = vm_key.sub('vmpooler__vm__', '')
1144
+ pool_name = redis.hget(vm_key, 'pool')
1145
+ next unless pool_name
1146
+
1147
+ in_any_queue = redis.sismember("vmpooler__pending__#{pool_name}", vm) ||
1148
+ redis.sismember("vmpooler__ready__#{pool_name}", vm) ||
1149
+ redis.sismember("vmpooler__running__#{pool_name}", vm) ||
1150
+ redis.sismember("vmpooler__completed__#{pool_name}", vm) ||
1151
+ redis.sismember("vmpooler__discovered__#{pool_name}", vm) ||
1152
+ redis.sismember("vmpooler__migrating__#{pool_name}", vm)
1153
+
1154
+ orphaned_count += 1 unless in_any_queue
1155
+ rescue StandardError
1156
+ # Skip on error
1157
+ end
1158
+ end
1159
+
1160
+ orphaned_count
1161
+ end
1162
+
1163
+ def determine_health_status(metrics)
1164
+ thresholds = health_thresholds
1165
+
1166
+ # Check DLQ size
1167
+ dlq_size = metrics['errors']['dlq_total_size']
1168
+ return 'unhealthy' if dlq_size > thresholds['dlq_max_critical']
1169
+
1170
+ # Check stuck VM count
1171
+ stuck_count = metrics['errors']['stuck_vm_count']
1172
+ return 'unhealthy' if stuck_count > thresholds['stuck_vm_max_critical']
1173
+
1174
+ # Check queue sizes
1175
+ metrics['queues'].each do |pool_name, queues|
1176
+ next if pool_name == 'dlq'
1177
+
1178
+ pending_size = begin
1179
+ queues['pending']['size']
1180
+ rescue StandardError
1181
+ 0
1182
+ end
1183
+ ready_size = begin
1184
+ queues['ready']['size']
1185
+ rescue StandardError
1186
+ 0
1187
+ end
1188
+
1189
+ return 'unhealthy' if pending_size > thresholds['pending_queue_max'] * 2
1190
+ return 'unhealthy' if ready_size > thresholds['ready_queue_max'] * 2
1191
+ end
1192
+
1193
+ # Check for degraded conditions
1194
+ return 'degraded' if dlq_size > thresholds['dlq_max_warning']
1195
+ return 'degraded' if stuck_count > thresholds['stuck_vm_max_warning']
1196
+
1197
+ metrics['queues'].each do |pool_name, queues|
1198
+ next if pool_name == 'dlq'
1199
+
1200
+ pending_size = begin
1201
+ queues['pending']['size']
1202
+ rescue StandardError
1203
+ 0
1204
+ end
1205
+ ready_size = begin
1206
+ queues['ready']['size']
1207
+ rescue StandardError
1208
+ 0
1209
+ end
1210
+
1211
+ return 'degraded' if pending_size > thresholds['pending_queue_max']
1212
+ return 'degraded' if ready_size > thresholds['ready_queue_max']
1213
+ end
1214
+
1215
+ 'healthy'
1216
+ end
1217
+
1218
+ def log_health_summary(metrics, status)
1219
+ summary = "[*] [health] Status: #{status.upcase}"
1220
+
1221
+ # Queue summary
1222
+ total_pending = 0
1223
+ total_ready = 0
1224
+ total_completed = 0
1225
+
1226
+ metrics['queues'].each do |pool_name, queues|
1227
+ next if pool_name == 'dlq'
1228
+
1229
+ total_pending += begin
1230
+ queues['pending']['size']
1231
+ rescue StandardError
1232
+ 0
1233
+ end
1234
+ total_ready += begin
1235
+ queues['ready']['size']
1236
+ rescue StandardError
1237
+ 0
1238
+ end
1239
+ total_completed += begin
1240
+ queues['completed']['size']
1241
+ rescue StandardError
1242
+ 0
1243
+ end
1244
+ end
1245
+
1246
+ summary += " | Queues: P=#{total_pending} R=#{total_ready} C=#{total_completed}"
1247
+ summary += " | DLQ=#{metrics['errors']['dlq_total_size']}"
1248
+ summary += " | Stuck=#{metrics['errors']['stuck_vm_count']}"
1249
+ summary += " | Orphaned=#{metrics['errors']['orphaned_metadata_count']}"
1250
+
1251
+ log_level = status == 'healthy' ? 's' : 'd'
1252
+ $logger.log(log_level, summary)
1253
+ end
1254
+
1255
+ def push_health_metrics(metrics, status)
1256
+ # Push error metrics first
1257
+ $metrics.gauge('vmpooler_health.dlq.total_size', metrics['errors']['dlq_total_size'])
1258
+ $metrics.gauge('vmpooler_health.stuck_vms.count', metrics['errors']['stuck_vm_count'])
1259
+ $metrics.gauge('vmpooler_health.orphaned_metadata.count', metrics['errors']['orphaned_metadata_count'])
1260
+
1261
+ # Push per-pool queue metrics
1262
+ metrics['queues'].each do |pool_name, queues|
1263
+ next if pool_name == 'dlq'
1264
+
1265
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.size", queues['pending']['size'])
1266
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.oldest_age", queues['pending']['oldest_age'])
1267
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.stuck_count", queues['pending']['stuck_count'])
1268
+
1269
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.size", queues['ready']['size'])
1270
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.oldest_age", queues['ready']['oldest_age'])
1271
+
1272
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.completed.size", queues['completed']['size'])
1273
+ end
1274
+
1275
+ # Push DLQ metrics
1276
+ metrics['queues']['dlq']&.each do |queue_type, dlq_metrics|
1277
+ $metrics.gauge("vmpooler_health.dlq.#{queue_type}.size", dlq_metrics['size'])
1278
+ end
1279
+
1280
+ # Push task metrics
1281
+ $metrics.gauge('vmpooler_health.tasks.clone.active', metrics['tasks']['clone']['active'])
1282
+ $metrics.gauge('vmpooler_health.tasks.ondemand.active', metrics['tasks']['ondemand']['active'])
1283
+ $metrics.gauge('vmpooler_health.tasks.ondemand.pending', metrics['tasks']['ondemand']['pending'])
1284
+
1285
+ # Push status last (0=healthy, 1=degraded, 2=unhealthy)
1286
+ status_value = { 'healthy' => 0, 'degraded' => 1, 'unhealthy' => 2 }[status] || 2
1287
+ $metrics.gauge('vmpooler_health.status', status_value)
1288
+ end
1289
+
584
1290
  def create_vm_disk(pool_name, vm, disk_size, provider)
585
1291
  Thread.new do
586
1292
  begin
@@ -981,7 +1687,12 @@ module Vmpooler
981
1687
 
982
1688
  sync_pool_template(pool)
983
1689
  loop do
1690
+ start_time = Time.now
984
1691
  result = _check_pool(pool, provider)
1692
+ duration = Time.now - start_time
1693
+
1694
+ $metrics.gauge("vmpooler_performance.check_pool.#{pool['name']}", duration)
1695
+ $logger.log('d', "[!] check_pool for #{pool['name']} took #{duration.round(2)}s") if duration > 5
985
1696
 
986
1697
  if result[:cloned_vms] > 0 || result[:checked_pending_vms] > 0 || result[:discovered_vms] > 0
987
1698
  loop_delay = loop_delay_min
@@ -1540,6 +2251,15 @@ module Vmpooler
1540
2251
  redis.zrem('vmpooler__provisioning__request', request_id)
1541
2252
  return
1542
2253
  end
2254
+
2255
+ # Check if request was already marked as failed (e.g., by delete endpoint)
2256
+ request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
2257
+ if request_status == 'failed'
2258
+ $logger.log('s', "Request '#{request_id}' already marked as failed, skipping VM creation")
2259
+ redis.zrem('vmpooler__provisioning__request', request_id)
2260
+ return
2261
+ end
2262
+
1543
2263
  score = redis.zscore('vmpooler__provisioning__request', request_id)
1544
2264
  requested = requested.split(',')
1545
2265
 
@@ -1763,6 +2483,48 @@ module Vmpooler
1763
2483
  check_ondemand_requests(check_loop_delay_min, check_loop_delay_max, check_loop_delay_decay)
1764
2484
  end
1765
2485
 
2486
+ # Queue purge thread
2487
+ if purge_enabled?
2488
+ purge_interval = ($config[:config] && $config[:config]['purge_interval']) || 3600 # default 1 hour
2489
+ if !$threads['queue_purge']
2490
+ $threads['queue_purge'] = Thread.new do
2491
+ loop do
2492
+ purge_stale_queue_entries
2493
+ sleep(purge_interval)
2494
+ end
2495
+ end
2496
+ elsif !$threads['queue_purge'].alive?
2497
+ $logger.log('d', '[!] [queue_purge] worker thread died, restarting')
2498
+ $threads['queue_purge'] = Thread.new do
2499
+ loop do
2500
+ purge_stale_queue_entries
2501
+ sleep(purge_interval)
2502
+ end
2503
+ end
2504
+ end
2505
+ end
2506
+
2507
+ # Health check thread
2508
+ if health_check_enabled?
2509
+ health_interval = ($config[:config] && $config[:config]['health_check_interval']) || 300 # default 5 minutes
2510
+ if !$threads['health_check']
2511
+ $threads['health_check'] = Thread.new do
2512
+ loop do
2513
+ check_queue_health
2514
+ sleep(health_interval)
2515
+ end
2516
+ end
2517
+ elsif !$threads['health_check'].alive?
2518
+ $logger.log('d', '[!] [health_check] worker thread died, restarting')
2519
+ $threads['health_check'] = Thread.new do
2520
+ loop do
2521
+ check_queue_health
2522
+ sleep(health_interval)
2523
+ end
2524
+ end
2525
+ end
2526
+ end
2527
+
1766
2528
  sleep(loop_delay)
1767
2529
 
1768
2530
  unless maxloop == 0