vmpooler 3.7.0 → 3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -161,16 +161,80 @@ module Vmpooler
161
161
  request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
162
162
  pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
163
163
  open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
164
+ retry_count = redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count').to_i if request_id
165
+
166
+ # Move to DLQ before moving to completed queue
167
+ move_to_dlq(vm, pool, 'pending', 'Timeout',
168
+ open_socket_error || 'VM timed out during pending phase',
169
+ redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
170
+
171
+ clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error')
172
+ clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class')
164
173
  redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
174
+
165
175
  if request_id
166
176
  ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
167
177
  if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
168
- # will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted
169
- redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
178
+ # Check retry count and max retry limit before retrying
179
+ retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
180
+ max_retries = $config[:config]['max_vm_retries'] || 3
181
+
182
+ $logger.log('s', "[!] [#{pool}] '#{vm}' checking retry logic: error='#{clone_error}', error_class='#{clone_error_class}', retry_count=#{retry_count}, max_retries=#{max_retries}")
183
+
184
+ # Determine if error is likely permanent (configuration issues)
185
+ permanent_error = permanent_error?(clone_error, clone_error_class)
186
+ $logger.log('s', "[!] [#{pool}] '#{vm}' permanent_error check result: #{permanent_error}")
187
+
188
+ if retry_count < max_retries && !permanent_error
189
+ # Increment retry count and retry VM creation
190
+ redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1)
191
+ redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
192
+ $logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})")
193
+ else
194
+ # Max retries exceeded or permanent error, mark request as permanently failed
195
+ failure_reason = if permanent_error
196
+ "Configuration error: #{clone_error}"
197
+ else
198
+ 'Max retry attempts exceeded'
199
+ end
200
+ redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
201
+ redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason)
202
+ $logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}")
203
+ $metrics.increment("vmpooler_errors.permanently_failed.#{pool}")
204
+ end
170
205
  end
171
206
  end
172
- $metrics.increment("errors.markedasfailed.#{pool}")
173
- open_socket_error
207
+ $metrics.increment("vmpooler_errors.markedasfailed.#{pool}")
208
+ open_socket_error || clone_error
209
+ end
210
+
211
+ # Determine if an error is likely permanent (configuration issue) vs transient
212
+ def permanent_error?(error_message, error_class)
213
+ return false if error_message.nil? || error_class.nil?
214
+
215
+ permanent_error_patterns = [
216
+ /template.*not found/i,
217
+ /template.*does not exist/i,
218
+ /invalid.*path/i,
219
+ /folder.*not found/i,
220
+ /datastore.*not found/i,
221
+ /resource pool.*not found/i,
222
+ /permission.*denied/i,
223
+ /authentication.*failed/i,
224
+ /invalid.*credentials/i,
225
+ /configuration.*error/i
226
+ ]
227
+
228
+ permanent_error_classes = [
229
+ 'ArgumentError',
230
+ 'NoMethodError',
231
+ 'NameError'
232
+ ]
233
+
234
+ # Check error message patterns
235
+ permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } ||
236
+ # Check error class types
237
+ permanent_error_classes.include?(error_class)
174
238
  end
175
239
 
176
240
  def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
@@ -223,8 +287,16 @@ module Vmpooler
223
287
  return true if provider.vm_ready?(pool_name, vm_name, redis)
224
288
 
225
289
  raise("VM #{vm_name} is not ready")
226
- rescue StandardError
290
+ rescue StandardError => e
227
291
  open_socket_error = redis.hget("vmpooler__vm__#{vm_name}", 'open_socket_error')
292
+ request_id = redis.hget("vmpooler__vm__#{vm_name}", 'request_id')
293
+ pool_alias = redis.hget("vmpooler__vm__#{vm_name}", 'pool_alias')
294
+
295
+ # Move to DLQ before moving to completed queue
296
+ move_to_dlq(vm_name, pool_name, 'ready', e.class.name,
297
+ open_socket_error || 'VM became unreachable in ready queue',
298
+ redis, request_id: request_id, pool_alias: pool_alias)
299
+
228
300
  move_vm_queue(pool_name, vm_name, 'ready', 'completed', redis, "removed from 'ready' queue. vm unreachable with error: #{open_socket_error}")
229
301
  end
230
302
 
@@ -357,6 +429,60 @@ module Vmpooler
357
429
  $logger.log('d', "[!] [#{pool}] '#{vm}' #{msg}") if msg
358
430
  end
359
431
 
432
+ # Dead-Letter Queue (DLQ) helper methods
433
+ def dlq_enabled?
434
+ $config[:config] && $config[:config]['dlq_enabled'] == true
435
+ end
436
+
437
+ def dlq_ttl
438
+ ($config[:config] && $config[:config]['dlq_ttl']) || 168 # default 7 days in hours
439
+ end
440
+
441
+ def dlq_max_entries
442
+ ($config[:config] && $config[:config]['dlq_max_entries']) || 10_000
443
+ end
444
+
445
+ def move_to_dlq(vm, pool, queue_type, error_class, error_message, redis, request_id: nil, pool_alias: nil, retry_count: 0, skip_metrics: false)
446
+ return unless dlq_enabled?
447
+
448
+ dlq_key = "vmpooler__dlq__#{queue_type}"
449
+ timestamp = Time.now.to_i
450
+
451
+ # Build DLQ entry
452
+ dlq_entry = {
453
+ 'vm' => vm,
454
+ 'pool' => pool,
455
+ 'queue_from' => queue_type,
456
+ 'error_class' => error_class.to_s,
457
+ 'error_message' => error_message.to_s,
458
+ 'failed_at' => Time.now.iso8601,
459
+ 'retry_count' => retry_count,
460
+ 'request_id' => request_id,
461
+ 'pool_alias' => pool_alias
462
+ }.compact
463
+
464
+ # Use sorted set with timestamp as score for easy age-based queries and TTL
465
+ dlq_entry_json = dlq_entry.to_json
466
+ redis.zadd(dlq_key, timestamp, "#{vm}:#{timestamp}:#{dlq_entry_json}")
467
+
468
+ # Enforce max entries limit by removing oldest entries
469
+ current_size = redis.zcard(dlq_key)
470
+ if current_size > dlq_max_entries
471
+ remove_count = current_size - dlq_max_entries
472
+ redis.zremrangebyrank(dlq_key, 0, remove_count - 1)
473
+ $logger.log('d', "[!] [dlq] Trimmed #{remove_count} oldest entries from #{dlq_key}")
474
+ end
475
+
476
+ # Set expiration on the entire DLQ (will be refreshed on next write)
477
+ ttl_seconds = dlq_ttl * 3600
478
+ redis.expire(dlq_key, ttl_seconds)
479
+
480
+ $metrics.increment("vmpooler_dlq.#{queue_type}.count") unless skip_metrics
481
+ $logger.log('d', "[!] [dlq] Moved '#{vm}' from '#{queue_type}' queue to DLQ: #{error_message}")
482
+ rescue StandardError => e
483
+ $logger.log('s', "[!] [dlq] Failed to move '#{vm}' to DLQ: #{e}")
484
+ end
485
+
360
486
  # Clone a VM
361
487
  def clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = nil)
362
488
  Thread.new do
@@ -366,7 +492,13 @@ module Vmpooler
366
492
  if request_id
367
493
  $logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}")
368
494
  @redis.with_metrics do |redis|
369
- redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
495
+ # Only re-queue if the request wasn't already marked as failed (e.g., by permanent error detection)
496
+ request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
497
+ if request_status != 'failed'
498
+ redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
499
+ else
500
+ $logger.log('s', "[!] [#{pool_name}] Request #{request_id} already marked as failed, not re-queueing")
501
+ end
370
502
  end
371
503
  else
372
504
  $logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")
@@ -419,10 +551,10 @@ module Vmpooler
419
551
  hostname_retries += 1
420
552
 
421
553
  if !hostname_available
422
- $metrics.increment("errors.duplicatehostname.#{pool_name}")
554
+ $metrics.increment("vmpooler_errors.duplicatehostname.#{pool_name}")
423
555
  $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} was not unique (attempt \##{hostname_retries} of #{max_hostname_retries})")
424
556
  elsif !dns_available
425
- $metrics.increment("errors.staledns.#{pool_name}")
557
+ $metrics.increment("vmpooler_errors.staledns.#{pool_name}")
426
558
  $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} already exists in DNS records (#{dns_ip}), stale DNS")
427
559
  end
428
560
  end
@@ -468,7 +600,7 @@ module Vmpooler
468
600
  provider.create_vm(pool_name, new_vmname)
469
601
  finish = format('%<time>.2f', time: Time.now - start)
470
602
  $logger.log('s', "[+] [#{pool_name}] '#{new_vmname}' cloned in #{finish} seconds")
471
- $metrics.timing("clone.#{pool_name}", finish)
603
+ $metrics.gauge("vmpooler_clone.#{pool_name}", finish)
472
604
 
473
605
  $logger.log('d', "[ ] [#{pool_name}] Obtaining IP for '#{new_vmname}'")
474
606
  ip_start = Time.now
@@ -489,14 +621,50 @@ module Vmpooler
489
621
 
490
622
  dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
491
623
  dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
492
- rescue StandardError
624
+ rescue StandardError => e
625
+ # Store error details for retry decision making
493
626
  @redis.with_metrics do |redis|
627
+ # Get retry count before moving to DLQ
628
+ retry_count = 0
629
+ if request_id
630
+ ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
631
+ retry_count = ondemandrequest_hash['retry_count'].to_i if ondemandrequest_hash
632
+ end
633
+
634
+ # Move to DLQ before removing from pending queue
635
+ move_to_dlq(new_vmname, pool_name, 'clone', e.class.name, e.message,
636
+ redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
637
+
494
638
  redis.pipelined do |pipeline|
495
639
  pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
640
+ pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message)
641
+ pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name)
496
642
  expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
497
643
  pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
498
644
  end
645
+
646
+ # Handle retry logic for on-demand requests
647
+ if request_id
648
+ retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
649
+ max_retries = $config[:config]['max_vm_retries'] || 3
650
+ is_permanent = permanent_error?(e.message, e.class.name)
651
+
652
+ $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' checking immediate failure retry: error='#{e.message}', error_class='#{e.class.name}', retry_count=#{retry_count}, max_retries=#{max_retries}, permanent_error=#{is_permanent}")
653
+
654
+ if is_permanent || retry_count >= max_retries
655
+ reason = is_permanent ? 'permanent error detected' : 'max retries exceeded'
656
+ $logger.log('s', "[!] [#{pool_name}] Cancelling request #{request_id} due to #{reason}")
657
+ redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
658
+ redis.zadd('vmpooler__odcreate__task', 0, "#{pool_alias}:#{pool_name}:0:#{request_id}")
659
+ else
660
+ # Increment retry count and re-queue for retry
661
+ redis.hincrby("vmpooler__odrequest__#{request_id}", 'retry_count', 1)
662
+ $logger.log('s', "[+] [#{pool_name}] Request #{request_id} will be retried (attempt #{retry_count + 1}/#{max_retries})")
663
+ redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
664
+ end
665
+ end
499
666
  end
667
+ $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}")
500
668
  raise
501
669
  ensure
502
670
  @redis.with_metrics do |redis|
@@ -546,7 +714,7 @@ module Vmpooler
546
714
 
547
715
  finish = format('%<time>.2f', time: Time.now - start)
548
716
  $logger.log('s', "[-] [#{pool}] '#{vm}' destroyed in #{finish} seconds")
549
- $metrics.timing("destroy.#{pool}", finish)
717
+ $metrics.gauge("vmpooler_destroy.#{pool}", finish)
550
718
  end
551
719
  end
552
720
  dereference_mutex(vm)
@@ -582,6 +750,543 @@ module Vmpooler
582
750
  provider.purge_unconfigured_resources(allowlist)
583
751
  end
584
752
 
753
+ # Auto-purge stale queue entries
754
+ def purge_enabled?
755
+ $config[:config] && $config[:config]['purge_enabled'] == true
756
+ end
757
+
758
+ def purge_dry_run?
759
+ $config[:config] && $config[:config]['purge_dry_run'] == true
760
+ end
761
+
762
+ def max_pending_age
763
+ ($config[:config] && $config[:config]['max_pending_age']) || 7200 # default 2 hours in seconds
764
+ end
765
+
766
+ def max_ready_age
767
+ ($config[:config] && $config[:config]['max_ready_age']) || 86_400 # default 24 hours in seconds
768
+ end
769
+
770
+ def max_completed_age
771
+ ($config[:config] && $config[:config]['max_completed_age']) || 3600 # default 1 hour in seconds
772
+ end
773
+
774
+ def max_orphaned_age
775
+ ($config[:config] && $config[:config]['max_orphaned_age']) || 86_400 # default 24 hours in seconds
776
+ end
777
+
778
+ def purge_stale_queue_entries
779
+ return unless purge_enabled?
780
+
781
+ Thread.new do
782
+ begin
783
+ $logger.log('d', '[*] [purge] Starting stale queue entry purge cycle')
784
+ purge_start = Time.now
785
+
786
+ @redis.with_metrics do |redis|
787
+ total_purged = 0
788
+
789
+ # Purge stale entries from each pool
790
+ $config[:pools].each do |pool|
791
+ pool_name = pool['name']
792
+
793
+ # Purge pending queue
794
+ purged_pending = purge_pending_queue(pool_name, redis)
795
+ total_purged += purged_pending
796
+
797
+ # Purge ready queue
798
+ purged_ready = purge_ready_queue(pool_name, redis)
799
+ total_purged += purged_ready
800
+
801
+ # Purge completed queue
802
+ purged_completed = purge_completed_queue(pool_name, redis)
803
+ total_purged += purged_completed
804
+ end
805
+
806
+ # Purge orphaned VM metadata
807
+ purged_orphaned = purge_orphaned_metadata(redis)
808
+ total_purged += purged_orphaned
809
+
810
+ purge_duration = Time.now - purge_start
811
+ $logger.log('s', "[*] [purge] Completed purge cycle in #{purge_duration.round(2)}s: #{total_purged} entries purged")
812
+ $metrics.gauge('vmpooler_purge.cycle.duration', purge_duration)
813
+ $metrics.gauge('vmpooler_purge.total.count', total_purged)
814
+ end
815
+ rescue StandardError => e
816
+ $logger.log('s', "[!] [purge] Failed during purge cycle: #{e}")
817
+ end
818
+ end
819
+ end
820
+
821
+ def purge_pending_queue(pool_name, redis)
822
+ queue_key = "vmpooler__pending__#{pool_name}"
823
+ vms = redis.smembers(queue_key)
824
+ purged_count = 0
825
+
826
+ vms.each do |vm|
827
+ begin
828
+ clone_time_str = redis.hget("vmpooler__vm__#{vm}", 'clone')
829
+ next unless clone_time_str
830
+
831
+ clone_time = Time.parse(clone_time_str)
832
+ age = Time.now - clone_time
833
+
834
+ if age > max_pending_age
835
+ request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
836
+ pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias')
837
+
838
+ purged_count += 1
839
+
840
+ if purge_dry_run?
841
+ $logger.log('d', "[*] [purge][dry-run] Would purge stale pending VM '#{vm}' (age: #{age.round(0)}s, max: #{max_pending_age}s)")
842
+ else
843
+ # Move to DLQ before removing (skip DLQ metric since we're tracking purge metric)
844
+ move_to_dlq(vm, pool_name, 'pending', 'Purge',
845
+ "Stale pending VM (age: #{age.round(0)}s > max: #{max_pending_age}s)",
846
+ redis, request_id: request_id, pool_alias: pool_alias, skip_metrics: true)
847
+
848
+ redis.srem(queue_key, vm)
849
+
850
+ # Set expiration on VM metadata if data_ttl is configured
851
+ if $config[:redis] && $config[:redis]['data_ttl']
852
+ expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
853
+ redis.expire("vmpooler__vm__#{vm}", expiration_ttl)
854
+ end
855
+
856
+ $logger.log('d', "[!] [purge] Purged stale pending VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
857
+ $metrics.increment("vmpooler_purge.pending.#{pool_name}.count")
858
+ end
859
+ end
860
+ rescue StandardError => e
861
+ $logger.log('d', "[!] [purge] Error checking pending VM '#{vm}': #{e}")
862
+ end
863
+ end
864
+
865
+ purged_count
866
+ end
867
+
868
+ def purge_ready_queue(pool_name, redis)
869
+ queue_key = "vmpooler__ready__#{pool_name}"
870
+ vms = redis.smembers(queue_key)
871
+ purged_count = 0
872
+
873
+ vms.each do |vm|
874
+ begin
875
+ ready_time_str = redis.hget("vmpooler__vm__#{vm}", 'ready')
876
+ next unless ready_time_str
877
+
878
+ ready_time = Time.parse(ready_time_str)
879
+ age = Time.now - ready_time
880
+
881
+ if age > max_ready_age
882
+ if purge_dry_run?
883
+ $logger.log('d', "[*] [purge][dry-run] Would purge stale ready VM '#{vm}' (age: #{age.round(0)}s, max: #{max_ready_age}s)")
884
+ else
885
+ redis.smove(queue_key, "vmpooler__completed__#{pool_name}", vm)
886
+ $logger.log('d', "[!] [purge] Moved stale ready VM '#{vm}' from '#{pool_name}' to completed (age: #{age.round(0)}s)")
887
+ $metrics.increment("vmpooler_purge.ready.#{pool_name}.count")
888
+ end
889
+ purged_count += 1
890
+ end
891
+ rescue StandardError => e
892
+ $logger.log('d', "[!] [purge] Error checking ready VM '#{vm}': #{e}")
893
+ end
894
+ end
895
+
896
+ purged_count
897
+ end
898
+
899
+ def purge_completed_queue(pool_name, redis)
900
+ queue_key = "vmpooler__completed__#{pool_name}"
901
+ vms = redis.smembers(queue_key)
902
+ purged_count = 0
903
+
904
+ vms.each do |vm|
905
+ begin
906
+ # Check destroy time or last activity time
907
+ destroy_time_str = redis.hget("vmpooler__vm__#{vm}", 'destroy')
908
+ checkout_time_str = redis.hget("vmpooler__vm__#{vm}", 'checkout')
909
+
910
+ # Use the most recent timestamp
911
+ timestamp_str = destroy_time_str || checkout_time_str
912
+ next unless timestamp_str
913
+
914
+ timestamp = Time.parse(timestamp_str)
915
+ age = Time.now - timestamp
916
+
917
+ if age > max_completed_age
918
+ if purge_dry_run?
919
+ $logger.log('d', "[*] [purge][dry-run] Would purge stale completed VM '#{vm}' (age: #{age.round(0)}s, max: #{max_completed_age}s)")
920
+ else
921
+ redis.srem(queue_key, vm)
922
+ $logger.log('d', "[!] [purge] Removed stale completed VM '#{vm}' from '#{pool_name}' (age: #{age.round(0)}s)")
923
+ $metrics.increment("vmpooler_purge.completed.#{pool_name}.count")
924
+ end
925
+ purged_count += 1
926
+ end
927
+ rescue StandardError => e
928
+ $logger.log('d', "[!] [purge] Error checking completed VM '#{vm}': #{e}")
929
+ end
930
+ end
931
+
932
+ purged_count
933
+ end
934
+
935
+ def purge_orphaned_metadata(redis)
936
+ # Find VM metadata that doesn't belong to any queue
937
+ all_vm_keys = redis.keys('vmpooler__vm__*')
938
+ purged_count = 0
939
+
940
+ all_vm_keys.each do |vm_key|
941
+ begin
942
+ vm = vm_key.sub('vmpooler__vm__', '')
943
+
944
+ # Check if VM exists in any queue
945
+ pool_name = redis.hget(vm_key, 'pool')
946
+ next unless pool_name
947
+
948
+ in_pending = redis.sismember("vmpooler__pending__#{pool_name}", vm)
949
+ in_ready = redis.sismember("vmpooler__ready__#{pool_name}", vm)
950
+ in_running = redis.sismember("vmpooler__running__#{pool_name}", vm)
951
+ in_completed = redis.sismember("vmpooler__completed__#{pool_name}", vm)
952
+ in_discovered = redis.sismember("vmpooler__discovered__#{pool_name}", vm)
953
+ in_migrating = redis.sismember("vmpooler__migrating__#{pool_name}", vm)
954
+
955
+ # VM is orphaned if not in any queue
956
+ unless in_pending || in_ready || in_running || in_completed || in_discovered || in_migrating
957
+ # Check age
958
+ clone_time_str = redis.hget(vm_key, 'clone')
959
+ next unless clone_time_str
960
+
961
+ clone_time = Time.parse(clone_time_str)
962
+ age = Time.now - clone_time
963
+
964
+ if age > max_orphaned_age
965
+ if purge_dry_run?
966
+ $logger.log('d', "[*] [purge][dry-run] Would purge orphaned metadata for '#{vm}' (age: #{age.round(0)}s, max: #{max_orphaned_age}s)")
967
+ else
968
+ expiration_ttl = 3600 # 1 hour
969
+ redis.expire(vm_key, expiration_ttl)
970
+ $logger.log('d', "[!] [purge] Set expiration on orphaned metadata for '#{vm}' (age: #{age.round(0)}s)")
971
+ $metrics.increment('vmpooler_purge.orphaned.count')
972
+ end
973
+ purged_count += 1
974
+ end
975
+ end
976
+ rescue StandardError => e
977
+ $logger.log('d', "[!] [purge] Error checking orphaned metadata '#{vm_key}': #{e}")
978
+ end
979
+ end
980
+
981
+ purged_count
982
+ end
983
+
984
+ # Health checks for Redis queues
985
+ def health_check_enabled?
986
+ $config[:config] && $config[:config]['health_check_enabled'] == true
987
+ end
988
+
989
+ def health_thresholds
990
+ defaults = {
991
+ 'pending_queue_max' => 100,
992
+ 'ready_queue_max' => 500,
993
+ 'dlq_max_warning' => 100,
994
+ 'dlq_max_critical' => 1000,
995
+ 'stuck_vm_age_threshold' => 7200, # 2 hours
996
+ 'stuck_vm_max_warning' => 10,
997
+ 'stuck_vm_max_critical' => 50
998
+ }
999
+
1000
+ if $config[:config] && $config[:config]['health_thresholds']
1001
+ defaults.merge($config[:config]['health_thresholds'])
1002
+ else
1003
+ defaults
1004
+ end
1005
+ end
1006
+
1007
+ def check_queue_health
1008
+ return unless health_check_enabled?
1009
+
1010
+ Thread.new do
1011
+ begin
1012
+ $logger.log('d', '[*] [health] Running queue health check')
1013
+ health_start = Time.now
1014
+
1015
+ @redis.with_metrics do |redis|
1016
+ health_metrics = calculate_health_metrics(redis)
1017
+ health_status = determine_health_status(health_metrics)
1018
+
1019
+ # Store health metrics in Redis for API consumption
1020
+ # Convert nested hash to JSON for storage
1021
+ require 'json'
1022
+ redis.hset('vmpooler__health', 'metrics', health_metrics.to_json)
1023
+ redis.hset('vmpooler__health', 'status', health_status)
1024
+ redis.hset('vmpooler__health', 'last_check', Time.now.iso8601)
1025
+ redis.expire('vmpooler__health', 3600) # Expire after 1 hour
1026
+
1027
+ # Log health summary
1028
+ log_health_summary(health_metrics, health_status)
1029
+
1030
+ # Push metrics
1031
+ push_health_metrics(health_metrics, health_status)
1032
+
1033
+ health_duration = Time.now - health_start
1034
+ $metrics.gauge('vmpooler_health.check.duration', health_duration)
1035
+ end
1036
+ rescue StandardError => e
1037
+ $logger.log('s', "[!] [health] Failed during health check: #{e}")
1038
+ end
1039
+ end
1040
+ end
1041
+
1042
+ def calculate_health_metrics(redis)
1043
+ metrics = {
1044
+ 'queues' => {},
1045
+ 'tasks' => {},
1046
+ 'errors' => {}
1047
+ }
1048
+
1049
+ total_stuck_vms = 0
1050
+ total_dlq_size = 0
1051
+ thresholds = health_thresholds
1052
+
1053
+ # Check each pool's queues
1054
+ $config[:pools].each do |pool|
1055
+ pool_name = pool['name']
1056
+ metrics['queues'][pool_name] = {}
1057
+
1058
+ # Pending queue metrics
1059
+ pending_key = "vmpooler__pending__#{pool_name}"
1060
+ pending_vms = redis.smembers(pending_key)
1061
+ pending_ages = calculate_queue_ages(pending_vms, 'clone', redis)
1062
+ stuck_pending = pending_ages.count { |age| age > thresholds['stuck_vm_age_threshold'] }
1063
+ total_stuck_vms += stuck_pending
1064
+
1065
+ metrics['queues'][pool_name]['pending'] = {
1066
+ 'size' => pending_vms.size,
1067
+ 'oldest_age' => pending_ages.max || 0,
1068
+ 'avg_age' => pending_ages.empty? ? 0 : (pending_ages.sum / pending_ages.size).round(0),
1069
+ 'stuck_count' => stuck_pending
1070
+ }
1071
+
1072
+ # Ready queue metrics
1073
+ ready_key = "vmpooler__ready__#{pool_name}"
1074
+ ready_vms = redis.smembers(ready_key)
1075
+ ready_ages = calculate_queue_ages(ready_vms, 'ready', redis)
1076
+
1077
+ metrics['queues'][pool_name]['ready'] = {
1078
+ 'size' => ready_vms.size,
1079
+ 'oldest_age' => ready_ages.max || 0,
1080
+ 'avg_age' => ready_ages.empty? ? 0 : (ready_ages.sum / ready_ages.size).round(0)
1081
+ }
1082
+
1083
+ # Completed queue metrics
1084
+ completed_key = "vmpooler__completed__#{pool_name}"
1085
+ completed_size = redis.scard(completed_key)
1086
+ metrics['queues'][pool_name]['completed'] = { 'size' => completed_size }
1087
+ end
1088
+
1089
+ # Task queue metrics
1090
+ clone_active = redis.get('vmpooler__tasks__clone').to_i
1091
+ ondemand_active = redis.get('vmpooler__tasks__ondemandclone').to_i
1092
+ odcreate_pending = redis.zcard('vmpooler__odcreate__task')
1093
+
1094
+ metrics['tasks']['clone'] = { 'active' => clone_active }
1095
+ metrics['tasks']['ondemand'] = { 'active' => ondemand_active, 'pending' => odcreate_pending }
1096
+
1097
+ # DLQ metrics
1098
+ if dlq_enabled?
1099
+ dlq_keys = redis.keys('vmpooler__dlq__*')
1100
+ dlq_keys.each do |dlq_key|
1101
+ queue_type = dlq_key.sub('vmpooler__dlq__', '')
1102
+ dlq_size = redis.zcard(dlq_key)
1103
+ total_dlq_size += dlq_size
1104
+ metrics['queues']['dlq'] ||= {}
1105
+ metrics['queues']['dlq'][queue_type] = { 'size' => dlq_size }
1106
+ end
1107
+ end
1108
+
1109
+ # Error metrics
1110
+ metrics['errors']['dlq_total_size'] = total_dlq_size
1111
+ metrics['errors']['stuck_vm_count'] = total_stuck_vms
1112
+
1113
+ # Orphaned metadata count
1114
+ orphaned_count = count_orphaned_metadata(redis)
1115
+ metrics['errors']['orphaned_metadata_count'] = orphaned_count
1116
+
1117
+ metrics
1118
+ end
1119
+
1120
+ def calculate_queue_ages(vms, timestamp_field, redis)
1121
+ ages = []
1122
+ vms.each do |vm|
1123
+ begin
1124
+ timestamp_str = redis.hget("vmpooler__vm__#{vm}", timestamp_field)
1125
+ next unless timestamp_str
1126
+
1127
+ timestamp = Time.parse(timestamp_str)
1128
+ age = (Time.now - timestamp).to_i
1129
+ ages << age
1130
+ rescue StandardError
1131
+ # Skip VMs with invalid timestamps
1132
+ end
1133
+ end
1134
+ ages
1135
+ end
1136
+
1137
+ def count_orphaned_metadata(redis)
1138
+ all_vm_keys = redis.keys('vmpooler__vm__*')
1139
+ orphaned_count = 0
1140
+
1141
+ all_vm_keys.each do |vm_key|
1142
+ begin
1143
+ vm = vm_key.sub('vmpooler__vm__', '')
1144
+ pool_name = redis.hget(vm_key, 'pool')
1145
+ next unless pool_name
1146
+
1147
+ in_any_queue = redis.sismember("vmpooler__pending__#{pool_name}", vm) ||
1148
+ redis.sismember("vmpooler__ready__#{pool_name}", vm) ||
1149
+ redis.sismember("vmpooler__running__#{pool_name}", vm) ||
1150
+ redis.sismember("vmpooler__completed__#{pool_name}", vm) ||
1151
+ redis.sismember("vmpooler__discovered__#{pool_name}", vm) ||
1152
+ redis.sismember("vmpooler__migrating__#{pool_name}", vm)
1153
+
1154
+ orphaned_count += 1 unless in_any_queue
1155
+ rescue StandardError
1156
+ # Skip on error
1157
+ end
1158
+ end
1159
+
1160
+ orphaned_count
1161
+ end
1162
+
1163
+ def determine_health_status(metrics)
1164
+ thresholds = health_thresholds
1165
+
1166
+ # Check DLQ size
1167
+ dlq_size = metrics['errors']['dlq_total_size']
1168
+ return 'unhealthy' if dlq_size > thresholds['dlq_max_critical']
1169
+
1170
+ # Check stuck VM count
1171
+ stuck_count = metrics['errors']['stuck_vm_count']
1172
+ return 'unhealthy' if stuck_count > thresholds['stuck_vm_max_critical']
1173
+
1174
+ # Check queue sizes
1175
+ metrics['queues'].each do |pool_name, queues|
1176
+ next if pool_name == 'dlq'
1177
+
1178
+ pending_size = begin
1179
+ queues['pending']['size']
1180
+ rescue StandardError
1181
+ 0
1182
+ end
1183
+ ready_size = begin
1184
+ queues['ready']['size']
1185
+ rescue StandardError
1186
+ 0
1187
+ end
1188
+
1189
+ return 'unhealthy' if pending_size > thresholds['pending_queue_max'] * 2
1190
+ return 'unhealthy' if ready_size > thresholds['ready_queue_max'] * 2
1191
+ end
1192
+
1193
+ # Check for degraded conditions
1194
+ return 'degraded' if dlq_size > thresholds['dlq_max_warning']
1195
+ return 'degraded' if stuck_count > thresholds['stuck_vm_max_warning']
1196
+
1197
+ metrics['queues'].each do |pool_name, queues|
1198
+ next if pool_name == 'dlq'
1199
+
1200
+ pending_size = begin
1201
+ queues['pending']['size']
1202
+ rescue StandardError
1203
+ 0
1204
+ end
1205
+ ready_size = begin
1206
+ queues['ready']['size']
1207
+ rescue StandardError
1208
+ 0
1209
+ end
1210
+
1211
+ return 'degraded' if pending_size > thresholds['pending_queue_max']
1212
+ return 'degraded' if ready_size > thresholds['ready_queue_max']
1213
+ end
1214
+
1215
+ 'healthy'
1216
+ end
1217
+
1218
+ def log_health_summary(metrics, status)
1219
+ summary = "[*] [health] Status: #{status.upcase}"
1220
+
1221
+ # Queue summary
1222
+ total_pending = 0
1223
+ total_ready = 0
1224
+ total_completed = 0
1225
+
1226
+ metrics['queues'].each do |pool_name, queues|
1227
+ next if pool_name == 'dlq'
1228
+
1229
+ total_pending += begin
1230
+ queues['pending']['size']
1231
+ rescue StandardError
1232
+ 0
1233
+ end
1234
+ total_ready += begin
1235
+ queues['ready']['size']
1236
+ rescue StandardError
1237
+ 0
1238
+ end
1239
+ total_completed += begin
1240
+ queues['completed']['size']
1241
+ rescue StandardError
1242
+ 0
1243
+ end
1244
+ end
1245
+
1246
+ summary += " | Queues: P=#{total_pending} R=#{total_ready} C=#{total_completed}"
1247
+ summary += " | DLQ=#{metrics['errors']['dlq_total_size']}"
1248
+ summary += " | Stuck=#{metrics['errors']['stuck_vm_count']}"
1249
+ summary += " | Orphaned=#{metrics['errors']['orphaned_metadata_count']}"
1250
+
1251
+ log_level = status == 'healthy' ? 's' : 'd'
1252
+ $logger.log(log_level, summary)
1253
+ end
1254
+
1255
+ def push_health_metrics(metrics, status)
1256
+ # Push error metrics first
1257
+ $metrics.gauge('vmpooler_health.dlq.total_size', metrics['errors']['dlq_total_size'])
1258
+ $metrics.gauge('vmpooler_health.stuck_vms.count', metrics['errors']['stuck_vm_count'])
1259
+ $metrics.gauge('vmpooler_health.orphaned_metadata.count', metrics['errors']['orphaned_metadata_count'])
1260
+
1261
+ # Push per-pool queue metrics
1262
+ metrics['queues'].each do |pool_name, queues|
1263
+ next if pool_name == 'dlq'
1264
+
1265
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.size", queues['pending']['size'])
1266
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.oldest_age", queues['pending']['oldest_age'])
1267
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.pending.stuck_count", queues['pending']['stuck_count'])
1268
+
1269
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.size", queues['ready']['size'])
1270
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.ready.oldest_age", queues['ready']['oldest_age'])
1271
+
1272
+ $metrics.gauge("vmpooler_health.queue.#{pool_name}.completed.size", queues['completed']['size'])
1273
+ end
1274
+
1275
+ # Push DLQ metrics
1276
+ metrics['queues']['dlq']&.each do |queue_type, dlq_metrics|
1277
+ $metrics.gauge("vmpooler_health.dlq.#{queue_type}.size", dlq_metrics['size'])
1278
+ end
1279
+
1280
+ # Push task metrics
1281
+ $metrics.gauge('vmpooler_health.tasks.clone.active', metrics['tasks']['clone']['active'])
1282
+ $metrics.gauge('vmpooler_health.tasks.ondemand.active', metrics['tasks']['ondemand']['active'])
1283
+ $metrics.gauge('vmpooler_health.tasks.ondemand.pending', metrics['tasks']['ondemand']['pending'])
1284
+
1285
+ # Push status last (0=healthy, 1=degraded, 2=unhealthy)
1286
+ status_value = { 'healthy' => 0, 'degraded' => 1, 'unhealthy' => 2 }[status] || 2
1287
+ $metrics.gauge('vmpooler_health.status', status_value)
1288
+ end
1289
+
585
1290
  def create_vm_disk(pool_name, vm, disk_size, provider)
586
1291
  Thread.new do
587
1292
  begin
@@ -982,7 +1687,12 @@ module Vmpooler
982
1687
 
983
1688
  sync_pool_template(pool)
984
1689
  loop do
1690
+ start_time = Time.now
985
1691
  result = _check_pool(pool, provider)
1692
+ duration = Time.now - start_time
1693
+
1694
+ $metrics.gauge("vmpooler_performance.check_pool.#{pool['name']}", duration)
1695
+ $logger.log('d', "[!] check_pool for #{pool['name']} took #{duration.round(2)}s") if duration > 5
986
1696
 
987
1697
  if result[:cloned_vms] > 0 || result[:checked_pending_vms] > 0 || result[:discovered_vms] > 0
988
1698
  loop_delay = loop_delay_min
@@ -1541,6 +2251,15 @@ module Vmpooler
1541
2251
  redis.zrem('vmpooler__provisioning__request', request_id)
1542
2252
  return
1543
2253
  end
2254
+
2255
+ # Check if request was already marked as failed (e.g., by delete endpoint)
2256
+ request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
2257
+ if request_status == 'failed'
2258
+ $logger.log('s', "Request '#{request_id}' already marked as failed, skipping VM creation")
2259
+ redis.zrem('vmpooler__provisioning__request', request_id)
2260
+ return
2261
+ end
2262
+
1544
2263
  score = redis.zscore('vmpooler__provisioning__request', request_id)
1545
2264
  requested = requested.split(',')
1546
2265
 
@@ -1764,6 +2483,48 @@ module Vmpooler
1764
2483
  check_ondemand_requests(check_loop_delay_min, check_loop_delay_max, check_loop_delay_decay)
1765
2484
  end
1766
2485
 
2486
+ # Queue purge thread
2487
+ if purge_enabled?
2488
+ purge_interval = ($config[:config] && $config[:config]['purge_interval']) || 3600 # default 1 hour
2489
+ if !$threads['queue_purge']
2490
+ $threads['queue_purge'] = Thread.new do
2491
+ loop do
2492
+ purge_stale_queue_entries
2493
+ sleep(purge_interval)
2494
+ end
2495
+ end
2496
+ elsif !$threads['queue_purge'].alive?
2497
+ $logger.log('d', '[!] [queue_purge] worker thread died, restarting')
2498
+ $threads['queue_purge'] = Thread.new do
2499
+ loop do
2500
+ purge_stale_queue_entries
2501
+ sleep(purge_interval)
2502
+ end
2503
+ end
2504
+ end
2505
+ end
2506
+
2507
+ # Health check thread
2508
+ if health_check_enabled?
2509
+ health_interval = ($config[:config] && $config[:config]['health_check_interval']) || 300 # default 5 minutes
2510
+ if !$threads['health_check']
2511
+ $threads['health_check'] = Thread.new do
2512
+ loop do
2513
+ check_queue_health
2514
+ sleep(health_interval)
2515
+ end
2516
+ end
2517
+ elsif !$threads['health_check'].alive?
2518
+ $logger.log('d', '[!] [health_check] worker thread died, restarting')
2519
+ $threads['health_check'] = Thread.new do
2520
+ loop do
2521
+ check_queue_health
2522
+ sleep(health_interval)
2523
+ end
2524
+ end
2525
+ end
2526
+ end
2527
+
1767
2528
  sleep(loop_delay)
1768
2529
 
1769
2530
  unless maxloop == 0