hybrid_platforms_conductor 32.7.3 → 32.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83352b7821e41bc5b0282c693c47f0cac0f5a9915a45dfa37affd3d1fcaadd8c
4
- data.tar.gz: fba8bf35c2569716b2b8f934a46684c8b29843e50fe49b8133d3eeae580938d1
3
+ metadata.gz: 4f1beb4de64ac4dcdc1de3e5063980ea4dab1b8ceef204baae0a9f4bf209f1f3
4
+ data.tar.gz: 0c5e65553da39646554d737c8a690939497a8c89f602a4eae01a4987fff994b7
5
5
  SHA512:
6
- metadata.gz: 69c667f1f6c626372ecef7c21c9afdb2865a38e418ad6add2ec4d04d9a0eeda6892fed168d546d89d8805d843c31945eb885d53b0cf5208a65a7eb89ba75ea74
7
- data.tar.gz: b9478d38660f80b7c9f62411d72d0e890e74ec509973ceb3d417721d21f5f9923ffc5dfbc96b857af6301d7bd708630638d393dc3bbf2abcd4cf7d12e22a7af6
6
+ metadata.gz: 3129d3b37140c6488b2b6a8b6c1c8cb22ce745181a643925d9bb2fdc77de97a331660c8d5193dde940535d58b545576796d22a29008d181c73bbf7bb384c2251
7
+ data.tar.gz: 187c6215c37e89b924693c1819fea12f8cc444858ac575f9445b535ae9866a396b01dbc5baaf85ca4e8163d60c8865c931eca319c4824148fdd06029920b9710
@@ -505,15 +505,25 @@ module HybridPlatformsConductor
505
505
  if @nodes_handler.get_ssh_session_exec_of(node) == 'false'
506
506
  # Here we have to create a ControlMaster using an interactive session, as the SSH server prohibits ExecSession, and so command executions.
507
507
  # We'll do that using another terminal spawned in the background.
508
- Thread.new do
509
- log_debug "[ ControlMaster - #{ssh_url} ] - Spawn interactive ControlMaster in separate terminal"
510
- @cmd_runner.run_cmd "xterm -e '#{ssh_exec} -o ControlMaster=yes -o ControlPersist=yes #{ssh_url}'", log_to_stdout: log_debug?
511
- log_debug "[ ControlMaster - #{ssh_url} ] - Separate interactive ControlMaster closed"
508
+ if ENV['hpc_interactive'] == 'false'
509
+ error = "Can't spawn interactive ControlMaster to #{node} in non-interactive mode. You may want to change the hpc_interactive env variable."
510
+ if no_exception
511
+ log_error error
512
+ exit_status = :non_interactive
513
+ else
514
+ raise error
515
+ end
516
+ else
517
+ Thread.new do
518
+ log_debug "[ ControlMaster - #{ssh_url} ] - Spawn interactive ControlMaster in separate terminal"
519
+ @cmd_runner.run_cmd "xterm -e '#{ssh_exec} -o ControlMaster=yes -o ControlPersist=yes #{ssh_url}'", log_to_stdout: log_debug?
520
+ log_debug "[ ControlMaster - #{ssh_url} ] - Separate interactive ControlMaster closed"
521
+ end
522
+ out 'External ControlMaster has been spawned.'
523
+ out 'Please login into it, keep its session opened and press enter here when done...'
524
+ $stdin.gets
525
+ exit_status = 0
512
526
  end
513
- out 'External ControlMaster has been spawned.'
514
- out 'Please login into it, keep its session opened and press enter here when done...'
515
- $stdin.gets
516
- exit_status = 0
517
527
  else
518
528
  # Create the control master
519
529
  ssh_control_master_start_cmd = "#{ssh_exec}#{@passwords.key?(node) || @auth_password ? '' : ' -o BatchMode=yes'} -o ControlMaster=yes -o ControlPersist=yes #{ssh_url} true"
@@ -23,6 +23,13 @@ module HybridPlatformsConductor
23
23
  super
24
24
  end
25
25
 
26
+ # Re-authenticate the Proxmox instance
27
+ # This can be useful when the API returns errors due to invalidated tokens
28
+ def reauthenticate
29
+ log_info 'Force re-authentication to Proxmox'
30
+ @auth_params = create_ticket
31
+ end
32
+
26
33
  end
27
34
  ::Proxmox::Proxmox.prepend ProxmoxPatches
28
35
 
@@ -74,13 +81,13 @@ module HybridPlatformsConductor
74
81
  # First check if we already have a test container that corresponds to this node and environment
75
82
  @lxc_details = nil
76
83
  with_proxmox do |proxmox|
77
- proxmox.get('nodes').each do |node_info|
84
+ proxmox_get(proxmox, 'nodes').each do |node_info|
78
85
  if proxmox_test_info[:test_config][:pve_nodes].include?(node_info['node']) && node_info['status'] == 'online'
79
- proxmox.get("nodes/#{node_info['node']}/lxc").each do |lxc_info|
86
+ proxmox_get(proxmox, "nodes/#{node_info['node']}/lxc").each do |lxc_info|
80
87
  vm_id = Integer(lxc_info['vmid'])
81
88
  if vm_id.between?(*proxmox_test_info[:test_config][:vm_ids_range])
82
89
  # Check if the description contains our ID
83
- lxc_config = proxmox.get("nodes/#{node_info['node']}/lxc/#{vm_id}/config")
90
+ lxc_config = proxmox_get(proxmox, "nodes/#{node_info['node']}/lxc/#{vm_id}/config")
84
91
  vm_description_lines = (lxc_config['description'] || '').split("\n")
85
92
  hpc_marker_idx = vm_description_lines.index('===== HPC info =====')
86
93
  unless hpc_marker_idx.nil?
@@ -222,8 +229,8 @@ module HybridPlatformsConductor
222
229
  with_proxmox do |proxmox|
223
230
  vm_id_str = @lxc_details[:vm_id].to_s
224
231
  status =
225
- if proxmox.get("nodes/#{@lxc_details[:pve_node]}/lxc").any? { |data_info| data_info['vmid'] == vm_id_str }
226
- status_info = proxmox.get("nodes/#{@lxc_details[:pve_node]}/lxc/#{@lxc_details[:vm_id]}/status/current")
232
+ if proxmox_get(proxmox, "nodes/#{@lxc_details[:pve_node]}/lxc").any? { |data_info| data_info['vmid'] == vm_id_str }
233
+ status_info = proxmox_get(proxmox, "nodes/#{@lxc_details[:pve_node]}/lxc/#{@lxc_details[:vm_id]}/status/current")
227
234
  # Careful that it is possible that somebody destroyed the VM and so its status is missing
228
235
  status = status_info.key?('status') ? status_info['status'].to_sym : :missing
229
236
  status = :exited if status == :stopped
@@ -292,11 +299,29 @@ module HybridPlatformsConductor
292
299
  end
293
300
  end
294
301
 
295
- # Maximum number of retries to perform on the Proxmox API.
296
- NBR_RETRIES_MAX = 5
297
-
298
- # Minimum seconds to wait between retries
299
- RETRY_WAIT_TIME_SECS = 5
302
+ # Perform a get operation on the API
303
+ # Protect the get API methods with a retry mechanism in case of 5xx errors.
304
+ #
305
+ # Parameters::
306
+ # * *proxmox* (Proxmox): The Proxmox instance
307
+ # * *path* (String): Path to get
308
+ # Result::
309
+ # * Object: API response
310
+ def proxmox_get(proxmox, path)
311
+ response = nil
312
+ idx_try = 0
313
+ loop do
314
+ response = proxmox.get(path)
315
+ break if !(response.is_a?(String)) || !(response =~ /^NOK: error code = 5\d\d$/)
316
+ log_warn "[ #{@node}/#{@environment} ] - Proxmox API call get #{path} returned error #{response} (attempt ##{idx_try}/#{proxmox_test_info[:api_max_retries]})"
317
+ raise "[ #{@node}/#{@environment} ] - Proxmox API call get #{path} returns #{response} continuously (tried #{idx_try + 1} times)" if idx_try >= proxmox_test_info[:api_max_retries]
318
+ idx_try += 1
319
+ # We have to reauthenticate: error 500 raised by Proxmox are often due to token being invalidated wrongly
320
+ proxmox.reauthenticate
321
+ sleep proxmox_test_info[:api_wait_between_retries_secs] + rand(5)
322
+ end
323
+ response
324
+ end
300
325
 
301
326
  # Run a Proxmox task.
302
327
  # Handle a retry mechanism in case of 5xx errors.
@@ -313,11 +338,13 @@ module HybridPlatformsConductor
313
338
  while task.nil? do
314
339
  task = proxmox.send(http_method, "nodes/#{pve_node}/#{sub_path}", *args)
315
340
  if task =~ /^NOK: error code = 5\d\d$/
316
- log_warn "[ #{@node}/#{@environment} ] - Proxmox API call #{http_method} nodes/#{pve_node}/#{sub_path} #{args} returned error #{task} (attempt ##{idx_try}/#{NBR_RETRIES_MAX})"
341
+ log_warn "[ #{@node}/#{@environment} ] - Proxmox API call #{http_method} nodes/#{pve_node}/#{sub_path} #{args} returned error #{task} (attempt ##{idx_try}/#{proxmox_test_info[:api_max_retries]})"
317
342
  task = nil
343
+ break if idx_try >= proxmox_test_info[:api_max_retries]
318
344
  idx_try += 1
319
- break if idx_try == NBR_RETRIES_MAX
320
- sleep RETRY_WAIT_TIME_SECS + rand(5)
345
+ # We have to reauthenticate: error 500 raised by Proxmox are often due to token being invalidated wrongly
346
+ proxmox.reauthenticate
347
+ sleep proxmox_test_info[:api_wait_between_retries_secs] + rand(5)
321
348
  end
322
349
  end
323
350
  if task.nil?
@@ -358,7 +385,7 @@ module HybridPlatformsConductor
358
385
  # Result::
359
386
  # * String: The task status
360
387
  def task_status(proxmox, pve_node, task)
361
- status_info = proxmox.get("nodes/#{pve_node}/tasks/#{task}/status")
388
+ status_info = proxmox_get(proxmox, "nodes/#{pve_node}/tasks/#{task}/status")
362
389
  "#{status_info['status']}#{status_info['exitstatus'] ? ":#{status_info['exitstatus']}" : ''}"
363
390
  end
364
391
 
@@ -377,7 +404,9 @@ module HybridPlatformsConductor
377
404
  (proxmox_test_info[:test_config].merge(
378
405
  proxmox_api_url: proxmox_test_info[:api_url],
379
406
  futex_file: '/tmp/hpc_proxmox_allocations.futex',
380
- logs_dir: '/tmp/hpc_proxmox_waiter_logs'
407
+ logs_dir: '/tmp/hpc_proxmox_waiter_logs',
408
+ api_max_retries: proxmox_test_info[:api_max_retries],
409
+ api_wait_between_retries_secs: proxmox_test_info[:api_wait_between_retries_secs]
381
410
  )).to_json
382
411
  )
383
412
  result = nil
@@ -486,7 +515,7 @@ module HybridPlatformsConductor
486
515
  # So remaining length is 255 - 13 = 242 characters.
487
516
  MAX_FILE_ID_SIZE = 242
488
517
 
489
- # Get an ID unique for theis node/environment and that can be used in file names.
518
+ # Get an ID unique for this node/environment and that can be used in file names.
490
519
  #
491
520
  # Result::
492
521
  # * String: ID
@@ -506,6 +535,8 @@ module HybridPlatformsConductor
506
535
  # Result::
507
536
  # * Hash<Symbol,Object>: Configuration of the Proxmox instance to be used:
508
537
  # * *api_url* (String): The Proxmox API URL
538
+ # * *api_max_retries* (Integer): Max number of API retries
539
+ # * *api_wait_between_retries_secs* (Integer): Number of seconds to wait between API retries
509
540
  # * *sync_node* (String): Node to be used to synchronize Proxmox resources acquisition
510
541
  # * *test_config* (Hash<Symbol,Object>): The test configuration. Check ProxmoxWaiter#initialize (config_file structure) method to get details.
511
542
  # * *vm_config* (Hash<Symbol,Object>): Extra configuration of a created container. Check #request_lxc_creation_for results to get details.
@@ -26,6 +26,8 @@ class ProxmoxWaiter
26
26
  # * *proxmox_api_url* (String): Proxmox API URL.
27
27
  # * *futex_file* (String): Path to the file serving as a futex.
28
28
  # * *logs_dir* (String): Path to the directory containing logs [default: '.']
29
+ # * *api_max_retries* (Integer): Max number of API retries
30
+ # * *api_wait_between_retries_secs* (Integer): Number of seconds to wait between API retries
29
31
  # * *pve_nodes* (Array<String>): List of PVE nodes allowed to spawn new containers [default: all]
30
32
  # * *vm_ips_list* (Array<String>): The list of IPs that are available for the Proxomx containers.
31
33
  # * *vm_ids_range* ([Integer, Integer]): Minimum and maximum reservable VM ID
@@ -637,11 +639,26 @@ class ProxmoxWaiter
637
639
 
638
640
  # Get a path from the API it returns its JSON result.
639
641
  # Keep a cache of it, whose lifespan is this ProxmoxWaiter instance.
642
+ # Have a retry mechanism to make sure eventual non-deterministic 5xx errors are not an issue.
640
643
  #
641
644
  # Parameters::
642
645
  # * *path* (String): API path to query
643
- def api_get(path)
644
- @gets_cache[path] = @proxmox.get(path) unless @gets_cache.key?(path)
646
+ # Result::
647
+ # * Object: The API response
648
+ def api_get(path, nbr_retries: 3, wait_between_retry_secs: 10)
649
+ unless @gets_cache.key?(path)
650
+ idx_try = 0
651
+ loop do
652
+ @gets_cache[path] = @proxmox.get(path)
653
+ break unless @gets_cache[path].is_a?(String) && @gets_cache[path] =~ /^NOK: error code = 5\d\d$/
654
+ raise "Proxmox API get #{path} returns #{@gets_cache[path]} continuously (tried #{idx_try + 1} times)" if idx_try >= @config['api_max_retries']
655
+ idx_try += 1
656
+ # We have to reauthenticate: error 500 raised by Proxmox are often due to token being invalidated wrongly
657
+ # TODO: Provide a way to do it properly in the official gem
658
+ @proxmox.instance_variable_set(:@auth_params, @proxmox.send(:create_ticket))
659
+ sleep @config['api_wait_between_retries_secs']
660
+ end
661
+ end
645
662
  @gets_cache[path]
646
663
  end
647
664
 
@@ -428,9 +428,11 @@ module HybridPlatformsConductor
428
428
  end
429
429
  end
430
430
  # Compute the timeout that will be applied, from the max timeout sum for every node that has tests to run
431
- timeout = CONNECTION_TIMEOUT + @cmds_to_run.map do |_node, cmds_list|
432
- cmds_list.inject(0) { |total_timeout, (_cmd, test_info)| test_info[:timeout] + total_timeout }
433
- end.max
431
+ timeout = CONNECTION_TIMEOUT + (
432
+ @cmds_to_run.map do |_node, cmds_list|
433
+ cmds_list.inject(0) { |total_timeout, (_cmd, test_info)| test_info[:timeout] + total_timeout }
434
+ end.max || 0
435
+ )
434
436
  # Run commands on nodes, in grouped way to avoid too many connections, per node
435
437
  # Hash< String, Array<String> >
436
438
  @test_cmds = Hash[@cmds_to_run.map do |node, cmds_list|
@@ -464,33 +466,35 @@ module HybridPlatformsConductor
464
466
  end,
465
467
  test_execution: proc do |test|
466
468
  exit_status, stdout, stderr = @actions_result[test.node]
467
- if exit_status.is_a?(Symbol)
468
- test.error "Error while executing tests: #{exit_status}: #{stderr}"
469
- else
470
- log_debug <<~EOS
471
- ----- Commands for #{test.node}:
472
- #{@test_cmds[test.node][:remote_bash].join("\n")}
473
- ----- STDOUT:
474
- #{stdout}
475
- ----- STDERR:
476
- #{stderr}
477
- -----
478
- EOS
479
- # Skip the first section, as it can contain SSH banners
480
- cmd_stdouts = stdout.split("#{CMD_SEPARATOR}\n")[1..-1]
481
- cmd_stdouts = [] if cmd_stdouts.nil?
482
- cmd_stderrs = stderr.split("#{CMD_SEPARATOR}\n")[1..-1]
483
- cmd_stderrs = [] if cmd_stderrs.nil?
484
- @cmds_to_run[test.node].zip(cmd_stdouts, cmd_stderrs).each do |(cmd, test_info), cmd_stdout, cmd_stderr|
485
- # Find the section that corresponds to this test
486
- if test_info[:test] == test
487
- cmd_stdout = '' if cmd_stdout.nil?
488
- cmd_stderr = '' if cmd_stderr.nil?
489
- stdout_lines = cmd_stdout.split("\n")
490
- # Last line of stdout is the return code
491
- return_code = stdout_lines.empty? ? :command_cant_run : Integer(stdout_lines.last)
492
- test.error "Command '#{cmd}' returned error code #{return_code}", "----- STDOUT:\n#{stdout_lines[0..-2].join("\n")}\n----- STDERR:\n#{cmd_stderr}" unless return_code == 0
493
- test_info[:validator].call(stdout_lines[0..-2], cmd_stderr.split("\n"), return_code)
469
+ unless exit_status.nil?
470
+ if exit_status.is_a?(Symbol)
471
+ test.error "Error while executing tests: #{exit_status}: #{stderr}"
472
+ else
473
+ log_debug <<~EOS
474
+ ----- Commands for #{test.node}:
475
+ #{@test_cmds[test.node][:remote_bash].join("\n")}
476
+ ----- STDOUT:
477
+ #{stdout}
478
+ ----- STDERR:
479
+ #{stderr}
480
+ -----
481
+ EOS
482
+ # Skip the first section, as it can contain SSH banners
483
+ cmd_stdouts = stdout.split("#{CMD_SEPARATOR}\n")[1..-1]
484
+ cmd_stdouts = [] if cmd_stdouts.nil?
485
+ cmd_stderrs = stderr.split("#{CMD_SEPARATOR}\n")[1..-1]
486
+ cmd_stderrs = [] if cmd_stderrs.nil?
487
+ @cmds_to_run[test.node].zip(cmd_stdouts, cmd_stderrs).each do |(cmd, test_info), cmd_stdout, cmd_stderr|
488
+ # Find the section that corresponds to this test
489
+ if test_info[:test] == test
490
+ cmd_stdout = '' if cmd_stdout.nil?
491
+ cmd_stderr = '' if cmd_stderr.nil?
492
+ stdout_lines = cmd_stdout.split("\n")
493
+ # Last line of stdout is the return code
494
+ return_code = stdout_lines.empty? ? :command_cant_run : Integer(stdout_lines.last)
495
+ test.error "Command '#{cmd}' returned error code #{return_code}", "----- STDOUT:\n#{stdout_lines[0..-2].join("\n")}\n----- STDERR:\n#{cmd_stderr}" unless return_code == 0
496
+ test_info[:validator].call(stdout_lines[0..-2], cmd_stderr.split("\n"), return_code)
497
+ end
494
498
  end
495
499
  end
496
500
  end
@@ -1,5 +1,5 @@
1
1
  module HybridPlatformsConductor
2
2
 
3
- VERSION = '32.7.3'
3
+ VERSION = '32.9.1'
4
4
 
5
5
  end
@@ -94,6 +94,7 @@ module HybridPlatformsConductorTest
94
94
  ENV.delete 'hpc_password_for_thycotic'
95
95
  ENV.delete 'hpc_domain_for_thycotic'
96
96
  ENV.delete 'hpc_certificates'
97
+ ENV.delete 'hpc_interactive'
97
98
  # Set the necessary Hybrid Platforms Conductor environment variables
98
99
  ENV['hpc_ssh_user'] = 'test_user'
99
100
  HybridPlatformsConductor::ServicesHandler.packaged_deployments.clear
@@ -44,6 +44,58 @@ describe HybridPlatformsConductor::ActionsExecutor do
44
44
  end
45
45
  end
46
46
 
47
+ it 'can\'t create an SSH master to 1 node not having Session Exec capabilities when hpc_interactive is false' do
48
+ with_test_platform(nodes: { 'node' => { meta: { host_ip: '192.168.42.42', ssh_session_exec: 'false' } } }) do
49
+ ENV['hpc_interactive'] = 'false'
50
+ with_cmd_runner_mocked(
51
+ [
52
+ ['which env', proc { [0, "/usr/bin/env\n", ''] }],
53
+ ['ssh -V 2>&1', proc { [0, "OpenSSH_7.4p1 Debian-10+deb9u7, OpenSSL 1.0.2u 20 Dec 2019\n", ''] }]
54
+ ] + ssh_expected_commands_for(
55
+ { 'node' => { connection: '192.168.42.42', user: 'test_user' } },
56
+ with_control_master_create: false,
57
+ with_control_master_destroy: false
58
+ )
59
+ ) do
60
+ test_connector.ssh_user = 'test_user'
61
+ expect do
62
+ test_connector.with_connection_to(['node']) do
63
+ end
64
+ end.to raise_error 'Can\'t spawn interactive ControlMaster to node in non-interactive mode. You may want to change the hpc_interactive env variable.'
65
+ end
66
+ end
67
+ end
68
+
69
+ it 'fails without creating exception when creating an SSH master to 1 node not having Session Exec capabilities when hpc_interactive is false and we use no_exception' do
70
+ with_test_platform(nodes: {
71
+ 'node1' => { meta: { host_ip: '192.168.42.1' } },
72
+ 'node2' => { meta: { host_ip: '192.168.42.2', ssh_session_exec: 'false' } },
73
+ 'node3' => { meta: { host_ip: '192.168.42.3' } }
74
+ }) do
75
+ ENV['hpc_interactive'] = 'false'
76
+ with_cmd_runner_mocked(
77
+ [
78
+ ['which env', proc { [0, "/usr/bin/env\n", ''] }],
79
+ ['ssh -V 2>&1', proc { [0, "OpenSSH_7.4p1 Debian-10+deb9u7, OpenSSL 1.0.2u 20 Dec 2019\n", ''] }]
80
+ ] + ssh_expected_commands_for(
81
+ 'node1' => { connection: '192.168.42.1', user: 'test_user' },
82
+ 'node3' => { connection: '192.168.42.3', user: 'test_user' }
83
+ ) + ssh_expected_commands_for(
84
+ {
85
+ 'node2' => { connection: '192.168.42.2', user: 'test_user' }
86
+ },
87
+ with_control_master_create: false,
88
+ with_control_master_destroy: false
89
+ )
90
+ ) do
91
+ test_connector.ssh_user = 'test_user'
92
+ test_connector.with_connection_to(%w[node1 node2 node3], no_exception: true) do |connected_nodes|
93
+ expect(connected_nodes.sort).to eq %w[node1 node3].sort
94
+ end
95
+ end
96
+ end
97
+ end
98
+
47
99
  it 'creates SSH master to several nodes' do
48
100
  with_test_platform(nodes: {
49
101
  'node1' => { meta: { host_ip: '192.168.42.1' } },
@@ -10,6 +10,7 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
10
10
  with_sync_node do
11
11
  mock_proxmox(mocked_pve_nodes: [{ 'pve_node_name' => {} }] * 5)
12
12
  expect(call_reserve_proxmox_container(2, 128 * 1024, 4, max_retries: 5)).to eq(error: 'not_enough_resources')
13
+ expect_proxmox_actions_to_be []
13
14
  end
14
15
  end
15
16
 
@@ -25,6 +26,69 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
25
26
  vm_id: 1000,
26
27
  vm_ip: '192.168.0.100'
27
28
  )
29
+ expect_proxmox_actions_to_be [
30
+ [
31
+ :post,
32
+ 'nodes/pve_node_name/lxc',
33
+ {
34
+ 'ostemplate' => 'test_template.iso',
35
+ 'hostname' => 'test.hostname.my-domain.com',
36
+ 'description' => /node: test_node\nenvironment: test_env/,
37
+ 'cores' => 2,
38
+ 'cpulimit' => 2,
39
+ 'memory' => 1024,
40
+ 'rootfs' => 'local-lvm:4',
41
+ 'net0' => 'name=eth0,bridge=vmbr0,gw=172.16.16.16,ip=192.168.0.100/32',
42
+ 'vmid' => 1000
43
+ }
44
+ ]
45
+ ]
46
+ end
47
+ end
48
+
49
+ it 'retries a few times before ending in error for a 5xx API error' do
50
+ with_sync_node do
51
+ mock_proxmox(mocked_pve_nodes: [{ 'pve_node_name' => { error_strings: ['NOK: error code = 500'] * 5 } }])
52
+ result = call_reserve_proxmox_container(2, 1024, 4, config: { api_max_retries: 4 })
53
+ expect(result[:error]).not_to eq nil
54
+ expect(result[:error]).to match /Unhandled exception from reserve_proxmox_container: Proxmox API get nodes\/pve_node_name\/lxc returns NOK: error code = 500 continuously \(tried 5 times\)/
55
+ expect_proxmox_actions_to_be [
56
+ [:create_ticket],
57
+ [:create_ticket],
58
+ [:create_ticket],
59
+ [:create_ticket]
60
+ ]
61
+ end
62
+ end
63
+
64
+ it 'retries API errors a few times until it gets resolved' do
65
+ with_sync_node do
66
+ mock_proxmox(mocked_pve_nodes: [{ 'pve_node_name' => { error_strings: ['NOK: error code = 500'] * 3 } }])
67
+ expect(call_reserve_proxmox_container(2, 1024, 4, config: { api_max_retries: 4 })).to eq(
68
+ pve_node: 'pve_node_name',
69
+ vm_id: 1000,
70
+ vm_ip: '192.168.0.100'
71
+ )
72
+ expect_proxmox_actions_to_be [
73
+ [:create_ticket],
74
+ [:create_ticket],
75
+ [:create_ticket],
76
+ [
77
+ :post,
78
+ 'nodes/pve_node_name/lxc',
79
+ {
80
+ 'ostemplate' => 'test_template.iso',
81
+ 'hostname' => 'test.hostname.my-domain.com',
82
+ 'description' => /node: test_node\nenvironment: test_env/,
83
+ 'cores' => 2,
84
+ 'cpulimit' => 2,
85
+ 'memory' => 1024,
86
+ 'rootfs' => 'local-lvm:4',
87
+ 'net0' => 'name=eth0,bridge=vmbr0,gw=172.16.16.16,ip=192.168.0.100/32',
88
+ 'vmid' => 1000
89
+ }
90
+ ]
91
+ ]
28
92
  end
29
93
  end
30
94
 
@@ -39,16 +39,7 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
39
39
  mock_proxmox_to_start_node(nbr_api_errors: 3)
40
40
  ]
41
41
  instance.create
42
- # To speed up the test, alter the wait time between retries.
43
- old_wait_secs = HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_get(:RETRY_WAIT_TIME_SECS)
44
- begin
45
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
46
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, 1)
47
- instance.start
48
- ensure
49
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
50
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, old_wait_secs)
51
- end
42
+ instance.start
52
43
  end
53
44
  end
54
45
 
@@ -58,19 +49,10 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
58
49
  # 1 - The info on existing containers
59
50
  mock_proxmox_to_get_nodes_info,
60
51
  # 2 - The start of the container - fail too many times
61
- mock_proxmox_to_start_node(nbr_api_errors: HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_get(:NBR_RETRIES_MAX), task_status: nil)
52
+ mock_proxmox_to_start_node(nbr_api_errors: 4, task_status: nil)
62
53
  ]
63
54
  instance.create
64
- # To speed up the test, alter the wait time between retries.
65
- old_wait_secs = HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_get(:RETRY_WAIT_TIME_SECS)
66
- begin
67
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
68
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, 1)
69
- expect { instance.start }.to raise_error '[ node/test ] - Proxmox API call post nodes/pve_node_name/lxc/1024/status/start [] is constantly failing. Giving up.'
70
- ensure
71
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
72
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, old_wait_secs)
73
- end
55
+ expect { instance.start }.to raise_error '[ node/test ] - Proxmox API call post nodes/pve_node_name/lxc/1024/status/start [] is constantly failing. Giving up.'
74
56
  end
75
57
  end
76
58
 
@@ -23,6 +23,32 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
23
23
  end
24
24
  end
25
25
 
26
+ it 'retries calls to the API when getting back errors 5xx' do
27
+ with_test_proxmox_platform do |instance|
28
+ mock_proxmox_calls_with [
29
+ # 1 - The info on existing containers
30
+ mock_proxmox_to_get_nodes_info,
31
+ # 2 - The status of the container
32
+ mock_proxmox_to_status_node(nbr_api_errors: 3)
33
+ ]
34
+ instance.create
35
+ expect(instance.state).to eq :created
36
+ end
37
+ end
38
+
39
+ it 'fails to get an instance\'s status when the Proxmox API fails too many times' do
40
+ with_test_proxmox_platform do |instance|
41
+ mock_proxmox_calls_with [
42
+ # 1 - The info on existing containers
43
+ mock_proxmox_to_get_nodes_info,
44
+ # 2 - The status of the container
45
+ mock_proxmox_to_status_node(nbr_api_errors: 4, status: nil)
46
+ ]
47
+ instance.create
48
+ expect { instance.state }.to raise_error '[ node/test ] - Proxmox API call get nodes/pve_node_name/lxc returns NOK: error code = 500 continuously (tried 4 times)'
49
+ end
50
+ end
51
+
26
52
  end
27
53
 
28
54
  end
@@ -69,7 +69,7 @@ describe HybridPlatformsConductor::TestsRunner do
69
69
  'node12' => { 'test_node12.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node12', stdout, stderr, exit_code] } },
70
70
  'node21' => { 'test_node21.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node21', stdout, stderr, exit_code] } },
71
71
  'node22' => { 'test_node22.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node22', stdout, stderr, exit_code] } }
72
- }}
72
+ } }
73
73
  expect(test_tests_runner.run_tests([{ all: true }])).to eq 0
74
74
  expect(ssh_executions.sort).to eq [
75
75
  ['node11', ['stdout11'], ['stderr11'], 0],
@@ -88,7 +88,7 @@ describe HybridPlatformsConductor::TestsRunner do
88
88
  HybridPlatformsConductorTest::TestPlugins::NodeSsh.node_tests = { node_ssh_test: {
89
89
  'node12' => { 'test_node12.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node12', stdout, stderr, exit_code] } },
90
90
  'node22' => { 'test_node22.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node22', stdout, stderr, exit_code] } }
91
- }}
91
+ } }
92
92
  expect(test_tests_runner.run_tests(%w[node12 node22])).to eq 0
93
93
  expect(ssh_executions.sort).to eq [
94
94
  ['node12', ['stdout12'], ['stderr12'], 0],
@@ -97,6 +97,19 @@ describe HybridPlatformsConductor::TestsRunner do
97
97
  end
98
98
  end
99
99
 
100
+ it 'does not execute anything when the tests report no command' do
101
+ with_test_platform_for_node_connection_tests do
102
+ test_tests_runner.tests = [:node_ssh_test]
103
+ ssh_executions = []
104
+ HybridPlatformsConductorTest::TestPlugins::NodeSsh.node_tests = { node_ssh_test: {
105
+ 'node12' => {},
106
+ 'node22' => {}
107
+ } }
108
+ expect(test_tests_runner.run_tests(%w[node12 node22])).to eq 0
109
+ expect(ssh_executions).to eq []
110
+ end
111
+ end
112
+
100
113
  it 'executes several SSH node tests once per node with the correct command, grouping commands' do
101
114
  with_test_platform_for_node_connection_tests do
102
115
  expect_actions_executor_runs([proc do |actions|
@@ -54,13 +54,15 @@ module HybridPlatformsConductorTest
54
54
  if with_session_exec
55
55
  /^.+\/ssh #{with_batch_mode ? '-o BatchMode=yes ' : ''}-o ControlMaster=yes -o ControlPersist=yes hpc\.#{Regexp.escape(node)} true$/
56
56
  else
57
- # Mock the user hitting enter as the Control Master will be created in another thread and the main thread waits for user input.
58
- expect($stdin).to receive(:gets) do
59
- # We have to wait for the Control Master creation thread to actually create the Control Master before hitting Enter.
60
- while !control_master_created do
61
- sleep 0.1
57
+ unless ENV['hpc_interactive'] == 'false'
58
+ # Mock the user hitting enter as the Control Master will be created in another thread and the main thread waits for user input.
59
+ expect($stdin).to receive(:gets) do
60
+ # We have to wait for the Control Master creation thread to actually create the Control Master before hitting Enter.
61
+ while !control_master_created do
62
+ sleep 0.1
63
+ end
64
+ "\n"
62
65
  end
63
- "\n"
64
66
  end
65
67
  /^xterm -e '.+\/ssh -o ControlMaster=yes -o ControlPersist=yes hpc\.#{Regexp.escape(node)}'$/
66
68
  end,
@@ -23,6 +23,8 @@ module HybridPlatformsConductorTest
23
23
  test_platform path: '#{repository}'
24
24
  proxmox(
25
25
  api_url: 'https://my-proxmox.my-domain.com:8006',
26
+ api_max_retries: 3,
27
+ api_wait_between_retries_secs: 0,
26
28
  sync_node: 'node',
27
29
  test_config: {
28
30
  pve_nodes: ['pve_node_name'],
@@ -75,12 +77,20 @@ module HybridPlatformsConductorTest
75
77
  # * *proxmox_password* (String or nil): Proxmox password used to connect to Proxmox API [default: nil]
76
78
  # * *proxmox_realm* (String or nil): Proxmox realm used to connect to Proxmox API [default: 'pam']
77
79
  # * *nodes_info* (Array<Hash>): Nodes info returned by the Proxmox API [default: []]
80
+ # * *nbr_api_errors* (Integer): Number of API errors 500 to mock before getting a successful query [defaults: 0]
78
81
  # * *extra_expects* (Proc or nil): Code called for additional expectations on the proxmox instance, or nil if none [default: nil]
79
82
  # * Parameters::
80
83
  # * *proxmox* (Double): The mocked Proxmox instance
81
84
  # Result::
82
85
  # * Proc: Code called in place of Proxmox.new. Signature is the same as Proxmox.new.
83
- def mock_proxmox_to_get_nodes_info(proxmox_user: nil, proxmox_password: nil, proxmox_realm: 'pam', nodes_info: [], extra_expects: nil)
86
+ def mock_proxmox_to_get_nodes_info(
87
+ proxmox_user: nil,
88
+ proxmox_password: nil,
89
+ proxmox_realm: 'pam',
90
+ nodes_info: [],
91
+ nbr_api_errors: 0,
92
+ extra_expects: nil
93
+ )
84
94
  proc do |url, pve_node, user, password, realm, options|
85
95
  expect(url).to eq 'https://my-proxmox.my-domain.com:8006/api2/json/'
86
96
  expect(pve_node).to eq 'my-proxmox'
@@ -97,8 +107,10 @@ module HybridPlatformsConductorTest
97
107
  # Nothing
98
108
  end
99
109
  # Mock checking existing nodes
100
- expect(proxmox).to receive(:get).with('nodes') do
101
- nodes_info
110
+ idx_try = 0
111
+ expect(proxmox).to receive(:get).exactly(nbr_api_errors + 1).times.with('nodes') do
112
+ idx_try += 1
113
+ idx_try <= nbr_api_errors ? 'NOK: error code = 500' : nodes_info
102
114
  end
103
115
  extra_expects.call(proxmox) unless extra_expects.nil?
104
116
  proxmox
@@ -144,6 +156,7 @@ module HybridPlatformsConductorTest
144
156
  idx_try += 1
145
157
  idx_try <= nbr_api_errors ? 'NOK: error code = 500' : task_name
146
158
  end
159
+ expect(proxmox).to receive(:reauthenticate).exactly(nbr_api_errors - (task_status.nil? ? 1 : 0)).times
147
160
  # Mock checking task status
148
161
  unless task_status.nil?
149
162
  # Mock checking task status
@@ -243,13 +256,15 @@ module HybridPlatformsConductorTest
243
256
  # Parameters::
244
257
  # * *proxmox_user* (String or nil): Proxmox user used to connect to Proxmox API [default: nil]
245
258
  # * *proxmox_password* (String or nil): Proxmox password used to connect to Proxmox API [default: nil]
246
- # * *status* (String): Mocked status [default: 'created']
259
+ # * *status* (String or nil): Mocked status, or nil if it should not be asked [default: 'created']
260
+ # * *nbr_api_errors* (Integer): Number of API errors 500 to mock before getting a successful query [defaults: 0]
247
261
  # Result::
248
262
  # * Proc: Code called in place of Proxmox.new. Signature is the same as Proxmox.new.
249
263
  def mock_proxmox_to_status_node(
250
264
  proxmox_user: nil,
251
265
  proxmox_password: nil,
252
- task_status: 'OK'
266
+ status: 'created',
267
+ nbr_api_errors: 0
253
268
  )
254
269
  proc do |url, pve_node, user, password, realm, options|
255
270
  expect(url).to eq 'https://my-proxmox.my-domain.com:8006/api2/json/'
@@ -267,17 +282,26 @@ module HybridPlatformsConductorTest
267
282
  # Nothing
268
283
  end
269
284
  # Mock getting status of a container
270
- expect(proxmox).to receive(:get).with('nodes/pve_node_name/lxc') do
271
- [
285
+ idx_try = 0
286
+ expect(proxmox).to receive(:get).exactly(nbr_api_errors + (status.nil? ? 0 : 1)).times.with('nodes/pve_node_name/lxc') do
287
+ idx_try += 1
288
+ if idx_try <= nbr_api_errors
289
+ 'NOK: error code = 500'
290
+ else
291
+ [
292
+ {
293
+ 'vmid' => '1024'
294
+ }
295
+ ]
296
+ end
297
+ end
298
+ expect(proxmox).to receive(:reauthenticate).exactly(nbr_api_errors - (status.nil? ? 1 : 0)).times
299
+ unless status.nil?
300
+ expect(proxmox).to receive(:get).with('nodes/pve_node_name/lxc/1024/status/current') do
272
301
  {
273
- 'vmid' => '1024'
302
+ 'status' => status
274
303
  }
275
- ]
276
- end
277
- expect(proxmox).to receive(:get).with('nodes/pve_node_name/lxc/1024/status/current') do
278
- {
279
- 'status' => 'created'
280
- }
304
+ end
281
305
  end
282
306
  proxmox
283
307
  end
@@ -548,13 +572,17 @@ module HybridPlatformsConductorTest
548
572
  ]
549
573
  when /^nodes\/([^\/]+)\/lxc$/
550
574
  pve_node_name = $1
551
- pve_nodes[pve_node_name][:lxc_containers].map do |vm_id, vm_info|
552
- {
553
- 'vmid' => vm_id.to_s,
554
- 'maxdisk' => vm_info[:maxdisk],
555
- 'maxmem' => vm_info[:maxmem],
556
- 'cpus' => vm_info[:cpus]
557
- }
575
+ if pve_nodes[pve_node_name][:error_strings].nil? || pve_nodes[pve_node_name][:error_strings].empty?
576
+ pve_nodes[pve_node_name][:lxc_containers].map do |vm_id, vm_info|
577
+ {
578
+ 'vmid' => vm_id.to_s,
579
+ 'maxdisk' => vm_info[:maxdisk],
580
+ 'maxmem' => vm_info[:maxmem],
581
+ 'cpus' => vm_info[:cpus]
582
+ }
583
+ end
584
+ else
585
+ pve_nodes[pve_node_name][:error_strings].shift
558
586
  end
559
587
  when /^nodes\/([^\/]+)\/lxc\/([^\/]+)\/config$/
560
588
  pve_node_name = $1
@@ -615,6 +643,10 @@ module HybridPlatformsConductorTest
615
643
  raise "Unknown Proxmox API post call: #{path}. Please adapt the test framework."
616
644
  end
617
645
  end
646
+ # Mock create_ticket
647
+ allow(proxmox).to receive(:create_ticket) do
648
+ @proxmox_actions << [:create_ticket]
649
+ end
618
650
  proxmox
619
651
  end
620
652
  end,
@@ -642,14 +674,26 @@ module HybridPlatformsConductorTest
642
674
  # * *wait_before_retry* (Integer): Specify the number of seconds to wait before retry [default: 0]
643
675
  # * *create* (Hash or nil): Create file content, or nil if none [default: nil]
644
676
  # * *destroy* (Hash or nil): Destroy file content, or nil if none [default: nil]
677
+ # * *api_max_retries* (Integer): Max number of API retries [default: 3]
678
+ # * *api_wait_between_retries_secs* (Integer): Number of seconds to wait between API retries [default: 0]
645
679
  # Result::
646
680
  # * Hash: JSON result of the call
647
- def call_reserve_proxmox_container_with(config: {}, max_retries: 1, wait_before_retry: 0, create: nil, destroy: nil)
681
+ def call_reserve_proxmox_container_with(
682
+ config: {},
683
+ max_retries: 1,
684
+ wait_before_retry: 0,
685
+ create: nil,
686
+ destroy: nil,
687
+ api_max_retries: 3,
688
+ api_wait_between_retries_secs: 0
689
+ )
648
690
  # Make sure we set default values in the config
649
691
  config = {
650
692
  proxmox_api_url: 'https://my-proxmox.my-domain.com:8006',
651
693
  futex_file: "#{@repository}/proxmox/allocations.futex",
652
694
  logs_dir: "#{Dir.tmpdir}/hpc_test_proxmox_waiter_logs",
695
+ api_max_retries: api_max_retries,
696
+ api_wait_between_retries_secs: api_wait_between_retries_secs,
653
697
  pve_nodes: ['pve_node_name'],
654
698
  vm_ips_list: %w[
655
699
  192.168.0.100
@@ -716,7 +760,14 @@ module HybridPlatformsConductorTest
716
760
  # * *wait_before_retry* (Integer): Specify the number of seconds to wait before retry [default: 0]
717
761
  # Result::
718
762
  # * Hash: JSON result of the call
719
- def call_reserve_proxmox_container(cpus, ram_mb, disk_gb, config: {}, max_retries: 1, wait_before_retry: 0)
763
+ def call_reserve_proxmox_container(
764
+ cpus,
765
+ ram_mb,
766
+ disk_gb,
767
+ config: {},
768
+ max_retries: 1,
769
+ wait_before_retry: 0
770
+ )
720
771
  call_reserve_proxmox_container_with(
721
772
  config: config,
722
773
  max_retries: max_retries,
@@ -763,7 +814,13 @@ module HybridPlatformsConductorTest
763
814
  # Parameters::
764
815
  # * *expected_proxmox_actions* (Array<Array>): Expected Proxmox actions
765
816
  def expect_proxmox_actions_to_be(expected_proxmox_actions)
766
- expect(@proxmox_actions.size).to eq expected_proxmox_actions.size
817
+ expect(@proxmox_actions.size).to eq(expected_proxmox_actions.size), <<~EOS
818
+ Expected #{expected_proxmox_actions.size} Proxmox actions, but got #{@proxmox_actions.size} instead:
819
+ ----- Received:
820
+ #{@proxmox_actions.map(&:inspect).join("\n")}
821
+ ----- Expected:
822
+ #{expected_proxmox_actions.map(&:inspect).join("\n")}
823
+ EOS
767
824
  @proxmox_actions.zip(expected_proxmox_actions).each do |proxmox_action, expected_proxmox_action|
768
825
  expect(proxmox_action.size).to eq expected_proxmox_action.size
769
826
  expect(proxmox_action[0..1]).to eq expected_proxmox_action[0..1]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hybrid_platforms_conductor
3
3
  version: !ruby/object:Gem::Version
4
- version: 32.7.3
4
+ version: 32.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Muriel Salvan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-12 00:00:00.000000000 Z
11
+ date: 2021-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: range_operators