hybrid_platforms_conductor 32.7.3 → 32.9.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83352b7821e41bc5b0282c693c47f0cac0f5a9915a45dfa37affd3d1fcaadd8c
4
- data.tar.gz: fba8bf35c2569716b2b8f934a46684c8b29843e50fe49b8133d3eeae580938d1
3
+ metadata.gz: 4f1beb4de64ac4dcdc1de3e5063980ea4dab1b8ceef204baae0a9f4bf209f1f3
4
+ data.tar.gz: 0c5e65553da39646554d737c8a690939497a8c89f602a4eae01a4987fff994b7
5
5
  SHA512:
6
- metadata.gz: 69c667f1f6c626372ecef7c21c9afdb2865a38e418ad6add2ec4d04d9a0eeda6892fed168d546d89d8805d843c31945eb885d53b0cf5208a65a7eb89ba75ea74
7
- data.tar.gz: b9478d38660f80b7c9f62411d72d0e890e74ec509973ceb3d417721d21f5f9923ffc5dfbc96b857af6301d7bd708630638d393dc3bbf2abcd4cf7d12e22a7af6
6
+ metadata.gz: 3129d3b37140c6488b2b6a8b6c1c8cb22ce745181a643925d9bb2fdc77de97a331660c8d5193dde940535d58b545576796d22a29008d181c73bbf7bb384c2251
7
+ data.tar.gz: 187c6215c37e89b924693c1819fea12f8cc444858ac575f9445b535ae9866a396b01dbc5baaf85ca4e8163d60c8865c931eca319c4824148fdd06029920b9710
@@ -505,15 +505,25 @@ module HybridPlatformsConductor
505
505
  if @nodes_handler.get_ssh_session_exec_of(node) == 'false'
506
506
  # Here we have to create a ControlMaster using an interactive session, as the SSH server prohibits ExecSession, and so command executions.
507
507
  # We'll do that using another terminal spawned in the background.
508
- Thread.new do
509
- log_debug "[ ControlMaster - #{ssh_url} ] - Spawn interactive ControlMaster in separate terminal"
510
- @cmd_runner.run_cmd "xterm -e '#{ssh_exec} -o ControlMaster=yes -o ControlPersist=yes #{ssh_url}'", log_to_stdout: log_debug?
511
- log_debug "[ ControlMaster - #{ssh_url} ] - Separate interactive ControlMaster closed"
508
+ if ENV['hpc_interactive'] == 'false'
509
+ error = "Can't spawn interactive ControlMaster to #{node} in non-interactive mode. You may want to change the hpc_interactive env variable."
510
+ if no_exception
511
+ log_error error
512
+ exit_status = :non_interactive
513
+ else
514
+ raise error
515
+ end
516
+ else
517
+ Thread.new do
518
+ log_debug "[ ControlMaster - #{ssh_url} ] - Spawn interactive ControlMaster in separate terminal"
519
+ @cmd_runner.run_cmd "xterm -e '#{ssh_exec} -o ControlMaster=yes -o ControlPersist=yes #{ssh_url}'", log_to_stdout: log_debug?
520
+ log_debug "[ ControlMaster - #{ssh_url} ] - Separate interactive ControlMaster closed"
521
+ end
522
+ out 'External ControlMaster has been spawned.'
523
+ out 'Please login into it, keep its session opened and press enter here when done...'
524
+ $stdin.gets
525
+ exit_status = 0
512
526
  end
513
- out 'External ControlMaster has been spawned.'
514
- out 'Please login into it, keep its session opened and press enter here when done...'
515
- $stdin.gets
516
- exit_status = 0
517
527
  else
518
528
  # Create the control master
519
529
  ssh_control_master_start_cmd = "#{ssh_exec}#{@passwords.key?(node) || @auth_password ? '' : ' -o BatchMode=yes'} -o ControlMaster=yes -o ControlPersist=yes #{ssh_url} true"
@@ -23,6 +23,13 @@ module HybridPlatformsConductor
23
23
  super
24
24
  end
25
25
 
26
+ # Re-authenticate the Proxmox instance
27
+ # This can be useful when the API returns errors due to invalidated tokens
28
+ def reauthenticate
29
+ log_info 'Force re-authentication to Proxmox'
30
+ @auth_params = create_ticket
31
+ end
32
+
26
33
  end
27
34
  ::Proxmox::Proxmox.prepend ProxmoxPatches
28
35
 
@@ -74,13 +81,13 @@ module HybridPlatformsConductor
74
81
  # First check if we already have a test container that corresponds to this node and environment
75
82
  @lxc_details = nil
76
83
  with_proxmox do |proxmox|
77
- proxmox.get('nodes').each do |node_info|
84
+ proxmox_get(proxmox, 'nodes').each do |node_info|
78
85
  if proxmox_test_info[:test_config][:pve_nodes].include?(node_info['node']) && node_info['status'] == 'online'
79
- proxmox.get("nodes/#{node_info['node']}/lxc").each do |lxc_info|
86
+ proxmox_get(proxmox, "nodes/#{node_info['node']}/lxc").each do |lxc_info|
80
87
  vm_id = Integer(lxc_info['vmid'])
81
88
  if vm_id.between?(*proxmox_test_info[:test_config][:vm_ids_range])
82
89
  # Check if the description contains our ID
83
- lxc_config = proxmox.get("nodes/#{node_info['node']}/lxc/#{vm_id}/config")
90
+ lxc_config = proxmox_get(proxmox, "nodes/#{node_info['node']}/lxc/#{vm_id}/config")
84
91
  vm_description_lines = (lxc_config['description'] || '').split("\n")
85
92
  hpc_marker_idx = vm_description_lines.index('===== HPC info =====')
86
93
  unless hpc_marker_idx.nil?
@@ -222,8 +229,8 @@ module HybridPlatformsConductor
222
229
  with_proxmox do |proxmox|
223
230
  vm_id_str = @lxc_details[:vm_id].to_s
224
231
  status =
225
- if proxmox.get("nodes/#{@lxc_details[:pve_node]}/lxc").any? { |data_info| data_info['vmid'] == vm_id_str }
226
- status_info = proxmox.get("nodes/#{@lxc_details[:pve_node]}/lxc/#{@lxc_details[:vm_id]}/status/current")
232
+ if proxmox_get(proxmox, "nodes/#{@lxc_details[:pve_node]}/lxc").any? { |data_info| data_info['vmid'] == vm_id_str }
233
+ status_info = proxmox_get(proxmox, "nodes/#{@lxc_details[:pve_node]}/lxc/#{@lxc_details[:vm_id]}/status/current")
227
234
  # Careful that it is possible that somebody destroyed the VM and so its status is missing
228
235
  status = status_info.key?('status') ? status_info['status'].to_sym : :missing
229
236
  status = :exited if status == :stopped
@@ -292,11 +299,29 @@ module HybridPlatformsConductor
292
299
  end
293
300
  end
294
301
 
295
- # Maximum number of retries to perform on the Proxmox API.
296
- NBR_RETRIES_MAX = 5
297
-
298
- # Minimum seconds to wait between retries
299
- RETRY_WAIT_TIME_SECS = 5
302
+ # Perform a get operation on the API
303
+ # Protect the get API methods with a retry mechanism in case of 5xx errors.
304
+ #
305
+ # Parameters::
306
+ # * *proxmox* (Proxmox): The Proxmox instance
307
+ # * *path* (String): Path to get
308
+ # Result::
309
+ # * Object: API response
310
+ def proxmox_get(proxmox, path)
311
+ response = nil
312
+ idx_try = 0
313
+ loop do
314
+ response = proxmox.get(path)
315
+ break if !(response.is_a?(String)) || !(response =~ /^NOK: error code = 5\d\d$/)
316
+ log_warn "[ #{@node}/#{@environment} ] - Proxmox API call get #{path} returned error #{response} (attempt ##{idx_try}/#{proxmox_test_info[:api_max_retries]})"
317
+ raise "[ #{@node}/#{@environment} ] - Proxmox API call get #{path} returns #{response} continuously (tried #{idx_try + 1} times)" if idx_try >= proxmox_test_info[:api_max_retries]
318
+ idx_try += 1
319
+ # We have to reauthenticate: error 500 raised by Proxmox are often due to token being invalidated wrongly
320
+ proxmox.reauthenticate
321
+ sleep proxmox_test_info[:api_wait_between_retries_secs] + rand(5)
322
+ end
323
+ response
324
+ end
300
325
 
301
326
  # Run a Proxmox task.
302
327
  # Handle a retry mechanism in case of 5xx errors.
@@ -313,11 +338,13 @@ module HybridPlatformsConductor
313
338
  while task.nil? do
314
339
  task = proxmox.send(http_method, "nodes/#{pve_node}/#{sub_path}", *args)
315
340
  if task =~ /^NOK: error code = 5\d\d$/
316
- log_warn "[ #{@node}/#{@environment} ] - Proxmox API call #{http_method} nodes/#{pve_node}/#{sub_path} #{args} returned error #{task} (attempt ##{idx_try}/#{NBR_RETRIES_MAX})"
341
+ log_warn "[ #{@node}/#{@environment} ] - Proxmox API call #{http_method} nodes/#{pve_node}/#{sub_path} #{args} returned error #{task} (attempt ##{idx_try}/#{proxmox_test_info[:api_max_retries]})"
317
342
  task = nil
343
+ break if idx_try >= proxmox_test_info[:api_max_retries]
318
344
  idx_try += 1
319
- break if idx_try == NBR_RETRIES_MAX
320
- sleep RETRY_WAIT_TIME_SECS + rand(5)
345
+ # We have to reauthenticate: error 500 raised by Proxmox are often due to token being invalidated wrongly
346
+ proxmox.reauthenticate
347
+ sleep proxmox_test_info[:api_wait_between_retries_secs] + rand(5)
321
348
  end
322
349
  end
323
350
  if task.nil?
@@ -358,7 +385,7 @@ module HybridPlatformsConductor
358
385
  # Result::
359
386
  # * String: The task status
360
387
  def task_status(proxmox, pve_node, task)
361
- status_info = proxmox.get("nodes/#{pve_node}/tasks/#{task}/status")
388
+ status_info = proxmox_get(proxmox, "nodes/#{pve_node}/tasks/#{task}/status")
362
389
  "#{status_info['status']}#{status_info['exitstatus'] ? ":#{status_info['exitstatus']}" : ''}"
363
390
  end
364
391
 
@@ -377,7 +404,9 @@ module HybridPlatformsConductor
377
404
  (proxmox_test_info[:test_config].merge(
378
405
  proxmox_api_url: proxmox_test_info[:api_url],
379
406
  futex_file: '/tmp/hpc_proxmox_allocations.futex',
380
- logs_dir: '/tmp/hpc_proxmox_waiter_logs'
407
+ logs_dir: '/tmp/hpc_proxmox_waiter_logs',
408
+ api_max_retries: proxmox_test_info[:api_max_retries],
409
+ api_wait_between_retries_secs: proxmox_test_info[:api_wait_between_retries_secs]
381
410
  )).to_json
382
411
  )
383
412
  result = nil
@@ -486,7 +515,7 @@ module HybridPlatformsConductor
486
515
  # So remaining length is 255 - 13 = 242 characters.
487
516
  MAX_FILE_ID_SIZE = 242
488
517
 
489
- # Get an ID unique for theis node/environment and that can be used in file names.
518
+ # Get an ID unique for this node/environment and that can be used in file names.
490
519
  #
491
520
  # Result::
492
521
  # * String: ID
@@ -506,6 +535,8 @@ module HybridPlatformsConductor
506
535
  # Result::
507
536
  # * Hash<Symbol,Object>: Configuration of the Proxmox instance to be used:
508
537
  # * *api_url* (String): The Proxmox API URL
538
+ # * *api_max_retries* (Integer): Max number of API retries
539
+ # * *api_wait_between_retries_secs* (Integer): Number of seconds to wait between API retries
509
540
  # * *sync_node* (String): Node to be used to synchronize Proxmox resources acquisition
510
541
  # * *test_config* (Hash<Symbol,Object>): The test configuration. Check ProxmoxWaiter#initialize (config_file structure) method to get details.
511
542
  # * *vm_config* (Hash<Symbol,Object>): Extra configuration of a created container. Check #request_lxc_creation_for results to get details.
@@ -26,6 +26,8 @@ class ProxmoxWaiter
26
26
  # * *proxmox_api_url* (String): Proxmox API URL.
27
27
  # * *futex_file* (String): Path to the file serving as a futex.
28
28
  # * *logs_dir* (String): Path to the directory containing logs [default: '.']
29
+ # * *api_max_retries* (Integer): Max number of API retries
30
+ # * *api_wait_between_retries_secs* (Integer): Number of seconds to wait between API retries
29
31
  # * *pve_nodes* (Array<String>): List of PVE nodes allowed to spawn new containers [default: all]
30
32
  # * *vm_ips_list* (Array<String>): The list of IPs that are available for the Proxomx containers.
31
33
  # * *vm_ids_range* ([Integer, Integer]): Minimum and maximum reservable VM ID
@@ -637,11 +639,26 @@ class ProxmoxWaiter
637
639
 
638
640
  # Get a path from the API it returns its JSON result.
639
641
  # Keep a cache of it, whose lifespan is this ProxmoxWaiter instance.
642
+ # Have a retry mechanism to make sure eventual non-deterministic 5xx errors are not an issue.
640
643
  #
641
644
  # Parameters::
642
645
  # * *path* (String): API path to query
643
- def api_get(path)
644
- @gets_cache[path] = @proxmox.get(path) unless @gets_cache.key?(path)
646
+ # Result::
647
+ # * Object: The API response
648
+ def api_get(path, nbr_retries: 3, wait_between_retry_secs: 10)
649
+ unless @gets_cache.key?(path)
650
+ idx_try = 0
651
+ loop do
652
+ @gets_cache[path] = @proxmox.get(path)
653
+ break unless @gets_cache[path].is_a?(String) && @gets_cache[path] =~ /^NOK: error code = 5\d\d$/
654
+ raise "Proxmox API get #{path} returns #{@gets_cache[path]} continuously (tried #{idx_try + 1} times)" if idx_try >= @config['api_max_retries']
655
+ idx_try += 1
656
+ # We have to reauthenticate: error 500 raised by Proxmox are often due to token being invalidated wrongly
657
+ # TODO: Provide a way to do it properly in the official gem
658
+ @proxmox.instance_variable_set(:@auth_params, @proxmox.send(:create_ticket))
659
+ sleep @config['api_wait_between_retries_secs']
660
+ end
661
+ end
645
662
  @gets_cache[path]
646
663
  end
647
664
 
@@ -428,9 +428,11 @@ module HybridPlatformsConductor
428
428
  end
429
429
  end
430
430
  # Compute the timeout that will be applied, from the max timeout sum for every node that has tests to run
431
- timeout = CONNECTION_TIMEOUT + @cmds_to_run.map do |_node, cmds_list|
432
- cmds_list.inject(0) { |total_timeout, (_cmd, test_info)| test_info[:timeout] + total_timeout }
433
- end.max
431
+ timeout = CONNECTION_TIMEOUT + (
432
+ @cmds_to_run.map do |_node, cmds_list|
433
+ cmds_list.inject(0) { |total_timeout, (_cmd, test_info)| test_info[:timeout] + total_timeout }
434
+ end.max || 0
435
+ )
434
436
  # Run commands on nodes, in grouped way to avoid too many connections, per node
435
437
  # Hash< String, Array<String> >
436
438
  @test_cmds = Hash[@cmds_to_run.map do |node, cmds_list|
@@ -464,33 +466,35 @@ module HybridPlatformsConductor
464
466
  end,
465
467
  test_execution: proc do |test|
466
468
  exit_status, stdout, stderr = @actions_result[test.node]
467
- if exit_status.is_a?(Symbol)
468
- test.error "Error while executing tests: #{exit_status}: #{stderr}"
469
- else
470
- log_debug <<~EOS
471
- ----- Commands for #{test.node}:
472
- #{@test_cmds[test.node][:remote_bash].join("\n")}
473
- ----- STDOUT:
474
- #{stdout}
475
- ----- STDERR:
476
- #{stderr}
477
- -----
478
- EOS
479
- # Skip the first section, as it can contain SSH banners
480
- cmd_stdouts = stdout.split("#{CMD_SEPARATOR}\n")[1..-1]
481
- cmd_stdouts = [] if cmd_stdouts.nil?
482
- cmd_stderrs = stderr.split("#{CMD_SEPARATOR}\n")[1..-1]
483
- cmd_stderrs = [] if cmd_stderrs.nil?
484
- @cmds_to_run[test.node].zip(cmd_stdouts, cmd_stderrs).each do |(cmd, test_info), cmd_stdout, cmd_stderr|
485
- # Find the section that corresponds to this test
486
- if test_info[:test] == test
487
- cmd_stdout = '' if cmd_stdout.nil?
488
- cmd_stderr = '' if cmd_stderr.nil?
489
- stdout_lines = cmd_stdout.split("\n")
490
- # Last line of stdout is the return code
491
- return_code = stdout_lines.empty? ? :command_cant_run : Integer(stdout_lines.last)
492
- test.error "Command '#{cmd}' returned error code #{return_code}", "----- STDOUT:\n#{stdout_lines[0..-2].join("\n")}\n----- STDERR:\n#{cmd_stderr}" unless return_code == 0
493
- test_info[:validator].call(stdout_lines[0..-2], cmd_stderr.split("\n"), return_code)
469
+ unless exit_status.nil?
470
+ if exit_status.is_a?(Symbol)
471
+ test.error "Error while executing tests: #{exit_status}: #{stderr}"
472
+ else
473
+ log_debug <<~EOS
474
+ ----- Commands for #{test.node}:
475
+ #{@test_cmds[test.node][:remote_bash].join("\n")}
476
+ ----- STDOUT:
477
+ #{stdout}
478
+ ----- STDERR:
479
+ #{stderr}
480
+ -----
481
+ EOS
482
+ # Skip the first section, as it can contain SSH banners
483
+ cmd_stdouts = stdout.split("#{CMD_SEPARATOR}\n")[1..-1]
484
+ cmd_stdouts = [] if cmd_stdouts.nil?
485
+ cmd_stderrs = stderr.split("#{CMD_SEPARATOR}\n")[1..-1]
486
+ cmd_stderrs = [] if cmd_stderrs.nil?
487
+ @cmds_to_run[test.node].zip(cmd_stdouts, cmd_stderrs).each do |(cmd, test_info), cmd_stdout, cmd_stderr|
488
+ # Find the section that corresponds to this test
489
+ if test_info[:test] == test
490
+ cmd_stdout = '' if cmd_stdout.nil?
491
+ cmd_stderr = '' if cmd_stderr.nil?
492
+ stdout_lines = cmd_stdout.split("\n")
493
+ # Last line of stdout is the return code
494
+ return_code = stdout_lines.empty? ? :command_cant_run : Integer(stdout_lines.last)
495
+ test.error "Command '#{cmd}' returned error code #{return_code}", "----- STDOUT:\n#{stdout_lines[0..-2].join("\n")}\n----- STDERR:\n#{cmd_stderr}" unless return_code == 0
496
+ test_info[:validator].call(stdout_lines[0..-2], cmd_stderr.split("\n"), return_code)
497
+ end
494
498
  end
495
499
  end
496
500
  end
@@ -1,5 +1,5 @@
1
1
  module HybridPlatformsConductor
2
2
 
3
- VERSION = '32.7.3'
3
+ VERSION = '32.9.1'
4
4
 
5
5
  end
@@ -94,6 +94,7 @@ module HybridPlatformsConductorTest
94
94
  ENV.delete 'hpc_password_for_thycotic'
95
95
  ENV.delete 'hpc_domain_for_thycotic'
96
96
  ENV.delete 'hpc_certificates'
97
+ ENV.delete 'hpc_interactive'
97
98
  # Set the necessary Hybrid Platforms Conductor environment variables
98
99
  ENV['hpc_ssh_user'] = 'test_user'
99
100
  HybridPlatformsConductor::ServicesHandler.packaged_deployments.clear
@@ -44,6 +44,58 @@ describe HybridPlatformsConductor::ActionsExecutor do
44
44
  end
45
45
  end
46
46
 
47
+ it 'can\'t create an SSH master to 1 node not having Session Exec capabilities when hpc_interactive is false' do
48
+ with_test_platform(nodes: { 'node' => { meta: { host_ip: '192.168.42.42', ssh_session_exec: 'false' } } }) do
49
+ ENV['hpc_interactive'] = 'false'
50
+ with_cmd_runner_mocked(
51
+ [
52
+ ['which env', proc { [0, "/usr/bin/env\n", ''] }],
53
+ ['ssh -V 2>&1', proc { [0, "OpenSSH_7.4p1 Debian-10+deb9u7, OpenSSL 1.0.2u 20 Dec 2019\n", ''] }]
54
+ ] + ssh_expected_commands_for(
55
+ { 'node' => { connection: '192.168.42.42', user: 'test_user' } },
56
+ with_control_master_create: false,
57
+ with_control_master_destroy: false
58
+ )
59
+ ) do
60
+ test_connector.ssh_user = 'test_user'
61
+ expect do
62
+ test_connector.with_connection_to(['node']) do
63
+ end
64
+ end.to raise_error 'Can\'t spawn interactive ControlMaster to node in non-interactive mode. You may want to change the hpc_interactive env variable.'
65
+ end
66
+ end
67
+ end
68
+
69
+ it 'fails without creating exception when creating an SSH master to 1 node not having Session Exec capabilities when hpc_interactive is false and we use no_exception' do
70
+ with_test_platform(nodes: {
71
+ 'node1' => { meta: { host_ip: '192.168.42.1' } },
72
+ 'node2' => { meta: { host_ip: '192.168.42.2', ssh_session_exec: 'false' } },
73
+ 'node3' => { meta: { host_ip: '192.168.42.3' } }
74
+ }) do
75
+ ENV['hpc_interactive'] = 'false'
76
+ with_cmd_runner_mocked(
77
+ [
78
+ ['which env', proc { [0, "/usr/bin/env\n", ''] }],
79
+ ['ssh -V 2>&1', proc { [0, "OpenSSH_7.4p1 Debian-10+deb9u7, OpenSSL 1.0.2u 20 Dec 2019\n", ''] }]
80
+ ] + ssh_expected_commands_for(
81
+ 'node1' => { connection: '192.168.42.1', user: 'test_user' },
82
+ 'node3' => { connection: '192.168.42.3', user: 'test_user' }
83
+ ) + ssh_expected_commands_for(
84
+ {
85
+ 'node2' => { connection: '192.168.42.2', user: 'test_user' }
86
+ },
87
+ with_control_master_create: false,
88
+ with_control_master_destroy: false
89
+ )
90
+ ) do
91
+ test_connector.ssh_user = 'test_user'
92
+ test_connector.with_connection_to(%w[node1 node2 node3], no_exception: true) do |connected_nodes|
93
+ expect(connected_nodes.sort).to eq %w[node1 node3].sort
94
+ end
95
+ end
96
+ end
97
+ end
98
+
47
99
  it 'creates SSH master to several nodes' do
48
100
  with_test_platform(nodes: {
49
101
  'node1' => { meta: { host_ip: '192.168.42.1' } },
@@ -10,6 +10,7 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
10
10
  with_sync_node do
11
11
  mock_proxmox(mocked_pve_nodes: [{ 'pve_node_name' => {} }] * 5)
12
12
  expect(call_reserve_proxmox_container(2, 128 * 1024, 4, max_retries: 5)).to eq(error: 'not_enough_resources')
13
+ expect_proxmox_actions_to_be []
13
14
  end
14
15
  end
15
16
 
@@ -25,6 +26,69 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
25
26
  vm_id: 1000,
26
27
  vm_ip: '192.168.0.100'
27
28
  )
29
+ expect_proxmox_actions_to_be [
30
+ [
31
+ :post,
32
+ 'nodes/pve_node_name/lxc',
33
+ {
34
+ 'ostemplate' => 'test_template.iso',
35
+ 'hostname' => 'test.hostname.my-domain.com',
36
+ 'description' => /node: test_node\nenvironment: test_env/,
37
+ 'cores' => 2,
38
+ 'cpulimit' => 2,
39
+ 'memory' => 1024,
40
+ 'rootfs' => 'local-lvm:4',
41
+ 'net0' => 'name=eth0,bridge=vmbr0,gw=172.16.16.16,ip=192.168.0.100/32',
42
+ 'vmid' => 1000
43
+ }
44
+ ]
45
+ ]
46
+ end
47
+ end
48
+
49
+ it 'retries a few times before ending in error for a 5xx API error' do
50
+ with_sync_node do
51
+ mock_proxmox(mocked_pve_nodes: [{ 'pve_node_name' => { error_strings: ['NOK: error code = 500'] * 5 } }])
52
+ result = call_reserve_proxmox_container(2, 1024, 4, config: { api_max_retries: 4 })
53
+ expect(result[:error]).not_to eq nil
54
+ expect(result[:error]).to match /Unhandled exception from reserve_proxmox_container: Proxmox API get nodes\/pve_node_name\/lxc returns NOK: error code = 500 continuously \(tried 5 times\)/
55
+ expect_proxmox_actions_to_be [
56
+ [:create_ticket],
57
+ [:create_ticket],
58
+ [:create_ticket],
59
+ [:create_ticket]
60
+ ]
61
+ end
62
+ end
63
+
64
+ it 'retries API errors a few times until it gets resolved' do
65
+ with_sync_node do
66
+ mock_proxmox(mocked_pve_nodes: [{ 'pve_node_name' => { error_strings: ['NOK: error code = 500'] * 3 } }])
67
+ expect(call_reserve_proxmox_container(2, 1024, 4, config: { api_max_retries: 4 })).to eq(
68
+ pve_node: 'pve_node_name',
69
+ vm_id: 1000,
70
+ vm_ip: '192.168.0.100'
71
+ )
72
+ expect_proxmox_actions_to_be [
73
+ [:create_ticket],
74
+ [:create_ticket],
75
+ [:create_ticket],
76
+ [
77
+ :post,
78
+ 'nodes/pve_node_name/lxc',
79
+ {
80
+ 'ostemplate' => 'test_template.iso',
81
+ 'hostname' => 'test.hostname.my-domain.com',
82
+ 'description' => /node: test_node\nenvironment: test_env/,
83
+ 'cores' => 2,
84
+ 'cpulimit' => 2,
85
+ 'memory' => 1024,
86
+ 'rootfs' => 'local-lvm:4',
87
+ 'net0' => 'name=eth0,bridge=vmbr0,gw=172.16.16.16,ip=192.168.0.100/32',
88
+ 'vmid' => 1000
89
+ }
90
+ ]
91
+ ]
28
92
  end
29
93
  end
30
94
 
@@ -39,16 +39,7 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
39
39
  mock_proxmox_to_start_node(nbr_api_errors: 3)
40
40
  ]
41
41
  instance.create
42
- # To speed up the test, alter the wait time between retries.
43
- old_wait_secs = HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_get(:RETRY_WAIT_TIME_SECS)
44
- begin
45
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
46
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, 1)
47
- instance.start
48
- ensure
49
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
50
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, old_wait_secs)
51
- end
42
+ instance.start
52
43
  end
53
44
  end
54
45
 
@@ -58,19 +49,10 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
58
49
  # 1 - The info on existing containers
59
50
  mock_proxmox_to_get_nodes_info,
60
51
  # 2 - The start of the container - fail too many times
61
- mock_proxmox_to_start_node(nbr_api_errors: HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_get(:NBR_RETRIES_MAX), task_status: nil)
52
+ mock_proxmox_to_start_node(nbr_api_errors: 4, task_status: nil)
62
53
  ]
63
54
  instance.create
64
- # To speed up the test, alter the wait time between retries.
65
- old_wait_secs = HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_get(:RETRY_WAIT_TIME_SECS)
66
- begin
67
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
68
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, 1)
69
- expect { instance.start }.to raise_error '[ node/test ] - Proxmox API call post nodes/pve_node_name/lxc/1024/status/start [] is constantly failing. Giving up.'
70
- ensure
71
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.send(:remove_const, :RETRY_WAIT_TIME_SECS)
72
- HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox.const_set(:RETRY_WAIT_TIME_SECS, old_wait_secs)
73
- end
55
+ expect { instance.start }.to raise_error '[ node/test ] - Proxmox API call post nodes/pve_node_name/lxc/1024/status/start [] is constantly failing. Giving up.'
74
56
  end
75
57
  end
76
58
 
@@ -23,6 +23,32 @@ describe HybridPlatformsConductor::HpcPlugins::Provisioner::Proxmox do
23
23
  end
24
24
  end
25
25
 
26
+ it 'retries calls to the API when getting back errors 5xx' do
27
+ with_test_proxmox_platform do |instance|
28
+ mock_proxmox_calls_with [
29
+ # 1 - The info on existing containers
30
+ mock_proxmox_to_get_nodes_info,
31
+ # 2 - The status of the container
32
+ mock_proxmox_to_status_node(nbr_api_errors: 3)
33
+ ]
34
+ instance.create
35
+ expect(instance.state).to eq :created
36
+ end
37
+ end
38
+
39
+ it 'fails to get an instance\'s status when the Proxmox API fails too many times' do
40
+ with_test_proxmox_platform do |instance|
41
+ mock_proxmox_calls_with [
42
+ # 1 - The info on existing containers
43
+ mock_proxmox_to_get_nodes_info,
44
+ # 2 - The status of the container
45
+ mock_proxmox_to_status_node(nbr_api_errors: 4, status: nil)
46
+ ]
47
+ instance.create
48
+ expect { instance.state }.to raise_error '[ node/test ] - Proxmox API call get nodes/pve_node_name/lxc returns NOK: error code = 500 continuously (tried 4 times)'
49
+ end
50
+ end
51
+
26
52
  end
27
53
 
28
54
  end
@@ -69,7 +69,7 @@ describe HybridPlatformsConductor::TestsRunner do
69
69
  'node12' => { 'test_node12.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node12', stdout, stderr, exit_code] } },
70
70
  'node21' => { 'test_node21.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node21', stdout, stderr, exit_code] } },
71
71
  'node22' => { 'test_node22.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node22', stdout, stderr, exit_code] } }
72
- }}
72
+ } }
73
73
  expect(test_tests_runner.run_tests([{ all: true }])).to eq 0
74
74
  expect(ssh_executions.sort).to eq [
75
75
  ['node11', ['stdout11'], ['stderr11'], 0],
@@ -88,7 +88,7 @@ describe HybridPlatformsConductor::TestsRunner do
88
88
  HybridPlatformsConductorTest::TestPlugins::NodeSsh.node_tests = { node_ssh_test: {
89
89
  'node12' => { 'test_node12.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node12', stdout, stderr, exit_code] } },
90
90
  'node22' => { 'test_node22.sh' => proc { |stdout, stderr, exit_code| ssh_executions << ['node22', stdout, stderr, exit_code] } }
91
- }}
91
+ } }
92
92
  expect(test_tests_runner.run_tests(%w[node12 node22])).to eq 0
93
93
  expect(ssh_executions.sort).to eq [
94
94
  ['node12', ['stdout12'], ['stderr12'], 0],
@@ -97,6 +97,19 @@ describe HybridPlatformsConductor::TestsRunner do
97
97
  end
98
98
  end
99
99
 
100
+ it 'does not execute anything when the tests report no command' do
101
+ with_test_platform_for_node_connection_tests do
102
+ test_tests_runner.tests = [:node_ssh_test]
103
+ ssh_executions = []
104
+ HybridPlatformsConductorTest::TestPlugins::NodeSsh.node_tests = { node_ssh_test: {
105
+ 'node12' => {},
106
+ 'node22' => {}
107
+ } }
108
+ expect(test_tests_runner.run_tests(%w[node12 node22])).to eq 0
109
+ expect(ssh_executions).to eq []
110
+ end
111
+ end
112
+
100
113
  it 'executes several SSH node tests once per node with the correct command, grouping commands' do
101
114
  with_test_platform_for_node_connection_tests do
102
115
  expect_actions_executor_runs([proc do |actions|
@@ -54,13 +54,15 @@ module HybridPlatformsConductorTest
54
54
  if with_session_exec
55
55
  /^.+\/ssh #{with_batch_mode ? '-o BatchMode=yes ' : ''}-o ControlMaster=yes -o ControlPersist=yes hpc\.#{Regexp.escape(node)} true$/
56
56
  else
57
- # Mock the user hitting enter as the Control Master will be created in another thread and the main thread waits for user input.
58
- expect($stdin).to receive(:gets) do
59
- # We have to wait for the Control Master creation thread to actually create the Control Master before hitting Enter.
60
- while !control_master_created do
61
- sleep 0.1
57
+ unless ENV['hpc_interactive'] == 'false'
58
+ # Mock the user hitting enter as the Control Master will be created in another thread and the main thread waits for user input.
59
+ expect($stdin).to receive(:gets) do
60
+ # We have to wait for the Control Master creation thread to actually create the Control Master before hitting Enter.
61
+ while !control_master_created do
62
+ sleep 0.1
63
+ end
64
+ "\n"
62
65
  end
63
- "\n"
64
66
  end
65
67
  /^xterm -e '.+\/ssh -o ControlMaster=yes -o ControlPersist=yes hpc\.#{Regexp.escape(node)}'$/
66
68
  end,
@@ -23,6 +23,8 @@ module HybridPlatformsConductorTest
23
23
  test_platform path: '#{repository}'
24
24
  proxmox(
25
25
  api_url: 'https://my-proxmox.my-domain.com:8006',
26
+ api_max_retries: 3,
27
+ api_wait_between_retries_secs: 0,
26
28
  sync_node: 'node',
27
29
  test_config: {
28
30
  pve_nodes: ['pve_node_name'],
@@ -75,12 +77,20 @@ module HybridPlatformsConductorTest
75
77
  # * *proxmox_password* (String or nil): Proxmox password used to connect to Proxmox API [default: nil]
76
78
  # * *proxmox_realm* (String or nil): Proxmox realm used to connect to Proxmox API [default: 'pam']
77
79
  # * *nodes_info* (Array<Hash>): Nodes info returned by the Proxmox API [default: []]
80
+ # * *nbr_api_errors* (Integer): Number of API errors 500 to mock before getting a successful query [defaults: 0]
78
81
  # * *extra_expects* (Proc or nil): Code called for additional expectations on the proxmox instance, or nil if none [default: nil]
79
82
  # * Parameters::
80
83
  # * *proxmox* (Double): The mocked Proxmox instance
81
84
  # Result::
82
85
  # * Proc: Code called in place of Proxmox.new. Signature is the same as Proxmox.new.
83
- def mock_proxmox_to_get_nodes_info(proxmox_user: nil, proxmox_password: nil, proxmox_realm: 'pam', nodes_info: [], extra_expects: nil)
86
+ def mock_proxmox_to_get_nodes_info(
87
+ proxmox_user: nil,
88
+ proxmox_password: nil,
89
+ proxmox_realm: 'pam',
90
+ nodes_info: [],
91
+ nbr_api_errors: 0,
92
+ extra_expects: nil
93
+ )
84
94
  proc do |url, pve_node, user, password, realm, options|
85
95
  expect(url).to eq 'https://my-proxmox.my-domain.com:8006/api2/json/'
86
96
  expect(pve_node).to eq 'my-proxmox'
@@ -97,8 +107,10 @@ module HybridPlatformsConductorTest
97
107
  # Nothing
98
108
  end
99
109
  # Mock checking existing nodes
100
- expect(proxmox).to receive(:get).with('nodes') do
101
- nodes_info
110
+ idx_try = 0
111
+ expect(proxmox).to receive(:get).exactly(nbr_api_errors + 1).times.with('nodes') do
112
+ idx_try += 1
113
+ idx_try <= nbr_api_errors ? 'NOK: error code = 500' : nodes_info
102
114
  end
103
115
  extra_expects.call(proxmox) unless extra_expects.nil?
104
116
  proxmox
@@ -144,6 +156,7 @@ module HybridPlatformsConductorTest
144
156
  idx_try += 1
145
157
  idx_try <= nbr_api_errors ? 'NOK: error code = 500' : task_name
146
158
  end
159
+ expect(proxmox).to receive(:reauthenticate).exactly(nbr_api_errors - (task_status.nil? ? 1 : 0)).times
147
160
  # Mock checking task status
148
161
  unless task_status.nil?
149
162
  # Mock checking task status
@@ -243,13 +256,15 @@ module HybridPlatformsConductorTest
243
256
  # Parameters::
244
257
  # * *proxmox_user* (String or nil): Proxmox user used to connect to Proxmox API [default: nil]
245
258
  # * *proxmox_password* (String or nil): Proxmox password used to connect to Proxmox API [default: nil]
246
- # * *status* (String): Mocked status [default: 'created']
259
+ # * *status* (String or nil): Mocked status, or nil if it should not be asked [default: 'created']
260
+ # * *nbr_api_errors* (Integer): Number of API errors 500 to mock before getting a successful query [defaults: 0]
247
261
  # Result::
248
262
  # * Proc: Code called in place of Proxmox.new. Signature is the same as Proxmox.new.
249
263
  def mock_proxmox_to_status_node(
250
264
  proxmox_user: nil,
251
265
  proxmox_password: nil,
252
- task_status: 'OK'
266
+ status: 'created',
267
+ nbr_api_errors: 0
253
268
  )
254
269
  proc do |url, pve_node, user, password, realm, options|
255
270
  expect(url).to eq 'https://my-proxmox.my-domain.com:8006/api2/json/'
@@ -267,17 +282,26 @@ module HybridPlatformsConductorTest
267
282
  # Nothing
268
283
  end
269
284
  # Mock getting status of a container
270
- expect(proxmox).to receive(:get).with('nodes/pve_node_name/lxc') do
271
- [
285
+ idx_try = 0
286
+ expect(proxmox).to receive(:get).exactly(nbr_api_errors + (status.nil? ? 0 : 1)).times.with('nodes/pve_node_name/lxc') do
287
+ idx_try += 1
288
+ if idx_try <= nbr_api_errors
289
+ 'NOK: error code = 500'
290
+ else
291
+ [
292
+ {
293
+ 'vmid' => '1024'
294
+ }
295
+ ]
296
+ end
297
+ end
298
+ expect(proxmox).to receive(:reauthenticate).exactly(nbr_api_errors - (status.nil? ? 1 : 0)).times
299
+ unless status.nil?
300
+ expect(proxmox).to receive(:get).with('nodes/pve_node_name/lxc/1024/status/current') do
272
301
  {
273
- 'vmid' => '1024'
302
+ 'status' => status
274
303
  }
275
- ]
276
- end
277
- expect(proxmox).to receive(:get).with('nodes/pve_node_name/lxc/1024/status/current') do
278
- {
279
- 'status' => 'created'
280
- }
304
+ end
281
305
  end
282
306
  proxmox
283
307
  end
@@ -548,13 +572,17 @@ module HybridPlatformsConductorTest
548
572
  ]
549
573
  when /^nodes\/([^\/]+)\/lxc$/
550
574
  pve_node_name = $1
551
- pve_nodes[pve_node_name][:lxc_containers].map do |vm_id, vm_info|
552
- {
553
- 'vmid' => vm_id.to_s,
554
- 'maxdisk' => vm_info[:maxdisk],
555
- 'maxmem' => vm_info[:maxmem],
556
- 'cpus' => vm_info[:cpus]
557
- }
575
+ if pve_nodes[pve_node_name][:error_strings].nil? || pve_nodes[pve_node_name][:error_strings].empty?
576
+ pve_nodes[pve_node_name][:lxc_containers].map do |vm_id, vm_info|
577
+ {
578
+ 'vmid' => vm_id.to_s,
579
+ 'maxdisk' => vm_info[:maxdisk],
580
+ 'maxmem' => vm_info[:maxmem],
581
+ 'cpus' => vm_info[:cpus]
582
+ }
583
+ end
584
+ else
585
+ pve_nodes[pve_node_name][:error_strings].shift
558
586
  end
559
587
  when /^nodes\/([^\/]+)\/lxc\/([^\/]+)\/config$/
560
588
  pve_node_name = $1
@@ -615,6 +643,10 @@ module HybridPlatformsConductorTest
615
643
  raise "Unknown Proxmox API post call: #{path}. Please adapt the test framework."
616
644
  end
617
645
  end
646
+ # Mock create_ticket
647
+ allow(proxmox).to receive(:create_ticket) do
648
+ @proxmox_actions << [:create_ticket]
649
+ end
618
650
  proxmox
619
651
  end
620
652
  end,
@@ -642,14 +674,26 @@ module HybridPlatformsConductorTest
642
674
  # * *wait_before_retry* (Integer): Specify the number of seconds to wait before retry [default: 0]
643
675
  # * *create* (Hash or nil): Create file content, or nil if none [default: nil]
644
676
  # * *destroy* (Hash or nil): Destroy file content, or nil if none [default: nil]
677
+ # * *api_max_retries* (Integer): Max number of API retries [default: 3]
678
+ # * *api_wait_between_retries_secs* (Integer): Number of seconds to wait between API retries [default: 0]
645
679
  # Result::
646
680
  # * Hash: JSON result of the call
647
- def call_reserve_proxmox_container_with(config: {}, max_retries: 1, wait_before_retry: 0, create: nil, destroy: nil)
681
+ def call_reserve_proxmox_container_with(
682
+ config: {},
683
+ max_retries: 1,
684
+ wait_before_retry: 0,
685
+ create: nil,
686
+ destroy: nil,
687
+ api_max_retries: 3,
688
+ api_wait_between_retries_secs: 0
689
+ )
648
690
  # Make sure we set default values in the config
649
691
  config = {
650
692
  proxmox_api_url: 'https://my-proxmox.my-domain.com:8006',
651
693
  futex_file: "#{@repository}/proxmox/allocations.futex",
652
694
  logs_dir: "#{Dir.tmpdir}/hpc_test_proxmox_waiter_logs",
695
+ api_max_retries: api_max_retries,
696
+ api_wait_between_retries_secs: api_wait_between_retries_secs,
653
697
  pve_nodes: ['pve_node_name'],
654
698
  vm_ips_list: %w[
655
699
  192.168.0.100
@@ -716,7 +760,14 @@ module HybridPlatformsConductorTest
716
760
  # * *wait_before_retry* (Integer): Specify the number of seconds to wait before retry [default: 0]
717
761
  # Result::
718
762
  # * Hash: JSON result of the call
719
- def call_reserve_proxmox_container(cpus, ram_mb, disk_gb, config: {}, max_retries: 1, wait_before_retry: 0)
763
+ def call_reserve_proxmox_container(
764
+ cpus,
765
+ ram_mb,
766
+ disk_gb,
767
+ config: {},
768
+ max_retries: 1,
769
+ wait_before_retry: 0
770
+ )
720
771
  call_reserve_proxmox_container_with(
721
772
  config: config,
722
773
  max_retries: max_retries,
@@ -763,7 +814,13 @@ module HybridPlatformsConductorTest
763
814
  # Parameters::
764
815
  # * *expected_proxmox_actions* (Array<Array>): Expected Proxmox actions
765
816
  def expect_proxmox_actions_to_be(expected_proxmox_actions)
766
- expect(@proxmox_actions.size).to eq expected_proxmox_actions.size
817
+ expect(@proxmox_actions.size).to eq(expected_proxmox_actions.size), <<~EOS
818
+ Expected #{expected_proxmox_actions.size} Proxmox actions, but got #{@proxmox_actions.size} instead:
819
+ ----- Received:
820
+ #{@proxmox_actions.map(&:inspect).join("\n")}
821
+ ----- Expected:
822
+ #{expected_proxmox_actions.map(&:inspect).join("\n")}
823
+ EOS
767
824
  @proxmox_actions.zip(expected_proxmox_actions).each do |proxmox_action, expected_proxmox_action|
768
825
  expect(proxmox_action.size).to eq expected_proxmox_action.size
769
826
  expect(proxmox_action[0..1]).to eq expected_proxmox_action[0..1]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hybrid_platforms_conductor
3
3
  version: !ruby/object:Gem::Version
4
- version: 32.7.3
4
+ version: 32.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Muriel Salvan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-12 00:00:00.000000000 Z
11
+ date: 2021-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: range_operators