ood_core 0.9.3 → 0.11.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,274 @@
1
+ require 'erb'
2
+ require 'etc'
3
+ require 'pathname'
4
+ require 'securerandom'
5
+ require 'shellwords'
6
+ require 'time'
7
+
8
+ # Object used for simplified communication SSH hosts
9
+ #
10
+ # @api private
11
+ class OodCore::Job::Adapters::LinuxHost::Launcher
12
+ attr_reader :contain, :debug, :site_timeout, :session_name_label, :singularity_bin,
13
+ :site_singularity_bindpath, :default_singularity_image, :ssh_hosts,
14
+ :strict_host_checking, :tmux_bin, :username
15
+ # The root exception class that all LinuxHost adapter-specific exceptions inherit
16
+ # from
17
+ class Error < StandardError; end
18
+
19
+ UNIT_SEPARATOR = "\x1F"
20
+
21
+ # @param debug Whether the adapter should be used in debug mode
22
+ # @param site_timeout [#to_i] A period after which the job should be killed or nil
23
+ # @param singularity_bin Path to the Singularity executable
24
+ # @param singularity_bindpath A comma delimited string of host paths to bindmount into the guest; sets SINGULARITY_BINDPATH environment variable
25
+ # @param singularity_image [#to_s] Path to the Singularity image
26
+ # @param ssh_hosts List of hosts to check when scanning for running jobs
27
+ # @param strict_host_checking Allow SSH to perform strict host checking
28
+ # @param submit_host The SSH-able host
29
+ # @param tmux_bin [#to_s] Path to the tmux executable
30
+ def initialize(
31
+ contain: false,
32
+ debug: false,
33
+ site_timeout: nil,
34
+ singularity_bin:,
35
+ singularity_bindpath: '/etc,/media,/mnt,/opt,/run,/srv,/usr,/var,/users',
36
+ singularity_image:,
37
+ ssh_hosts:,
38
+ strict_host_checking: false,
39
+ submit_host:,
40
+ tmux_bin:,
41
+ **_
42
+ )
43
+ @contain = !! contain
44
+ @debug = !! debug
45
+ @site_timeout = site_timeout.to_i
46
+ @session_name_label = 'launched-by-ondemand'
47
+ @singularity_bin = Pathname.new(singularity_bin)
48
+ @site_singularity_bindpath = singularity_bindpath.to_s
49
+ @default_singularity_image = Pathname.new(singularity_image)
50
+ @ssh_hosts = ssh_hosts
51
+ @strict_host_checking = strict_host_checking
52
+ @submit_host = submit_host
53
+ @tmux_bin = tmux_bin
54
+ @username = Etc.getlogin
55
+ end
56
+
57
+ # @param hostname [#to_s] The hostname to submit the work to
58
+ # @param script [OodCore::Job::Script] The script object defining the work
59
+ def start_remote_session(script)
60
+ cmd = ssh_cmd(submit_host(script), ['/usr/bin/env', 'bash'])
61
+
62
+ session_name = unique_session_name
63
+ output = call(*cmd, stdin: wrapped_script(script, session_name))
64
+ hostname = output.strip
65
+
66
+ "#{session_name}@#{hostname}"
67
+ end
68
+
69
+ def stop_remote_session(session_name, hostname)
70
+ cmd = ssh_cmd(hostname, ['/usr/bin/env', 'bash'])
71
+
72
+ kill_cmd = <<~SCRIPT
73
+ # Get the tmux pane PID for the target session
74
+ pane_pid=$(tmux list-panes -aF '\#{session_name} \#{pane_pid}' | grep '#{session_name}' | cut -f 2 -d ' ')
75
+ # Find the Singularity sinit PID child of the pane process
76
+ pane_sinit_pid=$(pstree -p -l "$pane_pid" | grep -o 'sinit([[:digit:]]*' | grep -o '[[:digit:]]*')
77
+ # Kill sinit which stops both Singularity-based processes and the tmux session
78
+ kill "$pane_sinit_pid"
79
+ SCRIPT
80
+
81
+ call(*cmd, stdin: kill_cmd)
82
+ rescue Error => e
83
+ raise e unless (
84
+ # The tmux server not running is not an error
85
+ e.message.include?('failed to connect to server') ||
86
+ # The session not being found is not an error
87
+ e.message.include?("session not found: #{session_name_label}")
88
+ )
89
+ end
90
+
91
+ def list_remote_sessions(host: nil)
92
+ host_list = (host) ? [host] : ssh_hosts
93
+
94
+ host_list.map {
95
+ |hostname| list_remote_tmux_session(hostname)
96
+ }.flatten.sort_by {
97
+ |hsh| hsh[:session_name]
98
+ }
99
+ end
100
+
101
+ def submit_host(script = nil)
102
+ if script && script.native && script.native['submit_host_override']
103
+ script.native['submit_host_override']
104
+ else
105
+ @submit_host
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ # Call a forked Slurm command for a given cluster
112
+ def call(cmd, *args, env: {}, stdin: "")
113
+ args = args.map(&:to_s)
114
+ env = env.to_h
115
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
116
+ s.success? ? o : raise(Error, e)
117
+ end
118
+
119
+ # The full command to ssh into the destination host and execute the command.
120
+ # SSH options include:
121
+ # -t Force pseudo-terminal allocation (required to allow tmux to run)
122
+ # -o BatchMode=yes (set mode to be non-interactive)
123
+ # if ! strict_host_checking
124
+ # -o UserKnownHostsFile=/dev/null (do not update the user's known hosts file)
125
+ # -o StrictHostKeyChecking=no (do no check the user's known hosts file)
126
+ #
127
+ # @param destination_host [#to_s] the destination host you wish to ssh into
128
+ # @param cmd [Array<#to_s>] the command to be executed on the destination host
129
+ def ssh_cmd(destination_host, cmd)
130
+ if strict_host_checking
131
+ [
132
+ 'ssh', '-t',
133
+ '-o', 'BatchMode=yes',
134
+ "#{username}@#{destination_host}"
135
+ ].concat(cmd)
136
+ else
137
+ [
138
+ 'ssh', '-t',
139
+ '-o', 'BatchMode=yes',
140
+ '-o', 'UserKnownHostsFile=/dev/null',
141
+ '-o', 'StrictHostKeyChecking=no',
142
+ "#{username}@#{destination_host}"
143
+ ].concat(cmd)
144
+ end
145
+ end
146
+
147
+ def shell
148
+ ENV['SHELL'] || '/bin/bash'
149
+ end
150
+
151
+ # Wraps a user-provided script into a Tmux invocation
152
+ def wrapped_script(script, session_name)
153
+ content = script.content
154
+ unless user_script_has_shebang?(script)
155
+ content = "#!#{shell}\n#{content}"
156
+ end
157
+
158
+ ERB.new(
159
+ File.read(Pathname.new(__dir__).join('templates/script_wrapper.erb.sh'))
160
+ ).result(binding.tap {|bnd|
161
+ {
162
+ 'arguments' => script_arguments(script),
163
+ 'cd_to_workdir' => (script.workdir) ? "cd #{script.workdir}" : '',
164
+ 'contain' => (contain) ? '--contain' : '',
165
+ 'debug' => debug,
166
+ 'email_on_terminated' => script_email_on_event(script, 'terminated'),
167
+ 'email_on_start' => script_email_on_event(script, 'started'),
168
+ 'environment' => export_env(script),
169
+ 'error_path' => (script.error_path) ? script.error_path.to_s : '/dev/null',
170
+ 'job_name' => script.job_name.to_s,
171
+ 'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
172
+ 'script_content' => content,
173
+ 'script_timeout' => script_timeout(script),
174
+ 'session_name' => session_name,
175
+ 'singularity_bin' => singularity_bin,
176
+ 'singularity_image' => singularity_image(script.native),
177
+ 'tmux_bin' => tmux_bin,
178
+ }.each{
179
+ |key, value| bnd.local_variable_set(key, value)
180
+ }
181
+ })
182
+ end
183
+
184
+ # Generate the environment export block for this script
185
+ def export_env(script)
186
+ environment = script.job_environment
187
+ (environment ? environment : {}).tap{
188
+ |hsh|
189
+ hsh['SINGULARITY_BINDPATH'] = singularity_bindpath(script.native)
190
+ }.map{
191
+ |key, value| "export #{key}=#{Shellwords.escape(value)}"
192
+ }.sort.join("\n")
193
+ end
194
+
195
+ def singularity_image(native)
196
+ if native && native[:singularity_container]
197
+ return native[:singularity_container]
198
+ end
199
+
200
+ default_singularity_image
201
+ end
202
+
203
+ def singularity_bindpath(native)
204
+ return site_singularity_bindpath unless native && native[:singularity_bindpath]
205
+
206
+ native[:singularity_bindpath]
207
+ end
208
+
209
+ def script_timeout(script)
210
+ wall_time = script.wall_time.to_i
211
+ return site_timeout if wall_time == 0
212
+ return [wall_time, site_timeout].min unless site_timeout == 0
213
+
214
+ wall_time
215
+ end
216
+
217
+ def script_arguments(script)
218
+ return '' unless script.args
219
+
220
+ Shellwords.join(script.args)
221
+ end
222
+
223
+ def script_email_on_event(script, event)
224
+ return false unless script.email && script.send("email_on_#{event}")
225
+
226
+ ERB.new(
227
+ File.read(Pathname.new(__dir__).join('templates/email.erb.sh'))
228
+ ).result(binding.tap {|bnd|
229
+ {
230
+ 'email_recipients' => script.email.map{|addr| Shellwords.escape(addr)}.join(', '),
231
+ 'job_name' => (script.job_name) ? script.job_name : 'LinuxHost_Adapter_Job',
232
+ 'job_status' => event
233
+ }.each{
234
+ |key, value| bnd.local_variable_set(key, value)
235
+ }
236
+ })
237
+ end
238
+
239
+ def unique_session_name
240
+ "#{session_name_label}-#{SecureRandom.uuid}"
241
+ end
242
+
243
+ # List all Tmux sessions on destination_host started by this adapter
244
+ # Additional tmux ls options available: http://man7.org/linux/man-pages/man1/tmux.1.html#FORMATS
245
+ def list_remote_tmux_session(destination_host)
246
+ # Note that the tmux variable substitution looks like Ruby string sub,
247
+ # these must either be single quoted strings or Ruby-string escaped as well
248
+ format_str = Shellwords.escape(
249
+ ['#{session_name}', '#{session_created}', '#{pane_pid}'].join(UNIT_SEPARATOR)
250
+ )
251
+ keys = [:session_name, :session_created, :session_pid]
252
+ cmd = ssh_cmd(destination_host, ['tmux', 'list-panes', '-aF', format_str])
253
+
254
+ call(*cmd).split(
255
+ "\n"
256
+ ).map do |line|
257
+ Hash[keys.zip(line.split(UNIT_SEPARATOR))].tap do |session_hash|
258
+ session_hash[:destination_host] = destination_host
259
+ session_hash[:id] = "#{session_hash[:session_name]}@#{destination_host}"
260
+ end
261
+ end.select{
262
+ |session_hash| session_hash[:session_name].start_with?(session_name_label)
263
+ }
264
+ rescue Error => e
265
+ # The tmux server not running is not an error
266
+ raise e unless e.message.include?('failed to connect to server')
267
+ []
268
+ end
269
+
270
+ def user_script_has_shebang?(script)
271
+ return false if script.content.empty?
272
+ script.content.split("\n").first.start_with?('#!/')
273
+ end
274
+ end
@@ -0,0 +1,9 @@
1
+ if command -v mail; then
2
+ cat << EMAIL_CONTENT | mail -s "Job <%= job_name %> has <%= job_status %>" <%= email_recipients %>
3
+ Greetings,
4
+
5
+ Your job <%= job_name %> has <%= job_status %>.
6
+
7
+ - The OnDemand Linux Host Adapter
8
+ EMAIL_CONTENT
9
+ fi
@@ -0,0 +1,64 @@
1
+ #!/bin/bash
2
+ hostname
3
+
4
+ # Put the script into a temp file on localhost
5
+ <% if debug %>
6
+ singularity_tmp_file=$(mktemp -p "$HOME" --suffix '_sing')
7
+ tmux_tmp_file=$(mktemp -p "$HOME" --suffix "_tmux")
8
+ <% else %>
9
+ singularity_tmp_file=$(mktemp)
10
+ tmux_tmp_file=$(mktemp)
11
+ <% end %>
12
+
13
+ # Create an executable to run in a tmux session
14
+ # The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
15
+ cat << 'TMUX_LAUNCHER' | sed "s#\$singularity_tmp_file#${singularity_tmp_file}#" > "$tmux_tmp_file"
16
+ #!/bin/bash
17
+ <% if email_on_terminated %>
18
+ exit_script() {
19
+ <%# DO NOT INDENT email_on_terminated may have HEREDOCS %>
20
+ <%= email_on_terminated %>
21
+ trap - SIGINT SIGTERM # clear the trap
22
+ kill -- -$$ # Sends SIGTERM to child/sub processes
23
+ }
24
+ trap exit_script SIGINT SIGTERM
25
+ <% end %>
26
+
27
+ <%= cd_to_workdir %>
28
+ <%= environment %>
29
+
30
+ <%= email_on_start %>
31
+
32
+ # Redirect stdout and stderr to separate files for all commands run within the curly braces
33
+ # https://unix.stackexchange.com/a/6431/204548
34
+ # Swap sterr and stdout after stdout has been redirected
35
+ # https://unix.stackexchange.com/a/61932/204548
36
+ OUTPUT_PATH=<%= output_path %>
37
+ ERROR_PATH=<%= error_path %>
38
+ ({
39
+ timeout <%= script_timeout %>s <%= singularity_bin %> exec <%= contain %> --pid <%= singularity_image %> /bin/bash --login $singularity_tmp_file <%= arguments %>
40
+ } | tee "$OUTPUT_PATH") 3>&1 1>&2 2>&3 | tee "$ERROR_PATH"
41
+
42
+ <%= email_on_terminated %>
43
+
44
+ # Exit the tmux session when we are complete
45
+ exit 0
46
+ TMUX_LAUNCHER
47
+
48
+ # Create an executable for Singularity to run
49
+ # Escaped HEREDOC means that we do not have to worry about Shell.escape-ing script_content
50
+ cat << 'SINGULARITY_LAUNCHER' > "$singularity_tmp_file"
51
+ <%= script_content %>
52
+ SINGULARITY_LAUNCHER
53
+
54
+ # Run the script inside a tmux session
55
+ chmod +x "$singularity_tmp_file"
56
+ chmod +x "$tmux_tmp_file"
57
+ <%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
58
+
59
+ # Remove the file
60
+ <% if ! debug %>
61
+ # Wait 1 second to ensure that tmux session has started before the file is removed
62
+ sleep 1
63
+ rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
64
+ <% end %>
@@ -167,6 +167,10 @@ module OodCore
167
167
  raise JobAdapterError, e.message
168
168
  end
169
169
 
170
+ def directive_prefix
171
+ '#BSUB'
172
+ end
173
+
170
174
  private
171
175
  # Determine state from LSF state code
172
176
  def get_state(st)
@@ -92,6 +92,15 @@ class OodCore::Job::Adapters::Lsf::Helper
92
92
  args += ["-W", (script.wall_time / 60).to_i] unless script.wall_time.nil?
93
93
  args += ["-L", script.shell_path.to_s] unless script.shell_path.nil?
94
94
 
95
+ # environment
96
+ env = script.job_environment || {}
97
+ # To preserve pre-existing behavior we only act when true or false, when nil we do nothing
98
+ if script.copy_environment?
99
+ args += ["-env", (["all"] + env.keys).join(",")]
100
+ elsif script.copy_environment? == false
101
+ args += ["-env", (["none"] + env.keys).join(",")]
102
+ end
103
+
95
104
  # input and output files
96
105
  args += ["-i", script.input_path] unless script.input_path.nil?
97
106
  args += ["-o", script.output_path] unless script.output_path.nil?
@@ -104,9 +113,6 @@ class OodCore::Job::Adapters::Lsf::Helper
104
113
 
105
114
  args += script.native unless script.native.nil?
106
115
 
107
- # environment
108
- env = script.job_environment || {}
109
-
110
116
  {args: args, env: env}
111
117
  end
112
118
  end
@@ -261,6 +261,7 @@ module OodCore
261
261
  # Set environment variables
262
262
  envvars = script.job_environment.to_h
263
263
  args += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
264
+ args += ["-V"] if script.copy_environment?
264
265
 
265
266
  # If error_path is not specified we join stdout & stderr (as this
266
267
  # mimics what the other resource managers do)
@@ -397,6 +398,10 @@ module OodCore
397
398
  raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
398
399
  end
399
400
 
401
+ def directive_prefix
402
+ '#PBS'
403
+ end
404
+
400
405
  private
401
406
  # Convert duration to seconds
402
407
  def duration_in_seconds(time)
@@ -157,6 +157,10 @@ module OodCore
157
157
  rescue Batch::Error => e
158
158
  raise JobAdapterError, e.message
159
159
  end
160
+
161
+ def directive_prefix
162
+ '#$'
163
+ end
160
164
  end
161
165
  end
162
166
  end
@@ -33,8 +33,7 @@ class OodCore::Job::Adapters::Sge::Batch
33
33
  # @see Factory.build_sge
34
34
  def initialize(config)
35
35
  @cluster = config.fetch(:cluster, nil)
36
- @conf = Pathname.new(config.fetch(:conf, nil))
37
- @bin = Pathname.new(config.fetch(:bin, nil))
36
+ @bin = Pathname.new(config.fetch(:bin, nil).to_s)
38
37
  @sge_root = Pathname.new(config[:sge_root] || ENV['SGE_ROOT'] || "/var/lib/gridengine")
39
38
  @bin_overrides = config.fetch(:bin_overrides, {})
40
39
 
@@ -20,6 +20,7 @@ class OodCore::Job::Adapters::Sge::Helper
20
20
  args += ['-h'] if script.submit_as_hold
21
21
  args += ['-r', 'yes'] if script.rerunnable
22
22
  script.job_environment.each_pair {|k, v| args += ['-v', "#{k.to_s}=#{v.to_s}"]} unless script.job_environment.nil?
23
+ args += ["-V"] if script.copy_environment?
23
24
 
24
25
  if script.workdir
25
26
  args += ['-wd', script.workdir]
@@ -27,13 +27,15 @@ class QstatXmlJRListener
27
27
  @parsed_job = {
28
28
  :tasks => [],
29
29
  :status => :queued,
30
- :procs => 1, # un-knowable from SGE qstat output
30
+ :procs => 1,
31
31
  :native => {} # TODO: improve native attribute reporting
32
32
  }
33
33
  @current_text = nil
34
34
  @current_request = nil
35
35
 
36
36
  @processing_job_array_spec = false
37
+ @adding_slots = false
38
+
37
39
  @job_array_spec = {
38
40
  start: nil,
39
41
  stop: nil,
@@ -46,6 +48,8 @@ class QstatXmlJRListener
46
48
  case name
47
49
  when 'task_id_range'
48
50
  toggle_processing_array_spec
51
+ when 'JB_pe_range'
52
+ toggle_adding_slots
49
53
  end
50
54
  end
51
55
 
@@ -78,13 +82,16 @@ class QstatXmlJRListener
78
82
  when 'djob_info'
79
83
  finalize_parsed_job
80
84
  when 'RN_min'
81
- set_job_array_piece(:start)
85
+ set_job_array_piece(:start) if @processing_job_array_spec
86
+ set_slots if @adding_slots
82
87
  when 'RN_max'
83
- set_job_array_piece(:stop)
88
+ set_job_array_piece(:stop) if @processing_job_array_spec
84
89
  when 'RN_step'
85
- set_job_array_piece(:step)
90
+ set_job_array_piece(:step) if @processing_job_array_spec
86
91
  when 'task_id_range'
87
92
  toggle_processing_array_spec
93
+ when 'JB_pe_range'
94
+ toggle_adding_slots
88
95
  end
89
96
  end
90
97
 
@@ -186,5 +193,13 @@ class QstatXmlJRListener
186
193
  def toggle_processing_array_spec
187
194
  @processing_job_array_spec = ! @processing_job_array_spec
188
195
  end
196
+
197
+ def toggle_adding_slots
198
+ @adding_slots = ! @adding_slots
199
+ end
200
+
201
+ def set_slots
202
+ @parsed_job[:procs] = @current_text.to_i
203
+ end
189
204
  end
190
205