ood_core 0.18.1 → 0.20.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a110306c66de3f349e7a5569cedc3ea02dfedd5dfaa360b352f8689af113f98b
4
- data.tar.gz: ae608343e63bb98e6383fea71af53943d7136bdf72d8bf46b653f0c19801fcec
3
+ metadata.gz: 0ad31cf26a48b0b388b5c4f1bc83583515837a82c1a9419e51f724d135c4820c
4
+ data.tar.gz: 9eccd2b14920adf24f4273abd9355b4061a6a082913011757c3d636ba9d7d485
5
5
  SHA512:
6
- metadata.gz: 8a6b9928561a6dba1b84cbb2ac58d389b84e8317589648c483382c166c81982859fb74f68f76297a25319faed06712c6256abdf1c6a5e0175be939aa0392f283
7
- data.tar.gz: 21396c77e39329f9d7b6112c7900dd7ffa51d695b137d15089c487799ed16e3f74aea1f1dfab9958e2928fb98f49db098f865906c53abf667a8ed64ceda5dc53
6
+ metadata.gz: 126c139985ca62fdfca217ee8ea2fade4292d2f8aac110d51ad94edda36aff6e8c28fbe1876981010dc2ac395eb5cebea0439add3cafbb844a023ff699b4d841
7
+ data.tar.gz: 6d17cc409aff1b7d7098b409451c93e0660a855c4b4b36f8a49afea5cc306aeff5181e76305c15b056a5234e401f613bc428a3e49d98e292c3b584aaf4394a9a
data/CHANGELOG.md CHANGED
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.20.1] - 07-21-2022
11
+
12
+ - Fixed turbovnc compatability issue with the -nohttpd flag in [767](https://github.com/OSC/ood_core/pull/767).
13
+
14
+ ## [0.20.0] - 06-03-2022
15
+
16
+ - Adapters can now respond to `cluster_info` in [752](https://github.com/OSC/ood_core/pull/752). This returns information about the cluster like how many nodes are available and so on. Only Slurm support in this release.
17
+ - `OodCore::Job::Info` now has a `gpus` attribute in [753](https://github.com/OSC/ood_core/pull/753). Only Slurm support in this release.
18
+ - Support Ruby 3 in [759](https://github.com/OSC/ood_core/pull/759)
19
+
20
+ ## [0.19.0] - 02-03-2022
21
+
22
+ ### Added
23
+
24
+ - Systemd adapter in [743](https://github.com/OSC/ood_core/pull/743).
25
+
26
+ ### Fixed
27
+
28
+ - The linux host adapter is a little more portable in [333](https://github.com/OSC/ood_core/pull/333).
29
+ - Improved pod security for the k8s adapter in [748](https://github.com/OSC/ood_core/pull/748).
30
+
10
31
  ## [0.18.1] - 10-18-2021
11
32
 
12
33
  ### Fixed
@@ -411,7 +432,10 @@ Functionally the same as [0.17.3] but with some CI updates.
411
432
  ### Added
412
433
  - Initial release!
413
434
 
414
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.18.1...HEAD
435
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.20.1...HEAD
436
+ [0.20.1]: https://github.com/OSC/ood_core/compare/v0.20.0...v0.20.1
437
+ [0.20.0]: https://github.com/OSC/ood_core/compare/v0.19.0...v0.20.0
438
+ [0.19.0]: https://github.com/OSC/ood_core/compare/v0.18.1...v0.19.0
415
439
  [0.18.1]: https://github.com/OSC/ood_core/compare/v0.18.0...v0.18.1
416
440
  [0.18.0]: https://github.com/OSC/ood_core/compare/v0.17.8...v0.18.0
417
441
  [0.17.6]: https://github.com/OSC/ood_core/compare/v0.17.5...v0.17.6
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2017-2018 Ohio Supercomputer Center
3
+ Copyright (c) 2017-2022 Ohio Supercomputer Center
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -86,8 +86,13 @@ module OodCore
86
86
  # Clean up any old VNC sessions that weren't cleaned before
87
87
  #{vnc_clean}
88
88
 
89
+ # for turbovnc 3.0 compatability.
90
+ if timeout 2 vncserver --help 2>&1 | grep 'nohttpd' >/dev/null 2>&1; then
91
+ HTTPD_OPT='-nohttpd'
92
+ fi
93
+
89
94
  # Attempt to start VNC server
90
- VNC_OUT=$(vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}" -nohttpd -noxstartup #{vnc_args} 2>&1)
95
+ VNC_OUT=$(vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}" $HTTPD_OPT -noxstartup #{vnc_args} 2>&1)
91
96
  VNC_PID=$(pgrep -s 0 Xvnc) # the script above will daemonize the Xvnc process
92
97
  echo "${VNC_OUT}"
93
98
 
@@ -33,6 +33,15 @@ module OodCore
33
33
  raise NotImplementedError, "subclass did not define #submit"
34
34
  end
35
35
 
36
+ # Retrieve the number of active and total cpus, nodes, and gpus
37
+ # @abstract Subclass is expected to implement {#cluster_stats}
38
+ # @raise [NotImplementedError] if subclass did not define {#cluster_stats}
39
+ # @return [ClusterInfo] Object containing quantified statistics about the
40
+ # cluster's active/total cpus, nodes, and gpus
41
+ def cluster_info
42
+ raise NotImplementedError, "subclass did not define #cluster_stats"
43
+ end
44
+
36
45
  # Retrieve info for all jobs from the resource manager
37
46
  # @abstract Subclass is expected to implement {#info_all}
38
47
  # @raise [NotImplementedError] if subclass did not define {#info_all}
@@ -228,7 +228,7 @@ module OodCore
228
228
  data_hash[:submission_time] = raw['dateSubmitted'].to_i
229
229
  data_hash[:queue_name] = raw['criteriaPriority']
230
230
 
231
- Info.new(data_hash)
231
+ Info.new(**data_hash)
232
232
  end
233
233
 
234
234
  # extended data is just lines of 'key: value' value, so parse
@@ -242,7 +242,7 @@ module OodCore
242
242
 
243
243
  data.to_s.lines.drop(1).each do |line|
244
244
  match_data = ccqstat_regex.match(line)
245
- infos << Info.new(ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
245
+ infos << Info.new(**ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
246
246
  end
247
247
 
248
248
  infos
@@ -93,7 +93,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
93
93
 
94
94
  def info(id)
95
95
  pod_json = safe_call('get', 'pod', id)
96
- return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
96
+ return OodCore::Job::Info.new(**{ id: id, status: 'completed' }) if pod_json.empty?
97
97
 
98
98
  service_json = safe_call('get', 'service', service_name(id))
99
99
  secret_json = safe_call('get', 'secret', secret_name(id))
@@ -31,7 +31,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
31
31
 
32
32
  pod_hash.deep_merge!(service_hash)
33
33
  pod_hash.deep_merge!(secret_hash)
34
- OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(pod_hash)
34
+ OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(**pod_hash)
35
35
  rescue NoMethodError
36
36
  raise K8sDataError, "unable to read data correctly from json"
37
37
  end
@@ -2,8 +2,8 @@
2
2
  class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
3
3
  attr_reader :ood_connection_info
4
4
 
5
- def initialize(ood_connection_info: {}, **options)
6
- super(options)
7
- @ood_connection_info = ood_connection_info
5
+ def initialize(options)
6
+ super(**options)
7
+ @ood_connection_info = options[:ood_connection_info]
8
8
  end
9
9
  end
@@ -106,7 +106,9 @@ spec:
106
106
  allowPrivilegeEscalation: false
107
107
  capabilities:
108
108
  drop:
109
- - all
109
+ - ALL
110
+ seccompProfile:
111
+ type: RuntimeDefault
110
112
  privileged: false
111
113
  <%- unless spec.init_containers.empty? -%>
112
114
  initContainers:
@@ -152,7 +154,9 @@ spec:
152
154
  allowPrivilegeEscalation: false
153
155
  capabilities:
154
156
  drop:
155
- - all
157
+ - ALL
158
+ seccompProfile:
159
+ type: RuntimeDefault
156
160
  privileged: false
157
161
  <%- end # init container loop -%>
158
162
  <%- end # if init containers -%>
@@ -73,7 +73,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
73
73
  # Get the tmux pane PID for the target session
74
74
  pane_pid=$(tmux list-panes -aF '\#{session_name} \#{pane_pid}' | grep '#{session_name}' | cut -f 2 -d ' ')
75
75
  # Find the Singularity sinit PID child of the pane process
76
- pane_sinit_pid=$(pstree -p -l "$pane_pid" | grep -o 'sinit([[:digit:]]*' | grep -o '[[:digit:]]*')
76
+ pane_sinit_pid=$(pstree -p -l "$pane_pid" | egrep -o 'sinit[(][[:digit:]]*|shim-init[(][[:digit:]]*' | grep -o '[[:digit:]]*')
77
77
  # Kill sinit which stops both Singularity-based processes and the tmux session
78
78
  kill "$pane_sinit_pid"
79
79
  SCRIPT
@@ -16,7 +16,7 @@ module OodCore
16
16
  # @option config [#to_h] :bin_overrides ({}) Optional overrides to LSF client executables
17
17
  # @option config [#to_s] :submit_host ('') Host to submit commands to
18
18
  def self.build_lsf(config)
19
- batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
19
+ batch = Adapters::Lsf::Batch.new(**config.to_h.symbolize_keys)
20
20
  Adapters::Lsf.new(batch: batch)
21
21
  end
22
22
  end
@@ -36,6 +36,13 @@ module OodCore
36
36
  using Refinements::HashExtensions
37
37
  using Refinements::ArrayExtensions
38
38
 
39
+ # Get integer representing the number of gpus used by a node or job,
40
+ # calculated from gres string
41
+ # @return [Integer] the number of gpus in gres
42
+ def gpus_from_gres(gres)
43
+ gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum
44
+ end
45
+
39
46
  # Object used for simplified communication with a Slurm batch server
40
47
  # @api private
41
48
  class Batch
@@ -98,6 +105,22 @@ module OodCore
98
105
  @strict_host_checking = strict_host_checking
99
106
  end
100
107
 
108
+ # Get a ClusterInfo object containing information about the given cluster
109
+ # @return [ClusterInfo] object containing cluster details
110
+ def get_cluster_info
111
+ node_cpu_info = call("sinfo", "-aho %A/%D/%C").strip.split('/')
112
+ gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2
113
+ gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length}")
114
+ .lines.uniq.map(&:split)
115
+ ClusterInfo.new(active_nodes: node_cpu_info[0].to_i,
116
+ total_nodes: node_cpu_info[2].to_i,
117
+ active_processors: node_cpu_info[3].to_i,
118
+ total_processors: node_cpu_info[6].to_i,
119
+ active_gpus: gres_lines.sum { |line| gpus_from_gres(line[2]) },
120
+ total_gpus: gres_lines.sum { |line| gpus_from_gres(line[1]) }
121
+ )
122
+ end
123
+
101
124
  # Get a list of hashes detailing each of the jobs on the batch server
102
125
  # @example Status info for all jobs
103
126
  # my_batch.get_jobs
@@ -454,6 +477,12 @@ module OodCore
454
477
  raise JobAdapterError, e.message
455
478
  end
456
479
 
480
+ # Retrieve info about active and total cpus, gpus, and nodes
481
+ # @return [Hash] information about cluster usage
482
+ def cluster_info
483
+ @slurm.get_cluster_info
484
+ end
485
+
457
486
  # Retrieve info for all jobs from the resource manager
458
487
  # @raise [JobAdapterError] if something goes wrong getting job info
459
488
  # @return [Array<Info>] information describing submitted jobs
@@ -643,7 +672,8 @@ module OodCore
643
672
  cpu_time: nil,
644
673
  submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
645
674
  dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
646
- native: v
675
+ native: v,
676
+ gpus: gpus_from_gres(v[:gres])
647
677
  )
648
678
  end
649
679
 
@@ -0,0 +1,252 @@
1
+ require 'erb'
2
+ require 'etc'
3
+ require 'pathname'
4
+ require 'securerandom'
5
+ require 'shellwords'
6
+ require 'time'
7
+
8
+ # Object used for simplified communication SSH hosts
9
+ #
10
+ # @api private
11
+ class OodCore::Job::Adapters::LinuxSystemd::Launcher
12
+ attr_reader :debug, :site_timeout, :session_name_label, :ssh_hosts,
13
+ :strict_host_checking, :username
14
+ # The root exception class that all LinuxSystemd adapter-specific exceptions inherit
15
+ # from
16
+ class Error < StandardError; end
17
+
18
+ # @param debug Whether the adapter should be used in debug mode
19
+ # @param site_timeout [#to_i] A period after which the job should be killed or nil
20
+ # @param ssh_hosts List of hosts to check when scanning for running jobs
21
+ # @param strict_host_checking Allow SSH to perform strict host checking
22
+ # @param submit_host The SSH-able host
23
+ def initialize(
24
+ debug: false,
25
+ site_timeout: nil,
26
+ ssh_hosts:,
27
+ strict_host_checking: false,
28
+ submit_host:,
29
+ **_
30
+ )
31
+ @debug = !! debug
32
+ @site_timeout = site_timeout.to_i
33
+ @session_name_label = 'ondemand'
34
+ @ssh_hosts = ssh_hosts
35
+ @strict_host_checking = strict_host_checking
36
+ @submit_host = submit_host
37
+ @username = Etc.getlogin
38
+ end
39
+
40
+ # @param hostname [#to_s] The hostname to submit the work to
41
+ # @param script [OodCore::Job::Script] The script object defining the work
42
+ def start_remote_session(script)
43
+ cmd = ssh_cmd(submit_host(script), ['/usr/bin/env', 'bash'])
44
+
45
+ session_name = unique_session_name
46
+ output = call(*cmd, stdin: wrapped_script(script, session_name))
47
+ hostname = parse_hostname(output)
48
+
49
+ "#{session_name}@#{hostname}"
50
+ end
51
+
52
+ def stop_remote_session(session_name, hostname)
53
+ cmd = ssh_cmd(hostname, ['/usr/bin/env', 'bash'])
54
+
55
+ kill_cmd = <<~SCRIPT
56
+ # stop the session by name
57
+ systemctl --user stop #{session_name}.service
58
+ SCRIPT
59
+
60
+ call(*cmd, stdin: kill_cmd)
61
+ rescue Error => e
62
+ interpret_and_raise(e)
63
+ end
64
+
65
+ def list_remote_sessions(host: nil)
66
+ host_list = (host) ? [host] : ssh_hosts
67
+
68
+ host_list.map {
69
+ |hostname| list_remote_systemd_session(hostname)
70
+ }.flatten.sort_by {
71
+ |hsh| hsh[:session_name]
72
+ }
73
+ end
74
+
75
+ def submit_host(script = nil)
76
+ if script && script.native && script.native['submit_host_override']
77
+ script.native['submit_host_override']
78
+ else
79
+ @submit_host
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ # Call a forked Slurm command for a given cluster
86
+ def call(cmd, *args, env: {}, stdin: "")
87
+ args = args.map(&:to_s)
88
+ env = env.to_h
89
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
90
+ s.success? ? o : raise(Error, e)
91
+ end
92
+
93
+ # The full command to ssh into the destination host and execute the command.
94
+ # SSH options include:
95
+ # -t Force pseudo-terminal allocation (required to allow tmux to run)
96
+ # -o BatchMode=yes (set mode to be non-interactive)
97
+ # if ! strict_host_checking
98
+ # -o UserKnownHostsFile=/dev/null (do not update the user's known hosts file)
99
+ # -o StrictHostKeyChecking=no (do no check the user's known hosts file)
100
+ #
101
+ # @param destination_host [#to_s] the destination host you wish to ssh into
102
+ # @param cmd [Array<#to_s>] the command to be executed on the destination host
103
+ def ssh_cmd(destination_host, cmd)
104
+ if strict_host_checking
105
+ [
106
+ 'ssh', '-t',
107
+ '-o', 'BatchMode=yes',
108
+ "#{username}@#{destination_host}"
109
+ ].concat(cmd)
110
+ else
111
+ [
112
+ 'ssh', '-t',
113
+ '-o', 'BatchMode=yes',
114
+ '-o', 'UserKnownHostsFile=/dev/null',
115
+ '-o', 'StrictHostKeyChecking=no',
116
+ "#{username}@#{destination_host}"
117
+ ].concat(cmd)
118
+ end
119
+ end
120
+
121
+ def shell
122
+ ENV['SHELL'] || '/bin/bash'
123
+ end
124
+
125
+ # Wraps a user-provided script into a systemd-run transient service
126
+ def wrapped_script(script, session_name)
127
+ content = script.content
128
+ unless user_script_has_shebang?(script)
129
+ content = "#!#{shell}\n#{content}"
130
+ end
131
+
132
+ ERB.new(
133
+ File.read(Pathname.new(__dir__).join('templates/script_wrapper.erb.sh'))
134
+ ).result(binding.tap {|bnd|
135
+ {
136
+ 'arguments' => script_arguments(script),
137
+ 'cd_to_workdir' => (script.workdir) ? "cd #{script.workdir}" : '',
138
+ 'debug' => debug,
139
+ 'email_on_terminated' => script_email_on_event(script, 'terminated'),
140
+ 'email_on_start' => script_email_on_event(script, 'started'),
141
+ 'environment' => export_env(script),
142
+ 'error_path' => error_path(script),
143
+ 'job_name' => script.job_name.to_s,
144
+ 'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
145
+ 'script_content' => content,
146
+ 'script_timeout' => script_timeout(script),
147
+ 'session_name' => session_name,
148
+ 'ssh_hosts' => ssh_hosts,
149
+ 'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
150
+ }.each{
151
+ |key, value| bnd.local_variable_set(key, value)
152
+ }
153
+ })
154
+ end
155
+
156
+ # Generate the environment export block for this script
157
+ def export_env(script)
158
+ environment = script.job_environment
159
+ (environment ? environment : {}).map{
160
+ |key, value| "export #{key}=#{Shellwords.escape(value)}"
161
+ }.sort.join("\n")
162
+ end
163
+
164
+ def script_timeout(script)
165
+ wall_time = script.wall_time.to_i
166
+ if wall_time == 0
167
+ # this is the only way it can be 0
168
+ # so make it into infinify for systemd to never terminate
169
+ site_timeout == 0 ? 'infinity' : site_timeout
170
+ elsif site_timeout != 0
171
+ [wall_time, site_timeout].min
172
+ else
173
+ wall_time
174
+ end
175
+ end
176
+
177
+ def script_arguments(script)
178
+ return '' unless script.args
179
+
180
+ Shellwords.join(script.args)
181
+ end
182
+
183
+ def script_email_on_event(script, event)
184
+ return false unless script.email && script.send("email_on_#{event}")
185
+
186
+ ERB.new(
187
+ File.read(Pathname.new(__dir__).join('templates/email.erb.sh'))
188
+ ).result(binding.tap {|bnd|
189
+ {
190
+ 'email_recipients' => script.email.map{|addr| Shellwords.escape(addr)}.join(', '),
191
+ 'job_name' => (script.job_name) ? script.job_name : 'LinuxHost_Adapter_Job',
192
+ 'job_status' => event
193
+ }.each{
194
+ |key, value| bnd.local_variable_set(key, value)
195
+ }
196
+ })
197
+ end
198
+
199
+ def unique_session_name
200
+ "#{session_name_label}-#{SecureRandom.alphanumeric(10)}"
201
+ end
202
+
203
+ # List all Systemd sessions on destination_host started by this adapter
204
+ def list_remote_systemd_session(destination_host)
205
+ cmd = ssh_cmd(destination_host, ['systemctl', '--user', 'show', '-t', 'service', '--state=running', "#{session_name_label}-*"])
206
+
207
+ # individual units are separated with an empty line
208
+ call(*cmd).split("\n\n").map do |oneunit|
209
+ Hash[oneunit.split("\n").map{ |line| line.split('=',2) }].tap do |session_hash|
210
+ session_hash[:session_name] = session_hash['Id'].delete_suffix('.service')
211
+ session_hash[:destination_host] = destination_host
212
+ session_hash[:id] = "#{session_hash[:session_name]}@#{destination_host}"
213
+ session_hash[:session_created] = Time.parse(session_hash['ExecMainStartTimestamp'])
214
+ session_hash[:job_name] = session_hash['Description']
215
+ end
216
+ end
217
+ rescue Error => e
218
+ interpret_and_raise(e)
219
+ []
220
+ end
221
+
222
+ def user_script_has_shebang?(script)
223
+ return false if script.content.empty?
224
+ script.content.split("\n").first.start_with?('#!/')
225
+ end
226
+
227
+ def error_path(script)
228
+ return script.error_path.to_s if script.error_path
229
+ return script.output_path.to_s if script.output_path
230
+
231
+ '/dev/null'
232
+ end
233
+
234
+ # under some conditions tmux returns status code 1 but it's not an actual
235
+ # error. These are when the session is not found or there are no sessions
236
+ # at all.
237
+ def interpret_and_raise(error)
238
+ if error.message.include?('failed to connect to server') # no sessions in tmux 1.8
239
+ nil
240
+ elsif error.message.include?('no server running on') # no sessions in tmux 2.7+ message
241
+ nil
242
+ else
243
+ raise error
244
+ end
245
+ end
246
+
247
+ def parse_hostname(output)
248
+ output.split($/).map do |line|
249
+ line[/^HOSTNAME:(.*)$/, 1]
250
+ end.compact.last.to_s
251
+ end
252
+ end
@@ -0,0 +1,9 @@
1
+ if command -v mail; then
2
+ cat << EMAIL_CONTENT | mail -s "Job <%= job_name %> has <%= job_status %>" <%= email_recipients %>
3
+ Greetings,
4
+
5
+ Your job <%= job_name %> has <%= job_status %>.
6
+
7
+ - The OnDemand Linux Systemd Adapter
8
+ EMAIL_CONTENT
9
+ fi
@@ -0,0 +1,56 @@
1
+ #!/bin/bash
2
+ SSH_HOSTS=(<%= ssh_hosts.join(' ').to_s %>)
3
+ hostnames=`hostname -A`
4
+ for host in ${SSH_HOSTS[@]}
5
+ do
6
+ if [[ " ${hostnames[@]} " =~ " ${host} " ]]; then
7
+ hostname=$host
8
+ fi
9
+ done
10
+
11
+ if [ -z "$hostname" ]; then
12
+ printf >&2 "ERROR: Can't start job on [${hostnames[@]}] because it does not match any hostname configured \nin ssh_hosts [${SSH_HOSTS[@]}]. The output of 'hostname -A' must match an entry in ssh_hosts \nfrom the cluster configuration."
13
+ exit 1
14
+ fi
15
+
16
+ echo ""
17
+ echo "HOSTNAME:$hostname"
18
+
19
+ # we need this user to be enabled for lingering or else the newly started
20
+ # service will end as soon as the ssh session starting has exited
21
+ loginctl enable-linger
22
+
23
+ # Put the script into a temp file on localhost
24
+ systemd_service_file="<%= workdir %>/systemd_service.sh"
25
+ systemd_service_file_pre="<%= workdir %>/systemd_pre.sh"
26
+ systemd_service_file_post="<%= workdir %>/systemd_post.sh"
27
+
28
+ cat << 'SYSTEMD_EXEC_PRE' > "$systemd_service_file_pre"
29
+ #!/bin/bash
30
+ <%= cd_to_workdir %>
31
+ <% if email_on_start %>
32
+ <%= email_on_start %>
33
+ <% end %>
34
+ SYSTEMD_EXEC_PRE
35
+
36
+ cat << 'SYSTEMD_EXEC_POST' > "$systemd_service_file_post"
37
+ #!/bin/bash
38
+ <%= cd_to_workdir %>
39
+ <% if email_on_terminated %>
40
+ <%= email_on_terminated %>
41
+ <% end %>
42
+ SYSTEMD_EXEC_POST
43
+
44
+ # Create an executable for systemd service to run
45
+ # Escaped HEREDOC means that we do not have to worry about Shell.escape-ing script_content
46
+ cat << 'SYSTEMD_EXEC' > "$systemd_service_file"
47
+ <%= script_content %>
48
+ SYSTEMD_EXEC
49
+
50
+ # Run the script inside a transient systemd user service
51
+ chmod +x "$systemd_service_file_pre" "$systemd_service_file" "$systemd_service_file_post"
52
+ <%= cd_to_workdir %>
53
+ systemd-run --user -r --no-block --unit=<%= session_name %> -p RuntimeMaxSec=<%= script_timeout %> \
54
+ -p ExecStartPre="$systemd_service_file_pre" -p ExecStartPost="$systemd_service_file_post" \
55
+ -p StandardOutput="file:<%= output_path %>" -p StandardError="file:<%= error_path %>" \
56
+ -p Description="<%= job_name %>" "$systemd_service_file"
@@ -0,0 +1,230 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "ood_core/refinements/array_extensions"
3
+ require "ood_core/job/adapters/helper"
4
+ require "set"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the LinuxSystemd adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [Object] :debug (false) Use the adapter in a debug mode
14
+ # @option config [Object] :max_timeout (nil) The longest 'wall_clock' permissible
15
+ # @option config [Object] :ssh_hosts (nil) The list of permissable hosts, defaults to :submit_host
16
+ # @option config [Object] :strict_host_checking (true) Set to false to disable strict host checking and updating the known_hosts file
17
+ # @option config [Object] :submit_host The SSH target to connect to, may be the head of a round-robin
18
+ def self.build_systemd(config)
19
+ c = config.to_h.symbolize_keys
20
+ debug = c.fetch(:debug, false)
21
+ max_timeout = c.fetch(:max_timeout, nil)
22
+ ssh_hosts = c.fetch(:ssh_hosts, [c[:submit_host]])
23
+ strict_host_checking = c.fetch(:strict_host_checking, true)
24
+ submit_host = c[:submit_host]
25
+
26
+ Adapters::LinuxSystemd.new(
27
+ ssh_hosts: ssh_hosts,
28
+ launcher: Adapters::LinuxSystemd::Launcher.new(
29
+ debug: debug,
30
+ max_timeout: max_timeout,
31
+ ssh_hosts: ssh_hosts,
32
+ strict_host_checking: strict_host_checking,
33
+ submit_host: submit_host,
34
+ )
35
+ )
36
+ end
37
+ end
38
+
39
+ module Adapters
40
+ # An adapter object that describes the communication with a remote host
41
+ # for job management.
42
+ class LinuxSystemd < Adapter
43
+ using Refinements::ArrayExtensions
44
+
45
+ require "ood_core/job/adapters/systemd/launcher"
46
+
47
+ def initialize(ssh_hosts:, launcher:)
48
+ @launcher = launcher
49
+ @ssh_hosts = Set.new(ssh_hosts)
50
+ end
51
+
52
+ # Submit a job with the attributes defined in the job template instance
53
+ # @param script [Script] script object that describes the script and
54
+ # attributes for the submitted job
55
+ # @param after [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
56
+ # @param afterok [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
57
+ # @param afternotok [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
58
+ # @param afterany [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
59
+ # @raise [JobAdapterError] if something goes wrong submitting a job
60
+ # @return [String] the job id returned after successfully submitting a
61
+ # job
62
+ # @see Adapter#submit
63
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
64
+ unless (after.empty? && afterok.empty? && afternotok.empty? && afterany.empty?)
65
+ raise JobAdapterError, 'Scheduling subsequent jobs is not available.'
66
+ end
67
+
68
+ @launcher.start_remote_session(script)
69
+ rescue Launcher::Error => e
70
+ raise JobAdapterError, e.message
71
+ end
72
+
73
+ # Retrieve info for all jobs from the resource manager
74
+ # @raise [JobAdapterError] if something goes wrong getting job info
75
+ # @return [Array<Info>] information describing submitted jobs
76
+ # @see Adapter#info_all
77
+ def info_all(attrs: nil, host: nil)
78
+ host_permitted?(host) if host
79
+
80
+ @launcher.list_remote_sessions(host: host).map{
81
+ |ls_output| ls_to_info(ls_output)
82
+ }
83
+ rescue Launcher::Error => e
84
+ raise JobAdapterError, e.message
85
+ end
86
+
87
+ # Retrieve info for all jobs for a given owner or owners from the
88
+ # resource manager
89
+ # Note: owner and attrs are present only to complete the interface and are ignored
90
+ # Note: since this API is used in production no errors or warnings are thrown / issued
91
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
92
+ # @raise [JobAdapterError] if something goes wrong getting job info
93
+ # @return [Array<Info>] information describing submitted jobs
94
+ def info_where_owner(_, attrs: nil)
95
+ info_all
96
+ end
97
+
98
+ # Iterate over each job Info object
99
+ # @param attrs [Array<symbol>] attrs is present only to complete the interface and is ignored
100
+ # @yield [Info] of each job to block
101
+ # @return [Enumerator] if no block given
102
+ def info_all_each(attrs: nil)
103
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
104
+
105
+ info_all(attrs: attrs).each do |job|
106
+ yield job
107
+ end
108
+ end
109
+
110
+ # Iterate over each job Info object
111
+ # @param owner [#to_s, Array<#to_s>] owner is present only to complete the interface and is ignored
112
+ # @param attrs [Array<symbol>] attrs is present only to complete the interface and is ignored
113
+ # @yield [Info] of each job to block
114
+ # @return [Enumerator] if no block given
115
+ def info_where_owner_each(owner, attrs: nil)
116
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
117
+
118
+ info_where_owner(owner, attrs: attrs).each do |job|
119
+ yield job
120
+ end
121
+ end
122
+
123
+ # Whether the adapter supports job arrays
124
+ # @return [Boolean] - false
125
+ def supports_job_arrays?
126
+ false
127
+ end
128
+
129
+ # Retrieve job info from the SSH host
130
+ # @param id [#to_s] the id of the job
131
+ # @raise [JobAdapterError] if something goes wrong getting job info
132
+ # @return [Info] information describing submitted job
133
+ # @see Adapter#info
134
+ def info(id)
135
+ _, host = parse_job_id(id)
136
+ job = info_all(host: host).select{|info| info.id == id}.first
137
+ (job) ? job : Info.new(id: id, status: :completed)
138
+ rescue Launcher::Error => e
139
+ raise JobAdapterError, e.message
140
+ end
141
+
142
+ # Retrieve job status from resource manager
143
+ # @note Optimized slightly over retrieving complete job information from server
144
+ # @abstract Subclass is expected to implement {#status}
145
+ # @raise [NotImplementedError] if subclass did not define {#status}
146
+ # @param id [#to_s] the id of the job
147
+ # @return [Status] status of job
148
+ def status(id)
149
+ _, host = parse_job_id(id)
150
+ job = info_all(host: host).select{|info| info.id == id}.first
151
+
152
+ Status.new(state: (job) ? :running : :completed)
153
+ rescue Launcher::Error => e
154
+ raise JobAdapterError, e.message
155
+ end
156
+
157
+ # Put the submitted job on hold
158
+ # @abstract Subclass is expected to implement {#hold}
159
+ # @raise [NotImplementedError] if subclass did not define {#hold}
160
+ # @param id [#to_s] the id of the job
161
+ # @return [void]
162
+ def hold(id)
163
+ # Consider sending SIGSTOP?
164
+ raise NotImplementedError, "subclass did not define #hold"
165
+ end
166
+
167
+ # Release the job that is on hold
168
+ # @abstract Subclass is expected to implement {#release}
169
+ # @raise [NotImplementedError] if subclass did not define {#release}
170
+ # @param id [#to_s] the id of the job
171
+ # @return [void]
172
+ def release(id)
173
+ # Consider sending SIGCONT
174
+ raise NotImplementedError, "subclass did not define #release"
175
+ end
176
+
177
+ # Delete the submitted job
178
+ # @abstract Subclass is expected to implement {#delete}
179
+ # @raise [NotImplementedError] if subclass did not define {#delete}
180
+ # @param id [#to_s] the id of the job
181
+ # @return [void]
182
+ def delete(id)
183
+ session_name, destination_host = parse_job_id(id)
184
+ @launcher.stop_remote_session(session_name, destination_host)
185
+ rescue Launcher::Error => e
186
+ raise JobAdapterError, e.message
187
+ end
188
+
189
+ def directive_prefix
190
+ nil
191
+ end
192
+
193
+ private
194
+
195
+ def host_permitted?(destination_host)
196
+ raise JobAdapterError, "Requested destination host (#{destination_host}) not permitted" unless @ssh_hosts.include?(destination_host)
197
+ end
198
+
199
+ def parse_job_id(id)
200
+ raise JobAdapterError, "#{id} is not a valid LinuxSystemd adapter id because it is missing the '@'." unless id.include?('@')
201
+
202
+ return id.split('@')
203
+ end
204
+
205
+ # Convert the returned Hash into an Info object
206
+ def ls_to_info(ls_output)
207
+ started = ls_output[:session_created].to_i
208
+ now = Time.now.to_i
209
+ ellapsed = now - started
210
+ Info.new(
211
+ accounting_id: nil,
212
+ allocated_nodes: [NodeInfo.new(name: ls_output[:destination_host], procs: 1)],
213
+ cpu_time: ellapsed,
214
+ dispatch_time: started,
215
+ id: ls_output[:id],
216
+ job_name: ls_output[:job_name],
217
+ job_owner: Etc.getlogin,
218
+ native: ls_output,
219
+ procs: 1,
220
+ queue_name: "LinuxSystemd adapter for #{@submit_host}",
221
+ status: :running,
222
+ submission_time: ellapsed,
223
+ submit_host: @submit_host,
224
+ wallclock_time: ellapsed
225
+ )
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,32 @@
1
+ module OodCore
2
+ module Job
3
+ # An object that contains details about the cluster's active and total nodes, processors, and gpus
4
+ class ClusterInfo
5
+ using Refinements::HashExtensions
6
+
7
+ attr_reader :active_nodes, :total_nodes, :active_processors, :total_processors, :active_gpu_nodes,
8
+ :total_gpu_nodes, :active_gpus, :total_gpus
9
+
10
+ def initialize(opts = {})
11
+ opts = opts.transform_keys(&:to_sym)
12
+ @active_nodes = opts.fetch(:active_nodes, nil).to_i
13
+ @total_nodes = opts.fetch(:total_nodes, nil).to_i
14
+ @active_processors = opts.fetch(:active_processors, nil).to_i
15
+ @total_processors = opts.fetch(:total_processors, nil).to_i
16
+ @active_gpus = opts.fetch(:active_gpus, nil).to_i
17
+ @total_gpus = opts.fetch(:total_gpus, nil).to_i
18
+ end
19
+
20
+ def to_h
21
+ {
22
+ active_nodes: active_nodes,
23
+ total_nodes: total_nodes,
24
+ active_processors: active_processors,
25
+ total_processors: total_processors,
26
+ active_gpus: active_gpus,
27
+ total_gpus: total_gpus
28
+ }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -65,6 +65,10 @@ module OodCore
65
65
  # @return [Object] native info
66
66
  attr_reader :native
67
67
 
68
+ # Number of gpus allocated for job
69
+ # @return [Integer, nil] allocated total number of gpus
70
+ attr_reader :gpus
71
+
68
72
  # List of job array child task statuses
69
73
  # @note only relevant for job arrays
70
74
  # @return [Array<Task>] tasks
@@ -86,15 +90,16 @@ module OodCore
86
90
  # @param dispatch_time [#to_i, nil] dispatch time
87
91
  # @param tasks [Array<Hash>] tasks e.g. { id: '12345.owens-batch', status: :running }
88
92
  # @param native [Object] native info
93
+ # @param gpus [#to_i, 0] allocated total number of gpus
89
94
  def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
90
95
  job_name: nil, job_owner: nil, accounting_id: nil,
91
96
  procs: nil, queue_name: nil, wallclock_time: nil,
92
97
  wallclock_limit: nil, cpu_time: nil, submission_time: nil,
93
- dispatch_time: nil, native: nil, tasks: [],
98
+ dispatch_time: nil, native: nil, gpus: 0, tasks: [],
94
99
  **_)
95
100
  @id = id.to_s
96
101
  @status = Status.new(state: status.to_sym)
97
- @allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(n.to_h) }
102
+ @allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(**n.to_h) }
98
103
  @submit_host = submit_host && submit_host.to_s
99
104
  @job_name = job_name && job_name.to_s
100
105
  @job_owner = job_owner && job_owner.to_s
@@ -111,6 +116,7 @@ module OodCore
111
116
  @status = job_array_aggregate_status unless @tasks.empty?
112
117
 
113
118
  @native = native
119
+ @gpus = gpus && gpus.to_i
114
120
  end
115
121
 
116
122
  # Create a new Info for a child task
@@ -147,10 +153,15 @@ module OodCore
147
153
  submission_time: submission_time,
148
154
  dispatch_time: dispatch_time,
149
155
  native: native,
156
+ gpus: gpus,
150
157
  tasks: tasks
151
158
  }
152
159
  end
153
160
 
161
+ def gpu?
162
+ gpus.positive?
163
+ end
164
+
154
165
  # The comparison operator
155
166
  # @param other [#to_h] object to compare against
156
167
  # @return [Boolean] whether objects are equivalent
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.18.1"
3
+ VERSION = "0.20.1"
4
4
  end
data/lib/ood_core.rb CHANGED
@@ -11,6 +11,7 @@ module OodCore
11
11
  require "ood_core/job/node_info"
12
12
  require "ood_core/job/script"
13
13
  require "ood_core/job/info"
14
+ require "ood_core/job/cluster_info"
14
15
  require "ood_core/job/status"
15
16
  require "ood_core/job/adapter"
16
17
  require "ood_core/job/factory"
data/ood_core.gemspec CHANGED
@@ -20,14 +20,15 @@ Gem::Specification.new do |spec|
20
20
  spec.bindir = "exe"
21
21
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
22
  spec.require_paths = ["lib"]
23
- spec.required_ruby_version = ">= 2.2.0"
23
+ spec.required_ruby_version = ">= 2.7.0"
24
24
 
25
25
  spec.add_runtime_dependency "ood_support", "~> 0.0.2"
26
26
  spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
27
+ spec.add_runtime_dependency "rexml", "~> 3.2"
27
28
  spec.add_development_dependency "bundler", "~> 2.1"
28
29
  spec.add_development_dependency "rake", "~> 13.0.1"
29
30
  spec.add_development_dependency "rspec", "~> 3.0"
30
31
  spec.add_development_dependency "pry", "~> 0.10"
31
32
  spec.add_development_dependency "timecop", "~> 0.8"
32
- spec.add_development_dependency "climate_control", "~> 0.2.0"
33
+ spec.add_development_dependency "climate_control", "~> 1.1.1"
33
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.1
4
+ version: 0.20.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2021-10-18 00:00:00.000000000 Z
13
+ date: 2022-07-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -46,6 +46,20 @@ dependencies:
46
46
  - - ">="
47
47
  - !ruby/object:Gem::Version
48
48
  version: 1.9.6
49
+ - !ruby/object:Gem::Dependency
50
+ name: rexml
51
+ requirement: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '3.2'
56
+ type: :runtime
57
+ prerelease: false
58
+ version_requirements: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '3.2'
49
63
  - !ruby/object:Gem::Dependency
50
64
  name: bundler
51
65
  requirement: !ruby/object:Gem::Requirement
@@ -122,14 +136,14 @@ dependencies:
122
136
  requirements:
123
137
  - - "~>"
124
138
  - !ruby/object:Gem::Version
125
- version: 0.2.0
139
+ version: 1.1.1
126
140
  type: :development
127
141
  prerelease: false
128
142
  version_requirements: !ruby/object:Gem::Requirement
129
143
  requirements:
130
144
  - - "~>"
131
145
  - !ruby/object:Gem::Version
132
- version: 0.2.0
146
+ version: 1.1.1
133
147
  description: Open OnDemand core library that provides support for an HPC Center to
134
148
  globally define HPC services that web applications can then take advantage of.
135
149
  email:
@@ -186,12 +200,17 @@ files:
186
200
  - lib/ood_core/job/adapters/sge/qstat_xml_j_r_listener.rb
187
201
  - lib/ood_core/job/adapters/sge/qstat_xml_r_listener.rb
188
202
  - lib/ood_core/job/adapters/slurm.rb
203
+ - lib/ood_core/job/adapters/systemd.rb
204
+ - lib/ood_core/job/adapters/systemd/launcher.rb
205
+ - lib/ood_core/job/adapters/systemd/templates/email.erb.sh
206
+ - lib/ood_core/job/adapters/systemd/templates/script_wrapper.erb.sh
189
207
  - lib/ood_core/job/adapters/torque.rb
190
208
  - lib/ood_core/job/adapters/torque/attributes.rb
191
209
  - lib/ood_core/job/adapters/torque/batch.rb
192
210
  - lib/ood_core/job/adapters/torque/error.rb
193
211
  - lib/ood_core/job/adapters/torque/ffi.rb
194
212
  - lib/ood_core/job/array_ids.rb
213
+ - lib/ood_core/job/cluster_info.rb
195
214
  - lib/ood_core/job/factory.rb
196
215
  - lib/ood_core/job/info.rb
197
216
  - lib/ood_core/job/node_info.rb
@@ -215,7 +234,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
215
234
  requirements:
216
235
  - - ">="
217
236
  - !ruby/object:Gem::Version
218
- version: 2.2.0
237
+ version: 2.7.0
219
238
  required_rubygems_version: !ruby/object:Gem::Requirement
220
239
  requirements:
221
240
  - - ">="