ood_core 0.18.1 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a110306c66de3f349e7a5569cedc3ea02dfedd5dfaa360b352f8689af113f98b
4
- data.tar.gz: ae608343e63bb98e6383fea71af53943d7136bdf72d8bf46b653f0c19801fcec
3
+ metadata.gz: 0ad31cf26a48b0b388b5c4f1bc83583515837a82c1a9419e51f724d135c4820c
4
+ data.tar.gz: 9eccd2b14920adf24f4273abd9355b4061a6a082913011757c3d636ba9d7d485
5
5
  SHA512:
6
- metadata.gz: 8a6b9928561a6dba1b84cbb2ac58d389b84e8317589648c483382c166c81982859fb74f68f76297a25319faed06712c6256abdf1c6a5e0175be939aa0392f283
7
- data.tar.gz: 21396c77e39329f9d7b6112c7900dd7ffa51d695b137d15089c487799ed16e3f74aea1f1dfab9958e2928fb98f49db098f865906c53abf667a8ed64ceda5dc53
6
+ metadata.gz: 126c139985ca62fdfca217ee8ea2fade4292d2f8aac110d51ad94edda36aff6e8c28fbe1876981010dc2ac395eb5cebea0439add3cafbb844a023ff699b4d841
7
+ data.tar.gz: 6d17cc409aff1b7d7098b409451c93e0660a855c4b4b36f8a49afea5cc306aeff5181e76305c15b056a5234e401f613bc428a3e49d98e292c3b584aaf4394a9a
data/CHANGELOG.md CHANGED
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.20.1] - 07-21-2022
11
+
12
+ - Fixed turbovnc compatability issue with the -nohttpd flag in [767](https://github.com/OSC/ood_core/pull/767).
13
+
14
+ ## [0.20.0] - 06-03-2022
15
+
16
+ - Adapters can now respond to `cluster_info` in [752](https://github.com/OSC/ood_core/pull/752). This returns information about the cluster like how many nodes are available and so on. Only Slurm support in this release.
17
+ - `OodCore::Job::Info` now has a `gpus` attribute in [753](https://github.com/OSC/ood_core/pull/753). Only Slurm support in this release.
18
+ - Support Ruby 3 in [759](https://github.com/OSC/ood_core/pull/759)
19
+
20
+ ## [0.19.0] - 02-03-2022
21
+
22
+ ### Added
23
+
24
+ - Systemd adapter in [743](https://github.com/OSC/ood_core/pull/743).
25
+
26
+ ### Fixed
27
+
28
+ - The linux host adapter is a little more portable in [333](https://github.com/OSC/ood_core/pull/333).
29
+ - Improved pod security for the k8s adapter in [748](https://github.com/OSC/ood_core/pull/748).
30
+
10
31
  ## [0.18.1] - 10-18-2021
11
32
 
12
33
  ### Fixed
@@ -411,7 +432,10 @@ Functionally the same as [0.17.3] but with some CI updates.
411
432
  ### Added
412
433
  - Initial release!
413
434
 
414
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.18.1...HEAD
435
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.20.1...HEAD
436
+ [0.20.1]: https://github.com/OSC/ood_core/compare/v0.20.0...v0.20.1
437
+ [0.20.0]: https://github.com/OSC/ood_core/compare/v0.19.0...v0.20.0
438
+ [0.19.0]: https://github.com/OSC/ood_core/compare/v0.18.1...v0.19.0
415
439
  [0.18.1]: https://github.com/OSC/ood_core/compare/v0.18.0...v0.18.1
416
440
  [0.18.0]: https://github.com/OSC/ood_core/compare/v0.17.8...v0.18.0
417
441
  [0.17.6]: https://github.com/OSC/ood_core/compare/v0.17.5...v0.17.6
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2017-2018 Ohio Supercomputer Center
3
+ Copyright (c) 2017-2022 Ohio Supercomputer Center
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -86,8 +86,13 @@ module OodCore
86
86
  # Clean up any old VNC sessions that weren't cleaned before
87
87
  #{vnc_clean}
88
88
 
89
+ # for turbovnc 3.0 compatability.
90
+ if timeout 2 vncserver --help 2>&1 | grep 'nohttpd' >/dev/null 2>&1; then
91
+ HTTPD_OPT='-nohttpd'
92
+ fi
93
+
89
94
  # Attempt to start VNC server
90
- VNC_OUT=$(vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}" -nohttpd -noxstartup #{vnc_args} 2>&1)
95
+ VNC_OUT=$(vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}" $HTTPD_OPT -noxstartup #{vnc_args} 2>&1)
91
96
  VNC_PID=$(pgrep -s 0 Xvnc) # the script above will daemonize the Xvnc process
92
97
  echo "${VNC_OUT}"
93
98
 
@@ -33,6 +33,15 @@ module OodCore
33
33
  raise NotImplementedError, "subclass did not define #submit"
34
34
  end
35
35
 
36
+ # Retrieve the number of active and total cpus, nodes, and gpus
37
+ # @abstract Subclass is expected to implement {#cluster_stats}
38
+ # @raise [NotImplementedError] if subclass did not define {#cluster_stats}
39
+ # @return [ClusterInfo] Object containing quantified statistics about the
40
+ # cluster's active/total cpus, nodes, and gpus
41
+ def cluster_info
42
+ raise NotImplementedError, "subclass did not define #cluster_stats"
43
+ end
44
+
36
45
  # Retrieve info for all jobs from the resource manager
37
46
  # @abstract Subclass is expected to implement {#info_all}
38
47
  # @raise [NotImplementedError] if subclass did not define {#info_all}
@@ -228,7 +228,7 @@ module OodCore
228
228
  data_hash[:submission_time] = raw['dateSubmitted'].to_i
229
229
  data_hash[:queue_name] = raw['criteriaPriority']
230
230
 
231
- Info.new(data_hash)
231
+ Info.new(**data_hash)
232
232
  end
233
233
 
234
234
  # extended data is just lines of 'key: value' value, so parse
@@ -242,7 +242,7 @@ module OodCore
242
242
 
243
243
  data.to_s.lines.drop(1).each do |line|
244
244
  match_data = ccqstat_regex.match(line)
245
- infos << Info.new(ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
245
+ infos << Info.new(**ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
246
246
  end
247
247
 
248
248
  infos
@@ -93,7 +93,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
93
93
 
94
94
  def info(id)
95
95
  pod_json = safe_call('get', 'pod', id)
96
- return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
96
+ return OodCore::Job::Info.new(**{ id: id, status: 'completed' }) if pod_json.empty?
97
97
 
98
98
  service_json = safe_call('get', 'service', service_name(id))
99
99
  secret_json = safe_call('get', 'secret', secret_name(id))
@@ -31,7 +31,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
31
31
 
32
32
  pod_hash.deep_merge!(service_hash)
33
33
  pod_hash.deep_merge!(secret_hash)
34
- OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(pod_hash)
34
+ OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(**pod_hash)
35
35
  rescue NoMethodError
36
36
  raise K8sDataError, "unable to read data correctly from json"
37
37
  end
@@ -2,8 +2,8 @@
2
2
  class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
3
3
  attr_reader :ood_connection_info
4
4
 
5
- def initialize(ood_connection_info: {}, **options)
6
- super(options)
7
- @ood_connection_info = ood_connection_info
5
+ def initialize(options)
6
+ super(**options)
7
+ @ood_connection_info = options[:ood_connection_info]
8
8
  end
9
9
  end
@@ -106,7 +106,9 @@ spec:
106
106
  allowPrivilegeEscalation: false
107
107
  capabilities:
108
108
  drop:
109
- - all
109
+ - ALL
110
+ seccompProfile:
111
+ type: RuntimeDefault
110
112
  privileged: false
111
113
  <%- unless spec.init_containers.empty? -%>
112
114
  initContainers:
@@ -152,7 +154,9 @@ spec:
152
154
  allowPrivilegeEscalation: false
153
155
  capabilities:
154
156
  drop:
155
- - all
157
+ - ALL
158
+ seccompProfile:
159
+ type: RuntimeDefault
156
160
  privileged: false
157
161
  <%- end # init container loop -%>
158
162
  <%- end # if init containers -%>
@@ -73,7 +73,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
73
73
  # Get the tmux pane PID for the target session
74
74
  pane_pid=$(tmux list-panes -aF '\#{session_name} \#{pane_pid}' | grep '#{session_name}' | cut -f 2 -d ' ')
75
75
  # Find the Singularity sinit PID child of the pane process
76
- pane_sinit_pid=$(pstree -p -l "$pane_pid" | grep -o 'sinit([[:digit:]]*' | grep -o '[[:digit:]]*')
76
+ pane_sinit_pid=$(pstree -p -l "$pane_pid" | egrep -o 'sinit[(][[:digit:]]*|shim-init[(][[:digit:]]*' | grep -o '[[:digit:]]*')
77
77
  # Kill sinit which stops both Singularity-based processes and the tmux session
78
78
  kill "$pane_sinit_pid"
79
79
  SCRIPT
@@ -16,7 +16,7 @@ module OodCore
16
16
  # @option config [#to_h] :bin_overrides ({}) Optional overrides to LSF client executables
17
17
  # @option config [#to_s] :submit_host ('') Host to submit commands to
18
18
  def self.build_lsf(config)
19
- batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
19
+ batch = Adapters::Lsf::Batch.new(**config.to_h.symbolize_keys)
20
20
  Adapters::Lsf.new(batch: batch)
21
21
  end
22
22
  end
@@ -36,6 +36,13 @@ module OodCore
36
36
  using Refinements::HashExtensions
37
37
  using Refinements::ArrayExtensions
38
38
 
39
+ # Get integer representing the number of gpus used by a node or job,
40
+ # calculated from gres string
41
+ # @return [Integer] the number of gpus in gres
42
+ def gpus_from_gres(gres)
43
+ gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum
44
+ end
45
+
39
46
  # Object used for simplified communication with a Slurm batch server
40
47
  # @api private
41
48
  class Batch
@@ -98,6 +105,22 @@ module OodCore
98
105
  @strict_host_checking = strict_host_checking
99
106
  end
100
107
 
108
+ # Get a ClusterInfo object containing information about the given cluster
109
+ # @return [ClusterInfo] object containing cluster details
110
+ def get_cluster_info
111
+ node_cpu_info = call("sinfo", "-aho %A/%D/%C").strip.split('/')
112
+ gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2
113
+ gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length}")
114
+ .lines.uniq.map(&:split)
115
+ ClusterInfo.new(active_nodes: node_cpu_info[0].to_i,
116
+ total_nodes: node_cpu_info[2].to_i,
117
+ active_processors: node_cpu_info[3].to_i,
118
+ total_processors: node_cpu_info[6].to_i,
119
+ active_gpus: gres_lines.sum { |line| gpus_from_gres(line[2]) },
120
+ total_gpus: gres_lines.sum { |line| gpus_from_gres(line[1]) }
121
+ )
122
+ end
123
+
101
124
  # Get a list of hashes detailing each of the jobs on the batch server
102
125
  # @example Status info for all jobs
103
126
  # my_batch.get_jobs
@@ -454,6 +477,12 @@ module OodCore
454
477
  raise JobAdapterError, e.message
455
478
  end
456
479
 
480
+ # Retrieve info about active and total cpus, gpus, and nodes
481
+ # @return [Hash] information about cluster usage
482
+ def cluster_info
483
+ @slurm.get_cluster_info
484
+ end
485
+
457
486
  # Retrieve info for all jobs from the resource manager
458
487
  # @raise [JobAdapterError] if something goes wrong getting job info
459
488
  # @return [Array<Info>] information describing submitted jobs
@@ -643,7 +672,8 @@ module OodCore
643
672
  cpu_time: nil,
644
673
  submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
645
674
  dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
646
- native: v
675
+ native: v,
676
+ gpus: gpus_from_gres(v[:gres])
647
677
  )
648
678
  end
649
679
 
@@ -0,0 +1,252 @@
1
+ require 'erb'
2
+ require 'etc'
3
+ require 'pathname'
4
+ require 'securerandom'
5
+ require 'shellwords'
6
+ require 'time'
7
+
8
+ # Object used for simplified communication SSH hosts
9
+ #
10
+ # @api private
11
+ class OodCore::Job::Adapters::LinuxSystemd::Launcher
12
+ attr_reader :debug, :site_timeout, :session_name_label, :ssh_hosts,
13
+ :strict_host_checking, :username
14
+ # The root exception class that all LinuxSystemd adapter-specific exceptions inherit
15
+ # from
16
+ class Error < StandardError; end
17
+
18
+ # @param debug Whether the adapter should be used in debug mode
19
+ # @param site_timeout [#to_i] A period after which the job should be killed or nil
20
+ # @param ssh_hosts List of hosts to check when scanning for running jobs
21
+ # @param strict_host_checking Allow SSH to perform strict host checking
22
+ # @param submit_host The SSH-able host
23
+ def initialize(
24
+ debug: false,
25
+ site_timeout: nil,
26
+ ssh_hosts:,
27
+ strict_host_checking: false,
28
+ submit_host:,
29
+ **_
30
+ )
31
+ @debug = !! debug
32
+ @site_timeout = site_timeout.to_i
33
+ @session_name_label = 'ondemand'
34
+ @ssh_hosts = ssh_hosts
35
+ @strict_host_checking = strict_host_checking
36
+ @submit_host = submit_host
37
+ @username = Etc.getlogin
38
+ end
39
+
40
+ # @param hostname [#to_s] The hostname to submit the work to
41
+ # @param script [OodCore::Job::Script] The script object defining the work
42
+ def start_remote_session(script)
43
+ cmd = ssh_cmd(submit_host(script), ['/usr/bin/env', 'bash'])
44
+
45
+ session_name = unique_session_name
46
+ output = call(*cmd, stdin: wrapped_script(script, session_name))
47
+ hostname = parse_hostname(output)
48
+
49
+ "#{session_name}@#{hostname}"
50
+ end
51
+
52
+ def stop_remote_session(session_name, hostname)
53
+ cmd = ssh_cmd(hostname, ['/usr/bin/env', 'bash'])
54
+
55
+ kill_cmd = <<~SCRIPT
56
+ # stop the session by name
57
+ systemctl --user stop #{session_name}.service
58
+ SCRIPT
59
+
60
+ call(*cmd, stdin: kill_cmd)
61
+ rescue Error => e
62
+ interpret_and_raise(e)
63
+ end
64
+
65
+ def list_remote_sessions(host: nil)
66
+ host_list = (host) ? [host] : ssh_hosts
67
+
68
+ host_list.map {
69
+ |hostname| list_remote_systemd_session(hostname)
70
+ }.flatten.sort_by {
71
+ |hsh| hsh[:session_name]
72
+ }
73
+ end
74
+
75
+ def submit_host(script = nil)
76
+ if script && script.native && script.native['submit_host_override']
77
+ script.native['submit_host_override']
78
+ else
79
+ @submit_host
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ # Call a forked Slurm command for a given cluster
86
+ def call(cmd, *args, env: {}, stdin: "")
87
+ args = args.map(&:to_s)
88
+ env = env.to_h
89
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
90
+ s.success? ? o : raise(Error, e)
91
+ end
92
+
93
+ # The full command to ssh into the destination host and execute the command.
94
+ # SSH options include:
95
+ # -t Force pseudo-terminal allocation (required to allow tmux to run)
96
+ # -o BatchMode=yes (set mode to be non-interactive)
97
+ # if ! strict_host_checking
98
+ # -o UserKnownHostsFile=/dev/null (do not update the user's known hosts file)
99
+ # -o StrictHostKeyChecking=no (do no check the user's known hosts file)
100
+ #
101
+ # @param destination_host [#to_s] the destination host you wish to ssh into
102
+ # @param cmd [Array<#to_s>] the command to be executed on the destination host
103
+ def ssh_cmd(destination_host, cmd)
104
+ if strict_host_checking
105
+ [
106
+ 'ssh', '-t',
107
+ '-o', 'BatchMode=yes',
108
+ "#{username}@#{destination_host}"
109
+ ].concat(cmd)
110
+ else
111
+ [
112
+ 'ssh', '-t',
113
+ '-o', 'BatchMode=yes',
114
+ '-o', 'UserKnownHostsFile=/dev/null',
115
+ '-o', 'StrictHostKeyChecking=no',
116
+ "#{username}@#{destination_host}"
117
+ ].concat(cmd)
118
+ end
119
+ end
120
+
121
+ def shell
122
+ ENV['SHELL'] || '/bin/bash'
123
+ end
124
+
125
+ # Wraps a user-provided script into a systemd-run transient service
126
+ def wrapped_script(script, session_name)
127
+ content = script.content
128
+ unless user_script_has_shebang?(script)
129
+ content = "#!#{shell}\n#{content}"
130
+ end
131
+
132
+ ERB.new(
133
+ File.read(Pathname.new(__dir__).join('templates/script_wrapper.erb.sh'))
134
+ ).result(binding.tap {|bnd|
135
+ {
136
+ 'arguments' => script_arguments(script),
137
+ 'cd_to_workdir' => (script.workdir) ? "cd #{script.workdir}" : '',
138
+ 'debug' => debug,
139
+ 'email_on_terminated' => script_email_on_event(script, 'terminated'),
140
+ 'email_on_start' => script_email_on_event(script, 'started'),
141
+ 'environment' => export_env(script),
142
+ 'error_path' => error_path(script),
143
+ 'job_name' => script.job_name.to_s,
144
+ 'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
145
+ 'script_content' => content,
146
+ 'script_timeout' => script_timeout(script),
147
+ 'session_name' => session_name,
148
+ 'ssh_hosts' => ssh_hosts,
149
+ 'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
150
+ }.each{
151
+ |key, value| bnd.local_variable_set(key, value)
152
+ }
153
+ })
154
+ end
155
+
156
+ # Generate the environment export block for this script
157
+ def export_env(script)
158
+ environment = script.job_environment
159
+ (environment ? environment : {}).map{
160
+ |key, value| "export #{key}=#{Shellwords.escape(value)}"
161
+ }.sort.join("\n")
162
+ end
163
+
164
+ def script_timeout(script)
165
+ wall_time = script.wall_time.to_i
166
+ if wall_time == 0
167
+ # this is the only way it can be 0
168
+ # so make it into infinify for systemd to never terminate
169
+ site_timeout == 0 ? 'infinity' : site_timeout
170
+ elsif site_timeout != 0
171
+ [wall_time, site_timeout].min
172
+ else
173
+ wall_time
174
+ end
175
+ end
176
+
177
+ def script_arguments(script)
178
+ return '' unless script.args
179
+
180
+ Shellwords.join(script.args)
181
+ end
182
+
183
+ def script_email_on_event(script, event)
184
+ return false unless script.email && script.send("email_on_#{event}")
185
+
186
+ ERB.new(
187
+ File.read(Pathname.new(__dir__).join('templates/email.erb.sh'))
188
+ ).result(binding.tap {|bnd|
189
+ {
190
+ 'email_recipients' => script.email.map{|addr| Shellwords.escape(addr)}.join(', '),
191
+ 'job_name' => (script.job_name) ? script.job_name : 'LinuxHost_Adapter_Job',
192
+ 'job_status' => event
193
+ }.each{
194
+ |key, value| bnd.local_variable_set(key, value)
195
+ }
196
+ })
197
+ end
198
+
199
+ def unique_session_name
200
+ "#{session_name_label}-#{SecureRandom.alphanumeric(10)}"
201
+ end
202
+
203
+ # List all Systemd sessions on destination_host started by this adapter
204
+ def list_remote_systemd_session(destination_host)
205
+ cmd = ssh_cmd(destination_host, ['systemctl', '--user', 'show', '-t', 'service', '--state=running', "#{session_name_label}-*"])
206
+
207
+ # individual units are separated with an empty line
208
+ call(*cmd).split("\n\n").map do |oneunit|
209
+ Hash[oneunit.split("\n").map{ |line| line.split('=',2) }].tap do |session_hash|
210
+ session_hash[:session_name] = session_hash['Id'].delete_suffix('.service')
211
+ session_hash[:destination_host] = destination_host
212
+ session_hash[:id] = "#{session_hash[:session_name]}@#{destination_host}"
213
+ session_hash[:session_created] = Time.parse(session_hash['ExecMainStartTimestamp'])
214
+ session_hash[:job_name] = session_hash['Description']
215
+ end
216
+ end
217
+ rescue Error => e
218
+ interpret_and_raise(e)
219
+ []
220
+ end
221
+
222
+ def user_script_has_shebang?(script)
223
+ return false if script.content.empty?
224
+ script.content.split("\n").first.start_with?('#!/')
225
+ end
226
+
227
+ def error_path(script)
228
+ return script.error_path.to_s if script.error_path
229
+ return script.output_path.to_s if script.output_path
230
+
231
+ '/dev/null'
232
+ end
233
+
234
+ # under some conditions tmux returns status code 1 but it's not an actual
235
+ # error. These are when the session is not found or there are no sessions
236
+ # at all.
237
+ def interpret_and_raise(error)
238
+ if error.message.include?('failed to connect to server') # no sessions in tmux 1.8
239
+ nil
240
+ elsif error.message.include?('no server running on') # no sessions in tmux 2.7+ message
241
+ nil
242
+ else
243
+ raise error
244
+ end
245
+ end
246
+
247
+ def parse_hostname(output)
248
+ output.split($/).map do |line|
249
+ line[/^HOSTNAME:(.*)$/, 1]
250
+ end.compact.last.to_s
251
+ end
252
+ end
@@ -0,0 +1,9 @@
1
+ if command -v mail; then
2
+ cat << EMAIL_CONTENT | mail -s "Job <%= job_name %> has <%= job_status %>" <%= email_recipients %>
3
+ Greetings,
4
+
5
+ Your job <%= job_name %> has <%= job_status %>.
6
+
7
+ - The OnDemand Linux Systemd Adapter
8
+ EMAIL_CONTENT
9
+ fi
@@ -0,0 +1,56 @@
1
+ #!/bin/bash
2
+ SSH_HOSTS=(<%= ssh_hosts.join(' ').to_s %>)
3
+ hostnames=`hostname -A`
4
+ for host in ${SSH_HOSTS[@]}
5
+ do
6
+ if [[ " ${hostnames[@]} " =~ " ${host} " ]]; then
7
+ hostname=$host
8
+ fi
9
+ done
10
+
11
+ if [ -z "$hostname" ]; then
12
+ printf >&2 "ERROR: Can't start job on [${hostnames[@]}] because it does not match any hostname configured \nin ssh_hosts [${SSH_HOSTS[@]}]. The output of 'hostname -A' must match an entry in ssh_hosts \nfrom the cluster configuration."
13
+ exit 1
14
+ fi
15
+
16
+ echo ""
17
+ echo "HOSTNAME:$hostname"
18
+
19
+ # we need this user to be enabled for lingering or else the newly started
20
+ # service will end as soon as the ssh session starting has exited
21
+ loginctl enable-linger
22
+
23
+ # Put the script into a temp file on localhost
24
+ systemd_service_file="<%= workdir %>/systemd_service.sh"
25
+ systemd_service_file_pre="<%= workdir %>/systemd_pre.sh"
26
+ systemd_service_file_post="<%= workdir %>/systemd_post.sh"
27
+
28
+ cat << 'SYSTEMD_EXEC_PRE' > "$systemd_service_file_pre"
29
+ #!/bin/bash
30
+ <%= cd_to_workdir %>
31
+ <% if email_on_start %>
32
+ <%= email_on_start %>
33
+ <% end %>
34
+ SYSTEMD_EXEC_PRE
35
+
36
+ cat << 'SYSTEMD_EXEC_POST' > "$systemd_service_file_post"
37
+ #!/bin/bash
38
+ <%= cd_to_workdir %>
39
+ <% if email_on_terminated %>
40
+ <%= email_on_terminated %>
41
+ <% end %>
42
+ SYSTEMD_EXEC_POST
43
+
44
+ # Create an executable for systemd service to run
45
+ # Escaped HEREDOC means that we do not have to worry about Shell.escape-ing script_content
46
+ cat << 'SYSTEMD_EXEC' > "$systemd_service_file"
47
+ <%= script_content %>
48
+ SYSTEMD_EXEC
49
+
50
+ # Run the script inside a transient systemd user service
51
+ chmod +x "$systemd_service_file_pre" "$systemd_service_file" "$systemd_service_file_post"
52
+ <%= cd_to_workdir %>
53
+ systemd-run --user -r --no-block --unit=<%= session_name %> -p RuntimeMaxSec=<%= script_timeout %> \
54
+ -p ExecStartPre="$systemd_service_file_pre" -p ExecStartPost="$systemd_service_file_post" \
55
+ -p StandardOutput="file:<%= output_path %>" -p StandardError="file:<%= error_path %>" \
56
+ -p Description="<%= job_name %>" "$systemd_service_file"
@@ -0,0 +1,230 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "ood_core/refinements/array_extensions"
3
+ require "ood_core/job/adapters/helper"
4
+ require "set"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the LinuxSystemd adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [Object] :debug (false) Use the adapter in a debug mode
14
+ # @option config [Object] :max_timeout (nil) The longest 'wall_clock' permissible
15
+ # @option config [Object] :ssh_hosts (nil) The list of permissable hosts, defaults to :submit_host
16
+ # @option config [Object] :strict_host_checking (true) Set to false to disable strict host checking and updating the known_hosts file
17
+ # @option config [Object] :submit_host The SSH target to connect to, may be the head of a round-robin
18
+ def self.build_systemd(config)
19
+ c = config.to_h.symbolize_keys
20
+ debug = c.fetch(:debug, false)
21
+ max_timeout = c.fetch(:max_timeout, nil)
22
+ ssh_hosts = c.fetch(:ssh_hosts, [c[:submit_host]])
23
+ strict_host_checking = c.fetch(:strict_host_checking, true)
24
+ submit_host = c[:submit_host]
25
+
26
+ Adapters::LinuxSystemd.new(
27
+ ssh_hosts: ssh_hosts,
28
+ launcher: Adapters::LinuxSystemd::Launcher.new(
29
+ debug: debug,
30
+ max_timeout: max_timeout,
31
+ ssh_hosts: ssh_hosts,
32
+ strict_host_checking: strict_host_checking,
33
+ submit_host: submit_host,
34
+ )
35
+ )
36
+ end
37
+ end
38
+
39
+ module Adapters
40
+ # An adapter object that describes the communication with a remote host
41
+ # for job management.
42
+ class LinuxSystemd < Adapter
43
+ using Refinements::ArrayExtensions
44
+
45
+ require "ood_core/job/adapters/systemd/launcher"
46
+
47
+ def initialize(ssh_hosts:, launcher:)
48
+ @launcher = launcher
49
+ @ssh_hosts = Set.new(ssh_hosts)
50
+ end
51
+
52
+ # Submit a job with the attributes defined in the job template instance
53
+ # @param script [Script] script object that describes the script and
54
+ # attributes for the submitted job
55
+ # @param after [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
56
+ # @param afterok [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
57
+ # @param afternotok [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
58
+ # @param afterany [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
59
+ # @raise [JobAdapterError] if something goes wrong submitting a job
60
+ # @return [String] the job id returned after successfully submitting a
61
+ # job
62
+ # @see Adapter#submit
63
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
64
+ unless (after.empty? && afterok.empty? && afternotok.empty? && afterany.empty?)
65
+ raise JobAdapterError, 'Scheduling subsequent jobs is not available.'
66
+ end
67
+
68
+ @launcher.start_remote_session(script)
69
+ rescue Launcher::Error => e
70
+ raise JobAdapterError, e.message
71
+ end
72
+
73
+ # Retrieve info for all jobs from the resource manager
74
+ # @raise [JobAdapterError] if something goes wrong getting job info
75
+ # @return [Array<Info>] information describing submitted jobs
76
+ # @see Adapter#info_all
77
+ def info_all(attrs: nil, host: nil)
78
+ host_permitted?(host) if host
79
+
80
+ @launcher.list_remote_sessions(host: host).map{
81
+ |ls_output| ls_to_info(ls_output)
82
+ }
83
+ rescue Launcher::Error => e
84
+ raise JobAdapterError, e.message
85
+ end
86
+
87
+ # Retrieve info for all jobs for a given owner or owners from the
88
+ # resource manager
89
+ # Note: owner and attrs are present only to complete the interface and are ignored
90
+ # Note: since this API is used in production no errors or warnings are thrown / issued
91
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
92
+ # @raise [JobAdapterError] if something goes wrong getting job info
93
+ # @return [Array<Info>] information describing submitted jobs
94
+ def info_where_owner(_, attrs: nil)
95
+ info_all
96
+ end
97
+
98
+ # Iterate over each job Info object
99
+ # @param attrs [Array<symbol>] attrs is present only to complete the interface and is ignored
100
+ # @yield [Info] of each job to block
101
+ # @return [Enumerator] if no block given
102
+ def info_all_each(attrs: nil)
103
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
104
+
105
+ info_all(attrs: attrs).each do |job|
106
+ yield job
107
+ end
108
+ end
109
+
110
+ # Iterate over each job Info object
111
+ # @param owner [#to_s, Array<#to_s>] owner is present only to complete the interface and is ignored
112
+ # @param attrs [Array<symbol>] attrs is present only to complete the interface and is ignored
113
+ # @yield [Info] of each job to block
114
+ # @return [Enumerator] if no block given
115
+ def info_where_owner_each(owner, attrs: nil)
116
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
117
+
118
+ info_where_owner(owner, attrs: attrs).each do |job|
119
+ yield job
120
+ end
121
+ end
122
+
123
+ # Whether the adapter supports job arrays
124
+ # @return [Boolean] - false
125
+ def supports_job_arrays?
126
+ false
127
+ end
128
+
129
+ # Retrieve job info from the SSH host
130
+ # @param id [#to_s] the id of the job
131
+ # @raise [JobAdapterError] if something goes wrong getting job info
132
+ # @return [Info] information describing submitted job
133
+ # @see Adapter#info
134
+ def info(id)
135
+ _, host = parse_job_id(id)
136
+ job = info_all(host: host).select{|info| info.id == id}.first
137
+ (job) ? job : Info.new(id: id, status: :completed)
138
+ rescue Launcher::Error => e
139
+ raise JobAdapterError, e.message
140
+ end
141
+
142
+ # Retrieve job status from resource manager
143
+ # @note Optimized slightly over retrieving complete job information from server
144
+ # @abstract Subclass is expected to implement {#status}
145
+ # @raise [NotImplementedError] if subclass did not define {#status}
146
+ # @param id [#to_s] the id of the job
147
+ # @return [Status] status of job
148
+ def status(id)
149
+ _, host = parse_job_id(id)
150
+ job = info_all(host: host).select{|info| info.id == id}.first
151
+
152
+ Status.new(state: (job) ? :running : :completed)
153
+ rescue Launcher::Error => e
154
+ raise JobAdapterError, e.message
155
+ end
156
+
157
+ # Put the submitted job on hold
158
+ # @abstract Subclass is expected to implement {#hold}
159
+ # @raise [NotImplementedError] if subclass did not define {#hold}
160
+ # @param id [#to_s] the id of the job
161
+ # @return [void]
162
+ def hold(id)
163
+ # Consider sending SIGSTOP?
164
+ raise NotImplementedError, "subclass did not define #hold"
165
+ end
166
+
167
+ # Release the job that is on hold
168
+ # @abstract Subclass is expected to implement {#release}
169
+ # @raise [NotImplementedError] if subclass did not define {#release}
170
+ # @param id [#to_s] the id of the job
171
+ # @return [void]
172
+ def release(id)
173
+ # Consider sending SIGCONT
174
+ raise NotImplementedError, "subclass did not define #release"
175
+ end
176
+
177
+ # Delete the submitted job
178
+ # @abstract Subclass is expected to implement {#delete}
179
+ # @raise [NotImplementedError] if subclass did not define {#delete}
180
+ # @param id [#to_s] the id of the job
181
+ # @return [void]
182
+ def delete(id)
183
+ session_name, destination_host = parse_job_id(id)
184
+ @launcher.stop_remote_session(session_name, destination_host)
185
+ rescue Launcher::Error => e
186
+ raise JobAdapterError, e.message
187
+ end
188
+
189
+ def directive_prefix
190
+ nil
191
+ end
192
+
193
+ private
194
+
195
+ def host_permitted?(destination_host)
196
+ raise JobAdapterError, "Requested destination host (#{destination_host}) not permitted" unless @ssh_hosts.include?(destination_host)
197
+ end
198
+
199
+ def parse_job_id(id)
200
+ raise JobAdapterError, "#{id} is not a valid LinuxSystemd adapter id because it is missing the '@'." unless id.include?('@')
201
+
202
+ return id.split('@')
203
+ end
204
+
205
+ # Convert the returned Hash into an Info object
206
+ def ls_to_info(ls_output)
207
+ started = ls_output[:session_created].to_i
208
+ now = Time.now.to_i
209
+ ellapsed = now - started
210
+ Info.new(
211
+ accounting_id: nil,
212
+ allocated_nodes: [NodeInfo.new(name: ls_output[:destination_host], procs: 1)],
213
+ cpu_time: ellapsed,
214
+ dispatch_time: started,
215
+ id: ls_output[:id],
216
+ job_name: ls_output[:job_name],
217
+ job_owner: Etc.getlogin,
218
+ native: ls_output,
219
+ procs: 1,
220
+ queue_name: "LinuxSystemd adapter for #{@submit_host}",
221
+ status: :running,
222
+ submission_time: ellapsed,
223
+ submit_host: @submit_host,
224
+ wallclock_time: ellapsed
225
+ )
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,32 @@
1
+ module OodCore
2
+ module Job
3
+ # An object that contains details about the cluster's active and total nodes, processors, and gpus
4
+ class ClusterInfo
5
+ using Refinements::HashExtensions
6
+
7
+ attr_reader :active_nodes, :total_nodes, :active_processors, :total_processors, :active_gpu_nodes,
8
+ :total_gpu_nodes, :active_gpus, :total_gpus
9
+
10
+ def initialize(opts = {})
11
+ opts = opts.transform_keys(&:to_sym)
12
+ @active_nodes = opts.fetch(:active_nodes, nil).to_i
13
+ @total_nodes = opts.fetch(:total_nodes, nil).to_i
14
+ @active_processors = opts.fetch(:active_processors, nil).to_i
15
+ @total_processors = opts.fetch(:total_processors, nil).to_i
16
+ @active_gpus = opts.fetch(:active_gpus, nil).to_i
17
+ @total_gpus = opts.fetch(:total_gpus, nil).to_i
18
+ end
19
+
20
+ def to_h
21
+ {
22
+ active_nodes: active_nodes,
23
+ total_nodes: total_nodes,
24
+ active_processors: active_processors,
25
+ total_processors: total_processors,
26
+ active_gpus: active_gpus,
27
+ total_gpus: total_gpus
28
+ }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -65,6 +65,10 @@ module OodCore
65
65
  # @return [Object] native info
66
66
  attr_reader :native
67
67
 
68
+ # Number of gpus allocated for job
69
+ # @return [Integer, nil] allocated total number of gpus
70
+ attr_reader :gpus
71
+
68
72
  # List of job array child task statuses
69
73
  # @note only relevant for job arrays
70
74
  # @return [Array<Task>] tasks
@@ -86,15 +90,16 @@ module OodCore
86
90
  # @param dispatch_time [#to_i, nil] dispatch time
87
91
  # @param tasks [Array<Hash>] tasks e.g. { id: '12345.owens-batch', status: :running }
88
92
  # @param native [Object] native info
93
+ # @param gpus [#to_i, 0] allocated total number of gpus
89
94
  def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
90
95
  job_name: nil, job_owner: nil, accounting_id: nil,
91
96
  procs: nil, queue_name: nil, wallclock_time: nil,
92
97
  wallclock_limit: nil, cpu_time: nil, submission_time: nil,
93
- dispatch_time: nil, native: nil, tasks: [],
98
+ dispatch_time: nil, native: nil, gpus: 0, tasks: [],
94
99
  **_)
95
100
  @id = id.to_s
96
101
  @status = Status.new(state: status.to_sym)
97
- @allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(n.to_h) }
102
+ @allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(**n.to_h) }
98
103
  @submit_host = submit_host && submit_host.to_s
99
104
  @job_name = job_name && job_name.to_s
100
105
  @job_owner = job_owner && job_owner.to_s
@@ -111,6 +116,7 @@ module OodCore
111
116
  @status = job_array_aggregate_status unless @tasks.empty?
112
117
 
113
118
  @native = native
119
+ @gpus = gpus && gpus.to_i
114
120
  end
115
121
 
116
122
  # Create a new Info for a child task
@@ -147,10 +153,15 @@ module OodCore
147
153
  submission_time: submission_time,
148
154
  dispatch_time: dispatch_time,
149
155
  native: native,
156
+ gpus: gpus,
150
157
  tasks: tasks
151
158
  }
152
159
  end
153
160
 
161
+ def gpu?
162
+ gpus.positive?
163
+ end
164
+
154
165
  # The comparison operator
155
166
  # @param other [#to_h] object to compare against
156
167
  # @return [Boolean] whether objects are equivalent
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.18.1"
3
+ VERSION = "0.20.1"
4
4
  end
data/lib/ood_core.rb CHANGED
@@ -11,6 +11,7 @@ module OodCore
11
11
  require "ood_core/job/node_info"
12
12
  require "ood_core/job/script"
13
13
  require "ood_core/job/info"
14
+ require "ood_core/job/cluster_info"
14
15
  require "ood_core/job/status"
15
16
  require "ood_core/job/adapter"
16
17
  require "ood_core/job/factory"
data/ood_core.gemspec CHANGED
@@ -20,14 +20,15 @@ Gem::Specification.new do |spec|
20
20
  spec.bindir = "exe"
21
21
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
22
  spec.require_paths = ["lib"]
23
- spec.required_ruby_version = ">= 2.2.0"
23
+ spec.required_ruby_version = ">= 2.7.0"
24
24
 
25
25
  spec.add_runtime_dependency "ood_support", "~> 0.0.2"
26
26
  spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
27
+ spec.add_runtime_dependency "rexml", "~> 3.2"
27
28
  spec.add_development_dependency "bundler", "~> 2.1"
28
29
  spec.add_development_dependency "rake", "~> 13.0.1"
29
30
  spec.add_development_dependency "rspec", "~> 3.0"
30
31
  spec.add_development_dependency "pry", "~> 0.10"
31
32
  spec.add_development_dependency "timecop", "~> 0.8"
32
- spec.add_development_dependency "climate_control", "~> 0.2.0"
33
+ spec.add_development_dependency "climate_control", "~> 1.1.1"
33
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.1
4
+ version: 0.20.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2021-10-18 00:00:00.000000000 Z
13
+ date: 2022-07-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -46,6 +46,20 @@ dependencies:
46
46
  - - ">="
47
47
  - !ruby/object:Gem::Version
48
48
  version: 1.9.6
49
+ - !ruby/object:Gem::Dependency
50
+ name: rexml
51
+ requirement: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '3.2'
56
+ type: :runtime
57
+ prerelease: false
58
+ version_requirements: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '3.2'
49
63
  - !ruby/object:Gem::Dependency
50
64
  name: bundler
51
65
  requirement: !ruby/object:Gem::Requirement
@@ -122,14 +136,14 @@ dependencies:
122
136
  requirements:
123
137
  - - "~>"
124
138
  - !ruby/object:Gem::Version
125
- version: 0.2.0
139
+ version: 1.1.1
126
140
  type: :development
127
141
  prerelease: false
128
142
  version_requirements: !ruby/object:Gem::Requirement
129
143
  requirements:
130
144
  - - "~>"
131
145
  - !ruby/object:Gem::Version
132
- version: 0.2.0
146
+ version: 1.1.1
133
147
  description: Open OnDemand core library that provides support for an HPC Center to
134
148
  globally define HPC services that web applications can then take advantage of.
135
149
  email:
@@ -186,12 +200,17 @@ files:
186
200
  - lib/ood_core/job/adapters/sge/qstat_xml_j_r_listener.rb
187
201
  - lib/ood_core/job/adapters/sge/qstat_xml_r_listener.rb
188
202
  - lib/ood_core/job/adapters/slurm.rb
203
+ - lib/ood_core/job/adapters/systemd.rb
204
+ - lib/ood_core/job/adapters/systemd/launcher.rb
205
+ - lib/ood_core/job/adapters/systemd/templates/email.erb.sh
206
+ - lib/ood_core/job/adapters/systemd/templates/script_wrapper.erb.sh
189
207
  - lib/ood_core/job/adapters/torque.rb
190
208
  - lib/ood_core/job/adapters/torque/attributes.rb
191
209
  - lib/ood_core/job/adapters/torque/batch.rb
192
210
  - lib/ood_core/job/adapters/torque/error.rb
193
211
  - lib/ood_core/job/adapters/torque/ffi.rb
194
212
  - lib/ood_core/job/array_ids.rb
213
+ - lib/ood_core/job/cluster_info.rb
195
214
  - lib/ood_core/job/factory.rb
196
215
  - lib/ood_core/job/info.rb
197
216
  - lib/ood_core/job/node_info.rb
@@ -215,7 +234,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
215
234
  requirements:
216
235
  - - ">="
217
236
  - !ruby/object:Gem::Version
218
- version: 2.2.0
237
+ version: 2.7.0
219
238
  required_rubygems_version: !ruby/object:Gem::Requirement
220
239
  requirements:
221
240
  - - ">="