ood_core 0.13.0 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +30 -0
- data/CHANGELOG.md +64 -1
- data/README.md +2 -2
- data/lib/ood_core/cluster.rb +11 -5
- data/lib/ood_core/job/adapters/ccq.rb +19 -12
- data/lib/ood_core/job/adapters/kubernetes.rb +193 -0
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +372 -0
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +299 -0
- data/lib/ood_core/job/adapters/kubernetes/k8s_job_info.rb +9 -0
- data/lib/ood_core/job/adapters/kubernetes/resources.rb +82 -0
- data/lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb +188 -0
- data/lib/ood_core/job/adapters/linux_host/launcher.rb +25 -10
- data/lib/ood_core/job/adapters/linux_host/templates/script_wrapper.erb.sh +3 -14
- data/lib/ood_core/job/adapters/slurm.rb +18 -1
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +2 -1
- metadata +32 -6
- data/.travis.yml +0 -9
@@ -16,7 +16,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
16
16
|
# from
|
17
17
|
class Error < StandardError; end
|
18
18
|
|
19
|
-
UNIT_SEPARATOR = "
|
19
|
+
UNIT_SEPARATOR = ","
|
20
20
|
|
21
21
|
# @param debug Whether the adapter should be used in debug mode
|
22
22
|
# @param site_timeout [#to_i] A period after which the job should be killed or nil
|
@@ -80,12 +80,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
80
80
|
|
81
81
|
call(*cmd, stdin: kill_cmd)
|
82
82
|
rescue Error => e
|
83
|
-
|
84
|
-
# The tmux server not running is not an error
|
85
|
-
e.message.include?('failed to connect to server') ||
|
86
|
-
# The session not being found is not an error
|
87
|
-
e.message.include?("session not found: #{session_name_label}")
|
88
|
-
)
|
83
|
+
interpret_and_raise(e)
|
89
84
|
end
|
90
85
|
|
91
86
|
def list_remote_sessions(host: nil)
|
@@ -166,7 +161,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
166
161
|
'email_on_terminated' => script_email_on_event(script, 'terminated'),
|
167
162
|
'email_on_start' => script_email_on_event(script, 'started'),
|
168
163
|
'environment' => export_env(script),
|
169
|
-
'error_path' => (script
|
164
|
+
'error_path' => error_path(script),
|
170
165
|
'job_name' => script.job_name.to_s,
|
171
166
|
'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
|
172
167
|
'script_content' => content,
|
@@ -176,6 +171,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
176
171
|
'singularity_image' => singularity_image(script.native),
|
177
172
|
'ssh_hosts' => ssh_hosts,
|
178
173
|
'tmux_bin' => tmux_bin,
|
174
|
+
'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
|
179
175
|
}.each{
|
180
176
|
|key, value| bnd.local_variable_set(key, value)
|
181
177
|
}
|
@@ -263,8 +259,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
263
259
|
|session_hash| session_hash[:session_name].start_with?(session_name_label)
|
264
260
|
}
|
265
261
|
rescue Error => e
|
266
|
-
|
267
|
-
raise e unless e.message.include?('failed to connect to server')
|
262
|
+
interpret_and_raise(e)
|
268
263
|
[]
|
269
264
|
end
|
270
265
|
|
@@ -272,4 +267,24 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
272
267
|
return false if script.content.empty?
|
273
268
|
script.content.split("\n").first.start_with?('#!/')
|
274
269
|
end
|
270
|
+
|
271
|
+
def error_path(script)
|
272
|
+
return script.error_path.to_s if script.error_path
|
273
|
+
return script.output_path.to_s if script.output_path
|
274
|
+
|
275
|
+
'/dev/null'
|
276
|
+
end
|
277
|
+
|
278
|
+
# under some conditions tmux returns status code 1 but it's not an actual
|
279
|
+
# error. These are when the session is not found or there are no sessions
|
280
|
+
# at all.
|
281
|
+
def interpret_and_raise(error)
|
282
|
+
if error.message.include?('failed to connect to server') # no sessions in tmux 1.8
|
283
|
+
nil
|
284
|
+
elsif error.message.include?('no server running on') # no sessions in tmux 2.7+ message
|
285
|
+
nil
|
286
|
+
else
|
287
|
+
raise error
|
288
|
+
end
|
289
|
+
end
|
275
290
|
end
|
@@ -16,13 +16,9 @@ fi
|
|
16
16
|
echo $hostname
|
17
17
|
|
18
18
|
# Put the script into a temp file on localhost
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
<% else %>
|
23
|
-
singularity_tmp_file=$(mktemp)
|
24
|
-
tmux_tmp_file=$(mktemp)
|
25
|
-
<% end %>
|
19
|
+
singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
|
20
|
+
tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
|
21
|
+
|
26
22
|
|
27
23
|
# Create an executable to run in a tmux session
|
28
24
|
# The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
|
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
|
|
69
65
|
chmod +x "$singularity_tmp_file"
|
70
66
|
chmod +x "$tmux_tmp_file"
|
71
67
|
<%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
|
72
|
-
|
73
|
-
# Remove the file
|
74
|
-
<% if ! debug %>
|
75
|
-
# Wait 1 second to ensure that tmux session has started before the file is removed
|
76
|
-
sleep 1
|
77
|
-
rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
|
78
|
-
<% end %>
|
@@ -80,6 +80,9 @@ module OodCore
|
|
80
80
|
# from
|
81
81
|
class Error < StandardError; end
|
82
82
|
|
83
|
+
# An error indicating the slurm command timed out
|
84
|
+
class SlurmTimeoutError < Error; end
|
85
|
+
|
83
86
|
# @param cluster [#to_s, nil] the cluster name
|
84
87
|
# @param conf [#to_s, nil] path to the slurm conf
|
85
88
|
# @param bin [#to_s] path to slurm installation binaries
|
@@ -147,6 +150,9 @@ module OodCore
|
|
147
150
|
end
|
148
151
|
jobs
|
149
152
|
end
|
153
|
+
rescue SlurmTimeoutError
|
154
|
+
# TODO: could use a log entry here
|
155
|
+
return [{ id: id, state: 'undetermined' }]
|
150
156
|
end
|
151
157
|
|
152
158
|
def squeue_fields(attrs)
|
@@ -303,7 +309,18 @@ module OodCore
|
|
303
309
|
|
304
310
|
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
305
311
|
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
306
|
-
s.success? ? o : raise(Error, e)
|
312
|
+
s.success? ? interpret_and_raise(o, e) : raise(Error, e)
|
313
|
+
end
|
314
|
+
|
315
|
+
# Helper function to raise an error based on the contents of stderr.
|
316
|
+
# Slurm exits 0 even when the command fails, so we need to interpret stderr
|
317
|
+
# to see if the command was actually successful.
|
318
|
+
def interpret_and_raise(stdout, stderr)
|
319
|
+
return stdout if stderr.empty?
|
320
|
+
|
321
|
+
raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
|
322
|
+
|
323
|
+
stdout
|
307
324
|
end
|
308
325
|
|
309
326
|
def squeue_attrs_for_info_attrs(attrs)
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
|
|
24
24
|
|
25
25
|
spec.add_runtime_dependency "ood_support", "~> 0.0.2"
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
|
27
|
-
spec.add_development_dependency "bundler", "~> 1
|
27
|
+
spec.add_development_dependency "bundler", "~> 2.1"
|
28
|
+
spec.add_runtime_dependency "activesupport", ">= 5.2", "< 6.0"
|
28
29
|
spec.add_development_dependency "rake", "~> 13.0.1"
|
29
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
30
31
|
spec.add_development_dependency "pry", "~> 0.10"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -52,14 +52,34 @@ dependencies:
|
|
52
52
|
requirements:
|
53
53
|
- - "~>"
|
54
54
|
- !ruby/object:Gem::Version
|
55
|
-
version: '1
|
55
|
+
version: '2.1'
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
58
|
version_requirements: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '1
|
62
|
+
version: '2.1'
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: activesupport
|
65
|
+
requirement: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '5.2'
|
70
|
+
- - "<"
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '6.0'
|
73
|
+
type: :runtime
|
74
|
+
prerelease: false
|
75
|
+
version_requirements: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '5.2'
|
80
|
+
- - "<"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '6.0'
|
63
83
|
- !ruby/object:Gem::Dependency
|
64
84
|
name: rake
|
65
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -140,9 +160,9 @@ executables: []
|
|
140
160
|
extensions: []
|
141
161
|
extra_rdoc_files: []
|
142
162
|
files:
|
163
|
+
- ".github/workflows/test.yml"
|
143
164
|
- ".gitignore"
|
144
165
|
- ".rspec"
|
145
|
-
- ".travis.yml"
|
146
166
|
- CHANGELOG.md
|
147
167
|
- Gemfile
|
148
168
|
- LICENSE.txt
|
@@ -166,6 +186,12 @@ files:
|
|
166
186
|
- lib/ood_core/job/adapters/ccq.rb
|
167
187
|
- lib/ood_core/job/adapters/drmaa.rb
|
168
188
|
- lib/ood_core/job/adapters/helper.rb
|
189
|
+
- lib/ood_core/job/adapters/kubernetes.rb
|
190
|
+
- lib/ood_core/job/adapters/kubernetes/batch.rb
|
191
|
+
- lib/ood_core/job/adapters/kubernetes/helper.rb
|
192
|
+
- lib/ood_core/job/adapters/kubernetes/k8s_job_info.rb
|
193
|
+
- lib/ood_core/job/adapters/kubernetes/resources.rb
|
194
|
+
- lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
|
169
195
|
- lib/ood_core/job/adapters/linux_host.rb
|
170
196
|
- lib/ood_core/job/adapters/linux_host/launcher.rb
|
171
197
|
- lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
|
@@ -216,7 +242,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
216
242
|
- !ruby/object:Gem::Version
|
217
243
|
version: '0'
|
218
244
|
requirements: []
|
219
|
-
rubygems_version: 3.
|
245
|
+
rubygems_version: 3.1.2
|
220
246
|
signing_key:
|
221
247
|
specification_version: 4
|
222
248
|
summary: Open OnDemand core library
|