ood_core 0.9.3 → 0.11.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +2 -2
- data/CHANGELOG.md +52 -2
- data/lib/ood_core.rb +1 -0
- data/lib/ood_core/batch_connect/template.rb +44 -2
- data/lib/ood_core/cluster.rb +15 -0
- data/lib/ood_core/clusters.rb +22 -10
- data/lib/ood_core/invalid_cluster.rb +37 -0
- data/lib/ood_core/job/adapter.rb +35 -4
- data/lib/ood_core/job/adapters/drmaa.rb +1 -1
- data/lib/ood_core/job/adapters/linux_host.rb +245 -0
- data/lib/ood_core/job/adapters/linux_host/launcher.rb +274 -0
- data/lib/ood_core/job/adapters/linux_host/templates/email.erb.sh +9 -0
- data/lib/ood_core/job/adapters/linux_host/templates/script_wrapper.erb.sh +64 -0
- data/lib/ood_core/job/adapters/lsf.rb +4 -0
- data/lib/ood_core/job/adapters/lsf/helper.rb +9 -3
- data/lib/ood_core/job/adapters/pbspro.rb +5 -0
- data/lib/ood_core/job/adapters/sge.rb +4 -0
- data/lib/ood_core/job/adapters/sge/batch.rb +1 -2
- data/lib/ood_core/job/adapters/sge/helper.rb +1 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_j_r_listener.rb +19 -4
- data/lib/ood_core/job/adapters/slurm.rb +25 -2
- data/lib/ood_core/job/adapters/torque.rb +5 -0
- data/lib/ood_core/job/array_ids.rb +18 -53
- data/lib/ood_core/job/script.rb +11 -2
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +2 -1
- metadata +24 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4f452bc936a1246bb7ac53d3aeafe36c7f54ad23a2d754d916b5701c40343288
|
4
|
+
data.tar.gz: f30a99239692b568b30453a0c519cbf8adb977589c4dfa35132bd4a0f6019c17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eb1d20267c147d723bfdafb1d169d6f7efc8323f66a749959672f5df23dc74d8503453f5a9214bf7bf35e5db74156ac3e48cdf18b19fa256d8a53fd331df5491
|
7
|
+
data.tar.gz: 1672bfd5d571492d9c5d6e97e1b2f3eeeeb26bd5580bca6e12b5e6282f05be9300b5b58aecb7ec20ecf8b9513983e1df5d78968bc6a23bacfc38262ffb1e1110
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,50 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
|
+
## [0.11.4] - 2020-05-27
|
10
|
+
### Fixed
|
11
|
+
- Environment exports in SLURM while implementing [#158](https://github.com/OSC/ood_core/issues/158)
|
12
|
+
and [#109](https://github.com/OSC/ood_core/issues/109) in [#163](https://github.com/OSC/ood_core/pull/163)
|
13
|
+
|
14
|
+
## [0.11.3] - 2020-05-11
|
15
|
+
### Fixed
|
16
|
+
- LinuxhHost Adapter to work with any login shell ([#188](https://github.com/OSC/ood_core/pull/188))
|
17
|
+
- LinuxhHost Adapter needs to display long lines in pstree to successfully parse
|
18
|
+
output ([#188](https://github.com/OSC/ood_core/pull/188))
|
19
|
+
|
20
|
+
## [0.11.2] - 2020-04-23
|
21
|
+
### Fixed
|
22
|
+
- fix signature of `LinuxHost#info_where_owner`
|
23
|
+
|
24
|
+
## [0.11.1] - 2020-03-18
|
25
|
+
### Changed
|
26
|
+
- Only the version changed. Had to republish to rubygems.org
|
27
|
+
|
28
|
+
## [0.11.0] - 2020-03-18
|
29
|
+
### Added
|
30
|
+
- Added directive prefixes to each adapter (e.g. `#QSUB`) ([#161](https://github.com/OSC/ood_core/issues/161))
|
31
|
+
- LHA supports `submit_host` field in native ([#164](https://github.com/OSC/ood_core/issues/164))
|
32
|
+
- Cluster files can be yaml or yml extensions ([#171](https://github.com/OSC/ood_core/issues/171))
|
33
|
+
- Users can add a flag `OOD_JOB_NAME_ILLEGAL_CHARS` to sanitize job names ([#183](https://github.com/OSC/ood_core/issues/183)
|
34
|
+
|
35
|
+
### Changed
|
36
|
+
- Simplified job array parsing ([#144](https://github.com/OSC/ood_core/issues/144))
|
37
|
+
|
38
|
+
### Fixed
|
39
|
+
- Issue where environment variables were not properly exported to the job ([#158](https://github.com/OSC/ood_core/issues/158))
|
40
|
+
- Parsing bad cluster files ([#150](https://github.com/OSC/ood_core/issues/150) and [#178](https://github.com/OSC/ood_core/issues/178))
|
41
|
+
- netcat is no longer a hard dependency. Now lsof, python and bash can be used ([153](https://github.com/OSC/ood_core/issues/153))
|
42
|
+
- GE crash when nil config file was given ([#175](https://github.com/OSC/ood_core/issues/175))
|
43
|
+
- GE sometimes reported incorrect core count ([#168](https://github.com/OSC/ood_core/issues/168))
|
44
|
+
|
45
|
+
|
46
|
+
## [0.10.0] - 2019-11-05
|
47
|
+
### Added
|
48
|
+
- Added an adapter for submitting work on Linux hosted systems without using a scheduler
|
49
|
+
|
50
|
+
### Fixed
|
51
|
+
- Fixed bug where an unreadable cluster config would cause crashes
|
52
|
+
|
9
53
|
## [0.9.3] - 2019-05-08
|
10
54
|
### Fixed
|
11
55
|
- Fixed bug relating to cluster comparison
|
@@ -27,7 +71,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
27
71
|
- Grid Engine adapter now starts scripts in the current directory like all other adapters
|
28
72
|
- Fixed issue where Slurm comment field might break job info parsing
|
29
73
|
- Fixed possible crash when comparing two clusters if the id of one of the clusters is nil
|
30
|
-
- Fixed bug with the live system test that impacted non-
|
74
|
+
- Fixed bug with the live system test that impacted non-LSF systems
|
31
75
|
- Fixed bug with Slurm adapter when submit time is not available
|
32
76
|
|
33
77
|
## [0.8.0] - 2019-01-29
|
@@ -189,7 +233,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
189
233
|
### Added
|
190
234
|
- Initial release!
|
191
235
|
|
192
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
236
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.11.4...HEAD
|
237
|
+
[0.11.4]: https://github.com/OSC/ood_core/compare/v0.11.3...v0.11.4
|
238
|
+
[0.11.3]: https://github.com/OSC/ood_core/compare/v0.11.2...v0.11.3
|
239
|
+
[0.11.2]: https://github.com/OSC/ood_core/compare/v0.11.1...v0.11.2
|
240
|
+
[0.11.1]: https://github.com/OSC/ood_core/compare/v0.11.0...v0.11.1
|
241
|
+
[0.11.0]: https://github.com/OSC/ood_core/compare/v0.10.0...v0.11.0
|
242
|
+
[0.10.0]: https://github.com/OSC/ood_core/compare/v0.9.3...v0.10.0
|
193
243
|
[0.9.3]: https://github.com/OSC/ood_core/compare/v0.9.2...v0.9.3
|
194
244
|
[0.9.2]: https://github.com/OSC/ood_core/compare/v0.9.1...v0.9.2
|
195
245
|
[0.9.1]: https://github.com/OSC/ood_core/compare/v0.9.0...v0.9.1
|
data/lib/ood_core.rb
CHANGED
@@ -117,11 +117,47 @@ module OodCore
|
|
117
117
|
}
|
118
118
|
export -f random_number
|
119
119
|
|
120
|
+
port_used_python() {
|
121
|
+
python -c "import socket; socket.socket().connect(('$1',$2))" >/dev/null 2>&1
|
122
|
+
}
|
123
|
+
|
124
|
+
port_used_python3() {
|
125
|
+
python3 -c "import socket; socket.socket().connect(('$1',$2))" >/dev/null 2>&1
|
126
|
+
}
|
127
|
+
|
128
|
+
port_used_nc(){
|
129
|
+
nc -w 2 "$1" "$2" < /dev/null > /dev/null 2>&1
|
130
|
+
}
|
131
|
+
|
132
|
+
port_used_lsof(){
|
133
|
+
lsof -i :"$2" >/dev/null 2>&1
|
134
|
+
}
|
135
|
+
|
136
|
+
port_used_bash(){
|
137
|
+
local bash_supported=$(strings /bin/bash 2>/dev/null | grep tcp)
|
138
|
+
if [ "$bash_supported" == "/dev/tcp/*/*" ]; then
|
139
|
+
(: < /dev/tcp/$1/$2) >/dev/null 2>&1
|
140
|
+
else
|
141
|
+
return 127
|
142
|
+
fi
|
143
|
+
}
|
144
|
+
|
120
145
|
# Check if port $1 is in use
|
121
146
|
port_used () {
|
122
147
|
local port="${1#*:}"
|
123
148
|
local host=$((expr "${1}" : '\\(.*\\):' || echo "localhost") | awk 'END{print $NF}')
|
124
|
-
|
149
|
+
local port_strategies=(port_used_nc port_used_lsof port_used_bash port_used_python port_used_python3)
|
150
|
+
|
151
|
+
for strategy in ${port_strategies[@]};
|
152
|
+
do
|
153
|
+
$strategy $host $port
|
154
|
+
status=$?
|
155
|
+
if [[ "$status" == "0" ]] || [[ "$status" == "1" ]]; then
|
156
|
+
return $status
|
157
|
+
fi
|
158
|
+
done
|
159
|
+
|
160
|
+
return 127
|
125
161
|
}
|
126
162
|
export -f port_used
|
127
163
|
|
@@ -143,8 +179,14 @@ module OodCore
|
|
143
179
|
local port="${1}"
|
144
180
|
local time="${2:-30}"
|
145
181
|
for ((i=1; i<=time*2; i++)); do
|
146
|
-
|
182
|
+
port_used "${port}"
|
183
|
+
port_status=$?
|
184
|
+
if [ "$port_status" == "0" ]; then
|
147
185
|
return 0
|
186
|
+
elif [ "$port_status" == "127" ]; then
|
187
|
+
echo "commands to find port were either not found or inaccessible."
|
188
|
+
echo "command options are lsof, nc, bash's /dev/tcp, or python (or python3) with socket lib."
|
189
|
+
return 127
|
148
190
|
fi
|
149
191
|
sleep 0.5
|
150
192
|
done
|
data/lib/ood_core/cluster.rb
CHANGED
@@ -28,6 +28,10 @@ module OodCore
|
|
28
28
|
# @return [Hash] the acls configuration
|
29
29
|
attr_reader :acls_config
|
30
30
|
|
31
|
+
# The errors encountered with configuring this cluster
|
32
|
+
# @return Array<String> the errors
|
33
|
+
attr_reader :errors
|
34
|
+
|
31
35
|
# @param cluster [#to_h] the cluster object
|
32
36
|
# @option cluster [#to_sym] :id The cluster id
|
33
37
|
# @option cluster [#to_h] :metadata ({}) The cluster's metadata
|
@@ -39,6 +43,8 @@ module OodCore
|
|
39
43
|
# against
|
40
44
|
# @option cluster [#to_h] :batch_connect ({}) Configuration for batch
|
41
45
|
# connect templates
|
46
|
+
# @option cluster [#to_a] :errors ([]) List of configuration errors
|
47
|
+
#
|
42
48
|
def initialize(cluster)
|
43
49
|
c = cluster.to_h.symbolize_keys
|
44
50
|
|
@@ -52,6 +58,9 @@ module OodCore
|
|
52
58
|
@custom_config = c.fetch(:custom, {}) .to_h.symbolize_keys
|
53
59
|
@acls_config = c.fetch(:acls, []) .map(&:to_h)
|
54
60
|
@batch_connect_config = c.fetch(:batch_connect, {}).to_h.symbolize_keys
|
61
|
+
|
62
|
+
# side affects from object creation and validation
|
63
|
+
@errors = c.fetch(:errors, []) .to_a
|
55
64
|
end
|
56
65
|
|
57
66
|
# Metadata that provides extra information about this cluster
|
@@ -159,6 +168,12 @@ module OodCore
|
|
159
168
|
}
|
160
169
|
end
|
161
170
|
|
171
|
+
# This cluster is always valid
|
172
|
+
# @return true
|
173
|
+
def valid?
|
174
|
+
return true
|
175
|
+
end
|
176
|
+
|
162
177
|
private
|
163
178
|
# Build acl adapter objects from array
|
164
179
|
def build_acls(ary)
|
data/lib/ood_core/clusters.rb
CHANGED
@@ -19,20 +19,32 @@ module OodCore
|
|
19
19
|
|
20
20
|
clusters = []
|
21
21
|
if config.file?
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
if config.readable?
|
23
|
+
CONFIG_VERSION.any? do |version|
|
24
|
+
begin
|
25
|
+
YAML.safe_load(config.read)&.fetch(version, {}).each do |k, v|
|
26
|
+
clusters << Cluster.new(send("parse_#{version}", id: k, cluster: v))
|
27
|
+
end
|
28
|
+
rescue Psych::SyntaxError => e
|
29
|
+
clusters << InvalidCluster.new(
|
30
|
+
id: config.basename(config.extname).to_s,
|
31
|
+
errors: [ e.message.to_s ]
|
32
|
+
)
|
33
|
+
end
|
25
34
|
end
|
26
|
-
!clusters.empty?
|
27
35
|
end
|
28
36
|
elsif config.directory?
|
29
|
-
Pathname.glob(config.join("*.yml")).each do |p|
|
37
|
+
Pathname.glob([config.join("*.yml"), config.join("*.yaml")]).select(&:file?).select(&:readable?).each do |p|
|
30
38
|
CONFIG_VERSION.any? do |version|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
39
|
+
begin
|
40
|
+
if cluster = YAML.safe_load(p.read)&.fetch(version, nil)
|
41
|
+
clusters << Cluster.new(send("parse_#{version}", id: p.basename(p.extname()).to_s, cluster: cluster))
|
42
|
+
end
|
43
|
+
rescue Psych::SyntaxError => e
|
44
|
+
clusters << InvalidCluster.new(
|
45
|
+
id: p.basename(p.extname).to_s,
|
46
|
+
errors: [ e.message.to_s ]
|
47
|
+
)
|
36
48
|
end
|
37
49
|
end
|
38
50
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module OodCore
|
2
|
+
# A special case of an OodCore::Cluster where something went awry in the
|
3
|
+
# creation and it's invalid for some reason. Users should only be able
|
4
|
+
# to rely on id and metadata.error_msg. All *allow? related functions
|
5
|
+
# false, meaning nothing is allowed.
|
6
|
+
class InvalidCluster < Cluster
|
7
|
+
# Jobs are not allowed
|
8
|
+
# @return false
|
9
|
+
def login_allow?
|
10
|
+
false
|
11
|
+
end
|
12
|
+
|
13
|
+
# Jobs are not allowed
|
14
|
+
# @return false
|
15
|
+
def job_allow?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
# Custom features are not allowed
|
20
|
+
# @return false
|
21
|
+
def custom_allow?(_)
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
# This cluster is not allowed to be used
|
26
|
+
# @return false
|
27
|
+
def allow?
|
28
|
+
false
|
29
|
+
end
|
30
|
+
|
31
|
+
# This cluster is never valid
|
32
|
+
# @return false
|
33
|
+
def valid?
|
34
|
+
return false
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/ood_core/job/adapter.rb
CHANGED
@@ -36,7 +36,7 @@ module OodCore
|
|
36
36
|
# Retrieve info for all jobs from the resource manager
|
37
37
|
# @abstract Subclass is expected to implement {#info_all}
|
38
38
|
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
39
|
-
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
39
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
40
40
|
# This array specifies only attrs you want, in addition to id and status.
|
41
41
|
# If an array, the Info object that is returned to you is not guarenteed
|
42
42
|
# to have a value for any attr besides the ones specified and id and status.
|
@@ -51,7 +51,7 @@ module OodCore
|
|
51
51
|
# Retrieve info for all jobs for a given owner or owners from the
|
52
52
|
# resource manager
|
53
53
|
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
54
|
-
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
54
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
55
55
|
# This array specifies only attrs you want, in addition to id and status.
|
56
56
|
# If an array, the Info object that is returned to you is not guarenteed
|
57
57
|
# to have a value for any attr besides the ones specified and id and status.
|
@@ -69,7 +69,7 @@ module OodCore
|
|
69
69
|
end
|
70
70
|
|
71
71
|
# Iterate over each job Info object
|
72
|
-
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
72
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
73
73
|
# This array specifies only attrs you want, in addition to id and status.
|
74
74
|
# If an array, the Info object that is returned to you is not guarenteed
|
75
75
|
# to have a value for any attr besides the ones specified and id and status.
|
@@ -88,7 +88,7 @@ module OodCore
|
|
88
88
|
|
89
89
|
# Iterate over each job Info object
|
90
90
|
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
91
|
-
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
91
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
92
92
|
# This array specifies only attrs you want, in addition to id and status.
|
93
93
|
# If an array, the Info object that is returned to you is not guarenteed
|
94
94
|
# to have a value for any attr besides the ones specified and id and status.
|
@@ -157,6 +157,37 @@ module OodCore
|
|
157
157
|
def delete(id)
|
158
158
|
raise NotImplementedError, "subclass did not define #delete"
|
159
159
|
end
|
160
|
+
|
161
|
+
# Return the scheduler-specific directive prefix
|
162
|
+
#
|
163
|
+
# Examples of directive prefixes include #QSUB, #BSUB and allow placing what would
|
164
|
+
# otherwise be command line options inside the job launch script.
|
165
|
+
#
|
166
|
+
# The method should return nil if the adapter does not support prefixes
|
167
|
+
#
|
168
|
+
# @abstract Subclass is expected to implement {#directive_prefix}
|
169
|
+
# @raise [NotImplementedError] if subclass did not defined {#directive_prefix}
|
170
|
+
# @return [String]
|
171
|
+
def directive_prefix
|
172
|
+
raise NotImplementedError, "subclass did not define #directive_prefix"
|
173
|
+
end
|
174
|
+
|
175
|
+
# Replace illegal chars in job name with a dash
|
176
|
+
#
|
177
|
+
# @return [String] job name with dashes replacing illegal chars
|
178
|
+
def sanitize_job_name(job_name)
|
179
|
+
# escape ^ and omit -
|
180
|
+
chars = job_name_illegal_chars.to_s.gsub("^", "\\^").gsub("-", "")
|
181
|
+
job_name.tr(chars, "-")
|
182
|
+
end
|
183
|
+
|
184
|
+
# Illegal chars that should not be used in a job name
|
185
|
+
# A dash is assumed to be legal in job names in all batch schedulers
|
186
|
+
#
|
187
|
+
# @return [String] string of chars
|
188
|
+
def job_name_illegal_chars
|
189
|
+
ENV["OOD_JOB_NAME_ILLEGAL_CHARS"].to_s
|
190
|
+
end
|
160
191
|
end
|
161
192
|
end
|
162
193
|
end
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# The contents of this file are subject to the Sun Industry Standards
|
14
14
|
# Source License Version 1.2 (the "License"); You may not use this file
|
15
15
|
# except in compliance with the License. You may obtain a copy of the
|
16
|
-
# License at http://
|
16
|
+
# License at http://gridscheduler.sourceforge.net/Gridengine_SISSL_license.html
|
17
17
|
#
|
18
18
|
# Software provided under this License is provided on an "AS IS" basis,
|
19
19
|
# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
|
@@ -0,0 +1,245 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
require "ood_core/refinements/array_extensions"
|
3
|
+
require "ood_core/job/adapters/helper"
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
module OodCore
|
7
|
+
module Job
|
8
|
+
class Factory
|
9
|
+
using Refinements::HashExtensions
|
10
|
+
|
11
|
+
# Build the LinuxHost adapter from a configuration
|
12
|
+
# @param config [#to_h] the configuration for job adapter
|
13
|
+
# @option config [Object] :contain (false) Pass `--contain` flag to Singularity; allows overriding bind mounts in singularity.conf
|
14
|
+
# @option config [Object] :debug (false) Use the adapter in a debug mode
|
15
|
+
# @option config [Object] :max_timeout (nil) The longest 'wall_clock' permissible
|
16
|
+
# @option config [Object] :singularity_bin ('/usr/bin/singularity') The path to the Singularity executable
|
17
|
+
# @option config [Object] :singularity_bindpath ('/etc,/media,/mnt,/opt,/srv,/usr,/var,/users') A comma delimited list of paths to bind between the host and the guest
|
18
|
+
# @option config [Object] :singularity_image The path to the Singularity image to use
|
19
|
+
# @option config [Object] :ssh_hosts (nil) The list of permissable hosts, defaults to :submit_host
|
20
|
+
# @option config [Object] :strict_host_checking (true) Set to false to disable strict host checking and updating the known_hosts file
|
21
|
+
# @option config [Object] :submit_host The SSH target to connect to, may be the head of a round-robin
|
22
|
+
# @option config [Object] :tmux_bin ('/usr/bin/tmux') The path to the Tmux executable
|
23
|
+
def self.build_linux_host(config)
|
24
|
+
c = config.to_h.symbolize_keys
|
25
|
+
contain = c.fetch(:contain, false)
|
26
|
+
debug = c.fetch(:debug, false)
|
27
|
+
max_timeout = c.fetch(:max_timeout, nil)
|
28
|
+
singularity_bin = c.fetch(:singularity_bin, '/usr/bin/singularity')
|
29
|
+
singularity_bindpath = c.fetch(:singularity_bindpath, '/etc,/media,/mnt,/opt,/srv,/usr,/var,/users')
|
30
|
+
singularity_image = c[:singularity_image]
|
31
|
+
ssh_hosts = c.fetch(:ssh_hosts, [c[:submit_host]])
|
32
|
+
strict_host_checking = c.fetch(:strict_host_checking, true)
|
33
|
+
submit_host = c[:submit_host]
|
34
|
+
tmux_bin = c.fetch(:tmux_bin, '/usr/bin/tmux')
|
35
|
+
|
36
|
+
Adapters::LinuxHost.new(
|
37
|
+
ssh_hosts: ssh_hosts,
|
38
|
+
launcher: Adapters::LinuxHost::Launcher.new(
|
39
|
+
contain: contain,
|
40
|
+
debug: debug,
|
41
|
+
max_timeout: max_timeout,
|
42
|
+
singularity_bin: singularity_bin,
|
43
|
+
singularity_bindpath: singularity_bindpath, # '/etc,/media,/mnt,/opt,/srv,/usr,/var,/users',
|
44
|
+
singularity_image: singularity_image,
|
45
|
+
ssh_hosts: ssh_hosts,
|
46
|
+
strict_host_checking: strict_host_checking,
|
47
|
+
submit_host: submit_host,
|
48
|
+
tmux_bin: tmux_bin,
|
49
|
+
)
|
50
|
+
)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
module Adapters
|
55
|
+
# An adapter object that describes the communication with a remote host
|
56
|
+
# for job management.
|
57
|
+
class LinuxHost < Adapter
|
58
|
+
using Refinements::ArrayExtensions
|
59
|
+
|
60
|
+
require "ood_core/job/adapters/linux_host/launcher"
|
61
|
+
|
62
|
+
def initialize(ssh_hosts:, launcher:)
|
63
|
+
@launcher = launcher
|
64
|
+
@ssh_hosts = Set.new(ssh_hosts)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Submit a job with the attributes defined in the job template instance
|
68
|
+
# @param script [Script] script object that describes the script and
|
69
|
+
# attributes for the submitted job
|
70
|
+
# @param after [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
|
71
|
+
# @param afterok [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
|
72
|
+
# @param afternotok [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
|
73
|
+
# @param afterany [#to_s, Array<#to_s>] No scheduling is available is used; setting raises JobAdapterError
|
74
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
75
|
+
# @return [String] the job id returned after successfully submitting a
|
76
|
+
# job
|
77
|
+
# @see Adapter#submit
|
78
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
79
|
+
unless (after.empty? && afterok.empty? && afternotok.empty? && afterany.empty?)
|
80
|
+
raise JobAdapterError, 'Scheduling subsequent jobs is not available.'
|
81
|
+
end
|
82
|
+
|
83
|
+
@launcher.start_remote_session(script)
|
84
|
+
rescue Launcher::Error => e
|
85
|
+
raise JobAdapterError, e.message
|
86
|
+
end
|
87
|
+
|
88
|
+
# Retrieve info for all jobs from the resource manager
|
89
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
90
|
+
# @return [Array<Info>] information describing submitted jobs
|
91
|
+
# @see Adapter#info_all
|
92
|
+
def info_all(attrs: nil, host: nil)
|
93
|
+
host_permitted?(host) if host
|
94
|
+
|
95
|
+
@launcher.list_remote_sessions(host: host).map{
|
96
|
+
|ls_output| ls_to_info(ls_output)
|
97
|
+
}
|
98
|
+
rescue Launcher::Error => e
|
99
|
+
raise JobAdapterError, e.message
|
100
|
+
end
|
101
|
+
|
102
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
103
|
+
# resource manager
|
104
|
+
# Note: owner and attrs are present only to complete the interface and are ignored
|
105
|
+
# Note: since this API is used in production no errors or warnings are thrown / issued
|
106
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
107
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
108
|
+
# @return [Array<Info>] information describing submitted jobs
|
109
|
+
def info_where_owner(_, attrs: nil)
|
110
|
+
info_all
|
111
|
+
end
|
112
|
+
|
113
|
+
# Iterate over each job Info object
|
114
|
+
# @param attrs [Array<symbol>] attrs is present only to complete the interface and is ignored
|
115
|
+
# @yield [Info] of each job to block
|
116
|
+
# @return [Enumerator] if no block given
|
117
|
+
def info_all_each(attrs: nil)
|
118
|
+
return to_enum(:info_all_each, attrs: attrs) unless block_given?
|
119
|
+
|
120
|
+
info_all(attrs: attrs).each do |job|
|
121
|
+
yield job
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Iterate over each job Info object
|
126
|
+
# @param owner [#to_s, Array<#to_s>] owner is present only to complete the interface and is ignored
|
127
|
+
# @param attrs [Array<symbol>] attrs is present only to complete the interface and is ignored
|
128
|
+
# @yield [Info] of each job to block
|
129
|
+
# @return [Enumerator] if no block given
|
130
|
+
def info_where_owner_each(owner, attrs: nil)
|
131
|
+
return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
|
132
|
+
|
133
|
+
info_where_owner(owner, attrs: attrs).each do |job|
|
134
|
+
yield job
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Whether the adapter supports job arrays
|
139
|
+
# @return [Boolean] - false
|
140
|
+
def supports_job_arrays?
|
141
|
+
false
|
142
|
+
end
|
143
|
+
|
144
|
+
# Retrieve job info from the SSH host
|
145
|
+
# @param id [#to_s] the id of the job
|
146
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
147
|
+
# @return [Info] information describing submitted job
|
148
|
+
# @see Adapter#info
|
149
|
+
def info(id)
|
150
|
+
_, host = parse_job_id(id)
|
151
|
+
job = info_all(host: host).select{|info| info.id == id}.first
|
152
|
+
(job) ? job : Info.new(id: id, status: :completed)
|
153
|
+
rescue Launcher::Error => e
|
154
|
+
raise JobAdapterError, e.message
|
155
|
+
end
|
156
|
+
|
157
|
+
# Retrieve job status from resource manager
|
158
|
+
# @note Optimized slightly over retrieving complete job information from server
|
159
|
+
# @abstract Subclass is expected to implement {#status}
|
160
|
+
# @raise [NotImplementedError] if subclass did not define {#status}
|
161
|
+
# @param id [#to_s] the id of the job
|
162
|
+
# @return [Status] status of job
|
163
|
+
def status(id)
|
164
|
+
_, host = parse_job_id(id)
|
165
|
+
job = info_all(host: host).select{|info| info.id == id}.first
|
166
|
+
|
167
|
+
Status.new(state: (job) ? :running : :completed)
|
168
|
+
rescue Launcher::Error => e
|
169
|
+
raise JobAdapterError, e.message
|
170
|
+
end
|
171
|
+
|
172
|
+
# Put the submitted job on hold
|
173
|
+
# @abstract Subclass is expected to implement {#hold}
|
174
|
+
# @raise [NotImplementedError] if subclass did not define {#hold}
|
175
|
+
# @param id [#to_s] the id of the job
|
176
|
+
# @return [void]
|
177
|
+
def hold(id)
|
178
|
+
# Consider sending SIGSTOP?
|
179
|
+
raise NotImplementedError, "subclass did not define #hold"
|
180
|
+
end
|
181
|
+
|
182
|
+
# Release the job that is on hold
|
183
|
+
# @abstract Subclass is expected to implement {#release}
|
184
|
+
# @raise [NotImplementedError] if subclass did not define {#release}
|
185
|
+
# @param id [#to_s] the id of the job
|
186
|
+
# @return [void]
|
187
|
+
def release(id)
|
188
|
+
# Consider sending SIGCONT
|
189
|
+
raise NotImplementedError, "subclass did not define #release"
|
190
|
+
end
|
191
|
+
|
192
|
+
# Delete the submitted job
|
193
|
+
# @abstract Subclass is expected to implement {#delete}
|
194
|
+
# @raise [NotImplementedError] if subclass did not define {#delete}
|
195
|
+
# @param id [#to_s] the id of the job
|
196
|
+
# @return [void]
|
197
|
+
def delete(id)
|
198
|
+
session_name, destination_host = parse_job_id(id)
|
199
|
+
@launcher.stop_remote_session(session_name, destination_host)
|
200
|
+
rescue Launcher::Error => e
|
201
|
+
raise JobAdapterError, e.message
|
202
|
+
end
|
203
|
+
|
204
|
+
def directive_prefix
|
205
|
+
nil
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
def host_permitted?(destination_host)
|
211
|
+
raise JobAdapterError, "Requested destination host (#{destination_host}) not permitted" unless @ssh_hosts.include?(destination_host)
|
212
|
+
end
|
213
|
+
|
214
|
+
def parse_job_id(id)
|
215
|
+
raise JobAdapterError, "#{id} is not a valid LinuxHost adapter id because it is missing the '@'." unless id.include?('@')
|
216
|
+
|
217
|
+
return id.split('@')
|
218
|
+
end
|
219
|
+
|
220
|
+
# Convert the returned Hash into an Info object
|
221
|
+
def ls_to_info(ls_output)
|
222
|
+
started = ls_output[:session_created].to_i
|
223
|
+
now = Time.now.to_i
|
224
|
+
ellapsed = now - started
|
225
|
+
Info.new(
|
226
|
+
accounting_id: nil,
|
227
|
+
allocated_nodes: [NodeInfo.new(name: ls_output[:destination_host], procs: 1)],
|
228
|
+
cpu_time: ellapsed,
|
229
|
+
dispatch_time: started,
|
230
|
+
id: ls_output[:id],
|
231
|
+
job_name: nil, # TODO
|
232
|
+
job_owner: Etc.getlogin,
|
233
|
+
native: ls_output,
|
234
|
+
procs: 1,
|
235
|
+
queue_name: "LinuxHost adapter for #{@submit_host}",
|
236
|
+
status: :running,
|
237
|
+
submission_time: ellapsed,
|
238
|
+
submit_host: @submit_host,
|
239
|
+
wallclock_time: ellapsed
|
240
|
+
)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|