ood_core 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.rspec +2 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +60 -0
- data/Rakefile +6 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/lib/ood_core.rb +34 -0
- data/lib/ood_core/acl/adapter.rb +17 -0
- data/lib/ood_core/acl/adapters/group.rb +59 -0
- data/lib/ood_core/acl/factory.rb +41 -0
- data/lib/ood_core/cluster.rb +143 -0
- data/lib/ood_core/clusters.rb +114 -0
- data/lib/ood_core/errors.rb +19 -0
- data/lib/ood_core/job/adapter.rb +89 -0
- data/lib/ood_core/job/adapters/lsf.rb +193 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +160 -0
- data/lib/ood_core/job/adapters/lsf/helper.rb +26 -0
- data/lib/ood_core/job/adapters/slurm.rb +470 -0
- data/lib/ood_core/job/adapters/torque.rb +274 -0
- data/lib/ood_core/job/factory.rb +41 -0
- data/lib/ood_core/job/info.rb +141 -0
- data/lib/ood_core/job/node_info.rb +47 -0
- data/lib/ood_core/job/node_request.rb +51 -0
- data/lib/ood_core/job/script.rb +235 -0
- data/lib/ood_core/job/status.rb +128 -0
- data/lib/ood_core/refinements/array_extensions.rb +22 -0
- data/lib/ood_core/refinements/hash_extensions.rb +25 -0
- data/lib/ood_core/version.rb +4 -0
- data/ood_core.gemspec +32 -0
- metadata +182 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module OodCore
|
4
|
+
# An enumerable that contains a list of {Cluster} objects
|
5
|
+
class Clusters
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
# The format version of the configuration file
|
9
|
+
CONFIG_VERSION = ['v2', 'v1']
|
10
|
+
|
11
|
+
class << self
|
12
|
+
# Parse a configuration file or a set of configuration files in a
|
13
|
+
# directory
|
14
|
+
# @param path [#to_s] configuration file or directory path
|
15
|
+
# @raise [ConfigurationNotFound] if path does not exist
|
16
|
+
# @return [Clusters] the clusters parsed from config
|
17
|
+
def load_file(path)
|
18
|
+
config = Pathname.new(path.to_s).expand_path
|
19
|
+
|
20
|
+
clusters = []
|
21
|
+
if config.file?
|
22
|
+
CONFIG_VERSION.any? do |version|
|
23
|
+
YAML.safe_load(config.read).fetch(version, {}).each do |k, v|
|
24
|
+
clusters << Cluster.new(send("parse_#{version}", id: k, cluster: v))
|
25
|
+
end
|
26
|
+
!clusters.empty?
|
27
|
+
end
|
28
|
+
elsif config.directory?
|
29
|
+
Pathname.glob(config.join("*.yml")).each do |p|
|
30
|
+
CONFIG_VERSION.any? do |version|
|
31
|
+
if cluster = YAML.safe_load(p.read).fetch(version, nil)
|
32
|
+
clusters << Cluster.new(send("parse_#{version}", id: p.basename(".yml").to_s, cluster: cluster))
|
33
|
+
true
|
34
|
+
else
|
35
|
+
false
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
else
|
40
|
+
raise ConfigurationNotFound, "configuration file '#{config}' does not exist"
|
41
|
+
end
|
42
|
+
|
43
|
+
new clusters
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
# Parse a list of clusters from a 'v1' config
|
48
|
+
# NB: Makes minimum assumptions about config
|
49
|
+
def parse_v1(id:, cluster:)
|
50
|
+
c = {
|
51
|
+
id: id,
|
52
|
+
metadata: {},
|
53
|
+
login: {},
|
54
|
+
job: {},
|
55
|
+
acls: [],
|
56
|
+
custom: {}
|
57
|
+
}
|
58
|
+
|
59
|
+
c[:metadata][:title] = cluster["title"] if cluster.key?("title")
|
60
|
+
c[:metadata][:url] = cluster["url"] if cluster.key?("url")
|
61
|
+
c[:metadata][:private] = true if cluster["cluster"]["data"]["hpc_cluster"] == false
|
62
|
+
|
63
|
+
if l = cluster["cluster"]["data"]["servers"]["login"]
|
64
|
+
c[:login][:host] = l["data"]["host"]
|
65
|
+
end
|
66
|
+
|
67
|
+
if rm = cluster["cluster"]["data"]["servers"]["resource_mgr"]
|
68
|
+
c[:job][:adapter] = "torque"
|
69
|
+
c[:job][:host] = rm["data"]["host"]
|
70
|
+
c[:job][:lib] = rm["data"]["lib"]
|
71
|
+
c[:job][:bin] = rm["data"]["bin"]
|
72
|
+
c[:job][:acls] = []
|
73
|
+
end
|
74
|
+
|
75
|
+
if v = cluster["validators"]
|
76
|
+
if vc = v["cluster"]
|
77
|
+
c[:acls] = vc.map do |h|
|
78
|
+
{
|
79
|
+
adapter: "group",
|
80
|
+
groups: h["data"]["groups"],
|
81
|
+
type: h["data"]["allow"] ? "whitelist" : "blacklist"
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
c
|
88
|
+
end
|
89
|
+
|
90
|
+
# Parse a list of clusters from a 'v2' config
|
91
|
+
def parse_v2(id:, cluster:)
|
92
|
+
cluster.merge(id: id)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# @param clusters [Array<Cluster>] list of cluster objects
|
97
|
+
def initialize(clusters = [])
|
98
|
+
@clusters = clusters
|
99
|
+
end
|
100
|
+
|
101
|
+
# Find cluster in list using the id of the cluster
|
102
|
+
# @param id [Object] id of cluster object
|
103
|
+
# @return [Cluster, nil] cluster object if found
|
104
|
+
def [](id)
|
105
|
+
@clusters.detect { |cluster| cluster == id }
|
106
|
+
end
|
107
|
+
|
108
|
+
# For a block {|cluster| ...}
|
109
|
+
# @yield [cluster] Gives the next cluster object in the list
|
110
|
+
def each(&block)
|
111
|
+
@clusters.each(&block)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module OodCore
|
2
|
+
# Generic {OodCore} exception class
|
3
|
+
class Error < StandardError; end
|
4
|
+
|
5
|
+
# Raised when cannot find configuration file specified
|
6
|
+
class ConfigurationNotFound < Error; end
|
7
|
+
|
8
|
+
# Raised when adapter not specified in configuration
|
9
|
+
class AdapterNotSpecified < Error; end
|
10
|
+
|
11
|
+
# Raised when cannot find adapter specified in configuration
|
12
|
+
class AdapterNotFound < Error; end
|
13
|
+
|
14
|
+
# Raised when job adapter encounters an error with resource manager
|
15
|
+
class JobAdapterError < Error; end
|
16
|
+
|
17
|
+
# Raised when a job state is set to an invalid option
|
18
|
+
class UnknownStateAttribute < Error; end
|
19
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module OodCore
|
2
|
+
module Job
|
3
|
+
# A class that handles the communication with a resource manager for
|
4
|
+
# submitting/statusing/holding/deleting jobs
|
5
|
+
# @abstract
|
6
|
+
class Adapter
|
7
|
+
# Submit a job with the attributes defined in the job template instance
|
8
|
+
# @abstract Subclass is expected to implement {#submit}
|
9
|
+
# @raise [NotImplementedError] if subclass did not define {#submit}
|
10
|
+
# @example Submit job template to cluster
|
11
|
+
# solver_id = job_adapter.submit(solver_script)
|
12
|
+
# #=> "1234.server"
|
13
|
+
# @example Submit job that depends on previous job
|
14
|
+
# post_id = job_adapter.submit(
|
15
|
+
# post_script,
|
16
|
+
# afterok: solver_id
|
17
|
+
# )
|
18
|
+
# #=> "1235.server"
|
19
|
+
# @param script [Script] script object that describes the
|
20
|
+
# script and attributes for the submitted job
|
21
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
|
22
|
+
# at any point after dependent jobs have started execution
|
23
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
24
|
+
# execution only after dependent jobs have terminated with no errors
|
25
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
26
|
+
# execution only after dependent jobs have terminated with errors
|
27
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
28
|
+
# execution after dependent jobs have terminated
|
29
|
+
# @return [String] the job id returned after successfully submitting a job
|
30
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
31
|
+
raise NotImplementedError, "subclass did not define #submit"
|
32
|
+
end
|
33
|
+
|
34
|
+
# Retrieve info for all jobs from the resource manager
|
35
|
+
# @abstract Subclass is expected to implement {#info_all}
|
36
|
+
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
37
|
+
# @return [Array<Info>] information describing submitted jobs
|
38
|
+
def info_all
|
39
|
+
raise NotImplementedError, "subclass did not define #info_all"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Retrieve job info from the resource manager
|
43
|
+
# @abstract Subclass is expected to implement {#info}
|
44
|
+
# @raise [NotImplementedError] if subclass did not define {#info}
|
45
|
+
# @param id [#to_s] the id of the job
|
46
|
+
# @return [Info] information describing submitted job
|
47
|
+
def info(id)
|
48
|
+
raise NotImplementedError, "subclass did not define #info"
|
49
|
+
end
|
50
|
+
|
51
|
+
# Retrieve job status from resource manager
|
52
|
+
# @note Optimized slightly over retrieving complete job information from server
|
53
|
+
# @abstract Subclass is expected to implement {#status}
|
54
|
+
# @raise [NotImplementedError] if subclass did not define {#status}
|
55
|
+
# @param id [#to_s] the id of the job
|
56
|
+
# @return [Status] status of job
|
57
|
+
def status(id)
|
58
|
+
raise NotImplementedError, "subclass did not define #status"
|
59
|
+
end
|
60
|
+
|
61
|
+
# Put the submitted job on hold
|
62
|
+
# @abstract Subclass is expected to implement {#hold}
|
63
|
+
# @raise [NotImplementedError] if subclass did not define {#hold}
|
64
|
+
# @param id [#to_s] the id of the job
|
65
|
+
# @return [void]
|
66
|
+
def hold(id)
|
67
|
+
raise NotImplementedError, "subclass did not define #hold"
|
68
|
+
end
|
69
|
+
|
70
|
+
# Release the job that is on hold
|
71
|
+
# @abstract Subclass is expected to implement {#release}
|
72
|
+
# @raise [NotImplementedError] if subclass did not define {#release}
|
73
|
+
# @param id [#to_s] the id of the job
|
74
|
+
# @return [void]
|
75
|
+
def release(id)
|
76
|
+
raise NotImplementedError, "subclass did not define #release"
|
77
|
+
end
|
78
|
+
|
79
|
+
# Delete the submitted job
|
80
|
+
# @abstract Subclass is expected to implement {#delete}
|
81
|
+
# @raise [NotImplementedError] if subclass did not define {#delete}
|
82
|
+
# @param id [#to_s] the id of the job
|
83
|
+
# @return [void]
|
84
|
+
def delete(id)
|
85
|
+
raise NotImplementedError, "subclass did not define #delete"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
|
3
|
+
module OodCore
|
4
|
+
module Job
|
5
|
+
class Factory
|
6
|
+
using Refinements::HashExtensions
|
7
|
+
|
8
|
+
# Build the Lsf adapter from a configuration
|
9
|
+
# @param config [#to_h] the configuration for job adapter
|
10
|
+
# @option config [#to_s] :bindir ('') Path to lsf client bin dir
|
11
|
+
# @option config [#to_s] :libdir ('') Path to lsf client lib dir
|
12
|
+
# @option config [#to_s] :envdir ('') Path to lsf client conf dir
|
13
|
+
# @option config [#to_s] :serverdir ('') Path to lsf client etc dir
|
14
|
+
def self.build_lsf(config)
|
15
|
+
batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
|
16
|
+
Adapters::Lsf.new(batch: batch)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module Adapters
|
21
|
+
class Lsf < Adapter
|
22
|
+
# @api private
|
23
|
+
attr_reader :batch, :helper
|
24
|
+
|
25
|
+
require "ood_core/job/adapters/lsf/batch"
|
26
|
+
require "ood_core/job/adapters/lsf/helper"
|
27
|
+
|
28
|
+
STATE_MAP = {
|
29
|
+
'RUN' => :running,
|
30
|
+
'PEND' => :queued,
|
31
|
+
'DONE' => :completed,
|
32
|
+
'EXIT' => :completed,
|
33
|
+
|
34
|
+
'PSUSP' => :queued_held, # supsended before job started, resumable via bresume
|
35
|
+
'USUSP' => :suspended, # suspended after job started, resumable via bresume
|
36
|
+
'SSUSP' => :suspended,
|
37
|
+
|
38
|
+
'WAIT' => :queued, # FIXME: not sure what else to do here
|
39
|
+
'ZOMBI' => :undetermined,
|
40
|
+
'UNKWN' => :undetermined
|
41
|
+
}
|
42
|
+
|
43
|
+
# @param opts [#to_h] the options defining this adapter
|
44
|
+
# @option opts [Batch] :batch The Lsf batch object
|
45
|
+
#
|
46
|
+
# @api private
|
47
|
+
# @see Factory.build_lsf
|
48
|
+
def initialize(batch:)
|
49
|
+
@batch = batch
|
50
|
+
@helper = Lsf::Helper.new
|
51
|
+
end
|
52
|
+
|
53
|
+
# Submit a job with the attributes defined in the job template instance
|
54
|
+
# @param script [Script] script object that describes the script and
|
55
|
+
# attributes for the submitted job
|
56
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for
|
57
|
+
# execution at any point after dependent jobs have started execution
|
58
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
59
|
+
# execution only after dependent jobs have terminated with no errors
|
60
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
61
|
+
# execution only after dependent jobs have terminated with errors
|
62
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
63
|
+
# execution after dependent jobs have terminated
|
64
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
65
|
+
# @return [String] the job id returned after successfully submitting a
|
66
|
+
# job
|
67
|
+
# @see Adapter#submit
|
68
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
69
|
+
# ensure dependencies are array of ids
|
70
|
+
after = Array(after).map(&:to_s)
|
71
|
+
afterok = Array(afterok).map(&:to_s)
|
72
|
+
afternotok = Array(afternotok).map(&:to_s)
|
73
|
+
afterany = Array(afterany).map(&:to_s)
|
74
|
+
|
75
|
+
args = []
|
76
|
+
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
77
|
+
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
78
|
+
args += ["-J", script.job_name] unless script.job_name.nil?
|
79
|
+
|
80
|
+
# TODO: dependencies
|
81
|
+
|
82
|
+
env = {
|
83
|
+
#TODO:
|
84
|
+
#LSB_HOSTS?
|
85
|
+
#LSB_MCPU_HOSTS?
|
86
|
+
#SNDJOBS_TO?
|
87
|
+
#
|
88
|
+
}
|
89
|
+
|
90
|
+
# Submit job
|
91
|
+
batch.submit_string(script.content, args: args, env: env)
|
92
|
+
|
93
|
+
rescue Batch::Error => e
|
94
|
+
raise JobAdapterError, e.message
|
95
|
+
end
|
96
|
+
|
97
|
+
# Retrieve job info from the resource manager
|
98
|
+
# @param id [#to_s] the id of the job
|
99
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
100
|
+
# @return [Info] information describing submitted job
|
101
|
+
# @see Adapter#info
|
102
|
+
def info(id)
|
103
|
+
# TODO: handle job arrays
|
104
|
+
job = batch.get_job(id: id)
|
105
|
+
job ? info_for_batch_hash(job) : nil
|
106
|
+
rescue Batch::Error => e
|
107
|
+
raise JobAdapterError, e.message
|
108
|
+
end
|
109
|
+
|
110
|
+
# Retrieve info for all jobs from the resource manager
|
111
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
112
|
+
# @return [Array<Info>] information describing submitted jobs
|
113
|
+
# @see Adapter#info_all
|
114
|
+
def info_all
|
115
|
+
batch.get_jobs.map { |v| info_for_batch_hash(v) }
|
116
|
+
rescue Batch::Error => e
|
117
|
+
raise JobAdapterError, e.message
|
118
|
+
end
|
119
|
+
|
120
|
+
# Retrieve job status from resource manager
|
121
|
+
# @param id [#to_s] the id of the job
|
122
|
+
# @raise [JobAdapterError] if something goes wrong getting job status
|
123
|
+
# @return [Status] status of job
|
124
|
+
# @see Adapter#status
|
125
|
+
def status(id)
|
126
|
+
job = batch.get_job(id: id)
|
127
|
+
state = job ? get_state(job[:status]) : :completed
|
128
|
+
Status.new(state: state)
|
129
|
+
rescue Batch::Error => e
|
130
|
+
raise JobAdapterError, e.message
|
131
|
+
end
|
132
|
+
|
133
|
+
# Put the submitted job on hold
|
134
|
+
# @param id [#to_s] the id of the job
|
135
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
136
|
+
# @return [void]
|
137
|
+
# @see Adapter#hold
|
138
|
+
def hold(id)
|
139
|
+
batch.hold_job(id.to_s)
|
140
|
+
rescue Batch::Error => e
|
141
|
+
raise JobAdapterError, e.message
|
142
|
+
end
|
143
|
+
|
144
|
+
# Release the job that is on hold
|
145
|
+
# @param id [#to_s] the id of the job
|
146
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
147
|
+
# @return [void]
|
148
|
+
# @see Adapter#release
|
149
|
+
def release(id)
|
150
|
+
batch.release_job(id.to_s)
|
151
|
+
rescue Batch::Error => e
|
152
|
+
raise JobAdapterError, e.message
|
153
|
+
end
|
154
|
+
|
155
|
+
# Delete the submitted job
|
156
|
+
# @param id [#to_s] the id of the job
|
157
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
158
|
+
# @return [void]
|
159
|
+
# @see Adapter#delete
|
160
|
+
def delete(id)
|
161
|
+
batch.delete_job(id.to_s)
|
162
|
+
rescue Batch::Error => e
|
163
|
+
raise JobAdapterError, e.message
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
# Determine state from LSF state code
|
168
|
+
def get_state(st)
|
169
|
+
STATE_MAP.fetch(st, :undetermined)
|
170
|
+
end
|
171
|
+
|
172
|
+
def info_for_batch_hash(v)
|
173
|
+
Info.new(
|
174
|
+
id: v[:id],
|
175
|
+
status: get_state(v[:status]),
|
176
|
+
allocated_nodes: [],
|
177
|
+
submit_host: v[:from_host],
|
178
|
+
job_name: v[:name],
|
179
|
+
job_owner: v[:user],
|
180
|
+
accounting_id: v[:project],
|
181
|
+
procs: nil,
|
182
|
+
queue_name: v[:queue],
|
183
|
+
wallclock_time: nil,
|
184
|
+
cpu_time: nil,
|
185
|
+
submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
|
186
|
+
dispatch_time: helper.parse_past_time(v[:start_time], ignore_errors: true),
|
187
|
+
native: v
|
188
|
+
)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# Object used for simplified communication with a LSF batch server
|
2
|
+
#
|
3
|
+
# @api private
|
4
|
+
class OodCore::Job::Adapters::Lsf::Batch
|
5
|
+
attr_reader :bindir, :libdir, :envdir, :serverdir
|
6
|
+
|
7
|
+
# The root exception class that all LSF-specific exceptions inherit
|
8
|
+
# from
|
9
|
+
class Error < StandardError; end
|
10
|
+
|
11
|
+
# @param bin [#to_s] path to LSF installation binaries
|
12
|
+
def initialize(bindir: "", envdir: "", libdir: "", serverdir: "", **_)
|
13
|
+
@bindir = Pathname.new(bindir.to_s)
|
14
|
+
|
15
|
+
@envdir = Pathname.new(envdir.to_s)
|
16
|
+
@libdir = Pathname.new(libdir.to_s)
|
17
|
+
@serverdir = Pathname.new(serverdir.to_s)
|
18
|
+
end
|
19
|
+
|
20
|
+
def default_env
|
21
|
+
{
|
22
|
+
"LSF_BINDIR" => bindir.to_s,
|
23
|
+
"LSF_LIBDIR" => libdir.to_s,
|
24
|
+
"LSF_ENVDIR" => envdir.to_s,
|
25
|
+
"LSF_SERVERDIR" => serverdir.to_s
|
26
|
+
}.reject {|k,v| v.nil? || v.empty? }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Get a list of hashes detailing each of the jobs on the batch server
|
30
|
+
# @raise [Error] if `bjobs` command exited unsuccessfully
|
31
|
+
# @return [Array<Hash>] list of details for jobs
|
32
|
+
def get_jobs
|
33
|
+
#TODO: split into get_all_jobs, get_my_jobs
|
34
|
+
args = bjobs_default_args
|
35
|
+
parse_bjobs_output(call("bjobs", *args))
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get hash detailing the specified job
|
39
|
+
# @param id [#to_s] the id of the job to check
|
40
|
+
# @raise [Error] if `bjobs` command exited unsuccessfully
|
41
|
+
# @return [Hash] details of specified job
|
42
|
+
def get_job(id:)
|
43
|
+
args = bjobs_default_args
|
44
|
+
args << id.to_s
|
45
|
+
parse_bjobs_output(call("bjobs", *args)).first
|
46
|
+
end
|
47
|
+
|
48
|
+
def bjobs_default_args
|
49
|
+
%w( -u all -a -w -W )
|
50
|
+
end
|
51
|
+
|
52
|
+
# status fields available from bjobs
|
53
|
+
def fields
|
54
|
+
%i(id user status queue from_host exec_host name submit_time
|
55
|
+
project cpu_used mem swap pids start_time finish_time)
|
56
|
+
end
|
57
|
+
|
58
|
+
# helper method
|
59
|
+
def parse_bjobs_output(response)
|
60
|
+
return [] if response =~ /No job found/ || response.nil?
|
61
|
+
|
62
|
+
lines = response.split("\n")
|
63
|
+
validate_bjobs_output_columns(lines.first.split)
|
64
|
+
|
65
|
+
lines.drop(1).map{ |job|
|
66
|
+
values = split_bjobs_output_line(job)
|
67
|
+
|
68
|
+
# make a hash of { field: "value", etc. }
|
69
|
+
Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
|
70
|
+
# if the value == "-", replace it with nil
|
71
|
+
o[k] = (v == "-" ? nil : v)
|
72
|
+
}
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Put a specified job on hold
|
78
|
+
# @example Put job "1234" on hold
|
79
|
+
# my_batch.hold_job("1234")
|
80
|
+
# @param id [#to_s] the id of the job
|
81
|
+
# @raise [Error] if `bstop` command exited unsuccessfully
|
82
|
+
# @return [void]
|
83
|
+
def hold_job(id)
|
84
|
+
call("bstop", id.to_s)
|
85
|
+
end
|
86
|
+
|
87
|
+
# Release a specified job that is on hold
|
88
|
+
# @example Release job "1234" from on hold
|
89
|
+
# my_batch.release_job("1234")
|
90
|
+
# @param id [#to_s] the id of the job
|
91
|
+
# @raise [Error] if `bresume` command exited unsuccessfully
|
92
|
+
# @return [void]
|
93
|
+
def release_job(id)
|
94
|
+
call("bresume", id.to_s)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Delete a specified job from batch server
|
98
|
+
# @example Delete job "1234"
|
99
|
+
# my_batch.delete_job("1234")
|
100
|
+
# @param id [#to_s] the id of the job
|
101
|
+
# @raise [Error] if `bkill` command exited unsuccessfully
|
102
|
+
# @return [void]
|
103
|
+
def delete_job(id)
|
104
|
+
call("bkill", id.to_s)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Submit a script expanded as a string to the batch server
|
108
|
+
# @param str [#to_s] script as a string
|
109
|
+
# @param args [Array<#to_s>] arguments passed to `sbatch` command
|
110
|
+
# @param env [Hash{#to_s => #to_s}] environment variables set
|
111
|
+
# @raise [Error] if `bsub` command exited unsuccessfully
|
112
|
+
# @return [String] the id of the job that was created
|
113
|
+
def submit_string(str, args: [], env: {})
|
114
|
+
args = args.map(&:to_s)
|
115
|
+
parse_bsub_output(call("bsub", *args, env: env, stdin: str.to_s))
|
116
|
+
end
|
117
|
+
|
118
|
+
# helper method
|
119
|
+
def parse_bsub_output(response)
|
120
|
+
if response =~ /Job <(.*)> /
|
121
|
+
$1
|
122
|
+
else
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
# Call a forked Lsf command for a given cluster
|
129
|
+
def call(cmd, *args, env: {}, stdin: "")
|
130
|
+
cmd = bindir.join(cmd.to_s).to_s
|
131
|
+
#TODO: args = ["-m", cluster] + args.map(&:to_s)
|
132
|
+
env = default_env.merge(env.to_h)
|
133
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
134
|
+
s.success? ? o : raise(Error, e)
|
135
|
+
end
|
136
|
+
|
137
|
+
# split a line of output from bjobs into field values
|
138
|
+
def split_bjobs_output_line(line)
|
139
|
+
values = line.strip.split
|
140
|
+
|
141
|
+
if(values.count > 15)
|
142
|
+
# FIXME: hack assumes 15 fields & only job name may have spaces
|
143
|
+
# collapse >15 fields into 15, assumes 7th field is JOB_NAME
|
144
|
+
values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
|
145
|
+
end
|
146
|
+
|
147
|
+
values
|
148
|
+
end
|
149
|
+
|
150
|
+
# verify the output from bjobs is parsable by this object
|
151
|
+
def validate_bjobs_output_columns(columns)
|
152
|
+
expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
|
153
|
+
SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
|
154
|
+
if columns != expected
|
155
|
+
raise Error, "bjobs output in different format than expected: " \
|
156
|
+
"#{columns.inspect} instead of #{expected.inspect}"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|