ood_core 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.rspec +2 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +60 -0
- data/Rakefile +6 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/lib/ood_core.rb +34 -0
- data/lib/ood_core/acl/adapter.rb +17 -0
- data/lib/ood_core/acl/adapters/group.rb +59 -0
- data/lib/ood_core/acl/factory.rb +41 -0
- data/lib/ood_core/cluster.rb +143 -0
- data/lib/ood_core/clusters.rb +114 -0
- data/lib/ood_core/errors.rb +19 -0
- data/lib/ood_core/job/adapter.rb +89 -0
- data/lib/ood_core/job/adapters/lsf.rb +193 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +160 -0
- data/lib/ood_core/job/adapters/lsf/helper.rb +26 -0
- data/lib/ood_core/job/adapters/slurm.rb +470 -0
- data/lib/ood_core/job/adapters/torque.rb +274 -0
- data/lib/ood_core/job/factory.rb +41 -0
- data/lib/ood_core/job/info.rb +141 -0
- data/lib/ood_core/job/node_info.rb +47 -0
- data/lib/ood_core/job/node_request.rb +51 -0
- data/lib/ood_core/job/script.rb +235 -0
- data/lib/ood_core/job/status.rb +128 -0
- data/lib/ood_core/refinements/array_extensions.rb +22 -0
- data/lib/ood_core/refinements/hash_extensions.rb +25 -0
- data/lib/ood_core/version.rb +4 -0
- data/ood_core.gemspec +32 -0
- metadata +182 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module OodCore
|
4
|
+
# An enumerable that contains a list of {Cluster} objects
|
5
|
+
class Clusters
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
# The format version of the configuration file
|
9
|
+
CONFIG_VERSION = ['v2', 'v1']
|
10
|
+
|
11
|
+
class << self
|
12
|
+
# Parse a configuration file or a set of configuration files in a
|
13
|
+
# directory
|
14
|
+
# @param path [#to_s] configuration file or directory path
|
15
|
+
# @raise [ConfigurationNotFound] if path does not exist
|
16
|
+
# @return [Clusters] the clusters parsed from config
|
17
|
+
def load_file(path)
|
18
|
+
config = Pathname.new(path.to_s).expand_path
|
19
|
+
|
20
|
+
clusters = []
|
21
|
+
if config.file?
|
22
|
+
CONFIG_VERSION.any? do |version|
|
23
|
+
YAML.safe_load(config.read).fetch(version, {}).each do |k, v|
|
24
|
+
clusters << Cluster.new(send("parse_#{version}", id: k, cluster: v))
|
25
|
+
end
|
26
|
+
!clusters.empty?
|
27
|
+
end
|
28
|
+
elsif config.directory?
|
29
|
+
Pathname.glob(config.join("*.yml")).each do |p|
|
30
|
+
CONFIG_VERSION.any? do |version|
|
31
|
+
if cluster = YAML.safe_load(p.read).fetch(version, nil)
|
32
|
+
clusters << Cluster.new(send("parse_#{version}", id: p.basename(".yml").to_s, cluster: cluster))
|
33
|
+
true
|
34
|
+
else
|
35
|
+
false
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
else
|
40
|
+
raise ConfigurationNotFound, "configuration file '#{config}' does not exist"
|
41
|
+
end
|
42
|
+
|
43
|
+
new clusters
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
# Parse a list of clusters from a 'v1' config
|
48
|
+
# NB: Makes minimum assumptions about config
|
49
|
+
def parse_v1(id:, cluster:)
|
50
|
+
c = {
|
51
|
+
id: id,
|
52
|
+
metadata: {},
|
53
|
+
login: {},
|
54
|
+
job: {},
|
55
|
+
acls: [],
|
56
|
+
custom: {}
|
57
|
+
}
|
58
|
+
|
59
|
+
c[:metadata][:title] = cluster["title"] if cluster.key?("title")
|
60
|
+
c[:metadata][:url] = cluster["url"] if cluster.key?("url")
|
61
|
+
c[:metadata][:private] = true if cluster["cluster"]["data"]["hpc_cluster"] == false
|
62
|
+
|
63
|
+
if l = cluster["cluster"]["data"]["servers"]["login"]
|
64
|
+
c[:login][:host] = l["data"]["host"]
|
65
|
+
end
|
66
|
+
|
67
|
+
if rm = cluster["cluster"]["data"]["servers"]["resource_mgr"]
|
68
|
+
c[:job][:adapter] = "torque"
|
69
|
+
c[:job][:host] = rm["data"]["host"]
|
70
|
+
c[:job][:lib] = rm["data"]["lib"]
|
71
|
+
c[:job][:bin] = rm["data"]["bin"]
|
72
|
+
c[:job][:acls] = []
|
73
|
+
end
|
74
|
+
|
75
|
+
if v = cluster["validators"]
|
76
|
+
if vc = v["cluster"]
|
77
|
+
c[:acls] = vc.map do |h|
|
78
|
+
{
|
79
|
+
adapter: "group",
|
80
|
+
groups: h["data"]["groups"],
|
81
|
+
type: h["data"]["allow"] ? "whitelist" : "blacklist"
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
c
|
88
|
+
end
|
89
|
+
|
90
|
+
# Parse a list of clusters from a 'v2' config
|
91
|
+
def parse_v2(id:, cluster:)
|
92
|
+
cluster.merge(id: id)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# @param clusters [Array<Cluster>] list of cluster objects
|
97
|
+
def initialize(clusters = [])
|
98
|
+
@clusters = clusters
|
99
|
+
end
|
100
|
+
|
101
|
+
# Find cluster in list using the id of the cluster
|
102
|
+
# @param id [Object] id of cluster object
|
103
|
+
# @return [Cluster, nil] cluster object if found
|
104
|
+
def [](id)
|
105
|
+
@clusters.detect { |cluster| cluster == id }
|
106
|
+
end
|
107
|
+
|
108
|
+
# For a block {|cluster| ...}
|
109
|
+
# @yield [cluster] Gives the next cluster object in the list
|
110
|
+
def each(&block)
|
111
|
+
@clusters.each(&block)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module OodCore
|
2
|
+
# Generic {OodCore} exception class
|
3
|
+
class Error < StandardError; end
|
4
|
+
|
5
|
+
# Raised when cannot find configuration file specified
|
6
|
+
class ConfigurationNotFound < Error; end
|
7
|
+
|
8
|
+
# Raised when adapter not specified in configuration
|
9
|
+
class AdapterNotSpecified < Error; end
|
10
|
+
|
11
|
+
# Raised when cannot find adapter specified in configuration
|
12
|
+
class AdapterNotFound < Error; end
|
13
|
+
|
14
|
+
# Raised when job adapter encounters an error with resource manager
|
15
|
+
class JobAdapterError < Error; end
|
16
|
+
|
17
|
+
# Raised when a job state is set to an invalid option
|
18
|
+
class UnknownStateAttribute < Error; end
|
19
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module OodCore
|
2
|
+
module Job
|
3
|
+
# A class that handles the communication with a resource manager for
|
4
|
+
# submitting/statusing/holding/deleting jobs
|
5
|
+
# @abstract
|
6
|
+
class Adapter
|
7
|
+
# Submit a job with the attributes defined in the job template instance
|
8
|
+
# @abstract Subclass is expected to implement {#submit}
|
9
|
+
# @raise [NotImplementedError] if subclass did not define {#submit}
|
10
|
+
# @example Submit job template to cluster
|
11
|
+
# solver_id = job_adapter.submit(solver_script)
|
12
|
+
# #=> "1234.server"
|
13
|
+
# @example Submit job that depends on previous job
|
14
|
+
# post_id = job_adapter.submit(
|
15
|
+
# post_script,
|
16
|
+
# afterok: solver_id
|
17
|
+
# )
|
18
|
+
# #=> "1235.server"
|
19
|
+
# @param script [Script] script object that describes the
|
20
|
+
# script and attributes for the submitted job
|
21
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
|
22
|
+
# at any point after dependent jobs have started execution
|
23
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
24
|
+
# execution only after dependent jobs have terminated with no errors
|
25
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
26
|
+
# execution only after dependent jobs have terminated with errors
|
27
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
28
|
+
# execution after dependent jobs have terminated
|
29
|
+
# @return [String] the job id returned after successfully submitting a job
|
30
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
31
|
+
raise NotImplementedError, "subclass did not define #submit"
|
32
|
+
end
|
33
|
+
|
34
|
+
# Retrieve info for all jobs from the resource manager
|
35
|
+
# @abstract Subclass is expected to implement {#info_all}
|
36
|
+
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
37
|
+
# @return [Array<Info>] information describing submitted jobs
|
38
|
+
def info_all
|
39
|
+
raise NotImplementedError, "subclass did not define #info_all"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Retrieve job info from the resource manager
|
43
|
+
# @abstract Subclass is expected to implement {#info}
|
44
|
+
# @raise [NotImplementedError] if subclass did not define {#info}
|
45
|
+
# @param id [#to_s] the id of the job
|
46
|
+
# @return [Info] information describing submitted job
|
47
|
+
def info(id)
|
48
|
+
raise NotImplementedError, "subclass did not define #info"
|
49
|
+
end
|
50
|
+
|
51
|
+
# Retrieve job status from resource manager
|
52
|
+
# @note Optimized slightly over retrieving complete job information from server
|
53
|
+
# @abstract Subclass is expected to implement {#status}
|
54
|
+
# @raise [NotImplementedError] if subclass did not define {#status}
|
55
|
+
# @param id [#to_s] the id of the job
|
56
|
+
# @return [Status] status of job
|
57
|
+
def status(id)
|
58
|
+
raise NotImplementedError, "subclass did not define #status"
|
59
|
+
end
|
60
|
+
|
61
|
+
# Put the submitted job on hold
|
62
|
+
# @abstract Subclass is expected to implement {#hold}
|
63
|
+
# @raise [NotImplementedError] if subclass did not define {#hold}
|
64
|
+
# @param id [#to_s] the id of the job
|
65
|
+
# @return [void]
|
66
|
+
def hold(id)
|
67
|
+
raise NotImplementedError, "subclass did not define #hold"
|
68
|
+
end
|
69
|
+
|
70
|
+
# Release the job that is on hold
|
71
|
+
# @abstract Subclass is expected to implement {#release}
|
72
|
+
# @raise [NotImplementedError] if subclass did not define {#release}
|
73
|
+
# @param id [#to_s] the id of the job
|
74
|
+
# @return [void]
|
75
|
+
def release(id)
|
76
|
+
raise NotImplementedError, "subclass did not define #release"
|
77
|
+
end
|
78
|
+
|
79
|
+
# Delete the submitted job
|
80
|
+
# @abstract Subclass is expected to implement {#delete}
|
81
|
+
# @raise [NotImplementedError] if subclass did not define {#delete}
|
82
|
+
# @param id [#to_s] the id of the job
|
83
|
+
# @return [void]
|
84
|
+
def delete(id)
|
85
|
+
raise NotImplementedError, "subclass did not define #delete"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
|
3
|
+
module OodCore
|
4
|
+
module Job
|
5
|
+
class Factory
|
6
|
+
using Refinements::HashExtensions
|
7
|
+
|
8
|
+
# Build the Lsf adapter from a configuration
|
9
|
+
# @param config [#to_h] the configuration for job adapter
|
10
|
+
# @option config [#to_s] :bindir ('') Path to lsf client bin dir
|
11
|
+
# @option config [#to_s] :libdir ('') Path to lsf client lib dir
|
12
|
+
# @option config [#to_s] :envdir ('') Path to lsf client conf dir
|
13
|
+
# @option config [#to_s] :serverdir ('') Path to lsf client etc dir
|
14
|
+
def self.build_lsf(config)
|
15
|
+
batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
|
16
|
+
Adapters::Lsf.new(batch: batch)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module Adapters
|
21
|
+
class Lsf < Adapter
|
22
|
+
# @api private
|
23
|
+
attr_reader :batch, :helper
|
24
|
+
|
25
|
+
require "ood_core/job/adapters/lsf/batch"
|
26
|
+
require "ood_core/job/adapters/lsf/helper"
|
27
|
+
|
28
|
+
STATE_MAP = {
|
29
|
+
'RUN' => :running,
|
30
|
+
'PEND' => :queued,
|
31
|
+
'DONE' => :completed,
|
32
|
+
'EXIT' => :completed,
|
33
|
+
|
34
|
+
'PSUSP' => :queued_held, # supsended before job started, resumable via bresume
|
35
|
+
'USUSP' => :suspended, # suspended after job started, resumable via bresume
|
36
|
+
'SSUSP' => :suspended,
|
37
|
+
|
38
|
+
'WAIT' => :queued, # FIXME: not sure what else to do here
|
39
|
+
'ZOMBI' => :undetermined,
|
40
|
+
'UNKWN' => :undetermined
|
41
|
+
}
|
42
|
+
|
43
|
+
# @param opts [#to_h] the options defining this adapter
|
44
|
+
# @option opts [Batch] :batch The Lsf batch object
|
45
|
+
#
|
46
|
+
# @api private
|
47
|
+
# @see Factory.build_lsf
|
48
|
+
def initialize(batch:)
|
49
|
+
@batch = batch
|
50
|
+
@helper = Lsf::Helper.new
|
51
|
+
end
|
52
|
+
|
53
|
+
# Submit a job with the attributes defined in the job template instance
|
54
|
+
# @param script [Script] script object that describes the script and
|
55
|
+
# attributes for the submitted job
|
56
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for
|
57
|
+
# execution at any point after dependent jobs have started execution
|
58
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
59
|
+
# execution only after dependent jobs have terminated with no errors
|
60
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
61
|
+
# execution only after dependent jobs have terminated with errors
|
62
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
63
|
+
# execution after dependent jobs have terminated
|
64
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
65
|
+
# @return [String] the job id returned after successfully submitting a
|
66
|
+
# job
|
67
|
+
# @see Adapter#submit
|
68
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
69
|
+
# ensure dependencies are array of ids
|
70
|
+
after = Array(after).map(&:to_s)
|
71
|
+
afterok = Array(afterok).map(&:to_s)
|
72
|
+
afternotok = Array(afternotok).map(&:to_s)
|
73
|
+
afterany = Array(afterany).map(&:to_s)
|
74
|
+
|
75
|
+
args = []
|
76
|
+
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
77
|
+
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
78
|
+
args += ["-J", script.job_name] unless script.job_name.nil?
|
79
|
+
|
80
|
+
# TODO: dependencies
|
81
|
+
|
82
|
+
env = {
|
83
|
+
#TODO:
|
84
|
+
#LSB_HOSTS?
|
85
|
+
#LSB_MCPU_HOSTS?
|
86
|
+
#SNDJOBS_TO?
|
87
|
+
#
|
88
|
+
}
|
89
|
+
|
90
|
+
# Submit job
|
91
|
+
batch.submit_string(script.content, args: args, env: env)
|
92
|
+
|
93
|
+
rescue Batch::Error => e
|
94
|
+
raise JobAdapterError, e.message
|
95
|
+
end
|
96
|
+
|
97
|
+
# Retrieve job info from the resource manager
|
98
|
+
# @param id [#to_s] the id of the job
|
99
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
100
|
+
# @return [Info] information describing submitted job
|
101
|
+
# @see Adapter#info
|
102
|
+
def info(id)
|
103
|
+
# TODO: handle job arrays
|
104
|
+
job = batch.get_job(id: id)
|
105
|
+
job ? info_for_batch_hash(job) : nil
|
106
|
+
rescue Batch::Error => e
|
107
|
+
raise JobAdapterError, e.message
|
108
|
+
end
|
109
|
+
|
110
|
+
# Retrieve info for all jobs from the resource manager
|
111
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
112
|
+
# @return [Array<Info>] information describing submitted jobs
|
113
|
+
# @see Adapter#info_all
|
114
|
+
def info_all
|
115
|
+
batch.get_jobs.map { |v| info_for_batch_hash(v) }
|
116
|
+
rescue Batch::Error => e
|
117
|
+
raise JobAdapterError, e.message
|
118
|
+
end
|
119
|
+
|
120
|
+
# Retrieve job status from resource manager
|
121
|
+
# @param id [#to_s] the id of the job
|
122
|
+
# @raise [JobAdapterError] if something goes wrong getting job status
|
123
|
+
# @return [Status] status of job
|
124
|
+
# @see Adapter#status
|
125
|
+
def status(id)
|
126
|
+
job = batch.get_job(id: id)
|
127
|
+
state = job ? get_state(job[:status]) : :completed
|
128
|
+
Status.new(state: state)
|
129
|
+
rescue Batch::Error => e
|
130
|
+
raise JobAdapterError, e.message
|
131
|
+
end
|
132
|
+
|
133
|
+
# Put the submitted job on hold
|
134
|
+
# @param id [#to_s] the id of the job
|
135
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
136
|
+
# @return [void]
|
137
|
+
# @see Adapter#hold
|
138
|
+
def hold(id)
|
139
|
+
batch.hold_job(id.to_s)
|
140
|
+
rescue Batch::Error => e
|
141
|
+
raise JobAdapterError, e.message
|
142
|
+
end
|
143
|
+
|
144
|
+
# Release the job that is on hold
|
145
|
+
# @param id [#to_s] the id of the job
|
146
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
147
|
+
# @return [void]
|
148
|
+
# @see Adapter#release
|
149
|
+
def release(id)
|
150
|
+
batch.release_job(id.to_s)
|
151
|
+
rescue Batch::Error => e
|
152
|
+
raise JobAdapterError, e.message
|
153
|
+
end
|
154
|
+
|
155
|
+
# Delete the submitted job
|
156
|
+
# @param id [#to_s] the id of the job
|
157
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
158
|
+
# @return [void]
|
159
|
+
# @see Adapter#delete
|
160
|
+
def delete(id)
|
161
|
+
batch.delete_job(id.to_s)
|
162
|
+
rescue Batch::Error => e
|
163
|
+
raise JobAdapterError, e.message
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
# Determine state from LSF state code
|
168
|
+
def get_state(st)
|
169
|
+
STATE_MAP.fetch(st, :undetermined)
|
170
|
+
end
|
171
|
+
|
172
|
+
def info_for_batch_hash(v)
|
173
|
+
Info.new(
|
174
|
+
id: v[:id],
|
175
|
+
status: get_state(v[:status]),
|
176
|
+
allocated_nodes: [],
|
177
|
+
submit_host: v[:from_host],
|
178
|
+
job_name: v[:name],
|
179
|
+
job_owner: v[:user],
|
180
|
+
accounting_id: v[:project],
|
181
|
+
procs: nil,
|
182
|
+
queue_name: v[:queue],
|
183
|
+
wallclock_time: nil,
|
184
|
+
cpu_time: nil,
|
185
|
+
submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
|
186
|
+
dispatch_time: helper.parse_past_time(v[:start_time], ignore_errors: true),
|
187
|
+
native: v
|
188
|
+
)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# Object used for simplified communication with a LSF batch server
|
2
|
+
#
|
3
|
+
# @api private
|
4
|
+
class OodCore::Job::Adapters::Lsf::Batch
|
5
|
+
attr_reader :bindir, :libdir, :envdir, :serverdir
|
6
|
+
|
7
|
+
# The root exception class that all LSF-specific exceptions inherit
|
8
|
+
# from
|
9
|
+
class Error < StandardError; end
|
10
|
+
|
11
|
+
# @param bin [#to_s] path to LSF installation binaries
|
12
|
+
def initialize(bindir: "", envdir: "", libdir: "", serverdir: "", **_)
|
13
|
+
@bindir = Pathname.new(bindir.to_s)
|
14
|
+
|
15
|
+
@envdir = Pathname.new(envdir.to_s)
|
16
|
+
@libdir = Pathname.new(libdir.to_s)
|
17
|
+
@serverdir = Pathname.new(serverdir.to_s)
|
18
|
+
end
|
19
|
+
|
20
|
+
def default_env
|
21
|
+
{
|
22
|
+
"LSF_BINDIR" => bindir.to_s,
|
23
|
+
"LSF_LIBDIR" => libdir.to_s,
|
24
|
+
"LSF_ENVDIR" => envdir.to_s,
|
25
|
+
"LSF_SERVERDIR" => serverdir.to_s
|
26
|
+
}.reject {|k,v| v.nil? || v.empty? }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Get a list of hashes detailing each of the jobs on the batch server
|
30
|
+
# @raise [Error] if `bjobs` command exited unsuccessfully
|
31
|
+
# @return [Array<Hash>] list of details for jobs
|
32
|
+
def get_jobs
|
33
|
+
#TODO: split into get_all_jobs, get_my_jobs
|
34
|
+
args = bjobs_default_args
|
35
|
+
parse_bjobs_output(call("bjobs", *args))
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get hash detailing the specified job
|
39
|
+
# @param id [#to_s] the id of the job to check
|
40
|
+
# @raise [Error] if `bjobs` command exited unsuccessfully
|
41
|
+
# @return [Hash] details of specified job
|
42
|
+
def get_job(id:)
|
43
|
+
args = bjobs_default_args
|
44
|
+
args << id.to_s
|
45
|
+
parse_bjobs_output(call("bjobs", *args)).first
|
46
|
+
end
|
47
|
+
|
48
|
+
def bjobs_default_args
|
49
|
+
%w( -u all -a -w -W )
|
50
|
+
end
|
51
|
+
|
52
|
+
# status fields available from bjobs
|
53
|
+
def fields
|
54
|
+
%i(id user status queue from_host exec_host name submit_time
|
55
|
+
project cpu_used mem swap pids start_time finish_time)
|
56
|
+
end
|
57
|
+
|
58
|
+
# helper method
|
59
|
+
def parse_bjobs_output(response)
|
60
|
+
return [] if response =~ /No job found/ || response.nil?
|
61
|
+
|
62
|
+
lines = response.split("\n")
|
63
|
+
validate_bjobs_output_columns(lines.first.split)
|
64
|
+
|
65
|
+
lines.drop(1).map{ |job|
|
66
|
+
values = split_bjobs_output_line(job)
|
67
|
+
|
68
|
+
# make a hash of { field: "value", etc. }
|
69
|
+
Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
|
70
|
+
# if the value == "-", replace it with nil
|
71
|
+
o[k] = (v == "-" ? nil : v)
|
72
|
+
}
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Put a specified job on hold
|
78
|
+
# @example Put job "1234" on hold
|
79
|
+
# my_batch.hold_job("1234")
|
80
|
+
# @param id [#to_s] the id of the job
|
81
|
+
# @raise [Error] if `bstop` command exited unsuccessfully
|
82
|
+
# @return [void]
|
83
|
+
def hold_job(id)
|
84
|
+
call("bstop", id.to_s)
|
85
|
+
end
|
86
|
+
|
87
|
+
# Release a specified job that is on hold
|
88
|
+
# @example Release job "1234" from on hold
|
89
|
+
# my_batch.release_job("1234")
|
90
|
+
# @param id [#to_s] the id of the job
|
91
|
+
# @raise [Error] if `bresume` command exited unsuccessfully
|
92
|
+
# @return [void]
|
93
|
+
def release_job(id)
|
94
|
+
call("bresume", id.to_s)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Delete a specified job from batch server
|
98
|
+
# @example Delete job "1234"
|
99
|
+
# my_batch.delete_job("1234")
|
100
|
+
# @param id [#to_s] the id of the job
|
101
|
+
# @raise [Error] if `bkill` command exited unsuccessfully
|
102
|
+
# @return [void]
|
103
|
+
def delete_job(id)
|
104
|
+
call("bkill", id.to_s)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Submit a script expanded as a string to the batch server
|
108
|
+
# @param str [#to_s] script as a string
|
109
|
+
# @param args [Array<#to_s>] arguments passed to `sbatch` command
|
110
|
+
# @param env [Hash{#to_s => #to_s}] environment variables set
|
111
|
+
# @raise [Error] if `bsub` command exited unsuccessfully
|
112
|
+
# @return [String] the id of the job that was created
|
113
|
+
def submit_string(str, args: [], env: {})
|
114
|
+
args = args.map(&:to_s)
|
115
|
+
parse_bsub_output(call("bsub", *args, env: env, stdin: str.to_s))
|
116
|
+
end
|
117
|
+
|
118
|
+
# helper method
|
119
|
+
def parse_bsub_output(response)
|
120
|
+
if response =~ /Job <(.*)> /
|
121
|
+
$1
|
122
|
+
else
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
# Call a forked Lsf command for a given cluster
|
129
|
+
def call(cmd, *args, env: {}, stdin: "")
|
130
|
+
cmd = bindir.join(cmd.to_s).to_s
|
131
|
+
#TODO: args = ["-m", cluster] + args.map(&:to_s)
|
132
|
+
env = default_env.merge(env.to_h)
|
133
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
134
|
+
s.success? ? o : raise(Error, e)
|
135
|
+
end
|
136
|
+
|
137
|
+
# split a line of output from bjobs into field values
|
138
|
+
def split_bjobs_output_line(line)
|
139
|
+
values = line.strip.split
|
140
|
+
|
141
|
+
if(values.count > 15)
|
142
|
+
# FIXME: hack assumes 15 fields & only job name may have spaces
|
143
|
+
# collapse >15 fields into 15, assumes 7th field is JOB_NAME
|
144
|
+
values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
|
145
|
+
end
|
146
|
+
|
147
|
+
values
|
148
|
+
end
|
149
|
+
|
150
|
+
# verify the output from bjobs is parsable by this object
|
151
|
+
def validate_bjobs_output_columns(columns)
|
152
|
+
expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
|
153
|
+
SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
|
154
|
+
if columns != expected
|
155
|
+
raise Error, "bjobs output in different format than expected: " \
|
156
|
+
"#{columns.inspect} instead of #{expected.inspect}"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|