ood_core 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,114 @@
1
+ require "yaml"
2
+
3
+ module OodCore
4
+ # An enumerable that contains a list of {Cluster} objects
5
+ class Clusters
6
+ include Enumerable
7
+
8
+ # The format version of the configuration file
9
+ CONFIG_VERSION = ['v2', 'v1']
10
+
11
+ class << self
12
+ # Parse a configuration file or a set of configuration files in a
13
+ # directory
14
+ # @param path [#to_s] configuration file or directory path
15
+ # @raise [ConfigurationNotFound] if path does not exist
16
+ # @return [Clusters] the clusters parsed from config
17
+ def load_file(path)
18
+ config = Pathname.new(path.to_s).expand_path
19
+
20
+ clusters = []
21
+ if config.file?
22
+ CONFIG_VERSION.any? do |version|
23
+ YAML.safe_load(config.read).fetch(version, {}).each do |k, v|
24
+ clusters << Cluster.new(send("parse_#{version}", id: k, cluster: v))
25
+ end
26
+ !clusters.empty?
27
+ end
28
+ elsif config.directory?
29
+ Pathname.glob(config.join("*.yml")).each do |p|
30
+ CONFIG_VERSION.any? do |version|
31
+ if cluster = YAML.safe_load(p.read).fetch(version, nil)
32
+ clusters << Cluster.new(send("parse_#{version}", id: p.basename(".yml").to_s, cluster: cluster))
33
+ true
34
+ else
35
+ false
36
+ end
37
+ end
38
+ end
39
+ else
40
+ raise ConfigurationNotFound, "configuration file '#{config}' does not exist"
41
+ end
42
+
43
+ new clusters
44
+ end
45
+
46
+ private
47
+ # Parse a list of clusters from a 'v1' config
48
+ # NB: Makes minimum assumptions about config
49
+ def parse_v1(id:, cluster:)
50
+ c = {
51
+ id: id,
52
+ metadata: {},
53
+ login: {},
54
+ job: {},
55
+ acls: [],
56
+ custom: {}
57
+ }
58
+
59
+ c[:metadata][:title] = cluster["title"] if cluster.key?("title")
60
+ c[:metadata][:url] = cluster["url"] if cluster.key?("url")
61
+ c[:metadata][:private] = true if cluster["cluster"]["data"]["hpc_cluster"] == false
62
+
63
+ if l = cluster["cluster"]["data"]["servers"]["login"]
64
+ c[:login][:host] = l["data"]["host"]
65
+ end
66
+
67
+ if rm = cluster["cluster"]["data"]["servers"]["resource_mgr"]
68
+ c[:job][:adapter] = "torque"
69
+ c[:job][:host] = rm["data"]["host"]
70
+ c[:job][:lib] = rm["data"]["lib"]
71
+ c[:job][:bin] = rm["data"]["bin"]
72
+ c[:job][:acls] = []
73
+ end
74
+
75
+ if v = cluster["validators"]
76
+ if vc = v["cluster"]
77
+ c[:acls] = vc.map do |h|
78
+ {
79
+ adapter: "group",
80
+ groups: h["data"]["groups"],
81
+ type: h["data"]["allow"] ? "whitelist" : "blacklist"
82
+ }
83
+ end
84
+ end
85
+ end
86
+
87
+ c
88
+ end
89
+
90
+ # Parse a list of clusters from a 'v2' config
91
+ def parse_v2(id:, cluster:)
92
+ cluster.merge(id: id)
93
+ end
94
+ end
95
+
96
+ # @param clusters [Array<Cluster>] list of cluster objects
97
+ def initialize(clusters = [])
98
+ @clusters = clusters
99
+ end
100
+
101
+ # Find cluster in list using the id of the cluster
102
+ # @param id [Object] id of cluster object
103
+ # @return [Cluster, nil] cluster object if found
104
+ def [](id)
105
+ @clusters.detect { |cluster| cluster == id }
106
+ end
107
+
108
+ # For a block {|cluster| ...}
109
+ # @yield [cluster] Gives the next cluster object in the list
110
+ def each(&block)
111
+ @clusters.each(&block)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,19 @@
1
+ module OodCore
2
+ # Generic {OodCore} exception class
3
+ class Error < StandardError; end
4
+
5
+ # Raised when cannot find configuration file specified
6
+ class ConfigurationNotFound < Error; end
7
+
8
+ # Raised when adapter not specified in configuration
9
+ class AdapterNotSpecified < Error; end
10
+
11
+ # Raised when cannot find adapter specified in configuration
12
+ class AdapterNotFound < Error; end
13
+
14
+ # Raised when job adapter encounters an error with resource manager
15
+ class JobAdapterError < Error; end
16
+
17
+ # Raised when a job state is set to an invalid option
18
+ class UnknownStateAttribute < Error; end
19
+ end
@@ -0,0 +1,89 @@
1
+ module OodCore
2
+ module Job
3
+ # A class that handles the communication with a resource manager for
4
+ # submitting/statusing/holding/deleting jobs
5
+ # @abstract
6
+ class Adapter
7
+ # Submit a job with the attributes defined in the job template instance
8
+ # @abstract Subclass is expected to implement {#submit}
9
+ # @raise [NotImplementedError] if subclass did not define {#submit}
10
+ # @example Submit job template to cluster
11
+ # solver_id = job_adapter.submit(solver_script)
12
+ # #=> "1234.server"
13
+ # @example Submit job that depends on previous job
14
+ # post_id = job_adapter.submit(
15
+ # post_script,
16
+ # afterok: solver_id
17
+ # )
18
+ # #=> "1235.server"
19
+ # @param script [Script] script object that describes the
20
+ # script and attributes for the submitted job
21
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
22
+ # at any point after dependent jobs have started execution
23
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
24
+ # execution only after dependent jobs have terminated with no errors
25
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
26
+ # execution only after dependent jobs have terminated with errors
27
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
28
+ # execution after dependent jobs have terminated
29
+ # @return [String] the job id returned after successfully submitting a job
30
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
31
+ raise NotImplementedError, "subclass did not define #submit"
32
+ end
33
+
34
+ # Retrieve info for all jobs from the resource manager
35
+ # @abstract Subclass is expected to implement {#info_all}
36
+ # @raise [NotImplementedError] if subclass did not define {#info_all}
37
+ # @return [Array<Info>] information describing submitted jobs
38
+ def info_all
39
+ raise NotImplementedError, "subclass did not define #info_all"
40
+ end
41
+
42
+ # Retrieve job info from the resource manager
43
+ # @abstract Subclass is expected to implement {#info}
44
+ # @raise [NotImplementedError] if subclass did not define {#info}
45
+ # @param id [#to_s] the id of the job
46
+ # @return [Info] information describing submitted job
47
+ def info(id)
48
+ raise NotImplementedError, "subclass did not define #info"
49
+ end
50
+
51
+ # Retrieve job status from resource manager
52
+ # @note Optimized slightly over retrieving complete job information from server
53
+ # @abstract Subclass is expected to implement {#status}
54
+ # @raise [NotImplementedError] if subclass did not define {#status}
55
+ # @param id [#to_s] the id of the job
56
+ # @return [Status] status of job
57
+ def status(id)
58
+ raise NotImplementedError, "subclass did not define #status"
59
+ end
60
+
61
+ # Put the submitted job on hold
62
+ # @abstract Subclass is expected to implement {#hold}
63
+ # @raise [NotImplementedError] if subclass did not define {#hold}
64
+ # @param id [#to_s] the id of the job
65
+ # @return [void]
66
+ def hold(id)
67
+ raise NotImplementedError, "subclass did not define #hold"
68
+ end
69
+
70
+ # Release the job that is on hold
71
+ # @abstract Subclass is expected to implement {#release}
72
+ # @raise [NotImplementedError] if subclass did not define {#release}
73
+ # @param id [#to_s] the id of the job
74
+ # @return [void]
75
+ def release(id)
76
+ raise NotImplementedError, "subclass did not define #release"
77
+ end
78
+
79
+ # Delete the submitted job
80
+ # @abstract Subclass is expected to implement {#delete}
81
+ # @raise [NotImplementedError] if subclass did not define {#delete}
82
+ # @param id [#to_s] the id of the job
83
+ # @return [void]
84
+ def delete(id)
85
+ raise NotImplementedError, "subclass did not define #delete"
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,193 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+
3
+ module OodCore
4
+ module Job
5
+ class Factory
6
+ using Refinements::HashExtensions
7
+
8
+ # Build the Lsf adapter from a configuration
9
+ # @param config [#to_h] the configuration for job adapter
10
+ # @option config [#to_s] :bindir ('') Path to lsf client bin dir
11
+ # @option config [#to_s] :libdir ('') Path to lsf client lib dir
12
+ # @option config [#to_s] :envdir ('') Path to lsf client conf dir
13
+ # @option config [#to_s] :serverdir ('') Path to lsf client etc dir
14
+ def self.build_lsf(config)
15
+ batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
16
+ Adapters::Lsf.new(batch: batch)
17
+ end
18
+ end
19
+
20
+ module Adapters
21
+ class Lsf < Adapter
22
+ # @api private
23
+ attr_reader :batch, :helper
24
+
25
+ require "ood_core/job/adapters/lsf/batch"
26
+ require "ood_core/job/adapters/lsf/helper"
27
+
28
+ STATE_MAP = {
29
+ 'RUN' => :running,
30
+ 'PEND' => :queued,
31
+ 'DONE' => :completed,
32
+ 'EXIT' => :completed,
33
+
34
+ 'PSUSP' => :queued_held, # supsended before job started, resumable via bresume
35
+ 'USUSP' => :suspended, # suspended after job started, resumable via bresume
36
+ 'SSUSP' => :suspended,
37
+
38
+ 'WAIT' => :queued, # FIXME: not sure what else to do here
39
+ 'ZOMBI' => :undetermined,
40
+ 'UNKWN' => :undetermined
41
+ }
42
+
43
+ # @param opts [#to_h] the options defining this adapter
44
+ # @option opts [Batch] :batch The Lsf batch object
45
+ #
46
+ # @api private
47
+ # @see Factory.build_lsf
48
+ def initialize(batch:)
49
+ @batch = batch
50
+ @helper = Lsf::Helper.new
51
+ end
52
+
53
+ # Submit a job with the attributes defined in the job template instance
54
+ # @param script [Script] script object that describes the script and
55
+ # attributes for the submitted job
56
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
57
+ # execution at any point after dependent jobs have started execution
58
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
59
+ # execution only after dependent jobs have terminated with no errors
60
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
61
+ # execution only after dependent jobs have terminated with errors
62
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
63
+ # execution after dependent jobs have terminated
64
+ # @raise [JobAdapterError] if something goes wrong submitting a job
65
+ # @return [String] the job id returned after successfully submitting a
66
+ # job
67
+ # @see Adapter#submit
68
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
69
+ # ensure dependencies are array of ids
70
+ after = Array(after).map(&:to_s)
71
+ afterok = Array(afterok).map(&:to_s)
72
+ afternotok = Array(afternotok).map(&:to_s)
73
+ afterany = Array(afterany).map(&:to_s)
74
+
75
+ args = []
76
+ args += ["-P", script.accounting_id] unless script.accounting_id.nil?
77
+ args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
78
+ args += ["-J", script.job_name] unless script.job_name.nil?
79
+
80
+ # TODO: dependencies
81
+
82
+ env = {
83
+ #TODO:
84
+ #LSB_HOSTS?
85
+ #LSB_MCPU_HOSTS?
86
+ #SNDJOBS_TO?
87
+ #
88
+ }
89
+
90
+ # Submit job
91
+ batch.submit_string(script.content, args: args, env: env)
92
+
93
+ rescue Batch::Error => e
94
+ raise JobAdapterError, e.message
95
+ end
96
+
97
+ # Retrieve job info from the resource manager
98
+ # @param id [#to_s] the id of the job
99
+ # @raise [JobAdapterError] if something goes wrong getting job info
100
+ # @return [Info] information describing submitted job
101
+ # @see Adapter#info
102
+ def info(id)
103
+ # TODO: handle job arrays
104
+ job = batch.get_job(id: id)
105
+ job ? info_for_batch_hash(job) : nil
106
+ rescue Batch::Error => e
107
+ raise JobAdapterError, e.message
108
+ end
109
+
110
+ # Retrieve info for all jobs from the resource manager
111
+ # @raise [JobAdapterError] if something goes wrong getting job info
112
+ # @return [Array<Info>] information describing submitted jobs
113
+ # @see Adapter#info_all
114
+ def info_all
115
+ batch.get_jobs.map { |v| info_for_batch_hash(v) }
116
+ rescue Batch::Error => e
117
+ raise JobAdapterError, e.message
118
+ end
119
+
120
+ # Retrieve job status from resource manager
121
+ # @param id [#to_s] the id of the job
122
+ # @raise [JobAdapterError] if something goes wrong getting job status
123
+ # @return [Status] status of job
124
+ # @see Adapter#status
125
+ def status(id)
126
+ job = batch.get_job(id: id)
127
+ state = job ? get_state(job[:status]) : :completed
128
+ Status.new(state: state)
129
+ rescue Batch::Error => e
130
+ raise JobAdapterError, e.message
131
+ end
132
+
133
+ # Put the submitted job on hold
134
+ # @param id [#to_s] the id of the job
135
+ # @raise [JobAdapterError] if something goes wrong holding a job
136
+ # @return [void]
137
+ # @see Adapter#hold
138
+ def hold(id)
139
+ batch.hold_job(id.to_s)
140
+ rescue Batch::Error => e
141
+ raise JobAdapterError, e.message
142
+ end
143
+
144
+ # Release the job that is on hold
145
+ # @param id [#to_s] the id of the job
146
+ # @raise [JobAdapterError] if something goes wrong releasing a job
147
+ # @return [void]
148
+ # @see Adapter#release
149
+ def release(id)
150
+ batch.release_job(id.to_s)
151
+ rescue Batch::Error => e
152
+ raise JobAdapterError, e.message
153
+ end
154
+
155
+ # Delete the submitted job
156
+ # @param id [#to_s] the id of the job
157
+ # @raise [JobAdapterError] if something goes wrong deleting a job
158
+ # @return [void]
159
+ # @see Adapter#delete
160
+ def delete(id)
161
+ batch.delete_job(id.to_s)
162
+ rescue Batch::Error => e
163
+ raise JobAdapterError, e.message
164
+ end
165
+
166
+ private
167
+ # Determine state from LSF state code
168
+ def get_state(st)
169
+ STATE_MAP.fetch(st, :undetermined)
170
+ end
171
+
172
+ def info_for_batch_hash(v)
173
+ Info.new(
174
+ id: v[:id],
175
+ status: get_state(v[:status]),
176
+ allocated_nodes: [],
177
+ submit_host: v[:from_host],
178
+ job_name: v[:name],
179
+ job_owner: v[:user],
180
+ accounting_id: v[:project],
181
+ procs: nil,
182
+ queue_name: v[:queue],
183
+ wallclock_time: nil,
184
+ cpu_time: nil,
185
+ submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
186
+ dispatch_time: helper.parse_past_time(v[:start_time], ignore_errors: true),
187
+ native: v
188
+ )
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,160 @@
1
+ # Object used for simplified communication with a LSF batch server
2
+ #
3
+ # @api private
4
+ class OodCore::Job::Adapters::Lsf::Batch
5
+ attr_reader :bindir, :libdir, :envdir, :serverdir
6
+
7
+ # The root exception class that all LSF-specific exceptions inherit
8
+ # from
9
+ class Error < StandardError; end
10
+
11
+ # @param bin [#to_s] path to LSF installation binaries
12
+ def initialize(bindir: "", envdir: "", libdir: "", serverdir: "", **_)
13
+ @bindir = Pathname.new(bindir.to_s)
14
+
15
+ @envdir = Pathname.new(envdir.to_s)
16
+ @libdir = Pathname.new(libdir.to_s)
17
+ @serverdir = Pathname.new(serverdir.to_s)
18
+ end
19
+
20
+ def default_env
21
+ {
22
+ "LSF_BINDIR" => bindir.to_s,
23
+ "LSF_LIBDIR" => libdir.to_s,
24
+ "LSF_ENVDIR" => envdir.to_s,
25
+ "LSF_SERVERDIR" => serverdir.to_s
26
+ }.reject {|k,v| v.nil? || v.empty? }
27
+ end
28
+
29
+ # Get a list of hashes detailing each of the jobs on the batch server
30
+ # @raise [Error] if `bjobs` command exited unsuccessfully
31
+ # @return [Array<Hash>] list of details for jobs
32
+ def get_jobs
33
+ #TODO: split into get_all_jobs, get_my_jobs
34
+ args = bjobs_default_args
35
+ parse_bjobs_output(call("bjobs", *args))
36
+ end
37
+
38
+ # Get hash detailing the specified job
39
+ # @param id [#to_s] the id of the job to check
40
+ # @raise [Error] if `bjobs` command exited unsuccessfully
41
+ # @return [Hash] details of specified job
42
+ def get_job(id:)
43
+ args = bjobs_default_args
44
+ args << id.to_s
45
+ parse_bjobs_output(call("bjobs", *args)).first
46
+ end
47
+
48
+ def bjobs_default_args
49
+ %w( -u all -a -w -W )
50
+ end
51
+
52
+ # status fields available from bjobs
53
+ def fields
54
+ %i(id user status queue from_host exec_host name submit_time
55
+ project cpu_used mem swap pids start_time finish_time)
56
+ end
57
+
58
+ # helper method
59
+ def parse_bjobs_output(response)
60
+ return [] if response =~ /No job found/ || response.nil?
61
+
62
+ lines = response.split("\n")
63
+ validate_bjobs_output_columns(lines.first.split)
64
+
65
+ lines.drop(1).map{ |job|
66
+ values = split_bjobs_output_line(job)
67
+
68
+ # make a hash of { field: "value", etc. }
69
+ Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
70
+ # if the value == "-", replace it with nil
71
+ o[k] = (v == "-" ? nil : v)
72
+ }
73
+ }
74
+ end
75
+
76
+
77
+ # Put a specified job on hold
78
+ # @example Put job "1234" on hold
79
+ # my_batch.hold_job("1234")
80
+ # @param id [#to_s] the id of the job
81
+ # @raise [Error] if `bstop` command exited unsuccessfully
82
+ # @return [void]
83
+ def hold_job(id)
84
+ call("bstop", id.to_s)
85
+ end
86
+
87
+ # Release a specified job that is on hold
88
+ # @example Release job "1234" from on hold
89
+ # my_batch.release_job("1234")
90
+ # @param id [#to_s] the id of the job
91
+ # @raise [Error] if `bresume` command exited unsuccessfully
92
+ # @return [void]
93
+ def release_job(id)
94
+ call("bresume", id.to_s)
95
+ end
96
+
97
+ # Delete a specified job from batch server
98
+ # @example Delete job "1234"
99
+ # my_batch.delete_job("1234")
100
+ # @param id [#to_s] the id of the job
101
+ # @raise [Error] if `bkill` command exited unsuccessfully
102
+ # @return [void]
103
+ def delete_job(id)
104
+ call("bkill", id.to_s)
105
+ end
106
+
107
+ # Submit a script expanded as a string to the batch server
108
+ # @param str [#to_s] script as a string
109
+ # @param args [Array<#to_s>] arguments passed to `sbatch` command
110
+ # @param env [Hash{#to_s => #to_s}] environment variables set
111
+ # @raise [Error] if `bsub` command exited unsuccessfully
112
+ # @return [String] the id of the job that was created
113
+ def submit_string(str, args: [], env: {})
114
+ args = args.map(&:to_s)
115
+ parse_bsub_output(call("bsub", *args, env: env, stdin: str.to_s))
116
+ end
117
+
118
+ # helper method
119
+ def parse_bsub_output(response)
120
+ if response =~ /Job <(.*)> /
121
+ $1
122
+ else
123
+ nil
124
+ end
125
+ end
126
+
127
+ private
128
+ # Call a forked Lsf command for a given cluster
129
+ def call(cmd, *args, env: {}, stdin: "")
130
+ cmd = bindir.join(cmd.to_s).to_s
131
+ #TODO: args = ["-m", cluster] + args.map(&:to_s)
132
+ env = default_env.merge(env.to_h)
133
+ o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
134
+ s.success? ? o : raise(Error, e)
135
+ end
136
+
137
+ # split a line of output from bjobs into field values
138
+ def split_bjobs_output_line(line)
139
+ values = line.strip.split
140
+
141
+ if(values.count > 15)
142
+ # FIXME: hack assumes 15 fields & only job name may have spaces
143
+ # collapse >15 fields into 15, assumes 7th field is JOB_NAME
144
+ values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
145
+ end
146
+
147
+ values
148
+ end
149
+
150
+ # verify the output from bjobs is parsable by this object
151
+ def validate_bjobs_output_columns(columns)
152
+ expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
153
+ SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
154
+ if columns != expected
155
+ raise Error, "bjobs output in different format than expected: " \
156
+ "#{columns.inspect} instead of #{expected.inspect}"
157
+ end
158
+ end
159
+
160
+ end