ood_core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ require "yaml"
2
+
3
+ module OodCore
4
+ # An enumerable that contains a list of {Cluster} objects
5
+ class Clusters
6
+ include Enumerable
7
+
8
+ # The format version of the configuration file
9
+ CONFIG_VERSION = ['v2', 'v1']
10
+
11
+ class << self
12
+ # Parse a configuration file or a set of configuration files in a
13
+ # directory
14
+ # @param path [#to_s] configuration file or directory path
15
+ # @raise [ConfigurationNotFound] if path does not exist
16
+ # @return [Clusters] the clusters parsed from config
17
+ def load_file(path)
18
+ config = Pathname.new(path.to_s).expand_path
19
+
20
+ clusters = []
21
+ if config.file?
22
+ CONFIG_VERSION.any? do |version|
23
+ YAML.safe_load(config.read).fetch(version, {}).each do |k, v|
24
+ clusters << Cluster.new(send("parse_#{version}", id: k, cluster: v))
25
+ end
26
+ !clusters.empty?
27
+ end
28
+ elsif config.directory?
29
+ Pathname.glob(config.join("*.yml")).each do |p|
30
+ CONFIG_VERSION.any? do |version|
31
+ if cluster = YAML.safe_load(p.read).fetch(version, nil)
32
+ clusters << Cluster.new(send("parse_#{version}", id: p.basename(".yml").to_s, cluster: cluster))
33
+ true
34
+ else
35
+ false
36
+ end
37
+ end
38
+ end
39
+ else
40
+ raise ConfigurationNotFound, "configuration file '#{config}' does not exist"
41
+ end
42
+
43
+ new clusters
44
+ end
45
+
46
+ private
47
+ # Parse a list of clusters from a 'v1' config
48
+ # NB: Makes minimum assumptions about config
49
+ def parse_v1(id:, cluster:)
50
+ c = {
51
+ id: id,
52
+ metadata: {},
53
+ login: {},
54
+ job: {},
55
+ acls: [],
56
+ custom: {}
57
+ }
58
+
59
+ c[:metadata][:title] = cluster["title"] if cluster.key?("title")
60
+ c[:metadata][:url] = cluster["url"] if cluster.key?("url")
61
+ c[:metadata][:private] = true if cluster["cluster"]["data"]["hpc_cluster"] == false
62
+
63
+ if l = cluster["cluster"]["data"]["servers"]["login"]
64
+ c[:login][:host] = l["data"]["host"]
65
+ end
66
+
67
+ if rm = cluster["cluster"]["data"]["servers"]["resource_mgr"]
68
+ c[:job][:adapter] = "torque"
69
+ c[:job][:host] = rm["data"]["host"]
70
+ c[:job][:lib] = rm["data"]["lib"]
71
+ c[:job][:bin] = rm["data"]["bin"]
72
+ c[:job][:acls] = []
73
+ end
74
+
75
+ if v = cluster["validators"]
76
+ if vc = v["cluster"]
77
+ c[:acls] = vc.map do |h|
78
+ {
79
+ adapter: "group",
80
+ groups: h["data"]["groups"],
81
+ type: h["data"]["allow"] ? "whitelist" : "blacklist"
82
+ }
83
+ end
84
+ end
85
+ end
86
+
87
+ c
88
+ end
89
+
90
+ # Parse a list of clusters from a 'v2' config
91
+ def parse_v2(id:, cluster:)
92
+ cluster.merge(id: id)
93
+ end
94
+ end
95
+
96
+ # @param clusters [Array<Cluster>] list of cluster objects
97
+ def initialize(clusters = [])
98
+ @clusters = clusters
99
+ end
100
+
101
+ # Find cluster in list using the id of the cluster
102
+ # @param id [Object] id of cluster object
103
+ # @return [Cluster, nil] cluster object if found
104
+ def [](id)
105
+ @clusters.detect { |cluster| cluster == id }
106
+ end
107
+
108
+ # For a block {|cluster| ...}
109
+ # @yield [cluster] Gives the next cluster object in the list
110
+ def each(&block)
111
+ @clusters.each(&block)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,19 @@
1
+ module OodCore
2
+ # Generic {OodCore} exception class
3
+ class Error < StandardError; end
4
+
5
+ # Raised when cannot find configuration file specified
6
+ class ConfigurationNotFound < Error; end
7
+
8
+ # Raised when adapter not specified in configuration
9
+ class AdapterNotSpecified < Error; end
10
+
11
+ # Raised when cannot find adapter specified in configuration
12
+ class AdapterNotFound < Error; end
13
+
14
+ # Raised when job adapter encounters an error with resource manager
15
+ class JobAdapterError < Error; end
16
+
17
+ # Raised when a job state is set to an invalid option
18
+ class UnknownStateAttribute < Error; end
19
+ end
@@ -0,0 +1,89 @@
1
+ module OodCore
2
+ module Job
3
+ # A class that handles the communication with a resource manager for
4
+ # submitting/statusing/holding/deleting jobs
5
+ # @abstract
6
+ class Adapter
7
+ # Submit a job with the attributes defined in the job template instance
8
+ # @abstract Subclass is expected to implement {#submit}
9
+ # @raise [NotImplementedError] if subclass did not define {#submit}
10
+ # @example Submit job template to cluster
11
+ # solver_id = job_adapter.submit(solver_script)
12
+ # #=> "1234.server"
13
+ # @example Submit job that depends on previous job
14
+ # post_id = job_adapter.submit(
15
+ # post_script,
16
+ # afterok: solver_id
17
+ # )
18
+ # #=> "1235.server"
19
+ # @param script [Script] script object that describes the
20
+ # script and attributes for the submitted job
21
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
22
+ # at any point after dependent jobs have started execution
23
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
24
+ # execution only after dependent jobs have terminated with no errors
25
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
26
+ # execution only after dependent jobs have terminated with errors
27
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
28
+ # execution after dependent jobs have terminated
29
+ # @return [String] the job id returned after successfully submitting a job
30
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
31
+ raise NotImplementedError, "subclass did not define #submit"
32
+ end
33
+
34
+ # Retrieve info for all jobs from the resource manager
35
+ # @abstract Subclass is expected to implement {#info_all}
36
+ # @raise [NotImplementedError] if subclass did not define {#info_all}
37
+ # @return [Array<Info>] information describing submitted jobs
38
+ def info_all
39
+ raise NotImplementedError, "subclass did not define #info_all"
40
+ end
41
+
42
+ # Retrieve job info from the resource manager
43
+ # @abstract Subclass is expected to implement {#info}
44
+ # @raise [NotImplementedError] if subclass did not define {#info}
45
+ # @param id [#to_s] the id of the job
46
+ # @return [Info] information describing submitted job
47
+ def info(id)
48
+ raise NotImplementedError, "subclass did not define #info"
49
+ end
50
+
51
+ # Retrieve job status from resource manager
52
+ # @note Optimized slightly over retrieving complete job information from server
53
+ # @abstract Subclass is expected to implement {#status}
54
+ # @raise [NotImplementedError] if subclass did not define {#status}
55
+ # @param id [#to_s] the id of the job
56
+ # @return [Status] status of job
57
+ def status(id)
58
+ raise NotImplementedError, "subclass did not define #status"
59
+ end
60
+
61
+ # Put the submitted job on hold
62
+ # @abstract Subclass is expected to implement {#hold}
63
+ # @raise [NotImplementedError] if subclass did not define {#hold}
64
+ # @param id [#to_s] the id of the job
65
+ # @return [void]
66
+ def hold(id)
67
+ raise NotImplementedError, "subclass did not define #hold"
68
+ end
69
+
70
+ # Release the job that is on hold
71
+ # @abstract Subclass is expected to implement {#release}
72
+ # @raise [NotImplementedError] if subclass did not define {#release}
73
+ # @param id [#to_s] the id of the job
74
+ # @return [void]
75
+ def release(id)
76
+ raise NotImplementedError, "subclass did not define #release"
77
+ end
78
+
79
+ # Delete the submitted job
80
+ # @abstract Subclass is expected to implement {#delete}
81
+ # @raise [NotImplementedError] if subclass did not define {#delete}
82
+ # @param id [#to_s] the id of the job
83
+ # @return [void]
84
+ def delete(id)
85
+ raise NotImplementedError, "subclass did not define #delete"
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,193 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+
3
+ module OodCore
4
+ module Job
5
+ class Factory
6
+ using Refinements::HashExtensions
7
+
8
+ # Build the Lsf adapter from a configuration
9
+ # @param config [#to_h] the configuration for job adapter
10
+ # @option config [#to_s] :bindir ('') Path to lsf client bin dir
11
+ # @option config [#to_s] :libdir ('') Path to lsf client lib dir
12
+ # @option config [#to_s] :envdir ('') Path to lsf client conf dir
13
+ # @option config [#to_s] :serverdir ('') Path to lsf client etc dir
14
+ def self.build_lsf(config)
15
+ batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
16
+ Adapters::Lsf.new(batch: batch)
17
+ end
18
+ end
19
+
20
+ module Adapters
21
+ class Lsf < Adapter
22
+ # @api private
23
+ attr_reader :batch, :helper
24
+
25
+ require "ood_core/job/adapters/lsf/batch"
26
+ require "ood_core/job/adapters/lsf/helper"
27
+
28
+ STATE_MAP = {
29
+ 'RUN' => :running,
30
+ 'PEND' => :queued,
31
+ 'DONE' => :completed,
32
+ 'EXIT' => :completed,
33
+
34
+ 'PSUSP' => :queued_held, # supsended before job started, resumable via bresume
35
+ 'USUSP' => :suspended, # suspended after job started, resumable via bresume
36
+ 'SSUSP' => :suspended,
37
+
38
+ 'WAIT' => :queued, # FIXME: not sure what else to do here
39
+ 'ZOMBI' => :undetermined,
40
+ 'UNKWN' => :undetermined
41
+ }
42
+
43
+ # @param opts [#to_h] the options defining this adapter
44
+ # @option opts [Batch] :batch The Lsf batch object
45
+ #
46
+ # @api private
47
+ # @see Factory.build_lsf
48
+ def initialize(batch:)
49
+ @batch = batch
50
+ @helper = Lsf::Helper.new
51
+ end
52
+
53
+ # Submit a job with the attributes defined in the job template instance
54
+ # @param script [Script] script object that describes the script and
55
+ # attributes for the submitted job
56
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
57
+ # execution at any point after dependent jobs have started execution
58
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
59
+ # execution only after dependent jobs have terminated with no errors
60
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
61
+ # execution only after dependent jobs have terminated with errors
62
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
63
+ # execution after dependent jobs have terminated
64
+ # @raise [JobAdapterError] if something goes wrong submitting a job
65
+ # @return [String] the job id returned after successfully submitting a
66
+ # job
67
+ # @see Adapter#submit
68
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
69
+ # ensure dependencies are array of ids
70
+ after = Array(after).map(&:to_s)
71
+ afterok = Array(afterok).map(&:to_s)
72
+ afternotok = Array(afternotok).map(&:to_s)
73
+ afterany = Array(afterany).map(&:to_s)
74
+
75
+ args = []
76
+ args += ["-P", script.accounting_id] unless script.accounting_id.nil?
77
+ args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
78
+ args += ["-J", script.job_name] unless script.job_name.nil?
79
+
80
+ # TODO: dependencies
81
+
82
+ env = {
83
+ #TODO:
84
+ #LSB_HOSTS?
85
+ #LSB_MCPU_HOSTS?
86
+ #SNDJOBS_TO?
87
+ #
88
+ }
89
+
90
+ # Submit job
91
+ batch.submit_string(script.content, args: args, env: env)
92
+
93
+ rescue Batch::Error => e
94
+ raise JobAdapterError, e.message
95
+ end
96
+
97
+ # Retrieve job info from the resource manager
98
+ # @param id [#to_s] the id of the job
99
+ # @raise [JobAdapterError] if something goes wrong getting job info
100
+ # @return [Info] information describing submitted job
101
+ # @see Adapter#info
102
+ def info(id)
103
+ # TODO: handle job arrays
104
+ job = batch.get_job(id: id)
105
+ job ? info_for_batch_hash(job) : nil
106
+ rescue Batch::Error => e
107
+ raise JobAdapterError, e.message
108
+ end
109
+
110
+ # Retrieve info for all jobs from the resource manager
111
+ # @raise [JobAdapterError] if something goes wrong getting job info
112
+ # @return [Array<Info>] information describing submitted jobs
113
+ # @see Adapter#info_all
114
+ def info_all
115
+ batch.get_jobs.map { |v| info_for_batch_hash(v) }
116
+ rescue Batch::Error => e
117
+ raise JobAdapterError, e.message
118
+ end
119
+
120
+ # Retrieve job status from resource manager
121
+ # @param id [#to_s] the id of the job
122
+ # @raise [JobAdapterError] if something goes wrong getting job status
123
+ # @return [Status] status of job
124
+ # @see Adapter#status
125
+ def status(id)
126
+ job = batch.get_job(id: id)
127
+ state = job ? get_state(job[:status]) : :completed
128
+ Status.new(state: state)
129
+ rescue Batch::Error => e
130
+ raise JobAdapterError, e.message
131
+ end
132
+
133
+ # Put the submitted job on hold
134
+ # @param id [#to_s] the id of the job
135
+ # @raise [JobAdapterError] if something goes wrong holding a job
136
+ # @return [void]
137
+ # @see Adapter#hold
138
+ def hold(id)
139
+ batch.hold_job(id.to_s)
140
+ rescue Batch::Error => e
141
+ raise JobAdapterError, e.message
142
+ end
143
+
144
+ # Release the job that is on hold
145
+ # @param id [#to_s] the id of the job
146
+ # @raise [JobAdapterError] if something goes wrong releasing a job
147
+ # @return [void]
148
+ # @see Adapter#release
149
+ def release(id)
150
+ batch.release_job(id.to_s)
151
+ rescue Batch::Error => e
152
+ raise JobAdapterError, e.message
153
+ end
154
+
155
+ # Delete the submitted job
156
+ # @param id [#to_s] the id of the job
157
+ # @raise [JobAdapterError] if something goes wrong deleting a job
158
+ # @return [void]
159
+ # @see Adapter#delete
160
+ def delete(id)
161
+ batch.delete_job(id.to_s)
162
+ rescue Batch::Error => e
163
+ raise JobAdapterError, e.message
164
+ end
165
+
166
+ private
167
+ # Determine state from LSF state code
168
+ def get_state(st)
169
+ STATE_MAP.fetch(st, :undetermined)
170
+ end
171
+
172
+ def info_for_batch_hash(v)
173
+ Info.new(
174
+ id: v[:id],
175
+ status: get_state(v[:status]),
176
+ allocated_nodes: [],
177
+ submit_host: v[:from_host],
178
+ job_name: v[:name],
179
+ job_owner: v[:user],
180
+ accounting_id: v[:project],
181
+ procs: nil,
182
+ queue_name: v[:queue],
183
+ wallclock_time: nil,
184
+ cpu_time: nil,
185
+ submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
186
+ dispatch_time: helper.parse_past_time(v[:start_time], ignore_errors: true),
187
+ native: v
188
+ )
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,160 @@
1
+ # Object used for simplified communication with a LSF batch server
2
+ #
3
+ # @api private
4
+ class OodCore::Job::Adapters::Lsf::Batch
5
+ attr_reader :bindir, :libdir, :envdir, :serverdir
6
+
7
+ # The root exception class that all LSF-specific exceptions inherit
8
+ # from
9
+ class Error < StandardError; end
10
+
11
+ # @param bin [#to_s] path to LSF installation binaries
12
+ def initialize(bindir: "", envdir: "", libdir: "", serverdir: "", **_)
13
+ @bindir = Pathname.new(bindir.to_s)
14
+
15
+ @envdir = Pathname.new(envdir.to_s)
16
+ @libdir = Pathname.new(libdir.to_s)
17
+ @serverdir = Pathname.new(serverdir.to_s)
18
+ end
19
+
20
+ def default_env
21
+ {
22
+ "LSF_BINDIR" => bindir.to_s,
23
+ "LSF_LIBDIR" => libdir.to_s,
24
+ "LSF_ENVDIR" => envdir.to_s,
25
+ "LSF_SERVERDIR" => serverdir.to_s
26
+ }.reject {|k,v| v.nil? || v.empty? }
27
+ end
28
+
29
+ # Get a list of hashes detailing each of the jobs on the batch server
30
+ # @raise [Error] if `bjobs` command exited unsuccessfully
31
+ # @return [Array<Hash>] list of details for jobs
32
+ def get_jobs
33
+ #TODO: split into get_all_jobs, get_my_jobs
34
+ args = bjobs_default_args
35
+ parse_bjobs_output(call("bjobs", *args))
36
+ end
37
+
38
+ # Get hash detailing the specified job
39
+ # @param id [#to_s] the id of the job to check
40
+ # @raise [Error] if `bjobs` command exited unsuccessfully
41
+ # @return [Hash] details of specified job
42
+ def get_job(id:)
43
+ args = bjobs_default_args
44
+ args << id.to_s
45
+ parse_bjobs_output(call("bjobs", *args)).first
46
+ end
47
+
48
+ def bjobs_default_args
49
+ %w( -u all -a -w -W )
50
+ end
51
+
52
+ # status fields available from bjobs
53
+ def fields
54
+ %i(id user status queue from_host exec_host name submit_time
55
+ project cpu_used mem swap pids start_time finish_time)
56
+ end
57
+
58
+ # helper method
59
+ def parse_bjobs_output(response)
60
+ return [] if response =~ /No job found/ || response.nil?
61
+
62
+ lines = response.split("\n")
63
+ validate_bjobs_output_columns(lines.first.split)
64
+
65
+ lines.drop(1).map{ |job|
66
+ values = split_bjobs_output_line(job)
67
+
68
+ # make a hash of { field: "value", etc. }
69
+ Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
70
+ # if the value == "-", replace it with nil
71
+ o[k] = (v == "-" ? nil : v)
72
+ }
73
+ }
74
+ end
75
+
76
+
77
+ # Put a specified job on hold
78
+ # @example Put job "1234" on hold
79
+ # my_batch.hold_job("1234")
80
+ # @param id [#to_s] the id of the job
81
+ # @raise [Error] if `bstop` command exited unsuccessfully
82
+ # @return [void]
83
+ def hold_job(id)
84
+ call("bstop", id.to_s)
85
+ end
86
+
87
+ # Release a specified job that is on hold
88
+ # @example Release job "1234" from on hold
89
+ # my_batch.release_job("1234")
90
+ # @param id [#to_s] the id of the job
91
+ # @raise [Error] if `bresume` command exited unsuccessfully
92
+ # @return [void]
93
+ def release_job(id)
94
+ call("bresume", id.to_s)
95
+ end
96
+
97
+ # Delete a specified job from batch server
98
+ # @example Delete job "1234"
99
+ # my_batch.delete_job("1234")
100
+ # @param id [#to_s] the id of the job
101
+ # @raise [Error] if `bkill` command exited unsuccessfully
102
+ # @return [void]
103
+ def delete_job(id)
104
+ call("bkill", id.to_s)
105
+ end
106
+
107
+ # Submit a script expanded as a string to the batch server
108
+ # @param str [#to_s] script as a string
109
+ # @param args [Array<#to_s>] arguments passed to `sbatch` command
110
+ # @param env [Hash{#to_s => #to_s}] environment variables set
111
+ # @raise [Error] if `bsub` command exited unsuccessfully
112
+ # @return [String] the id of the job that was created
113
+ def submit_string(str, args: [], env: {})
114
+ args = args.map(&:to_s)
115
+ parse_bsub_output(call("bsub", *args, env: env, stdin: str.to_s))
116
+ end
117
+
118
+ # helper method
119
+ def parse_bsub_output(response)
120
+ if response =~ /Job <(.*)> /
121
+ $1
122
+ else
123
+ nil
124
+ end
125
+ end
126
+
127
+ private
128
+ # Call a forked Lsf command for a given cluster
129
+ def call(cmd, *args, env: {}, stdin: "")
130
+ cmd = bindir.join(cmd.to_s).to_s
131
+ #TODO: args = ["-m", cluster] + args.map(&:to_s)
132
+ env = default_env.merge(env.to_h)
133
+ o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
134
+ s.success? ? o : raise(Error, e)
135
+ end
136
+
137
+ # split a line of output from bjobs into field values
138
+ def split_bjobs_output_line(line)
139
+ values = line.strip.split
140
+
141
+ if(values.count > 15)
142
+ # FIXME: hack assumes 15 fields & only job name may have spaces
143
+ # collapse >15 fields into 15, assumes 7th field is JOB_NAME
144
+ values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
145
+ end
146
+
147
+ values
148
+ end
149
+
150
+ # verify the output from bjobs is parsable by this object
151
+ def validate_bjobs_output_columns(columns)
152
+ expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
153
+ SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
154
+ if columns != expected
155
+ raise Error, "bjobs output in different format than expected: " \
156
+ "#{columns.inspect} instead of #{expected.inspect}"
157
+ end
158
+ end
159
+
160
+ end