ood_core 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,163 @@
1
+ require "ood_core/refinements/array_extensions"
2
+ require "ood_core/refinements/hash_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the Sun Grid Engine adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [Object] :cluster (nil) The cluster to communicate with
12
+ # @option config [Object] :conf (nil) Path to the SGE conf
13
+ # @option config [Object] :bin (nil) Path to SGE client binaries
14
+ # @option config [Object] :sge_root (nil) Path to SGE root, note that
15
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to SGE client executables
16
+ # this may be nil, but must be set to use the DRMAA API, and there is a
17
+ # severe performance penalty calling Sge#info without using DRMAA.
18
+ def self.build_sge(config)
19
+ batch = Adapters::Sge::Batch.new(config.to_h.symbolize_keys)
20
+ Adapters::Sge.new(batch: batch)
21
+ end
22
+ end
23
+
24
+ module Adapters
25
+ class Sge < Adapter
26
+ using Refinements::HashExtensions
27
+ using Refinements::ArrayExtensions
28
+
29
+ require "ood_core/job/adapters/sge/batch"
30
+ require "ood_core/job/adapters/sge/helper"
31
+
32
+ # The cluster of the Sun Grid Engine batch server
33
+ # @example UCLA's hoffman2 cluster
34
+ # my_batch.cluster #=> "hoffman2"
35
+ # @return [String, nil] the cluster name
36
+ attr_reader :cluster
37
+
38
+ # The path to the Sun Grid Engine configuration file
39
+ # @example For Sun Grid Engine 8.0.1
40
+ # my_batch.conf.to_s #=> "/u/systems/UGE8.0.1vm/h2.conf
41
+ # @return [Pathname, nil] path to gridengine conf
42
+ attr_reader :conf
43
+
44
+ # The path to the Sun Grid Engine client installation binaries
45
+ # @example For Sun Grid Engine 8.0.1
46
+ # my_batch.bin.to_s #=> "/u/systems/UGE8.0.1vm/bin/lx-amd64/
47
+ # @return [Pathname] path to SGE binaries
48
+ attr_reader :bin
49
+
50
+ # The root exception class that all Sun Grid Engine-specific exceptions inherit
51
+ # from
52
+ class Error < StandardError; end
53
+
54
+ # @param batch [Adapters::Sge::Batch]
55
+ def initialize(batch:)
56
+ @batch = batch
57
+ @helper = Sge::Helper.new
58
+ end
59
+
60
+ # Submit a job with the attributes defined in the job template instance
61
+ # @example Submit job template to cluster
62
+ # solver_id = job_adapter.submit(solver_script)
63
+ # #=> "1234.server"
64
+ # @example Submit job that depends on previous job
65
+ # post_id = job_adapter.submit(
66
+ # post_script,
67
+ # afterok: solver_id
68
+ # )
69
+ # #=> "1235.server"
70
+ # @param script [Script] script object that describes the
71
+ # script and attributes for the submitted job
72
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
73
+ # at any point after dependent jobs have started execution
74
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
75
+ # execution only after dependent jobs have terminated with no errors
76
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
77
+ # execution only after dependent jobs have terminated with errors
78
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
79
+ # execution after dependent jobs have terminated
80
+ # @raise [JobAdapterError] if something goes wrong submitting a job
81
+ # @return [String] the job id returned after successfully submitting a job
82
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
83
+ # SGE supports jod dependencies on job completion
84
+ args = @helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
85
+
86
+ @batch.submit(script.content, args)
87
+ rescue Batch::Error => e
88
+ raise JobAdapterError, e.message
89
+ end
90
+
91
+ # Retrieve info for all jobs from the resource manager
92
+ # @return [Array<Info>] information describing submitted jobs
93
+ def info_all
94
+ @batch.get_all
95
+ rescue Batch::Error => e
96
+ raise JobAdapterError, e.message
97
+ end
98
+
99
+ # Retrieve info for all jobs for a given owner or owners from the
100
+ # resource manager
101
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
102
+ # @raise [JobAdapterError] if something goes wrong getting job info
103
+ # @return [Array<Info>] information describing submitted jobs
104
+ def info_where_owner(owner)
105
+ owner = Array.wrap(owner).map(&:to_s).join(',')
106
+ @batch.get_all(owner: owner)
107
+ rescue Batch::Error => e
108
+ raise JobAdapterError, e.message
109
+ end
110
+
111
+ # Retrieve job info from the resource manager
112
+ # @param id [#to_s] the id of the job
113
+ # @raise [JobAdapterError] if something goes wrong getting job info
114
+ # @return [Info] information describing submitted job
115
+ def info(id)
116
+ @batch.get_info_enqueued_job(id)
117
+ rescue Batch::Error => e
118
+ raise JobAdapterError, e.message
119
+ end
120
+
121
+ # Retrieve job status from resource manager
122
+ # @param id [#to_s] the id of the job
123
+ # @raise [JobAdapterError] if something goes wrong getting the status of a job
124
+ # @return [Status] status of job
125
+ def status(id)
126
+ info(id).status
127
+ rescue Batch::Error => e
128
+ raise JobAdapterError, e.message
129
+ end
130
+
131
+ # Put the submitted job on hold
132
+ # @param id [#to_s] the id of the job
133
+ # @raise [JobAdapterError] if something goes wrong holding a job
134
+ # @return [void]
135
+ def hold(id)
136
+ @batch.hold(id.to_s)
137
+ rescue Batch::Error => e
138
+ raise JobAdapterError, e.message
139
+ end
140
+
141
+ # Release the job that is on hold
142
+ # @param id [#to_s] the id of the job
143
+ # @raise [JobAdapterError] if something goes wrong releasing a job
144
+ # @return [void]
145
+ def release(id)
146
+ @batch.release(id.to_s)
147
+ rescue Batch::Error => e
148
+ raise JobAdapterError, e.message
149
+ end
150
+
151
+ # Delete the submitted job
152
+ # @param id [#to_s] the id of the job
153
+ # @raise [JobAdapterError] if something goes wrong deleting a job
154
+ # @return [void]
155
+ def delete(id)
156
+ @batch.delete(id.to_s)
157
+ rescue Batch::Error => e
158
+ raise JobAdapterError, e.message
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
@@ -1,5 +1,6 @@
1
1
  require "time"
2
2
  require "ood_core/refinements/hash_extensions"
3
+ require "ood_core/job/adapters/helper"
3
4
 
4
5
  module OodCore
5
6
  module Job
@@ -11,12 +12,14 @@ module OodCore
11
12
  # @option config [Object] :cluster (nil) The cluster to communicate with
12
13
  # @option config [Object] :conf (nil) Path to the slurm conf
13
14
  # @option config [Object] :bin (nil) Path to slurm client binaries
15
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to Slurm client executables
14
16
  def self.build_slurm(config)
15
17
  c = config.to_h.symbolize_keys
16
18
  cluster = c.fetch(:cluster, nil)
17
19
  conf = c.fetch(:conf, nil)
18
20
  bin = c.fetch(:bin, nil)
19
- slurm = Adapters::Slurm::Batch.new(cluster: cluster, conf: conf, bin: bin)
21
+ bin_overrides = c.fetch(:bin_overrides, {})
22
+ slurm = Adapters::Slurm::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides)
20
23
  Adapters::Slurm.new(slurm: slurm)
21
24
  end
22
25
  end
@@ -48,6 +51,12 @@ module OodCore
48
51
  # @return [Pathname] path to slurm binaries
49
52
  attr_reader :bin
50
53
 
54
+ # Optional overrides for Slurm client executables
55
+ # @example
56
+ # {'sbatch' => '/usr/local/bin/sbatch'}
57
+ # @return Hash<String, String>
58
+ attr_reader :bin_overrides
59
+
51
60
  # The root exception class that all Slurm-specific exceptions inherit
52
61
  # from
53
62
  class Error < StandardError; end
@@ -55,10 +64,11 @@ module OodCore
55
64
  # @param cluster [#to_s, nil] the cluster name
56
65
  # @param conf [#to_s, nil] path to the slurm conf
57
66
  # @param bin [#to_s] path to slurm installation binaries
58
- def initialize(cluster: nil, bin: nil, conf: nil)
67
+ def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {})
59
68
  @cluster = cluster && cluster.to_s
60
69
  @conf = conf && Pathname.new(conf.to_s)
61
70
  @bin = Pathname.new(bin.to_s)
71
+ @bin_overrides = bin_overrides
62
72
  end
63
73
 
64
74
  # Get a list of hashes detailing each of the jobs on the batch server
@@ -140,7 +150,7 @@ module OodCore
140
150
  private
141
151
  # Call a forked Slurm command for a given cluster
142
152
  def call(cmd, *args, env: {}, stdin: "")
143
- cmd = bin.join(cmd.to_s).to_s
153
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
144
154
  args = args.map(&:to_s)
145
155
  args += ["-M", cluster] if cluster
146
156
  env = env.to_h
@@ -150,11 +160,11 @@ module OodCore
150
160
  end
151
161
 
152
162
  # Fields requested from a formatted `squeue` call
163
+ # Note that the order of these fields is important
153
164
  def fields
154
165
  {
155
166
  account: "%a",
156
167
  job_id: "%A",
157
- gres: "%b",
158
168
  exec_host: "%B",
159
169
  min_cpus: "%c",
160
170
  cpus: "%C",
@@ -200,7 +210,8 @@ module OodCore
200
210
  nice: "%y",
201
211
  scheduled_nodes: "%Y",
202
212
  sockets_cores_threads: "%z",
203
- work_dir: "%Z"
213
+ work_dir: "%Z",
214
+ gres: "%b", # must come at the end to fix a bug with Slurm 18
204
215
  }
205
216
  end
206
217
  end
@@ -0,0 +1,109 @@
1
+ class OodCore::Job::Adapters::Torque
2
+ # Maintains a constant Hash of defined PBS attribute types
3
+ # Includes:
4
+ # Attribute names used by user commands
5
+ # Additional job and general attribute names
6
+ # Additional queue attribute names
7
+ # Additional server attribute names
8
+ # Additional node attribute names
9
+ ATTR = {
10
+ # Attribute names used by user commands
11
+ a: :Execution_Time,
12
+ c: :Checkpoint,
13
+ e: :Error_Path,
14
+ f: :fault_tolerant,
15
+ g: :group_list,
16
+ h: :Hold_Types,
17
+ j: :Join_Path,
18
+ k: :Keep_Files,
19
+ l: :Resource_List,
20
+ m: :Mail_Points,
21
+ o: :Output_Path,
22
+ p: :Priority,
23
+ q: :destination,
24
+ r: :Rerunable,
25
+ t: :job_array_request,
26
+ array_id: :job_array_id,
27
+ u: :User_List,
28
+ v: :Variable_List,
29
+ A: :Account_Name,
30
+ args: :job_arguments,
31
+ reservation_id: :reservation_id,
32
+ login_node_id: :login_node_id,
33
+ login_prop: :login_property,
34
+ external_nodes: :external_nodes,
35
+ multi_req_alps: :multi_req_alps,
36
+ M: :Mail_Users,
37
+ N: :Job_Name,
38
+ S: :Shell_Path_List,
39
+ depend: :depend,
40
+ inter: :interactive,
41
+ stagein: :stagein,
42
+ stageout: :stageout,
43
+ jobtype: :jobtype,
44
+ submit_host: :submit_host,
45
+ init_work_dir: :init_work_dir,
46
+
47
+ # Additional job and general attribute names
48
+ ctime: :ctime,
49
+ exechost: :exec_host,
50
+ execport: :exec_port,
51
+ mtime: :mtime,
52
+ qtime: :qtime,
53
+ session: :session_id,
54
+ euser: :euser,
55
+ egroup: :egroup,
56
+ hashname: :hashname,
57
+ hopcount: :hop_count,
58
+ security: :security,
59
+ sched_hint: :sched_hint,
60
+ substate: :substate,
61
+ name: :Job_Name,
62
+ owner: :Job_Owner,
63
+ used: :resources_used,
64
+ state: :job_state,
65
+ queue: :queue,
66
+ server: :server,
67
+ maxrun: :max_running,
68
+ maxreport: :max_report,
69
+ total: :total_jobs,
70
+ comment: :comment,
71
+ cookie: :cookie,
72
+ qrank: :queue_rank,
73
+ altid: :alt_id,
74
+ etime: :etime,
75
+ exitstat: :exit_status,
76
+ forwardx11: :forward_x11,
77
+ submit_args: :submit_args,
78
+ tokens: :tokens,
79
+ netcounter: :net_counter,
80
+ umask: :umask,
81
+ start_time: :start_time,
82
+ start_count: :start_count,
83
+ checkpoint_dir: :checkpoint_dir,
84
+ checkpoint_name: :checkpoint_name,
85
+ checkpoint_time: :checkpoint_time,
86
+ checkpoint_restart_status: :checkpoint_restart_status,
87
+ restart_name: :restart_name,
88
+ comp_time: :comp_time,
89
+ reported: :reported,
90
+ intcmd: :inter_cmd,
91
+ job_radix: :job_radix,
92
+ sister_list: :sister_list,
93
+ total_runtime: :total_runtime,
94
+ P: :proxy_user,
95
+ node_exclusive: :node_exclusive,
96
+ exec_gpus: :exec_gpus,
97
+ exec_mics: :exec_mics,
98
+ J: :job_id,
99
+ pagg: :pagg_id,
100
+ system_start_time: :system_start_time,
101
+ gpu_flags: :gpu_flags,
102
+
103
+ # Additional queue attribute names
104
+
105
+ # Additional server attribute names
106
+
107
+ # Additional node attribute names
108
+ }
109
+ end