ood_core 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ require "ood_core/refinements/array_extensions"
2
+ require "ood_core/refinements/hash_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the Sun Grid Engine adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [Object] :cluster (nil) The cluster to communicate with
12
+ # @option config [Object] :conf (nil) Path to the SGE conf
13
+ # @option config [Object] :bin (nil) Path to SGE client binaries
14
+ # @option config [Object] :sge_root (nil) Path to SGE root, note that
15
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to SGE client executables
16
+ # this may be nil, but must be set to use the DRMAA API, and there is a
17
+ # severe performance penalty calling Sge#info without using DRMAA.
18
+ def self.build_sge(config)
19
+ batch = Adapters::Sge::Batch.new(config.to_h.symbolize_keys)
20
+ Adapters::Sge.new(batch: batch)
21
+ end
22
+ end
23
+
24
+ module Adapters
25
+ class Sge < Adapter
26
+ using Refinements::HashExtensions
27
+ using Refinements::ArrayExtensions
28
+
29
+ require "ood_core/job/adapters/sge/batch"
30
+ require "ood_core/job/adapters/sge/helper"
31
+
32
+ # The cluster of the Sun Grid Engine batch server
33
+ # @example UCLA's hoffman2 cluster
34
+ # my_batch.cluster #=> "hoffman2"
35
+ # @return [String, nil] the cluster name
36
+ attr_reader :cluster
37
+
38
+ # The path to the Sun Grid Engine configuration file
39
+ # @example For Sun Grid Engine 8.0.1
40
+ # my_batch.conf.to_s #=> "/u/systems/UGE8.0.1vm/h2.conf
41
+ # @return [Pathname, nil] path to gridengine conf
42
+ attr_reader :conf
43
+
44
+ # The path to the Sun Grid Engine client installation binaries
45
+ # @example For Sun Grid Engine 8.0.1
46
+ # my_batch.bin.to_s #=> "/u/systems/UGE8.0.1vm/bin/lx-amd64/
47
+ # @return [Pathname] path to SGE binaries
48
+ attr_reader :bin
49
+
50
+ # The root exception class that all Sun Grid Engine-specific exceptions inherit
51
+ # from
52
+ class Error < StandardError; end
53
+
54
+ # @param batch [Adapters::Sge::Batch]
55
+ def initialize(batch:)
56
+ @batch = batch
57
+ @helper = Sge::Helper.new
58
+ end
59
+
60
+ # Submit a job with the attributes defined in the job template instance
61
+ # @example Submit job template to cluster
62
+ # solver_id = job_adapter.submit(solver_script)
63
+ # #=> "1234.server"
64
+ # @example Submit job that depends on previous job
65
+ # post_id = job_adapter.submit(
66
+ # post_script,
67
+ # afterok: solver_id
68
+ # )
69
+ # #=> "1235.server"
70
+ # @param script [Script] script object that describes the
71
+ # script and attributes for the submitted job
72
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
73
+ # at any point after dependent jobs have started execution
74
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
75
+ # execution only after dependent jobs have terminated with no errors
76
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
77
+ # execution only after dependent jobs have terminated with errors
78
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
79
+ # execution after dependent jobs have terminated
80
+ # @raise [JobAdapterError] if something goes wrong submitting a job
81
+ # @return [String] the job id returned after successfully submitting a job
82
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
83
+ # SGE supports jod dependencies on job completion
84
+ args = @helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
85
+
86
+ @batch.submit(script.content, args)
87
+ rescue Batch::Error => e
88
+ raise JobAdapterError, e.message
89
+ end
90
+
91
+ # Retrieve info for all jobs from the resource manager
92
+ # @return [Array<Info>] information describing submitted jobs
93
+ def info_all
94
+ @batch.get_all
95
+ rescue Batch::Error => e
96
+ raise JobAdapterError, e.message
97
+ end
98
+
99
+ # Retrieve info for all jobs for a given owner or owners from the
100
+ # resource manager
101
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
102
+ # @raise [JobAdapterError] if something goes wrong getting job info
103
+ # @return [Array<Info>] information describing submitted jobs
104
+ def info_where_owner(owner)
105
+ owner = Array.wrap(owner).map(&:to_s).join(',')
106
+ @batch.get_all(owner: owner)
107
+ rescue Batch::Error => e
108
+ raise JobAdapterError, e.message
109
+ end
110
+
111
+ # Retrieve job info from the resource manager
112
+ # @param id [#to_s] the id of the job
113
+ # @raise [JobAdapterError] if something goes wrong getting job info
114
+ # @return [Info] information describing submitted job
115
+ def info(id)
116
+ @batch.get_info_enqueued_job(id)
117
+ rescue Batch::Error => e
118
+ raise JobAdapterError, e.message
119
+ end
120
+
121
+ # Retrieve job status from resource manager
122
+ # @param id [#to_s] the id of the job
123
+ # @raise [JobAdapterError] if something goes wrong getting the status of a job
124
+ # @return [Status] status of job
125
+ def status(id)
126
+ info(id).status
127
+ rescue Batch::Error => e
128
+ raise JobAdapterError, e.message
129
+ end
130
+
131
+ # Put the submitted job on hold
132
+ # @param id [#to_s] the id of the job
133
+ # @raise [JobAdapterError] if something goes wrong holding a job
134
+ # @return [void]
135
+ def hold(id)
136
+ @batch.hold(id.to_s)
137
+ rescue Batch::Error => e
138
+ raise JobAdapterError, e.message
139
+ end
140
+
141
+ # Release the job that is on hold
142
+ # @param id [#to_s] the id of the job
143
+ # @raise [JobAdapterError] if something goes wrong releasing a job
144
+ # @return [void]
145
+ def release(id)
146
+ @batch.release(id.to_s)
147
+ rescue Batch::Error => e
148
+ raise JobAdapterError, e.message
149
+ end
150
+
151
+ # Delete the submitted job
152
+ # @param id [#to_s] the id of the job
153
+ # @raise [JobAdapterError] if something goes wrong deleting a job
154
+ # @return [void]
155
+ def delete(id)
156
+ @batch.delete(id.to_s)
157
+ rescue Batch::Error => e
158
+ raise JobAdapterError, e.message
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
@@ -1,5 +1,6 @@
1
1
  require "time"
2
2
  require "ood_core/refinements/hash_extensions"
3
+ require "ood_core/job/adapters/helper"
3
4
 
4
5
  module OodCore
5
6
  module Job
@@ -11,12 +12,14 @@ module OodCore
11
12
  # @option config [Object] :cluster (nil) The cluster to communicate with
12
13
  # @option config [Object] :conf (nil) Path to the slurm conf
13
14
  # @option config [Object] :bin (nil) Path to slurm client binaries
15
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to Slurm client executables
14
16
  def self.build_slurm(config)
15
17
  c = config.to_h.symbolize_keys
16
18
  cluster = c.fetch(:cluster, nil)
17
19
  conf = c.fetch(:conf, nil)
18
20
  bin = c.fetch(:bin, nil)
19
- slurm = Adapters::Slurm::Batch.new(cluster: cluster, conf: conf, bin: bin)
21
+ bin_overrides = c.fetch(:bin_overrides, {})
22
+ slurm = Adapters::Slurm::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides)
20
23
  Adapters::Slurm.new(slurm: slurm)
21
24
  end
22
25
  end
@@ -48,6 +51,12 @@ module OodCore
48
51
  # @return [Pathname] path to slurm binaries
49
52
  attr_reader :bin
50
53
 
54
+ # Optional overrides for Slurm client executables
55
+ # @example
56
+ # {'sbatch' => '/usr/local/bin/sbatch'}
57
+ # @return Hash<String, String>
58
+ attr_reader :bin_overrides
59
+
51
60
  # The root exception class that all Slurm-specific exceptions inherit
52
61
  # from
53
62
  class Error < StandardError; end
@@ -55,10 +64,11 @@ module OodCore
55
64
  # @param cluster [#to_s, nil] the cluster name
56
65
  # @param conf [#to_s, nil] path to the slurm conf
57
66
  # @param bin [#to_s] path to slurm installation binaries
58
- def initialize(cluster: nil, bin: nil, conf: nil)
67
+ def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {})
59
68
  @cluster = cluster && cluster.to_s
60
69
  @conf = conf && Pathname.new(conf.to_s)
61
70
  @bin = Pathname.new(bin.to_s)
71
+ @bin_overrides = bin_overrides
62
72
  end
63
73
 
64
74
  # Get a list of hashes detailing each of the jobs on the batch server
@@ -140,7 +150,7 @@ module OodCore
140
150
  private
141
151
  # Call a forked Slurm command for a given cluster
142
152
  def call(cmd, *args, env: {}, stdin: "")
143
- cmd = bin.join(cmd.to_s).to_s
153
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
144
154
  args = args.map(&:to_s)
145
155
  args += ["-M", cluster] if cluster
146
156
  env = env.to_h
@@ -150,11 +160,11 @@ module OodCore
150
160
  end
151
161
 
152
162
  # Fields requested from a formatted `squeue` call
163
+ # Note that the order of these fields is important
153
164
  def fields
154
165
  {
155
166
  account: "%a",
156
167
  job_id: "%A",
157
- gres: "%b",
158
168
  exec_host: "%B",
159
169
  min_cpus: "%c",
160
170
  cpus: "%C",
@@ -200,7 +210,8 @@ module OodCore
200
210
  nice: "%y",
201
211
  scheduled_nodes: "%Y",
202
212
  sockets_cores_threads: "%z",
203
- work_dir: "%Z"
213
+ work_dir: "%Z",
214
+ gres: "%b", # must come at the end to fix a bug with Slurm 18
204
215
  }
205
216
  end
206
217
  end
@@ -0,0 +1,109 @@
1
+ class OodCore::Job::Adapters::Torque
2
+ # Maintains a constant Hash of defined PBS attribute types
3
+ # Includes:
4
+ # Attribute names used by user commands
5
+ # Additional job and general attribute names
6
+ # Additional queue attribute names
7
+ # Additional server attribute names
8
+ # Additional node attribute names
9
+ ATTR = {
10
+ # Attribute names used by user commands
11
+ a: :Execution_Time,
12
+ c: :Checkpoint,
13
+ e: :Error_Path,
14
+ f: :fault_tolerant,
15
+ g: :group_list,
16
+ h: :Hold_Types,
17
+ j: :Join_Path,
18
+ k: :Keep_Files,
19
+ l: :Resource_List,
20
+ m: :Mail_Points,
21
+ o: :Output_Path,
22
+ p: :Priority,
23
+ q: :destination,
24
+ r: :Rerunable,
25
+ t: :job_array_request,
26
+ array_id: :job_array_id,
27
+ u: :User_List,
28
+ v: :Variable_List,
29
+ A: :Account_Name,
30
+ args: :job_arguments,
31
+ reservation_id: :reservation_id,
32
+ login_node_id: :login_node_id,
33
+ login_prop: :login_property,
34
+ external_nodes: :external_nodes,
35
+ multi_req_alps: :multi_req_alps,
36
+ M: :Mail_Users,
37
+ N: :Job_Name,
38
+ S: :Shell_Path_List,
39
+ depend: :depend,
40
+ inter: :interactive,
41
+ stagein: :stagein,
42
+ stageout: :stageout,
43
+ jobtype: :jobtype,
44
+ submit_host: :submit_host,
45
+ init_work_dir: :init_work_dir,
46
+
47
+ # Additional job and general attribute names
48
+ ctime: :ctime,
49
+ exechost: :exec_host,
50
+ execport: :exec_port,
51
+ mtime: :mtime,
52
+ qtime: :qtime,
53
+ session: :session_id,
54
+ euser: :euser,
55
+ egroup: :egroup,
56
+ hashname: :hashname,
57
+ hopcount: :hop_count,
58
+ security: :security,
59
+ sched_hint: :sched_hint,
60
+ substate: :substate,
61
+ name: :Job_Name,
62
+ owner: :Job_Owner,
63
+ used: :resources_used,
64
+ state: :job_state,
65
+ queue: :queue,
66
+ server: :server,
67
+ maxrun: :max_running,
68
+ maxreport: :max_report,
69
+ total: :total_jobs,
70
+ comment: :comment,
71
+ cookie: :cookie,
72
+ qrank: :queue_rank,
73
+ altid: :alt_id,
74
+ etime: :etime,
75
+ exitstat: :exit_status,
76
+ forwardx11: :forward_x11,
77
+ submit_args: :submit_args,
78
+ tokens: :tokens,
79
+ netcounter: :net_counter,
80
+ umask: :umask,
81
+ start_time: :start_time,
82
+ start_count: :start_count,
83
+ checkpoint_dir: :checkpoint_dir,
84
+ checkpoint_name: :checkpoint_name,
85
+ checkpoint_time: :checkpoint_time,
86
+ checkpoint_restart_status: :checkpoint_restart_status,
87
+ restart_name: :restart_name,
88
+ comp_time: :comp_time,
89
+ reported: :reported,
90
+ intcmd: :inter_cmd,
91
+ job_radix: :job_radix,
92
+ sister_list: :sister_list,
93
+ total_runtime: :total_runtime,
94
+ P: :proxy_user,
95
+ node_exclusive: :node_exclusive,
96
+ exec_gpus: :exec_gpus,
97
+ exec_mics: :exec_mics,
98
+ J: :job_id,
99
+ pagg: :pagg_id,
100
+ system_start_time: :system_start_time,
101
+ gpu_flags: :gpu_flags,
102
+
103
+ # Additional queue attribute names
104
+
105
+ # Additional server attribute names
106
+
107
+ # Additional node attribute names
108
+ }
109
+ end