ood_core 0.12.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb944d43beb0aced99e13efb2ef10bf33f9666c705c50ca5ae1727751de43073
4
- data.tar.gz: 6e3cd66160be3bbd63124d6f2ddc794bce4ae64e385977828faba5dfd28ff838
3
+ metadata.gz: 3296708d7bc47f3379a9e4a6c845d3f25c5ccefb599f4b92406d9dffdaef220b
4
+ data.tar.gz: b6af9e90b67bc9a7a52203808d849d8800336b30b09bdb8ed204526d01bc92e9
5
5
  SHA512:
6
- metadata.gz: 176e331a856c1e6958c444426d5c1b41aa881e90a69dca507b07f5463eb81355689e8391e0bf27823fc42a9484789f623ffd566b9d6c414c9cf741a7cafd1def
7
- data.tar.gz: 15481101ad3120d3e8457612f2b8a8be4f1e268b38538b18b710f27887836f7a47eac3bf2e89d8f73745ae96ae78e21cd5ba5afefb4161cf95f435d6f2fdf001
6
+ metadata.gz: 623ac6e6f8081d68a3e925d1150c9f20a0f613ccfb6837519d1b95d04533a72caa403c54327aad85dcea9c0694cc23941f40307d942623c095f53fed7fc32026
7
+ data.tar.gz: 0d785a9ade36b2f6f62f9ae55672091346aa4fb76bf358e6c00d4bc007623b8d1798813474665fc7b4d850d89e041fae5c2fefc9719fbe9f53a161a76127eaad
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
6
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
7
 
8
8
  ## [Unreleased]
9
+ ## [0.13.0] - 2020-08-10
10
+ ### Added
11
+ - CloudyCluster CCQ Adapter
12
+
9
13
  ## [0.12.0] - 2020-08-05
10
14
  ### Added
11
15
  - qos option to Slurm and Torque [#205](https://github.com/OSC/ood_core/pull/205)
@@ -243,7 +247,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
243
247
  ### Added
244
248
  - Initial release!
245
249
 
246
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.12.0...HEAD
250
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.13.0...HEAD
251
+ [0.13.0]: https://github.com/OSC/ood_core/compare/v0.12.0...v0.13.0
247
252
  [0.12.0]: https://github.com/OSC/ood_core/compare/v0.11.4...v0.12.0
248
253
  [0.11.4]: https://github.com/OSC/ood_core/compare/v0.11.3...v0.11.4
249
254
  [0.11.3]: https://github.com/OSC/ood_core/compare/v0.11.2...v0.11.3
@@ -0,0 +1,267 @@
1
+ require "ood_core/job/adapters/helper"
2
+ require "tempfile"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the Cloudy Cluster adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [Object] :image (nil) The default VM image to use
12
+ # @option config [Object] :cloud (gcp) The cloud provider being used [gcp,aws]
13
+ # @option config [Object] :scheduler (nil) The name of the scheduler to use
14
+ # @option config [Object] :sge_root (nil) Path to SGE root, note that
15
+ # @option config [#to_h] :bin (nil) Path to CC client binaries
16
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to CC client executables
17
+ def self.build_ccq(config)
18
+ Adapters::CCQ.new(config.to_h.symbolize_keys)
19
+ end
20
+ end
21
+
22
+ module Adapters
23
+
24
+ class PromptError < StandardError; end
25
+
26
+ class CCQ < Adapter
27
+ using Refinements::ArrayExtensions
28
+
29
+ attr_reader :image, :cloud, :scheduler, :bin, :bin_overrides, :jobid_regex
30
+
31
+ def initialize(config)
32
+ @image = config.fetch(:image, nil)
33
+ @cloud = config.fetch(:cloud, gcp_provider)
34
+ @scheduler = config.fetch(:scheduler, nil)
35
+ @bin = config.fetch(:bin, '/opt/CloudyCluster/srv/CCQ')
36
+ @bin_overrides = config.fetch(:bin_overrides, {})
37
+ @jobid_regex = config.fetch(:jobid_regex, "job id is: (?<job_id>\\d+) you")
38
+ end
39
+
40
+ # Submit a job with the attributes defined in the job template instance
41
+ # @param script [Script] script object that describes the script and
42
+ # attributes for the submitted job
43
+ # @param after [#to_s, Array<#to_s>] not used
44
+ # @param afterok [#to_s, Array<#to_s>] not used
45
+ # @param afternotok [#to_s, Array<#to_s>] not used
46
+ # @param afterany [#to_s, Array<#to_s>] not used
47
+ # @return [String] the job id returned after successfully submitting a
48
+ # job
49
+ # @see Adapter#submit
50
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
51
+ script_file = make_script_file(script.content)
52
+ args = []
53
+
54
+ # cluster configuration args
55
+ args.concat ["-s", scheduler] unless scheduler.nil?
56
+ args.concat [image_arg, image] unless image.nil?
57
+
58
+ args.concat ["-o", script.output_path.to_s] unless script.output_path.nil?
59
+ args.concat ["-e", script.error_path.to_s] unless script.error_path.nil?
60
+ args.concat ["-tl", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
61
+ args.concat ["-js", script_file.path.to_s]
62
+
63
+ args.concat script.native if script.native
64
+
65
+ output = call("ccqsub", args: args)
66
+ parse_job_id_from_ccqsub(output)
67
+ ensure
68
+ script_file.close
69
+ end
70
+
71
+ # Retrieve info for all jobs from the resource manager
72
+ # @return [Array<Info>] information describing submitted jobs
73
+ def info_all(attrs: nil)
74
+ args = []
75
+ args.concat ["-s", scheduler] unless scheduler.nil?
76
+
77
+ stat_output = call("ccqstat", args: args)
78
+ info_from_ccqstat(stat_output)
79
+ end
80
+
81
+ # Retrieve job info from the resource manager
82
+ # @param id [#to_s] the id of the job
83
+ # @return [Info] information describing submitted job
84
+ def info(id)
85
+ args = []
86
+ args.concat ["-s", scheduler] unless scheduler.nil?
87
+ args.concat ["-ji", id]
88
+
89
+ stat_output = call("ccqstat", args: args)
90
+
91
+ # WARNING: code path differs here than info_all because the output
92
+ # from ccqstat -ji $JOBID is much more data than just the 4
93
+ # columns that ccqstat gives.
94
+ info_from_ccqstat_extended(stat_output)
95
+ end
96
+
97
+ # Retrieve job status from resource manager
98
+ # @param id [#to_s] the id of the job
99
+ # @return [Status] status of job
100
+ # @see Adapter#status
101
+ def status(id)
102
+ info(id).status
103
+ end
104
+
105
+ # This adapter does not implement hold and will always raise
106
+ # an exception.
107
+ # @param id [#to_s] the id of the job
108
+ # @raise [JobAdapterError] always
109
+ # @return [void]
110
+ def hold(_)
111
+ raise NotImplementedError, "subclass did not define #hold"
112
+ end
113
+
114
+ # This adapter does not implement release and will always raise
115
+ # an exception.
116
+ # @param id [#to_s] the id of the job
117
+ # @raise [JobAdapterError] always
118
+ # @return [void]
119
+ def release(_)
120
+ raise NotImplementedError, "subclass did not define #release"
121
+ end
122
+
123
+ # Delete the submitted job
124
+ # @param id [#to_s] the id of the job
125
+ # @return [void]
126
+ def delete(id)
127
+ call("ccqdel", args: [id])
128
+ end
129
+
130
+ def directive_prefix
131
+ '#CC'
132
+ end
133
+
134
+ private
135
+
136
+ # Mapping of state codes
137
+ STATE_MAP =
138
+ {
139
+ 'Error' => :suspended, # not running, but infrastructure still possibly exists
140
+ 'CreatingCG' => :queued, # creating control group
141
+ 'Pending' => :queued, # in queue
142
+ 'Submitted' => :queued, #
143
+ 'Provisioning' => :queued, # node is being provisioned
144
+ 'Running' => :running, #
145
+ 'Completed' => :completed, #
146
+ }.freeze
147
+
148
+ def gcp_provider
149
+ 'gcp'
150
+ end
151
+
152
+ def aws_provider
153
+ 'aws'
154
+ end
155
+
156
+ def image_arg
157
+ if cloud == gcp_provider
158
+ '-gcpgi'
159
+ else
160
+ '-awsami'
161
+ end
162
+ end
163
+
164
+ def call(cmd, args: [], env: {}, stdin: "")
165
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
166
+ args = args.map(&:to_s)
167
+ env = env.to_h
168
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
169
+ s.success? ? o : interpret_and_raise(e, cmd)
170
+ end
171
+
172
+ # helper function to interpret an error the command had given and
173
+ # raise a different error.
174
+ def interpret_and_raise(error, command)
175
+ # a special case with CCQ that prompts the user for username & password
176
+ # so let's be helpful and tell the user what to do.
177
+ if error.end_with?("EOFError: EOF when reading a line\n")
178
+ raise(
179
+ PromptError,
180
+ "The #{command} command was prompted. You need to generate the certificate " +
181
+ "manually in a shell by running 'ccqstat'\nand entering your username/password"
182
+ )
183
+ else
184
+ raise(JobAdapterError, e.message)
185
+ end
186
+ end
187
+
188
+ # Convert seconds to duration
189
+ def seconds_to_duration(seconds)
190
+ format("%02d:%02d:%02d", seconds / 3600, seconds / 60 % 60, seconds % 60)
191
+ end
192
+
193
+ # helper to make a script file. We can't pipe it into ccq so we have to
194
+ # write a file.
195
+ def make_script_file(content)
196
+ file = Tempfile.new(tmp_file_name)
197
+ file.write(content.to_s)
198
+ file.flush
199
+ file
200
+ end
201
+
202
+ def tmp_file_name
203
+ 'ccq_ood_script_'
204
+ end
205
+
206
+ def parse_job_id_from_ccqsub(output)
207
+ match_data = /#{jobid_regex}/.match(output)
208
+ # match_data could be nil, OR re-configured jobid_regex could be looking for a different named group
209
+ job_id = match_data&.named_captures&.fetch('job_id', nil)
210
+ throw JobAdapterError.new "Could not extract job id out of ccqsub output '#{output}'" if job_id.nil?
211
+ job_id
212
+ end
213
+
214
+ # parse an Ood::Job::Info object from extended ccqstat output
215
+ def info_from_ccqstat_extended(data)
216
+ raw = extended_data_to_hash(data)
217
+ data_hash = { native: raw }
218
+ data_hash[:status] = get_state(raw['status'])
219
+ data_hash[:id] = raw['name']
220
+ data_hash[:job_name] = raw['jobName']
221
+ data_hash[:job_owner] = raw['userName']
222
+ data_hash[:submit_host] = raw['submitHostInstanceId']
223
+ data_hash[:dispatch_time] = raw['startTime'].to_i
224
+ data_hash[:submission_time] = raw['dateSubmitted'].to_i
225
+ data_hash[:queue_name] = raw['criteriaPriority']
226
+
227
+ Info.new(data_hash)
228
+ end
229
+
230
+ # extended data is just lines of 'key: value' value, so parse
231
+ # it and stick it all in a hash.
232
+ def extended_data_to_hash(data)
233
+ Hash[data.to_s.scan(/(\w+): (\S+)/)]
234
+ end
235
+
236
+ def info_from_ccqstat(data)
237
+ infos = []
238
+
239
+ data.to_s.each_line do |line|
240
+ words = line.split(/\s/).reject(&:empty?)
241
+ next if !words.empty? && words[0] == "Id" # just skip the header
242
+
243
+ infos << Info.new(line_to_hash(words)) if words.size == 5
244
+ end
245
+
246
+ infos
247
+ end
248
+
249
+ def line_to_hash(words)
250
+ return unless words.size == 5
251
+
252
+ data_hash = {}
253
+ data_hash[:id] = words[0]
254
+ data_hash[:job_name] = words[1]
255
+ data_hash[:job_owner] = words[2]
256
+ data_hash[:status] = get_state(words[4])
257
+
258
+ data_hash
259
+ end
260
+
261
+ def get_state(state)
262
+ STATE_MAP.fetch(state, :undetermined)
263
+ end
264
+ end
265
+ end
266
+ end
267
+ end
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.12.0"
3
+ VERSION = "0.13.0"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2020-08-05 00:00:00.000000000 Z
13
+ date: 2020-08-10 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -163,6 +163,7 @@ files:
163
163
  - lib/ood_core/errors.rb
164
164
  - lib/ood_core/invalid_cluster.rb
165
165
  - lib/ood_core/job/adapter.rb
166
+ - lib/ood_core/job/adapters/ccq.rb
166
167
  - lib/ood_core/job/adapters/drmaa.rb
167
168
  - lib/ood_core/job/adapters/helper.rb
168
169
  - lib/ood_core/job/adapters/linux_host.rb