ruby-cute 0.12 → 0.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitlab-ci.yml +55 -0
- data/README.md +12 -1
- data/bin/cute +2 -1
- data/bin/grd +701 -0
- data/debian/.gitattributes +3 -0
- data/debian/changelog +108 -0
- data/debian/control +22 -5
- data/debian/ruby-cute.docs +1 -1
- data/debian/rules +3 -11
- data/debian/watch +2 -2
- data/examples/distem-bootstrap +16 -22
- data/examples/g5k-tutorial.md +25 -18
- data/examples/g5k_exp_virt.rb +1 -1
- data/lib/cute/bash.rb +7 -7
- data/lib/cute/configparser.rb +14 -12
- data/lib/cute/execute.rb +6 -4
- data/lib/cute/extensions.rb +3 -4
- data/lib/cute/g5k_api.rb +62 -38
- data/lib/cute/net-ssh-exec3.rb +10 -7
- data/lib/cute/net-ssh.rb +2 -2
- data/lib/cute/net.rb +5 -7
- data/lib/cute/synchronization.rb +1 -3
- data/lib/cute/taktuk.rb +4 -5
- data/lib/cute/version.rb +1 -1
- data/ruby-cute.gemspec +6 -3
- data/spec/g5k_api_check_spec.rb +1 -1
- data/spec/g5k_api_spec.rb +14 -22
- data/spec/spec_helper.rb +10 -6
- data/test/test_execute.rb +0 -0
- metadata +55 -11
- data/debian/compat +0 -1
data/bin/grd
ADDED
@@ -0,0 +1,701 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# For help, see' grd -h'
|
3
|
+
#
|
4
|
+
#
|
5
|
+
# Grid5000 bugs that would help improve grd:
|
6
|
+
# Bug 11547 - l'API OAR génère des fichiers temporaires oarapi.subscript.* dans les homes
|
7
|
+
# Bug 13929 - Stocker la sortie des admissions rules dans un événement
|
8
|
+
# Bug 13867 - l'API de kadeploy donne les logs de manière non idempotente
|
9
|
+
# Random issues with the API that cause error 500:
|
10
|
+
# - Bug 13928 - erreur 500 sur l'API lors d'un DELETE
|
11
|
+
# - Bug 13742 - erreur 500 sur l'API pour créer un job
|
12
|
+
# - Bug 13766 - erreur 500 api
|
13
|
+
# - Bug 11697 - api-proxy/g5k-api: erreur "AH01102: error reading status line from remote server localhost:8000"
|
14
|
+
#
|
15
|
+
# Test cases:
|
16
|
+
# grd bs -s ncy
|
17
|
+
# grd bs -s ncy -c
|
18
|
+
# grd bs -s ncy -q production -l {grappe}/nodes=1+{"type='kavlan'"}/vlan=1 -w 0:10 --armor
|
19
|
+
# grd bs -s ncy -q production -l {grappe}/nodes=1+{"type='kavlan'"}/vlan=1 -w 0:10 --armor -d
|
20
|
+
# echo 'hostname ; pwd' > setup-script
|
21
|
+
# grd bs -s ncy -f setup-script -T
|
22
|
+
# grd l
|
23
|
+
|
24
|
+
gem 'ruby-cute', '>= 0.12'
|
25
|
+
$:.unshift File.expand_path("../../lib", File.realpath(__FILE__))
|
26
|
+
|
27
|
+
require 'cute'
|
28
|
+
require 'optparse'
|
29
|
+
require 'pp'
|
30
|
+
require 'net/ssh/gateway'
|
31
|
+
require 'net/ssh'
|
32
|
+
require 'net/scp'
|
33
|
+
require 'peach'
|
34
|
+
|
35
|
+
def cute_init
|
36
|
+
$logger = Logger.new(STDOUT)
|
37
|
+
STDOUT.sync = true
|
38
|
+
STDERR.sync = true
|
39
|
+
conf = ENV['HOME']+'/.grid5000_api.yml'
|
40
|
+
if File::exist?(conf)
|
41
|
+
yconf = YAML::load(IO::read(conf)) rescue {}
|
42
|
+
$login = yconf['username']
|
43
|
+
$password = yconf['password']
|
44
|
+
$g5k = Cute::G5K::API.new(:conf_file => conf, :debug => true)
|
45
|
+
elsif g5k_internal?
|
46
|
+
$login = `whoami`.chomp
|
47
|
+
$g5k = Cute::G5K::API.new(:debug => true)
|
48
|
+
else
|
49
|
+
puts "ERROR: no .grid5000_api.yml found, and outside Grid'5000. Need API credentials."
|
50
|
+
exit(1)
|
51
|
+
end
|
52
|
+
$g5k.rest.user_agent += " grd"
|
53
|
+
$debug = true if ENV['GRD_DEBUG']
|
54
|
+
if $debug
|
55
|
+
$logger.level = Logger::DEBUG
|
56
|
+
else
|
57
|
+
$logger.level = Logger::INFO
|
58
|
+
end
|
59
|
+
$logger.formatter = proc { |severity, datetime, _progname, msg| "%s, [%s] %s\n" % [severity[0..0], datetime.strftime("%Y-%m-%d %H:%M:%S"), msg ] }
|
60
|
+
$g5k.logger = $logger
|
61
|
+
end
|
62
|
+
|
63
|
+
def do_provisioning(site, job, o)
|
64
|
+
if o[:env]
|
65
|
+
keys = Dir::glob(ENV['HOME'] + '/.ssh/*.pub').map { |f| IO::read(f) }.join("\n")
|
66
|
+
ts = Time::now
|
67
|
+
d = {
|
68
|
+
:env => o[:env],
|
69
|
+
:keys => keys
|
70
|
+
}
|
71
|
+
if get_job_vlan(job)
|
72
|
+
d[:vlan_id] = get_job_vlan(job)
|
73
|
+
end
|
74
|
+
$logger.level = Logger::WARN unless $debug
|
75
|
+
job = $g5k.deploy(job, d)
|
76
|
+
$logger.level = Logger::INFO unless $debug
|
77
|
+
duid = job['deploy'].last['uid']
|
78
|
+
$logger.info "Started provisioning of environment #{o[:env]}. Live log: https://api.grid5000.fr/3.0/sites/#{site}/internal/kadeployapi/deployment/#{duid}/logs"
|
79
|
+
$logger.level = Logger::WARN unless $debug
|
80
|
+
job = $g5k.wait_for_deploy(job)
|
81
|
+
$logger.level = Logger::INFO unless $debug
|
82
|
+
deploy = job['deploy'].last
|
83
|
+
failed = ((deploy['status'] == 'error') or (not deploy['result'].to_a.select { |e| e[1]['state'] != 'OK' }.empty?))
|
84
|
+
if failed
|
85
|
+
$logger.error "Provisioning failed. Terminating resources reservation and exiting."
|
86
|
+
$g5k.release(job)
|
87
|
+
exit(1)
|
88
|
+
end
|
89
|
+
$logger.info "Provisioning completed after #{(Time::now - ts).to_i}s"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_job_vlan(job)
|
94
|
+
if job['resources_by_type']['vlans'] and job['resources_by_type']['vlans'].length > 0
|
95
|
+
return job['resources_by_type']['vlans'].first
|
96
|
+
else
|
97
|
+
return nil
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def nodename_in_vlan(node, vlan)
|
102
|
+
fqdn = node.split('.')
|
103
|
+
nodename = fqdn[0]
|
104
|
+
site = fqdn[1]
|
105
|
+
domainname = fqdn[2..3].join('.')
|
106
|
+
node_vlan_name = "#{nodename}-kavlan-#{vlan}.#{site}.#{domainname}"
|
107
|
+
return node_vlan_name
|
108
|
+
end
|
109
|
+
|
110
|
+
def show_nodes(job)
|
111
|
+
nodes = job['assigned_nodes']
|
112
|
+
$logger.info "Nodes: #{nodes.join(' ')}"
|
113
|
+
if (v = get_job_vlan(job))
|
114
|
+
$logger.info "VLAN: #{v}"
|
115
|
+
end
|
116
|
+
if nodes.length > 1
|
117
|
+
$logger.info "Nodeset: #{nodeset(nodes)}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
$cache = {}
|
122
|
+
def get_api_sites
|
123
|
+
unless $cache["sites"]
|
124
|
+
$cache['sites'] = $g5k.site_uids
|
125
|
+
$cache['sites'].freeze
|
126
|
+
end
|
127
|
+
return $cache['sites']
|
128
|
+
end
|
129
|
+
|
130
|
+
def resolve_site(s)
|
131
|
+
s = 'nancy' if s == 'ncy' or s == 'ny'
|
132
|
+
s = 'nantes' if s == 'ns'
|
133
|
+
sites = get_api_sites
|
134
|
+
l = sites.select { |e| e.start_with?(s) }.sort
|
135
|
+
if l.length == 0
|
136
|
+
raise "Invalid site: #{s}"
|
137
|
+
elsif l.length > 1
|
138
|
+
raise "Invalid site: #{s} (matches: #{l.join(' ')})"
|
139
|
+
else
|
140
|
+
return l.first
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def nodeset(nodes)
|
145
|
+
return `echo #{nodes.join(',')} | nodeset -f`.chomp
|
146
|
+
end
|
147
|
+
|
148
|
+
def g5k_internal?
|
149
|
+
return `hostname --fqdn`.chomp =~ /\.grid5000\.fr$/ ? true : false
|
150
|
+
end
|
151
|
+
|
152
|
+
if ['bootstrap', 'bs'].include?(ARGV[0])
|
153
|
+
ARGV.shift
|
154
|
+
o = {}
|
155
|
+
o[:type] = []
|
156
|
+
OptionParser.new do |opts|
|
157
|
+
opts.banner = "usage: grd bootstrap [options]"
|
158
|
+
opts.separator " Reserve, provision, configure and connect to resources (alias: bs)"
|
159
|
+
opts.separator ""
|
160
|
+
opts.separator "# Options:"
|
161
|
+
opts.separator "## (A) Select which resources to reserve, and for how long:"
|
162
|
+
opts.on("-s", "--site SITE", "Site on which resources will be requested") do |d|
|
163
|
+
o[:site] = d
|
164
|
+
end
|
165
|
+
opts.on("-l", "--resources DESCRIPTION", "description of the requested resources (using the syntax for oarsub -l). default: nodes=1") do |d|
|
166
|
+
o[:resources] = d
|
167
|
+
end
|
168
|
+
opts.on("-w", "--walltime DURATION", "maximum duration of the reservation (using the syntax for oarsub -l)") do |d|
|
169
|
+
o[:walltime] = d
|
170
|
+
end
|
171
|
+
opts.separator ""
|
172
|
+
opts.separator "## (B) Choose when to reserve resources (default: ASAP, and wait interactively):"
|
173
|
+
opts.on("-d", "--detach", "Reserve resources ASAP, but do not wait interactively (batch/submission mode)") do
|
174
|
+
o[:detach] = true
|
175
|
+
end
|
176
|
+
opts.on("-r", "--reservation DATE", "Resources at a specified date and time. Do not wait for the reservation to start.") do |d|
|
177
|
+
o[:reservation] = d
|
178
|
+
o[:detach] = true
|
179
|
+
end
|
180
|
+
opts.separator ""
|
181
|
+
opts.separator "## (C) Set the environment (system image) to provision. If none specified, the 'standard' environment will be used"
|
182
|
+
opts.on("-e", "--environment ENV", "Kadeploy environment to provision") do |d|
|
183
|
+
o[:env] = d
|
184
|
+
end
|
185
|
+
opts.separator ""
|
186
|
+
opts.separator "## (D) Run a file (typically a script) on the first node"
|
187
|
+
opts.on("-f", "--script FILE", "script") do |d|
|
188
|
+
o[:script] = d
|
189
|
+
end
|
190
|
+
opts.on("-a", "--script-arg ARG", "argument to pass to the script (can be specified multiple times)") do |d|
|
191
|
+
o[:script_arg] ||= []
|
192
|
+
o[:script_arg] << d
|
193
|
+
end
|
194
|
+
opts.on("-T", "--terminate-after-script", "Terminate the reservation after the script execution") do
|
195
|
+
o[:terminate_after_script] = true
|
196
|
+
end
|
197
|
+
opts.separator ""
|
198
|
+
opts.separator "## (E) Connect interactively to the first node (incompatible with '--reservation' and '--detach')."
|
199
|
+
opts.on("-c", "--connect") do
|
200
|
+
o[:connect] = true
|
201
|
+
end
|
202
|
+
opts.separator ""
|
203
|
+
opts.separator "## Advanced options:"
|
204
|
+
opts.separator "### Related to reservation"
|
205
|
+
|
206
|
+
opts.on("-t", "--type JOB_TYPE", "OAR job type (can be specified multiple times)") do |d|
|
207
|
+
o[:type] << d
|
208
|
+
end
|
209
|
+
opts.on("-q", "--queue QUEUE", "OAR queue") do |d|
|
210
|
+
o[:queue] = d
|
211
|
+
end
|
212
|
+
opts.on("-p", "--project PROJECT", "OAR project") do |d|
|
213
|
+
o[:project] = d
|
214
|
+
end
|
215
|
+
opts.on("-n", "--name NAME", "OAR name") do |d|
|
216
|
+
o[:name] = d
|
217
|
+
end
|
218
|
+
opts.separator ""
|
219
|
+
opts.separator "### Related to post-deployment configuration"
|
220
|
+
opts.on("", "--armor", "Fetch and run g5k-armor-node.py (see https://www.grid5000.fr/w/Armored_Node_for_Sensitive_Data). This implies '--environment debian11-x64-big'") do
|
221
|
+
o[:armor] = true
|
222
|
+
end
|
223
|
+
opts.separator ""
|
224
|
+
opts.separator "# Notes:"
|
225
|
+
opts.separator "* All options are optional except '--site' if running from outside Grid'5000."
|
226
|
+
opts.separator "* In steps (D) and (E), the connection is done as 'root' if an environment was provisioned (and as the normal user otherwise)."
|
227
|
+
opts.separator ""
|
228
|
+
opts.separator "# Examples:"
|
229
|
+
opts.separator "## Basic usage: reserve one node on the current site, as soon as possible, and for the default walltime, and wait until it is available"
|
230
|
+
opts.separator " grd bs"
|
231
|
+
opts.separator ""
|
232
|
+
opts.separator "## Reserve, provision, execute a script, and connect to the node"
|
233
|
+
opts.separator " grd bs -s ly -l nodes=3 -w 0:10 -e debian11-x64-min -f setup-script -c"
|
234
|
+
opts.separator ""
|
235
|
+
opts.separator "Examples for -l / --resources:"
|
236
|
+
opts.separator ' nodes=3 {"gpu_count>0"}/nodes=1 {"cluster=\'gros\'"}/nodes=1 {nova}/nodes=3 (see https://www.grid5000.fr/w/OAR_Syntax_simplification)'
|
237
|
+
opts.separator ""
|
238
|
+
opts.separator "Examples for -r / --reservation:"
|
239
|
+
opts.separator ' "2022-03-30 19:30:05" "2022-03-30 19" "19" "2022-03-30 19,2022-04-02 04" "13,15"'
|
240
|
+
end.parse!
|
241
|
+
|
242
|
+
cute_init
|
243
|
+
|
244
|
+
if not o[:site]
|
245
|
+
if `hostname --fqdn`.chomp =~ /.*\.([^.]+)\.grid5000\.fr$/
|
246
|
+
o[:site] = $1
|
247
|
+
else
|
248
|
+
$logger.error("Running outside Grid'5000: the target site must be specified using '-s SITE'.")
|
249
|
+
exit(1)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
site = resolve_site(o[:site])
|
253
|
+
jo = {}
|
254
|
+
jo[:site] = site
|
255
|
+
jo[:resources] = o[:resources] || 'nodes=1'
|
256
|
+
jo[:walltime] = o[:walltime] if o[:walltime] # else use OAR's default
|
257
|
+
jo[:reservation] = o[:reservation] if o[:reservation]
|
258
|
+
jo[:queue] = o[:queue] if o[:queue] # else use OAR's default
|
259
|
+
jo[:project] = o[:project] if o[:project] # else use OAR's default
|
260
|
+
o[:name] ||= 'grd'
|
261
|
+
jo[:name] = o[:name]
|
262
|
+
jo[:wait] = false
|
263
|
+
if o[:env] and not o[:type].include?('deploy')
|
264
|
+
o[:type] << 'deploy'
|
265
|
+
end
|
266
|
+
jo[:type] = o[:type]
|
267
|
+
|
268
|
+
if o[:armor]
|
269
|
+
if o[:script]
|
270
|
+
$logger.error("--armor and --script are incompatible. Exiting.")
|
271
|
+
exit(1)
|
272
|
+
end
|
273
|
+
if o[:env] and o[:env] != 'debian11-x64-big'
|
274
|
+
$logger.error("--armor and --environment are incompatible. Exiting.")
|
275
|
+
exit(1)
|
276
|
+
end
|
277
|
+
o[:env] = 'debian11-x64-big'
|
278
|
+
o[:type] << 'deploy' if not o[:type].include?('deploy')
|
279
|
+
tmp = `mktemp /tmp/armor.XXXXXX`.chomp
|
280
|
+
File::open(tmp, "w") do |fd|
|
281
|
+
fd.puts <<-EOF
|
282
|
+
#!/bin/bash -xe
|
283
|
+
wget https://gitlab.inria.fr/grid5000/g5k-armor/-/raw/master/g5k-armor-node.py
|
284
|
+
chmod a+rx g5k-armor-node.py
|
285
|
+
./g5k-armor-node.py
|
286
|
+
EOF
|
287
|
+
end
|
288
|
+
o[:script] = tmp
|
289
|
+
end
|
290
|
+
|
291
|
+
if o[:detach]
|
292
|
+
if o[:connect]
|
293
|
+
$logger.error("--connect and --detach (or --reservation) are incompatible. Exiting.")
|
294
|
+
exit(1)
|
295
|
+
end
|
296
|
+
jo[:cmd] = 'grd inner'
|
297
|
+
if o[:env]
|
298
|
+
jo[:cmd] += " -e '#{o[:env]}'"
|
299
|
+
end
|
300
|
+
if o[:script]
|
301
|
+
# upload script to frontend
|
302
|
+
ssh = nil
|
303
|
+
gateway = nil
|
304
|
+
fnode = "#{site}.grid5000.fr"
|
305
|
+
if g5k_internal?
|
306
|
+
ssh = Net::SSH.start(fnode, $login)
|
307
|
+
else
|
308
|
+
gateway = Net::SSH::Gateway.new('access.grid5000.fr', $login)
|
309
|
+
ssh = gateway.ssh(fnode, $login)
|
310
|
+
end
|
311
|
+
tmpfile = ssh.exec3!("mkdir -p .cache/grd ; mktemp .cache/grd/script.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
312
|
+
ssh.scp.upload!(o[:script], tmpfile)
|
313
|
+
ssh.exec3!("chmod u+x #{tmpfile}", { :no_log => true, :no_output => true })
|
314
|
+
ssh.close
|
315
|
+
ssh.shutdown!
|
316
|
+
if not g5k_internal?
|
317
|
+
gateway.shutdown!
|
318
|
+
end
|
319
|
+
jo[:cmd] += " -f '#{tmpfile}'"
|
320
|
+
(o[:script_arg] || []).each do |a|
|
321
|
+
jo[:cmd] += " -a '#{a}'"
|
322
|
+
end
|
323
|
+
if o[:terminate_after_script]
|
324
|
+
jo[:cmd] += " --terminate-after-script"
|
325
|
+
end
|
326
|
+
end
|
327
|
+
if jo[:cmd] == 'grd inner'
|
328
|
+
# since we have nothing specific to do, we just sleep instead of calling 'grd inner'.
|
329
|
+
jo[:cmd] = "sleep infinity"
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
begin
|
334
|
+
job = $g5k.reserve(jo)
|
335
|
+
rescue Cute::G5K::BadRequest => e
|
336
|
+
$logger.error("Reservation failed with error 400 (Bad Request)")
|
337
|
+
if e.inner_message
|
338
|
+
$logger.error("Error message from reservation system:")
|
339
|
+
e.inner_message.each_line { |l| puts " " + l }
|
340
|
+
end
|
341
|
+
exit(1)
|
342
|
+
end
|
343
|
+
|
344
|
+
$logger.info("OAR job (reservation) ID: #{job['uid']}")
|
345
|
+
|
346
|
+
begin
|
347
|
+
t = nil
|
348
|
+
ts = Time::now
|
349
|
+
loop do
|
350
|
+
tries = 0
|
351
|
+
begin
|
352
|
+
job = $g5k.get_job(site, job['uid'])
|
353
|
+
rescue Cute::G5K::RequestFailed, Cute::G5K::BadRequest => e
|
354
|
+
$logger.info "Fetching reservation status failed due to API error: #{e.message}"
|
355
|
+
tries += 1
|
356
|
+
if tries < 5
|
357
|
+
$logger.info "Retrying.."
|
358
|
+
sleep 5
|
359
|
+
retry
|
360
|
+
else
|
361
|
+
$logger.info "Too many attempts, exiting. The job might still be running."
|
362
|
+
exit(1)
|
363
|
+
end
|
364
|
+
end
|
365
|
+
break if job['state'] == 'running'
|
366
|
+
t = job['scheduled_at']
|
367
|
+
if !t.nil?
|
368
|
+
t = Time.at(t)
|
369
|
+
secs = [ t - Time.now, 0 ].max.to_i
|
370
|
+
$logger.info "Reservation #{job['uid']} should be available at #{t} (in #{secs} s)"
|
371
|
+
break if o[:detach]
|
372
|
+
end
|
373
|
+
Kernel.sleep(2)
|
374
|
+
end
|
375
|
+
|
376
|
+
if o[:detach]
|
377
|
+
$logger.info "Your reservation will run in the background. Follow its status:"
|
378
|
+
$logger.info " Using the API: https://api.grid5000.fr/3.0/sites/#{site}/jobs/#{job['uid']}?pretty"
|
379
|
+
$logger.info " Using the Gantt: https://intranet.grid5000.fr/oar/#{site}/drawgantt-svg/"
|
380
|
+
$logger.info " When it will be running, using its output files:"
|
381
|
+
$logger.info " #{site}:/home/#{$login}/OAR.#{o[:name]}.#{job['uid']}.stdout"
|
382
|
+
$logger.info " #{site}:/home/#{$login}/OAR.#{o[:name]}.#{job['uid']}.stderr"
|
383
|
+
$logger.info "Exiting."
|
384
|
+
exit(0)
|
385
|
+
end
|
386
|
+
|
387
|
+
$logger.info "Resources are available after #{(Time::now - ts).to_i}s"
|
388
|
+
show_nodes(job)
|
389
|
+
# deployment
|
390
|
+
do_provisioning(site, job, o)
|
391
|
+
|
392
|
+
rescue Interrupt
|
393
|
+
$logger.info "Interrupted. Releasing resources."
|
394
|
+
$g5k.release(job)
|
395
|
+
exit(1)
|
396
|
+
end
|
397
|
+
|
398
|
+
# execute script
|
399
|
+
tlogin = o[:env] ? 'root' : $login
|
400
|
+
fnode = job['assigned_nodes'].first
|
401
|
+
# if in VLAN, adjust node name
|
402
|
+
if get_job_vlan(job)
|
403
|
+
fnode = nodename_in_vlan(fnode, get_job_vlan(job))
|
404
|
+
end
|
405
|
+
if o[:script]
|
406
|
+
ssh = nil
|
407
|
+
gateway = nil
|
408
|
+
if g5k_internal?
|
409
|
+
ssh = Net::SSH.start(fnode, tlogin)
|
410
|
+
else
|
411
|
+
gateway = Net::SSH::Gateway.new('access.grid5000.fr', $login)
|
412
|
+
ssh = gateway.ssh(fnode, tlogin)
|
413
|
+
end
|
414
|
+
if tlogin == 'root'
|
415
|
+
# We use a file in /root to avoid issues when unmounting /tmp in the script
|
416
|
+
tmpfile = ssh.exec3!("mkdir -p /root/.cache/grd && mktemp /root/.cache/grd/grd.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
417
|
+
else
|
418
|
+
tmpfile = ssh.exec3!("mktemp /tmp/grd.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
419
|
+
end
|
420
|
+
ssh.scp.upload!(o[:script], tmpfile)
|
421
|
+
ssh.exec3!("chmod u+x #{tmpfile}", { :no_log => true, :no_output => true })
|
422
|
+
args = (o[:script_arg] || []).join(' ')
|
423
|
+
$logger.info "Running script on #{fnode} ..."
|
424
|
+
ssh.exec3!("#{tmpfile} #{args}", { :ignore_error => true })
|
425
|
+
$logger.info "Script finished."
|
426
|
+
ssh.close
|
427
|
+
ssh.shutdown!
|
428
|
+
if not g5k_internal?
|
429
|
+
gateway.shutdown!
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
if o[:terminate_after_script]
|
434
|
+
$logger.info("Releasing resources.")
|
435
|
+
$g5k.release(job)
|
436
|
+
|
437
|
+
else
|
438
|
+
jh = g5k_internal? ? "" : "-J #{$login}@access.grid5000.fr "
|
439
|
+
cmd = "ssh -o StrictHostKeyChecking=accept-new #{jh}#{tlogin}@#{fnode}"
|
440
|
+
$logger.info "Connect to first node using:"
|
441
|
+
$logger.info " #{cmd}"
|
442
|
+
|
443
|
+
if o[:connect]
|
444
|
+
puts
|
445
|
+
system(cmd)
|
446
|
+
puts
|
447
|
+
s = nil
|
448
|
+
loop do
|
449
|
+
print "Connection to node terminated. Terminate resources reservation? (Y/N) "
|
450
|
+
s = gets.chomp.upcase
|
451
|
+
break if s == "Y" or s == "N"
|
452
|
+
end
|
453
|
+
if s == 'Y'
|
454
|
+
$logger.info("Releasing resources.")
|
455
|
+
$g5k.release(job)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
end
|
459
|
+
|
460
|
+
elsif ['inner'].include?(ARGV[0])
|
461
|
+
ARGV.shift
|
462
|
+
o = {}
|
463
|
+
o[:type] = []
|
464
|
+
OptionParser.new do |opts|
|
465
|
+
opts.banner = "usage: grd inner [options]"
|
466
|
+
opts.separator " Do the actions that need to be performed inside a reservation"
|
467
|
+
opts.on("-e", "--environment ENV", "Kadeploy environment to provision") do |d|
|
468
|
+
o[:env] = d
|
469
|
+
end
|
470
|
+
opts.on("-f", "--script FILE", "script") do |d|
|
471
|
+
o[:script] = d
|
472
|
+
end
|
473
|
+
opts.on("-a", "--script-arg ARG", "argument to pass to the script (can be specified multiple times)") do |d|
|
474
|
+
o[:script_arg] ||= []
|
475
|
+
o[:script_arg] << d
|
476
|
+
end
|
477
|
+
opts.on("-T", "--terminate-after-script", "Terminate the reservation after the script execution (default: keep resources until end of walltime)") do
|
478
|
+
o[:terminate_after_script] = true
|
479
|
+
end
|
480
|
+
end.parse!
|
481
|
+
|
482
|
+
|
483
|
+
if `hostname --fqdn`.chomp =~ /.*\.([^.]+)\.grid5000\.fr$/
|
484
|
+
o[:site] = $1
|
485
|
+
site = $1
|
486
|
+
else
|
487
|
+
$logger.error("ERROR: could not determine site.")
|
488
|
+
exit(1)
|
489
|
+
end
|
490
|
+
|
491
|
+
cute_init
|
492
|
+
|
493
|
+
jobid = ENV['OAR_JOB_ID'].to_i
|
494
|
+
|
495
|
+
$logger.info "Arguments: #{ARGV.inspect}"
|
496
|
+
$logger.info "OAR job ID: #{jobid}"
|
497
|
+
# we use OAR_NODEFILE to avoid an API request that would not work from a node
|
498
|
+
nodes = IO::readlines(ENV['OAR_NODEFILE']).map { |l| l.chomp }.uniq
|
499
|
+
$logger.info "Nodes: #{nodes.join(' ')}"
|
500
|
+
if nodes.length > 1
|
501
|
+
$logger.info "Nodeset: #{nodeset(nodes)}"
|
502
|
+
end
|
503
|
+
|
504
|
+
if o[:env]
|
505
|
+
job = $g5k.get_job(site, jobid) # check if this works from node
|
506
|
+
do_provisioning(site, job, o)
|
507
|
+
end
|
508
|
+
|
509
|
+
if o[:script]
|
510
|
+
args = (o[:script_arg] || []).join(' ')
|
511
|
+
if o[:env]
|
512
|
+
# we need to copy the script to the node using ssh
|
513
|
+
fnode = nodes.first
|
514
|
+
# if in VLAN, adjust node name
|
515
|
+
if get_job_vlan(job)
|
516
|
+
fnode = nodename_in_vlan(fnode, get_job_vlan(job))
|
517
|
+
end
|
518
|
+
tlogin = 'root'
|
519
|
+
ssh = Net::SSH.start(fnode, tlogin)
|
520
|
+
# We use a file in /root to avoid issues when unmounting /tmp in the script
|
521
|
+
tmpfile = ssh.exec3!("mkdir -p /root/.cache/grd && mktemp /root/.cache/grd/grd.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
522
|
+
ssh.scp.upload!(o[:script], tmpfile)
|
523
|
+
ssh.exec3!("chmod u+x #{tmpfile}", { :no_log => true, :no_output => true })
|
524
|
+
$logger.info "Running script on #{fnode} ..."
|
525
|
+
ssh.exec3!("#{tmpfile} #{args}", { :ignore_error => true })
|
526
|
+
$logger.info "Script finished."
|
527
|
+
ssh.close
|
528
|
+
ssh.shutdown!
|
529
|
+
else
|
530
|
+
# we are already on the node
|
531
|
+
system("chmod u+x #{o[:script]}") or raise
|
532
|
+
$logger.info "Running script ..."
|
533
|
+
ssh.exec3!("#{tmpfile} #{args}", { :ignore_error => true }) or raise
|
534
|
+
$logger.info "Script finished."
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
if o[:terminate_after_script]
|
539
|
+
$logger.info("Terminating resources reservation.")
|
540
|
+
else
|
541
|
+
$logger.info("Waiting until end of reservation.")
|
542
|
+
sleep() # sleep until end of reservation
|
543
|
+
end
|
544
|
+
|
545
|
+
elsif ['list', 'l'].include?(ARGV[0])
|
546
|
+
cute_init
|
547
|
+
|
548
|
+
ARGV.shift
|
549
|
+
o = {}
|
550
|
+
OptionParser.new do |opts|
|
551
|
+
opts.banner = "usage: grd list [options]"
|
552
|
+
opts.separator " List reservations"
|
553
|
+
opts.on("-s", "--site SITE", "Only list reservations on the specified site") do |d|
|
554
|
+
o[:site] = resolve_site(d)
|
555
|
+
end
|
556
|
+
opts.on("-a", "--all", "List all reservations, not just the current user's") do
|
557
|
+
o[:all] = true
|
558
|
+
end
|
559
|
+
opts.on("-r", "--raw", "Raw output (suitable for scripts)") do
|
560
|
+
o[:raw] = true
|
561
|
+
end
|
562
|
+
end.parse!
|
563
|
+
|
564
|
+
if o[:site]
|
565
|
+
sites = [ o[:site] ]
|
566
|
+
else
|
567
|
+
sites = get_api_sites.sort
|
568
|
+
end
|
569
|
+
begin
|
570
|
+
require 'terminal-table'
|
571
|
+
rescue LoadError
|
572
|
+
STDERR.puts "'terminal-table' library not found, using raw mode"
|
573
|
+
o[:raw] = true
|
574
|
+
end
|
575
|
+
a = []
|
576
|
+
r = ['site', 'id']
|
577
|
+
if o[:all]
|
578
|
+
r += ['project', 'user']
|
579
|
+
end
|
580
|
+
r += ['queue', 'state', 'start', 'end', 'duration', 'name', 'types', 'nodes', 'count' ]
|
581
|
+
a << r
|
582
|
+
|
583
|
+
login = o[:all] ? nil : $login
|
584
|
+
m = Mutex::new
|
585
|
+
sites.peach do |cursite|
|
586
|
+
# FIXME implement pagination (see https://intranet.grid5000.fr/bugzilla/show_bug.cgi?id=13882 about missing doc)
|
587
|
+
jobs = $g5k.get_jobs(cursite, login, %w{waiting launching running hold error terminated} - %w{error terminated}, true, false)
|
588
|
+
jobs.each do |j|
|
589
|
+
sa = if j['started_at'] and j['started_at'] > 0
|
590
|
+
Time::at(j['started_at'])
|
591
|
+
elsif j['scheduled_at'] and j['scheduled_at'] > 0
|
592
|
+
Time::at(j['scheduled_at'])
|
593
|
+
else
|
594
|
+
nil
|
595
|
+
end
|
596
|
+
sa_s = sa.nil? ? '' : sa.strftime("%Y-%m-%d %H:%M:%S")
|
597
|
+
endt = if sa
|
598
|
+
sa + j['walltime']
|
599
|
+
else
|
600
|
+
nil
|
601
|
+
end
|
602
|
+
endt_s = endt.nil? ? '' : endt.strftime("%Y-%m-%d %H:%M:%S")
|
603
|
+
|
604
|
+
# convert walltime to d/h/m/s
|
605
|
+
t = j['walltime']
|
606
|
+
mm, ss = t.divmod(60) #=> [4515, 21]
|
607
|
+
hh, mm = mm.divmod(60) #=> [75, 15]
|
608
|
+
dd, hh = hh.divmod(24) #=> [3, 3]
|
609
|
+
walltime_s = "%dd%2dh%2dm%2ds" % [dd, hh, mm, ss]
|
610
|
+
nodes = nodeset(j['assigned_nodes'])
|
611
|
+
nodes_count = j['assigned_nodes'].uniq.length
|
612
|
+
# we hide the default monitor type to avoid useless noise
|
613
|
+
types = j['types'].reject { |type| type == 'monitor=prom_.*default_metrics' }.join(',')
|
614
|
+
m.synchronize do
|
615
|
+
r = [ cursite, j['uid'] ]
|
616
|
+
if o[:all]
|
617
|
+
r += [ j['project'], j['user'] ]
|
618
|
+
end
|
619
|
+
r += [ j['queue'], j['state'], sa_s, endt_s, walltime_s, j['name'], types, nodes, nodes_count ]
|
620
|
+
a << r
|
621
|
+
end
|
622
|
+
end
|
623
|
+
end
|
624
|
+
a = [a[0]] + a[1..-1].sort { |a1, a2| [a1[0],a1[1]] <=> [a2[0], a2[1]] }
|
625
|
+
if o[:raw]
|
626
|
+
a.each do |l|
|
627
|
+
puts l.join("\t")
|
628
|
+
end
|
629
|
+
else
|
630
|
+
table = Terminal::Table.new
|
631
|
+
table.headings = a[0]
|
632
|
+
table.rows = a[1..-1]
|
633
|
+
puts table
|
634
|
+
end
|
635
|
+
|
636
|
+
elsif ['terminate', 't'].include?(ARGV[0])
|
637
|
+
cute_init
|
638
|
+
ARGV.shift
|
639
|
+
o = {}
|
640
|
+
OptionParser.new do |opts|
|
641
|
+
opts.banner = "usage: grd terminate [options]"
|
642
|
+
opts.separator " Terminate reservations"
|
643
|
+
opts.on("-s", "--site SITE", "Only terminate reservations on the specified site") do |d|
|
644
|
+
o[:site] = resolve_site(d)
|
645
|
+
end
|
646
|
+
opts.on("-j", "--job JOB_ID", "Only terminate the specified job/reservation (implies -y)") do |j|
|
647
|
+
o[:job] = j
|
648
|
+
o[:yes] = true
|
649
|
+
end
|
650
|
+
opts.on("-y", "--yes", "Do not ask for confirmation") do
|
651
|
+
o[:yes] = true
|
652
|
+
end
|
653
|
+
end.parse!
|
654
|
+
|
655
|
+
if o[:site]
|
656
|
+
sites = [ o[:site] ]
|
657
|
+
else
|
658
|
+
sites = get_api_sites
|
659
|
+
end
|
660
|
+
sites.each do |current_site|
|
661
|
+
jobs = $g5k.get_jobs(current_site, $login, %w{waiting launching running hold error terminated} - %w{error terminated})
|
662
|
+
jobs.each do |j|
|
663
|
+
next if o[:job] and j['uid'].to_i != o[:job].to_i
|
664
|
+
if j['started_at'] and j['started_at'] > 0
|
665
|
+
sa = Time::at(j['started_at'])
|
666
|
+
else
|
667
|
+
sa = ''
|
668
|
+
end
|
669
|
+
sig = "site=#{current_site} id=#{j['uid']} state=#{j['state']} started_at=#{sa} nodes=#{nodeset(j['assigned_nodes'])}"
|
670
|
+
if o[:yes]
|
671
|
+
puts "Terminating #{sig}"
|
672
|
+
$g5k.release(j)
|
673
|
+
else
|
674
|
+
print "Terminate #{sig} ? (Y/N) "
|
675
|
+
s = nil
|
676
|
+
loop do
|
677
|
+
s = gets.chomp.upcase
|
678
|
+
break if s == "Y" or s == "N"
|
679
|
+
print "Terminate? (Y/N) "
|
680
|
+
end
|
681
|
+
if s == 'Y'
|
682
|
+
$g5k.release(j)
|
683
|
+
end
|
684
|
+
end
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
else
|
689
|
+
puts <<-EOF
|
690
|
+
usage: grd <command> [options]
|
691
|
+
|
692
|
+
Available commands:
|
693
|
+
bootstrap Reserve, provision, configure and connect to resources (alias: bs)
|
694
|
+
list List resources reservations (alias: l)
|
695
|
+
terminate Terminate resources reservation(s) (alias: t)
|
696
|
+
|
697
|
+
Use 'grd <command> --help' for details.
|
698
|
+
EOF
|
699
|
+
exit(1)
|
700
|
+
end
|
701
|
+
|