ruby-cute 0.12 → 0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitlab-ci.yml +55 -0
- data/README.md +12 -1
- data/bin/cute +2 -1
- data/bin/grd +701 -0
- data/debian/.gitattributes +3 -0
- data/debian/changelog +108 -0
- data/debian/control +22 -5
- data/debian/ruby-cute.docs +1 -1
- data/debian/rules +3 -11
- data/debian/watch +2 -2
- data/examples/distem-bootstrap +16 -22
- data/examples/g5k-tutorial.md +25 -18
- data/examples/g5k_exp_virt.rb +1 -1
- data/lib/cute/bash.rb +7 -7
- data/lib/cute/configparser.rb +14 -12
- data/lib/cute/execute.rb +6 -4
- data/lib/cute/extensions.rb +3 -4
- data/lib/cute/g5k_api.rb +62 -38
- data/lib/cute/net-ssh-exec3.rb +10 -7
- data/lib/cute/net-ssh.rb +2 -2
- data/lib/cute/net.rb +5 -7
- data/lib/cute/synchronization.rb +1 -3
- data/lib/cute/taktuk.rb +4 -5
- data/lib/cute/version.rb +1 -1
- data/ruby-cute.gemspec +6 -3
- data/spec/g5k_api_check_spec.rb +1 -1
- data/spec/g5k_api_spec.rb +14 -22
- data/spec/spec_helper.rb +10 -6
- data/test/test_execute.rb +0 -0
- metadata +55 -11
- data/debian/compat +0 -1
data/bin/grd
ADDED
@@ -0,0 +1,701 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# For help, see' grd -h'
|
3
|
+
#
|
4
|
+
#
|
5
|
+
# Grid5000 bugs that would help improve grd:
|
6
|
+
# Bug 11547 - l'API OAR génère des fichiers temporaires oarapi.subscript.* dans les homes
|
7
|
+
# Bug 13929 - Stocker la sortie des admissions rules dans un événement
|
8
|
+
# Bug 13867 - l'API de kadeploy donne les logs de manière non idempotente
|
9
|
+
# Random issues with the API that cause error 500:
|
10
|
+
# - Bug 13928 - erreur 500 sur l'API lors d'un DELETE
|
11
|
+
# - Bug 13742 - erreur 500 sur l'API pour créer un job
|
12
|
+
# - Bug 13766 - erreur 500 api
|
13
|
+
# - Bug 11697 - api-proxy/g5k-api: erreur "AH01102: error reading status line from remote server localhost:8000"
|
14
|
+
#
|
15
|
+
# Test cases:
|
16
|
+
# grd bs -s ncy
|
17
|
+
# grd bs -s ncy -c
|
18
|
+
# grd bs -s ncy -q production -l {grappe}/nodes=1+{"type='kavlan'"}/vlan=1 -w 0:10 --armor
|
19
|
+
# grd bs -s ncy -q production -l {grappe}/nodes=1+{"type='kavlan'"}/vlan=1 -w 0:10 --armor -d
|
20
|
+
# echo 'hostname ; pwd' > setup-script
|
21
|
+
# grd bs -s ncy -f setup-script -T
|
22
|
+
# grd l
|
23
|
+
|
24
|
+
gem 'ruby-cute', '>= 0.12'
|
25
|
+
$:.unshift File.expand_path("../../lib", File.realpath(__FILE__))
|
26
|
+
|
27
|
+
require 'cute'
|
28
|
+
require 'optparse'
|
29
|
+
require 'pp'
|
30
|
+
require 'net/ssh/gateway'
|
31
|
+
require 'net/ssh'
|
32
|
+
require 'net/scp'
|
33
|
+
require 'peach'
|
34
|
+
|
35
|
+
def cute_init
|
36
|
+
$logger = Logger.new(STDOUT)
|
37
|
+
STDOUT.sync = true
|
38
|
+
STDERR.sync = true
|
39
|
+
conf = ENV['HOME']+'/.grid5000_api.yml'
|
40
|
+
if File::exist?(conf)
|
41
|
+
yconf = YAML::load(IO::read(conf)) rescue {}
|
42
|
+
$login = yconf['username']
|
43
|
+
$password = yconf['password']
|
44
|
+
$g5k = Cute::G5K::API.new(:conf_file => conf, :debug => true)
|
45
|
+
elsif g5k_internal?
|
46
|
+
$login = `whoami`.chomp
|
47
|
+
$g5k = Cute::G5K::API.new(:debug => true)
|
48
|
+
else
|
49
|
+
puts "ERROR: no .grid5000_api.yml found, and outside Grid'5000. Need API credentials."
|
50
|
+
exit(1)
|
51
|
+
end
|
52
|
+
$g5k.rest.user_agent += " grd"
|
53
|
+
$debug = true if ENV['GRD_DEBUG']
|
54
|
+
if $debug
|
55
|
+
$logger.level = Logger::DEBUG
|
56
|
+
else
|
57
|
+
$logger.level = Logger::INFO
|
58
|
+
end
|
59
|
+
$logger.formatter = proc { |severity, datetime, _progname, msg| "%s, [%s] %s\n" % [severity[0..0], datetime.strftime("%Y-%m-%d %H:%M:%S"), msg ] }
|
60
|
+
$g5k.logger = $logger
|
61
|
+
end
|
62
|
+
|
63
|
+
def do_provisioning(site, job, o)
|
64
|
+
if o[:env]
|
65
|
+
keys = Dir::glob(ENV['HOME'] + '/.ssh/*.pub').map { |f| IO::read(f) }.join("\n")
|
66
|
+
ts = Time::now
|
67
|
+
d = {
|
68
|
+
:env => o[:env],
|
69
|
+
:keys => keys
|
70
|
+
}
|
71
|
+
if get_job_vlan(job)
|
72
|
+
d[:vlan_id] = get_job_vlan(job)
|
73
|
+
end
|
74
|
+
$logger.level = Logger::WARN unless $debug
|
75
|
+
job = $g5k.deploy(job, d)
|
76
|
+
$logger.level = Logger::INFO unless $debug
|
77
|
+
duid = job['deploy'].last['uid']
|
78
|
+
$logger.info "Started provisioning of environment #{o[:env]}. Live log: https://api.grid5000.fr/3.0/sites/#{site}/internal/kadeployapi/deployment/#{duid}/logs"
|
79
|
+
$logger.level = Logger::WARN unless $debug
|
80
|
+
job = $g5k.wait_for_deploy(job)
|
81
|
+
$logger.level = Logger::INFO unless $debug
|
82
|
+
deploy = job['deploy'].last
|
83
|
+
failed = ((deploy['status'] == 'error') or (not deploy['result'].to_a.select { |e| e[1]['state'] != 'OK' }.empty?))
|
84
|
+
if failed
|
85
|
+
$logger.error "Provisioning failed. Terminating resources reservation and exiting."
|
86
|
+
$g5k.release(job)
|
87
|
+
exit(1)
|
88
|
+
end
|
89
|
+
$logger.info "Provisioning completed after #{(Time::now - ts).to_i}s"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_job_vlan(job)
|
94
|
+
if job['resources_by_type']['vlans'] and job['resources_by_type']['vlans'].length > 0
|
95
|
+
return job['resources_by_type']['vlans'].first
|
96
|
+
else
|
97
|
+
return nil
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def nodename_in_vlan(node, vlan)
|
102
|
+
fqdn = node.split('.')
|
103
|
+
nodename = fqdn[0]
|
104
|
+
site = fqdn[1]
|
105
|
+
domainname = fqdn[2..3].join('.')
|
106
|
+
node_vlan_name = "#{nodename}-kavlan-#{vlan}.#{site}.#{domainname}"
|
107
|
+
return node_vlan_name
|
108
|
+
end
|
109
|
+
|
110
|
+
def show_nodes(job)
|
111
|
+
nodes = job['assigned_nodes']
|
112
|
+
$logger.info "Nodes: #{nodes.join(' ')}"
|
113
|
+
if (v = get_job_vlan(job))
|
114
|
+
$logger.info "VLAN: #{v}"
|
115
|
+
end
|
116
|
+
if nodes.length > 1
|
117
|
+
$logger.info "Nodeset: #{nodeset(nodes)}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
$cache = {}
|
122
|
+
def get_api_sites
|
123
|
+
unless $cache["sites"]
|
124
|
+
$cache['sites'] = $g5k.site_uids
|
125
|
+
$cache['sites'].freeze
|
126
|
+
end
|
127
|
+
return $cache['sites']
|
128
|
+
end
|
129
|
+
|
130
|
+
def resolve_site(s)
|
131
|
+
s = 'nancy' if s == 'ncy' or s == 'ny'
|
132
|
+
s = 'nantes' if s == 'ns'
|
133
|
+
sites = get_api_sites
|
134
|
+
l = sites.select { |e| e.start_with?(s) }.sort
|
135
|
+
if l.length == 0
|
136
|
+
raise "Invalid site: #{s}"
|
137
|
+
elsif l.length > 1
|
138
|
+
raise "Invalid site: #{s} (matches: #{l.join(' ')})"
|
139
|
+
else
|
140
|
+
return l.first
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def nodeset(nodes)
|
145
|
+
return `echo #{nodes.join(',')} | nodeset -f`.chomp
|
146
|
+
end
|
147
|
+
|
148
|
+
def g5k_internal?
|
149
|
+
return `hostname --fqdn`.chomp =~ /\.grid5000\.fr$/ ? true : false
|
150
|
+
end
|
151
|
+
|
152
|
+
if ['bootstrap', 'bs'].include?(ARGV[0])
|
153
|
+
ARGV.shift
|
154
|
+
o = {}
|
155
|
+
o[:type] = []
|
156
|
+
OptionParser.new do |opts|
|
157
|
+
opts.banner = "usage: grd bootstrap [options]"
|
158
|
+
opts.separator " Reserve, provision, configure and connect to resources (alias: bs)"
|
159
|
+
opts.separator ""
|
160
|
+
opts.separator "# Options:"
|
161
|
+
opts.separator "## (A) Select which resources to reserve, and for how long:"
|
162
|
+
opts.on("-s", "--site SITE", "Site on which resources will be requested") do |d|
|
163
|
+
o[:site] = d
|
164
|
+
end
|
165
|
+
opts.on("-l", "--resources DESCRIPTION", "description of the requested resources (using the syntax for oarsub -l). default: nodes=1") do |d|
|
166
|
+
o[:resources] = d
|
167
|
+
end
|
168
|
+
opts.on("-w", "--walltime DURATION", "maximum duration of the reservation (using the syntax for oarsub -l)") do |d|
|
169
|
+
o[:walltime] = d
|
170
|
+
end
|
171
|
+
opts.separator ""
|
172
|
+
opts.separator "## (B) Choose when to reserve resources (default: ASAP, and wait interactively):"
|
173
|
+
opts.on("-d", "--detach", "Reserve resources ASAP, but do not wait interactively (batch/submission mode)") do
|
174
|
+
o[:detach] = true
|
175
|
+
end
|
176
|
+
opts.on("-r", "--reservation DATE", "Resources at a specified date and time. Do not wait for the reservation to start.") do |d|
|
177
|
+
o[:reservation] = d
|
178
|
+
o[:detach] = true
|
179
|
+
end
|
180
|
+
opts.separator ""
|
181
|
+
opts.separator "## (C) Set the environment (system image) to provision. If none specified, the 'standard' environment will be used"
|
182
|
+
opts.on("-e", "--environment ENV", "Kadeploy environment to provision") do |d|
|
183
|
+
o[:env] = d
|
184
|
+
end
|
185
|
+
opts.separator ""
|
186
|
+
opts.separator "## (D) Run a file (typically a script) on the first node"
|
187
|
+
opts.on("-f", "--script FILE", "script") do |d|
|
188
|
+
o[:script] = d
|
189
|
+
end
|
190
|
+
opts.on("-a", "--script-arg ARG", "argument to pass to the script (can be specified multiple times)") do |d|
|
191
|
+
o[:script_arg] ||= []
|
192
|
+
o[:script_arg] << d
|
193
|
+
end
|
194
|
+
opts.on("-T", "--terminate-after-script", "Terminate the reservation after the script execution") do
|
195
|
+
o[:terminate_after_script] = true
|
196
|
+
end
|
197
|
+
opts.separator ""
|
198
|
+
opts.separator "## (E) Connect interactively to the first node (incompatible with '--reservation' and '--detach')."
|
199
|
+
opts.on("-c", "--connect") do
|
200
|
+
o[:connect] = true
|
201
|
+
end
|
202
|
+
opts.separator ""
|
203
|
+
opts.separator "## Advanced options:"
|
204
|
+
opts.separator "### Related to reservation"
|
205
|
+
|
206
|
+
opts.on("-t", "--type JOB_TYPE", "OAR job type (can be specified multiple times)") do |d|
|
207
|
+
o[:type] << d
|
208
|
+
end
|
209
|
+
opts.on("-q", "--queue QUEUE", "OAR queue") do |d|
|
210
|
+
o[:queue] = d
|
211
|
+
end
|
212
|
+
opts.on("-p", "--project PROJECT", "OAR project") do |d|
|
213
|
+
o[:project] = d
|
214
|
+
end
|
215
|
+
opts.on("-n", "--name NAME", "OAR name") do |d|
|
216
|
+
o[:name] = d
|
217
|
+
end
|
218
|
+
opts.separator ""
|
219
|
+
opts.separator "### Related to post-deployment configuration"
|
220
|
+
opts.on("", "--armor", "Fetch and run g5k-armor-node.py (see https://www.grid5000.fr/w/Armored_Node_for_Sensitive_Data). This implies '--environment debian11-x64-big'") do
|
221
|
+
o[:armor] = true
|
222
|
+
end
|
223
|
+
opts.separator ""
|
224
|
+
opts.separator "# Notes:"
|
225
|
+
opts.separator "* All options are optional except '--site' if running from outside Grid'5000."
|
226
|
+
opts.separator "* In steps (D) and (E), the connection is done as 'root' if an environment was provisioned (and as the normal user otherwise)."
|
227
|
+
opts.separator ""
|
228
|
+
opts.separator "# Examples:"
|
229
|
+
opts.separator "## Basic usage: reserve one node on the current site, as soon as possible, and for the default walltime, and wait until it is available"
|
230
|
+
opts.separator " grd bs"
|
231
|
+
opts.separator ""
|
232
|
+
opts.separator "## Reserve, provision, execute a script, and connect to the node"
|
233
|
+
opts.separator " grd bs -s ly -l nodes=3 -w 0:10 -e debian11-x64-min -f setup-script -c"
|
234
|
+
opts.separator ""
|
235
|
+
opts.separator "Examples for -l / --resources:"
|
236
|
+
opts.separator ' nodes=3 {"gpu_count>0"}/nodes=1 {"cluster=\'gros\'"}/nodes=1 {nova}/nodes=3 (see https://www.grid5000.fr/w/OAR_Syntax_simplification)'
|
237
|
+
opts.separator ""
|
238
|
+
opts.separator "Examples for -r / --reservation:"
|
239
|
+
opts.separator ' "2022-03-30 19:30:05" "2022-03-30 19" "19" "2022-03-30 19,2022-04-02 04" "13,15"'
|
240
|
+
end.parse!
|
241
|
+
|
242
|
+
cute_init
|
243
|
+
|
244
|
+
if not o[:site]
|
245
|
+
if `hostname --fqdn`.chomp =~ /.*\.([^.]+)\.grid5000\.fr$/
|
246
|
+
o[:site] = $1
|
247
|
+
else
|
248
|
+
$logger.error("Running outside Grid'5000: the target site must be specified using '-s SITE'.")
|
249
|
+
exit(1)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
site = resolve_site(o[:site])
|
253
|
+
jo = {}
|
254
|
+
jo[:site] = site
|
255
|
+
jo[:resources] = o[:resources] || 'nodes=1'
|
256
|
+
jo[:walltime] = o[:walltime] if o[:walltime] # else use OAR's default
|
257
|
+
jo[:reservation] = o[:reservation] if o[:reservation]
|
258
|
+
jo[:queue] = o[:queue] if o[:queue] # else use OAR's default
|
259
|
+
jo[:project] = o[:project] if o[:project] # else use OAR's default
|
260
|
+
o[:name] ||= 'grd'
|
261
|
+
jo[:name] = o[:name]
|
262
|
+
jo[:wait] = false
|
263
|
+
if o[:env] and not o[:type].include?('deploy')
|
264
|
+
o[:type] << 'deploy'
|
265
|
+
end
|
266
|
+
jo[:type] = o[:type]
|
267
|
+
|
268
|
+
if o[:armor]
|
269
|
+
if o[:script]
|
270
|
+
$logger.error("--armor and --script are incompatible. Exiting.")
|
271
|
+
exit(1)
|
272
|
+
end
|
273
|
+
if o[:env] and o[:env] != 'debian11-x64-big'
|
274
|
+
$logger.error("--armor and --environment are incompatible. Exiting.")
|
275
|
+
exit(1)
|
276
|
+
end
|
277
|
+
o[:env] = 'debian11-x64-big'
|
278
|
+
o[:type] << 'deploy' if not o[:type].include?('deploy')
|
279
|
+
tmp = `mktemp /tmp/armor.XXXXXX`.chomp
|
280
|
+
File::open(tmp, "w") do |fd|
|
281
|
+
fd.puts <<-EOF
|
282
|
+
#!/bin/bash -xe
|
283
|
+
wget https://gitlab.inria.fr/grid5000/g5k-armor/-/raw/master/g5k-armor-node.py
|
284
|
+
chmod a+rx g5k-armor-node.py
|
285
|
+
./g5k-armor-node.py
|
286
|
+
EOF
|
287
|
+
end
|
288
|
+
o[:script] = tmp
|
289
|
+
end
|
290
|
+
|
291
|
+
if o[:detach]
|
292
|
+
if o[:connect]
|
293
|
+
$logger.error("--connect and --detach (or --reservation) are incompatible. Exiting.")
|
294
|
+
exit(1)
|
295
|
+
end
|
296
|
+
jo[:cmd] = 'grd inner'
|
297
|
+
if o[:env]
|
298
|
+
jo[:cmd] += " -e '#{o[:env]}'"
|
299
|
+
end
|
300
|
+
if o[:script]
|
301
|
+
# upload script to frontend
|
302
|
+
ssh = nil
|
303
|
+
gateway = nil
|
304
|
+
fnode = "#{site}.grid5000.fr"
|
305
|
+
if g5k_internal?
|
306
|
+
ssh = Net::SSH.start(fnode, $login)
|
307
|
+
else
|
308
|
+
gateway = Net::SSH::Gateway.new('access.grid5000.fr', $login)
|
309
|
+
ssh = gateway.ssh(fnode, $login)
|
310
|
+
end
|
311
|
+
tmpfile = ssh.exec3!("mkdir -p .cache/grd ; mktemp .cache/grd/script.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
312
|
+
ssh.scp.upload!(o[:script], tmpfile)
|
313
|
+
ssh.exec3!("chmod u+x #{tmpfile}", { :no_log => true, :no_output => true })
|
314
|
+
ssh.close
|
315
|
+
ssh.shutdown!
|
316
|
+
if not g5k_internal?
|
317
|
+
gateway.shutdown!
|
318
|
+
end
|
319
|
+
jo[:cmd] += " -f '#{tmpfile}'"
|
320
|
+
(o[:script_arg] || []).each do |a|
|
321
|
+
jo[:cmd] += " -a '#{a}'"
|
322
|
+
end
|
323
|
+
if o[:terminate_after_script]
|
324
|
+
jo[:cmd] += " --terminate-after-script"
|
325
|
+
end
|
326
|
+
end
|
327
|
+
if jo[:cmd] == 'grd inner'
|
328
|
+
# since we have nothing specific to do, we just sleep instead of calling 'grd inner'.
|
329
|
+
jo[:cmd] = "sleep infinity"
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
begin
|
334
|
+
job = $g5k.reserve(jo)
|
335
|
+
rescue Cute::G5K::BadRequest => e
|
336
|
+
$logger.error("Reservation failed with error 400 (Bad Request)")
|
337
|
+
if e.inner_message
|
338
|
+
$logger.error("Error message from reservation system:")
|
339
|
+
e.inner_message.each_line { |l| puts " " + l }
|
340
|
+
end
|
341
|
+
exit(1)
|
342
|
+
end
|
343
|
+
|
344
|
+
$logger.info("OAR job (reservation) ID: #{job['uid']}")
|
345
|
+
|
346
|
+
begin
|
347
|
+
t = nil
|
348
|
+
ts = Time::now
|
349
|
+
loop do
|
350
|
+
tries = 0
|
351
|
+
begin
|
352
|
+
job = $g5k.get_job(site, job['uid'])
|
353
|
+
rescue Cute::G5K::RequestFailed, Cute::G5K::BadRequest => e
|
354
|
+
$logger.info "Fetching reservation status failed due to API error: #{e.message}"
|
355
|
+
tries += 1
|
356
|
+
if tries < 5
|
357
|
+
$logger.info "Retrying.."
|
358
|
+
sleep 5
|
359
|
+
retry
|
360
|
+
else
|
361
|
+
$logger.info "Too many attempts, exiting. The job might still be running."
|
362
|
+
exit(1)
|
363
|
+
end
|
364
|
+
end
|
365
|
+
break if job['state'] == 'running'
|
366
|
+
t = job['scheduled_at']
|
367
|
+
if !t.nil?
|
368
|
+
t = Time.at(t)
|
369
|
+
secs = [ t - Time.now, 0 ].max.to_i
|
370
|
+
$logger.info "Reservation #{job['uid']} should be available at #{t} (in #{secs} s)"
|
371
|
+
break if o[:detach]
|
372
|
+
end
|
373
|
+
Kernel.sleep(2)
|
374
|
+
end
|
375
|
+
|
376
|
+
if o[:detach]
|
377
|
+
$logger.info "Your reservation will run in the background. Follow its status:"
|
378
|
+
$logger.info " Using the API: https://api.grid5000.fr/3.0/sites/#{site}/jobs/#{job['uid']}?pretty"
|
379
|
+
$logger.info " Using the Gantt: https://intranet.grid5000.fr/oar/#{site}/drawgantt-svg/"
|
380
|
+
$logger.info " When it will be running, using its output files:"
|
381
|
+
$logger.info " #{site}:/home/#{$login}/OAR.#{o[:name]}.#{job['uid']}.stdout"
|
382
|
+
$logger.info " #{site}:/home/#{$login}/OAR.#{o[:name]}.#{job['uid']}.stderr"
|
383
|
+
$logger.info "Exiting."
|
384
|
+
exit(0)
|
385
|
+
end
|
386
|
+
|
387
|
+
$logger.info "Resources are available after #{(Time::now - ts).to_i}s"
|
388
|
+
show_nodes(job)
|
389
|
+
# deployment
|
390
|
+
do_provisioning(site, job, o)
|
391
|
+
|
392
|
+
rescue Interrupt
|
393
|
+
$logger.info "Interrupted. Releasing resources."
|
394
|
+
$g5k.release(job)
|
395
|
+
exit(1)
|
396
|
+
end
|
397
|
+
|
398
|
+
# execute script
|
399
|
+
tlogin = o[:env] ? 'root' : $login
|
400
|
+
fnode = job['assigned_nodes'].first
|
401
|
+
# if in VLAN, adjust node name
|
402
|
+
if get_job_vlan(job)
|
403
|
+
fnode = nodename_in_vlan(fnode, get_job_vlan(job))
|
404
|
+
end
|
405
|
+
if o[:script]
|
406
|
+
ssh = nil
|
407
|
+
gateway = nil
|
408
|
+
if g5k_internal?
|
409
|
+
ssh = Net::SSH.start(fnode, tlogin)
|
410
|
+
else
|
411
|
+
gateway = Net::SSH::Gateway.new('access.grid5000.fr', $login)
|
412
|
+
ssh = gateway.ssh(fnode, tlogin)
|
413
|
+
end
|
414
|
+
if tlogin == 'root'
|
415
|
+
# We use a file in /root to avoid issues when unmounting /tmp in the script
|
416
|
+
tmpfile = ssh.exec3!("mkdir -p /root/.cache/grd && mktemp /root/.cache/grd/grd.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
417
|
+
else
|
418
|
+
tmpfile = ssh.exec3!("mktemp /tmp/grd.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
419
|
+
end
|
420
|
+
ssh.scp.upload!(o[:script], tmpfile)
|
421
|
+
ssh.exec3!("chmod u+x #{tmpfile}", { :no_log => true, :no_output => true })
|
422
|
+
args = (o[:script_arg] || []).join(' ')
|
423
|
+
$logger.info "Running script on #{fnode} ..."
|
424
|
+
ssh.exec3!("#{tmpfile} #{args}", { :ignore_error => true })
|
425
|
+
$logger.info "Script finished."
|
426
|
+
ssh.close
|
427
|
+
ssh.shutdown!
|
428
|
+
if not g5k_internal?
|
429
|
+
gateway.shutdown!
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
if o[:terminate_after_script]
|
434
|
+
$logger.info("Releasing resources.")
|
435
|
+
$g5k.release(job)
|
436
|
+
|
437
|
+
else
|
438
|
+
jh = g5k_internal? ? "" : "-J #{$login}@access.grid5000.fr "
|
439
|
+
cmd = "ssh -o StrictHostKeyChecking=accept-new #{jh}#{tlogin}@#{fnode}"
|
440
|
+
$logger.info "Connect to first node using:"
|
441
|
+
$logger.info " #{cmd}"
|
442
|
+
|
443
|
+
if o[:connect]
|
444
|
+
puts
|
445
|
+
system(cmd)
|
446
|
+
puts
|
447
|
+
s = nil
|
448
|
+
loop do
|
449
|
+
print "Connection to node terminated. Terminate resources reservation? (Y/N) "
|
450
|
+
s = gets.chomp.upcase
|
451
|
+
break if s == "Y" or s == "N"
|
452
|
+
end
|
453
|
+
if s == 'Y'
|
454
|
+
$logger.info("Releasing resources.")
|
455
|
+
$g5k.release(job)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
end
|
459
|
+
|
460
|
+
elsif ['inner'].include?(ARGV[0])
|
461
|
+
ARGV.shift
|
462
|
+
o = {}
|
463
|
+
o[:type] = []
|
464
|
+
OptionParser.new do |opts|
|
465
|
+
opts.banner = "usage: grd inner [options]"
|
466
|
+
opts.separator " Do the actions that need to be performed inside a reservation"
|
467
|
+
opts.on("-e", "--environment ENV", "Kadeploy environment to provision") do |d|
|
468
|
+
o[:env] = d
|
469
|
+
end
|
470
|
+
opts.on("-f", "--script FILE", "script") do |d|
|
471
|
+
o[:script] = d
|
472
|
+
end
|
473
|
+
opts.on("-a", "--script-arg ARG", "argument to pass to the script (can be specified multiple times)") do |d|
|
474
|
+
o[:script_arg] ||= []
|
475
|
+
o[:script_arg] << d
|
476
|
+
end
|
477
|
+
opts.on("-T", "--terminate-after-script", "Terminate the reservation after the script execution (default: keep resources until end of walltime)") do
|
478
|
+
o[:terminate_after_script] = true
|
479
|
+
end
|
480
|
+
end.parse!
|
481
|
+
|
482
|
+
|
483
|
+
if `hostname --fqdn`.chomp =~ /.*\.([^.]+)\.grid5000\.fr$/
|
484
|
+
o[:site] = $1
|
485
|
+
site = $1
|
486
|
+
else
|
487
|
+
$logger.error("ERROR: could not determine site.")
|
488
|
+
exit(1)
|
489
|
+
end
|
490
|
+
|
491
|
+
cute_init
|
492
|
+
|
493
|
+
jobid = ENV['OAR_JOB_ID'].to_i
|
494
|
+
|
495
|
+
$logger.info "Arguments: #{ARGV.inspect}"
|
496
|
+
$logger.info "OAR job ID: #{jobid}"
|
497
|
+
# we use OAR_NODEFILE to avoid an API request that would not work from a node
|
498
|
+
nodes = IO::readlines(ENV['OAR_NODEFILE']).map { |l| l.chomp }.uniq
|
499
|
+
$logger.info "Nodes: #{nodes.join(' ')}"
|
500
|
+
if nodes.length > 1
|
501
|
+
$logger.info "Nodeset: #{nodeset(nodes)}"
|
502
|
+
end
|
503
|
+
|
504
|
+
if o[:env]
|
505
|
+
job = $g5k.get_job(site, jobid) # check if this works from node
|
506
|
+
do_provisioning(site, job, o)
|
507
|
+
end
|
508
|
+
|
509
|
+
if o[:script]
|
510
|
+
args = (o[:script_arg] || []).join(' ')
|
511
|
+
if o[:env]
|
512
|
+
# we need to copy the script to the node using ssh
|
513
|
+
fnode = nodes.first
|
514
|
+
# if in VLAN, adjust node name
|
515
|
+
if get_job_vlan(job)
|
516
|
+
fnode = nodename_in_vlan(fnode, get_job_vlan(job))
|
517
|
+
end
|
518
|
+
tlogin = 'root'
|
519
|
+
ssh = Net::SSH.start(fnode, tlogin)
|
520
|
+
# We use a file in /root to avoid issues when unmounting /tmp in the script
|
521
|
+
tmpfile = ssh.exec3!("mkdir -p /root/.cache/grd && mktemp /root/.cache/grd/grd.XXXXXX", { :no_log => true, :no_output => true })[:stdout].chomp
|
522
|
+
ssh.scp.upload!(o[:script], tmpfile)
|
523
|
+
ssh.exec3!("chmod u+x #{tmpfile}", { :no_log => true, :no_output => true })
|
524
|
+
$logger.info "Running script on #{fnode} ..."
|
525
|
+
ssh.exec3!("#{tmpfile} #{args}", { :ignore_error => true })
|
526
|
+
$logger.info "Script finished."
|
527
|
+
ssh.close
|
528
|
+
ssh.shutdown!
|
529
|
+
else
|
530
|
+
# we are already on the node
|
531
|
+
system("chmod u+x #{o[:script]}") or raise
|
532
|
+
$logger.info "Running script ..."
|
533
|
+
ssh.exec3!("#{tmpfile} #{args}", { :ignore_error => true }) or raise
|
534
|
+
$logger.info "Script finished."
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
if o[:terminate_after_script]
|
539
|
+
$logger.info("Terminating resources reservation.")
|
540
|
+
else
|
541
|
+
$logger.info("Waiting until end of reservation.")
|
542
|
+
sleep() # sleep until end of reservation
|
543
|
+
end
|
544
|
+
|
545
|
+
elsif ['list', 'l'].include?(ARGV[0])
|
546
|
+
cute_init
|
547
|
+
|
548
|
+
ARGV.shift
|
549
|
+
o = {}
|
550
|
+
OptionParser.new do |opts|
|
551
|
+
opts.banner = "usage: grd list [options]"
|
552
|
+
opts.separator " List reservations"
|
553
|
+
opts.on("-s", "--site SITE", "Only list reservations on the specified site") do |d|
|
554
|
+
o[:site] = resolve_site(d)
|
555
|
+
end
|
556
|
+
opts.on("-a", "--all", "List all reservations, not just the current user's") do
|
557
|
+
o[:all] = true
|
558
|
+
end
|
559
|
+
opts.on("-r", "--raw", "Raw output (suitable for scripts)") do
|
560
|
+
o[:raw] = true
|
561
|
+
end
|
562
|
+
end.parse!
|
563
|
+
|
564
|
+
if o[:site]
|
565
|
+
sites = [ o[:site] ]
|
566
|
+
else
|
567
|
+
sites = get_api_sites.sort
|
568
|
+
end
|
569
|
+
begin
|
570
|
+
require 'terminal-table'
|
571
|
+
rescue LoadError
|
572
|
+
STDERR.puts "'terminal-table' library not found, using raw mode"
|
573
|
+
o[:raw] = true
|
574
|
+
end
|
575
|
+
a = []
|
576
|
+
r = ['site', 'id']
|
577
|
+
if o[:all]
|
578
|
+
r += ['project', 'user']
|
579
|
+
end
|
580
|
+
r += ['queue', 'state', 'start', 'end', 'duration', 'name', 'types', 'nodes', 'count' ]
|
581
|
+
a << r
|
582
|
+
|
583
|
+
login = o[:all] ? nil : $login
|
584
|
+
m = Mutex::new
|
585
|
+
sites.peach do |cursite|
|
586
|
+
# FIXME implement pagination (see https://intranet.grid5000.fr/bugzilla/show_bug.cgi?id=13882 about missing doc)
|
587
|
+
jobs = $g5k.get_jobs(cursite, login, %w{waiting launching running hold error terminated} - %w{error terminated}, true, false)
|
588
|
+
jobs.each do |j|
|
589
|
+
sa = if j['started_at'] and j['started_at'] > 0
|
590
|
+
Time::at(j['started_at'])
|
591
|
+
elsif j['scheduled_at'] and j['scheduled_at'] > 0
|
592
|
+
Time::at(j['scheduled_at'])
|
593
|
+
else
|
594
|
+
nil
|
595
|
+
end
|
596
|
+
sa_s = sa.nil? ? '' : sa.strftime("%Y-%m-%d %H:%M:%S")
|
597
|
+
endt = if sa
|
598
|
+
sa + j['walltime']
|
599
|
+
else
|
600
|
+
nil
|
601
|
+
end
|
602
|
+
endt_s = endt.nil? ? '' : endt.strftime("%Y-%m-%d %H:%M:%S")
|
603
|
+
|
604
|
+
# convert walltime to d/h/m/s
|
605
|
+
t = j['walltime']
|
606
|
+
mm, ss = t.divmod(60) #=> [4515, 21]
|
607
|
+
hh, mm = mm.divmod(60) #=> [75, 15]
|
608
|
+
dd, hh = hh.divmod(24) #=> [3, 3]
|
609
|
+
walltime_s = "%dd%2dh%2dm%2ds" % [dd, hh, mm, ss]
|
610
|
+
nodes = nodeset(j['assigned_nodes'])
|
611
|
+
nodes_count = j['assigned_nodes'].uniq.length
|
612
|
+
# we hide the default monitor type to avoid useless noise
|
613
|
+
types = j['types'].reject { |type| type == 'monitor=prom_.*default_metrics' }.join(',')
|
614
|
+
m.synchronize do
|
615
|
+
r = [ cursite, j['uid'] ]
|
616
|
+
if o[:all]
|
617
|
+
r += [ j['project'], j['user'] ]
|
618
|
+
end
|
619
|
+
r += [ j['queue'], j['state'], sa_s, endt_s, walltime_s, j['name'], types, nodes, nodes_count ]
|
620
|
+
a << r
|
621
|
+
end
|
622
|
+
end
|
623
|
+
end
|
624
|
+
a = [a[0]] + a[1..-1].sort { |a1, a2| [a1[0],a1[1]] <=> [a2[0], a2[1]] }
|
625
|
+
if o[:raw]
|
626
|
+
a.each do |l|
|
627
|
+
puts l.join("\t")
|
628
|
+
end
|
629
|
+
else
|
630
|
+
table = Terminal::Table.new
|
631
|
+
table.headings = a[0]
|
632
|
+
table.rows = a[1..-1]
|
633
|
+
puts table
|
634
|
+
end
|
635
|
+
|
636
|
+
elsif ['terminate', 't'].include?(ARGV[0])
|
637
|
+
cute_init
|
638
|
+
ARGV.shift
|
639
|
+
o = {}
|
640
|
+
OptionParser.new do |opts|
|
641
|
+
opts.banner = "usage: grd terminate [options]"
|
642
|
+
opts.separator " Terminate reservations"
|
643
|
+
opts.on("-s", "--site SITE", "Only terminate reservations on the specified site") do |d|
|
644
|
+
o[:site] = resolve_site(d)
|
645
|
+
end
|
646
|
+
opts.on("-j", "--job JOB_ID", "Only terminate the specified job/reservation (implies -y)") do |j|
|
647
|
+
o[:job] = j
|
648
|
+
o[:yes] = true
|
649
|
+
end
|
650
|
+
opts.on("-y", "--yes", "Do not ask for confirmation") do
|
651
|
+
o[:yes] = true
|
652
|
+
end
|
653
|
+
end.parse!
|
654
|
+
|
655
|
+
if o[:site]
|
656
|
+
sites = [ o[:site] ]
|
657
|
+
else
|
658
|
+
sites = get_api_sites
|
659
|
+
end
|
660
|
+
sites.each do |current_site|
|
661
|
+
jobs = $g5k.get_jobs(current_site, $login, %w{waiting launching running hold error terminated} - %w{error terminated})
|
662
|
+
jobs.each do |j|
|
663
|
+
next if o[:job] and j['uid'].to_i != o[:job].to_i
|
664
|
+
if j['started_at'] and j['started_at'] > 0
|
665
|
+
sa = Time::at(j['started_at'])
|
666
|
+
else
|
667
|
+
sa = ''
|
668
|
+
end
|
669
|
+
sig = "site=#{current_site} id=#{j['uid']} state=#{j['state']} started_at=#{sa} nodes=#{nodeset(j['assigned_nodes'])}"
|
670
|
+
if o[:yes]
|
671
|
+
puts "Terminating #{sig}"
|
672
|
+
$g5k.release(j)
|
673
|
+
else
|
674
|
+
print "Terminate #{sig} ? (Y/N) "
|
675
|
+
s = nil
|
676
|
+
loop do
|
677
|
+
s = gets.chomp.upcase
|
678
|
+
break if s == "Y" or s == "N"
|
679
|
+
print "Terminate? (Y/N) "
|
680
|
+
end
|
681
|
+
if s == 'Y'
|
682
|
+
$g5k.release(j)
|
683
|
+
end
|
684
|
+
end
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
else
|
689
|
+
puts <<-EOF
|
690
|
+
usage: grd <command> [options]
|
691
|
+
|
692
|
+
Available commands:
|
693
|
+
bootstrap Reserve, provision, configure and connect to resources (alias: bs)
|
694
|
+
list List resources reservations (alias: l)
|
695
|
+
terminate Terminate resources reservation(s) (alias: t)
|
696
|
+
|
697
|
+
Use 'grd <command> --help' for details.
|
698
|
+
EOF
|
699
|
+
exit(1)
|
700
|
+
end
|
701
|
+
|