xpflow 0.1b
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/xpflow +96 -0
- data/lib/colorado.rb +198 -0
- data/lib/json/add/core.rb +243 -0
- data/lib/json/add/rails.rb +8 -0
- data/lib/json/common.rb +423 -0
- data/lib/json/editor.rb +1369 -0
- data/lib/json/ext.rb +28 -0
- data/lib/json/pure/generator.rb +442 -0
- data/lib/json/pure/parser.rb +320 -0
- data/lib/json/pure.rb +15 -0
- data/lib/json/version.rb +8 -0
- data/lib/json.rb +62 -0
- data/lib/mime/types.rb +881 -0
- data/lib/mime-types.rb +3 -0
- data/lib/restclient/abstract_response.rb +106 -0
- data/lib/restclient/exceptions.rb +193 -0
- data/lib/restclient/net_http_ext.rb +55 -0
- data/lib/restclient/payload.rb +235 -0
- data/lib/restclient/raw_response.rb +34 -0
- data/lib/restclient/request.rb +316 -0
- data/lib/restclient/resource.rb +169 -0
- data/lib/restclient/response.rb +24 -0
- data/lib/restclient.rb +174 -0
- data/lib/xpflow/bash.rb +341 -0
- data/lib/xpflow/bundle.rb +113 -0
- data/lib/xpflow/cmdline.rb +249 -0
- data/lib/xpflow/collection.rb +122 -0
- data/lib/xpflow/concurrency.rb +79 -0
- data/lib/xpflow/data.rb +393 -0
- data/lib/xpflow/dsl.rb +816 -0
- data/lib/xpflow/engine.rb +574 -0
- data/lib/xpflow/ensemble.rb +135 -0
- data/lib/xpflow/events.rb +56 -0
- data/lib/xpflow/experiment.rb +65 -0
- data/lib/xpflow/exts/facter.rb +30 -0
- data/lib/xpflow/exts/g5k.rb +931 -0
- data/lib/xpflow/exts/g5k_use.rb +50 -0
- data/lib/xpflow/exts/gui.rb +140 -0
- data/lib/xpflow/exts/model.rb +155 -0
- data/lib/xpflow/graph.rb +1603 -0
- data/lib/xpflow/graph_xpflow.rb +251 -0
- data/lib/xpflow/import.rb +196 -0
- data/lib/xpflow/library.rb +349 -0
- data/lib/xpflow/logging.rb +153 -0
- data/lib/xpflow/manager.rb +147 -0
- data/lib/xpflow/nodes.rb +1250 -0
- data/lib/xpflow/runs.rb +773 -0
- data/lib/xpflow/runtime.rb +125 -0
- data/lib/xpflow/scope.rb +168 -0
- data/lib/xpflow/ssh.rb +186 -0
- data/lib/xpflow/stat.rb +50 -0
- data/lib/xpflow/stdlib.rb +381 -0
- data/lib/xpflow/structs.rb +369 -0
- data/lib/xpflow/taktuk.rb +193 -0
- data/lib/xpflow/templates/ssh-config.basic +14 -0
- data/lib/xpflow/templates/ssh-config.inria +18 -0
- data/lib/xpflow/templates/ssh-config.proxy +13 -0
- data/lib/xpflow/templates/taktuk +6590 -0
- data/lib/xpflow/templates/utils/batch +4 -0
- data/lib/xpflow/templates/utils/bootstrap +12 -0
- data/lib/xpflow/templates/utils/hostname +3 -0
- data/lib/xpflow/templates/utils/ping +3 -0
- data/lib/xpflow/templates/utils/rsync +12 -0
- data/lib/xpflow/templates/utils/scp +17 -0
- data/lib/xpflow/templates/utils/scp_many +8 -0
- data/lib/xpflow/templates/utils/ssh +3 -0
- data/lib/xpflow/templates/utils/ssh-interactive +4 -0
- data/lib/xpflow/templates/utils/taktuk +19 -0
- data/lib/xpflow/threads.rb +187 -0
- data/lib/xpflow/utils.rb +569 -0
- data/lib/xpflow/visual.rb +230 -0
- data/lib/xpflow/with_g5k.rb +7 -0
- data/lib/xpflow.rb +349 -0
- metadata +135 -0
@@ -0,0 +1,931 @@
|
|
1
|
+
#
|
2
|
+
# name: XPFlow::G5K::Library
|
3
|
+
#
|
4
|
+
|
5
|
+
require 'tempfile'
|
6
|
+
require 'etc'
|
7
|
+
require 'json'
|
8
|
+
require 'xpflow'
|
9
|
+
require 'restclient'
|
10
|
+
require 'pp'
|
11
|
+
require 'digest'
|
12
|
+
require 'date'
|
13
|
+
require 'cgi'
|
14
|
+
require 'shellwords'
|
15
|
+
|
16
|
+
module XPFlow; module G5K
|
17
|
+
|
18
|
+
SSH_CONFIG = "/tmp/.xpflow_ssh_config_#{Etc.getlogin}"
|
19
|
+
|
20
|
+
def self.install_ssh_config_file(user)
|
21
|
+
# TODO: this has to be fixed
|
22
|
+
# race conditions are possible
|
23
|
+
File.open(SSH_CONFIG, File::WRONLY|File::CREAT, 0600) do |f|
|
24
|
+
f.flock(File::LOCK_EX)
|
25
|
+
f.truncate(0)
|
26
|
+
f.write("LogLevel quiet\n")
|
27
|
+
f.write("StrictHostKeyChecking no\n")
|
28
|
+
f.write("UserKnownHostsFile /dev/null\n")
|
29
|
+
f.write("ForwardAgent yes\n")
|
30
|
+
f.write("Host g5k\n")
|
31
|
+
f.write(" Hostname access.nancy.grid5000.fr\n")
|
32
|
+
f.write(" User #{user}\n\n")
|
33
|
+
f.write("Host *.g5k\n")
|
34
|
+
f.write(" User #{user}\n")
|
35
|
+
f.write(" ProxyCommand ssh -F #{SSH_CONFIG} g5k \"nc -q 0 `basename %h .g5k` %p\"\n\n")
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.get_ssh_key
|
41
|
+
name = File.expand_path('~/.ssh/id_rsa.pub')
|
42
|
+
raise 'SSH key not present' unless File.exists?(name)
|
43
|
+
return IO::read(name).strip
|
44
|
+
end
|
45
|
+
|
46
|
+
class G5KRestFactory
|
47
|
+
|
48
|
+
def initialize
|
49
|
+
@mutex = Mutex.new
|
50
|
+
end
|
51
|
+
|
52
|
+
def get_credentials
|
53
|
+
return [ $g5k_user, $g5k_pass ]
|
54
|
+
end
|
55
|
+
|
56
|
+
def connect
|
57
|
+
@mutex.synchronize do
|
58
|
+
creds = get_credentials()
|
59
|
+
G5KRest.new(*creds)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
class G5KArray < Array
|
66
|
+
|
67
|
+
alias old_select select
|
68
|
+
|
69
|
+
def list
|
70
|
+
return self
|
71
|
+
end
|
72
|
+
|
73
|
+
def uids
|
74
|
+
return self.map { |it| it['uid'] }
|
75
|
+
end
|
76
|
+
|
77
|
+
def select(&block)
|
78
|
+
return old_select(&block)
|
79
|
+
end
|
80
|
+
|
81
|
+
def __repr__
|
82
|
+
return self.map { |it| it.__repr__ }.to_s
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
class G5KJson < Hash
|
88
|
+
|
89
|
+
def items
|
90
|
+
return self['items']
|
91
|
+
end
|
92
|
+
|
93
|
+
def rel(r)
|
94
|
+
return self['links'].detect { |x| x['rel'] == r }['href']
|
95
|
+
end
|
96
|
+
|
97
|
+
def rel_self
|
98
|
+
return rel('self')
|
99
|
+
end
|
100
|
+
|
101
|
+
def rel_parent
|
102
|
+
return rel('parent')
|
103
|
+
end
|
104
|
+
|
105
|
+
def link(title)
|
106
|
+
return self['links'].detect { |x| x['title'] == title }['href']
|
107
|
+
end
|
108
|
+
|
109
|
+
def uid
|
110
|
+
return self['uid']
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.parse(s)
|
114
|
+
return JSON.parse(s, :object_class => G5KJson, :array_class => G5KArray)
|
115
|
+
end
|
116
|
+
|
117
|
+
def __repr__
|
118
|
+
return self['uid'] unless self['uid'].nil?
|
119
|
+
return Hash[self.map { |k, v| [k, v.__repr__ ] }].to_s
|
120
|
+
end
|
121
|
+
|
122
|
+
def refresh(g5k)
|
123
|
+
return g5k.get_json_raw(rel_self)
|
124
|
+
end
|
125
|
+
|
126
|
+
def job_type
|
127
|
+
# gets type of job
|
128
|
+
return (self['types'].include?('deploy') ? :deploy : :normal)
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
class G5KRest
|
134
|
+
|
135
|
+
# Basic Grid5000 Rest Interface
|
136
|
+
|
137
|
+
attr_reader :user
|
138
|
+
|
139
|
+
def self.from_config
|
140
|
+
G5KRestFactory.new.connect
|
141
|
+
end
|
142
|
+
|
143
|
+
def initialize(user, pass)
|
144
|
+
@user = user
|
145
|
+
@pass = pass
|
146
|
+
raise "You forgot to use :g5k library!" if (user.nil? or pass.nil?)
|
147
|
+
user_escaped = CGI.escape(user)
|
148
|
+
pass_escaped = CGI.escape(pass)
|
149
|
+
@endpoint = "https://#{user_escaped}:#{pass_escaped}@api.grid5000.fr"
|
150
|
+
@api = RestClient::Resource.new(@endpoint, :timeout => 15)
|
151
|
+
end
|
152
|
+
|
153
|
+
def resource(path)
|
154
|
+
path = path[1..-1] if path.start_with?('/')
|
155
|
+
return @api[path]
|
156
|
+
end
|
157
|
+
|
158
|
+
def delete_json_raw(path)
|
159
|
+
begin
|
160
|
+
return resource(path).delete()
|
161
|
+
rescue RestClient::InternalServerError => e
|
162
|
+
raise
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def post_json_raw(path, json)
|
167
|
+
r = resource(path).post(json.to_json,
|
168
|
+
:content_type => "application/json", :accept => "application/json")
|
169
|
+
return G5KJson.parse(r)
|
170
|
+
end
|
171
|
+
|
172
|
+
def get_json_raw(path)
|
173
|
+
maxfails = 3
|
174
|
+
fails = 0
|
175
|
+
while true
|
176
|
+
begin
|
177
|
+
r = resource(path).get()
|
178
|
+
return G5KJson.parse(r)
|
179
|
+
rescue RestClient::RequestTimeout
|
180
|
+
fails += 1
|
181
|
+
raise if fails > maxfails
|
182
|
+
Kernel.sleep(1.0)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
def get_json(resource)
|
189
|
+
return get_json_raw("sid/#{resource}")
|
190
|
+
end
|
191
|
+
|
192
|
+
def post_json(resource, json)
|
193
|
+
begin
|
194
|
+
return post_json_raw("sid/#{resource}", json)
|
195
|
+
rescue => e
|
196
|
+
raise
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_items(resource)
|
201
|
+
return get_json(resource).items
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def get_sites
|
206
|
+
sites = get_items('sites').list
|
207
|
+
# TODO: temporary fix for two sites
|
208
|
+
sites = sites.select { |x| ! [ "Bordeaux", "Reims" ].include?(x['name']) }
|
209
|
+
return sites
|
210
|
+
end
|
211
|
+
|
212
|
+
def get_site_status(site)
|
213
|
+
return get_items("sites/#{site}/status").list
|
214
|
+
end
|
215
|
+
|
216
|
+
def get_jobs(site, uid = nil)
|
217
|
+
filter = uid.nil? ? "" : "&user_uid=#{uid}"
|
218
|
+
resource = "sites/#{site}/jobs/?state=running#{filter}"
|
219
|
+
return get_items(resource).list
|
220
|
+
end
|
221
|
+
|
222
|
+
def get_job(site, jid)
|
223
|
+
resource = "sites/#{site}/jobs/#{jid}"
|
224
|
+
return get_json(resource)
|
225
|
+
end
|
226
|
+
|
227
|
+
def get_clusters(site)
|
228
|
+
return get_items("sites/#{site}/clusters").list
|
229
|
+
end
|
230
|
+
|
231
|
+
def get_switches(site)
|
232
|
+
items = get_items("sites/#{site}/network_equipments")
|
233
|
+
items = items.select { |x| x['kind'] == 'switch' }
|
234
|
+
# extract nodes connected to those switches
|
235
|
+
items.each { |switch|
|
236
|
+
conns = switch['linecards'].detect { |c| c['kind'] == 'node' }
|
237
|
+
next if conns.nil? # IB switches for example
|
238
|
+
nodes = conns['ports'] \
|
239
|
+
.select { |x| x != {} } \
|
240
|
+
.map { |x| x['uid'] } \
|
241
|
+
.map { |x| "#{x}.#{site}.grid5000.fr"}
|
242
|
+
switch['nodes'] = nodes
|
243
|
+
}
|
244
|
+
return items.select { |it| it.key?('nodes') }
|
245
|
+
end
|
246
|
+
|
247
|
+
def get_switch(site, name)
|
248
|
+
s = get_switches(site).detect { |x| x.uid == name }
|
249
|
+
raise "Unknown switch '#{name}'" if s.nil?
|
250
|
+
return s
|
251
|
+
end
|
252
|
+
|
253
|
+
def follow_link(obj, rel)
|
254
|
+
return get_json_raw(obj.link(rel))
|
255
|
+
end
|
256
|
+
|
257
|
+
def follow_parent(obj)
|
258
|
+
return get_json_raw(obj.rel_parent)
|
259
|
+
end
|
260
|
+
|
261
|
+
def get_nodes_status(site)
|
262
|
+
nodes = {}
|
263
|
+
get_site_status(site).map do |node|
|
264
|
+
name = node['node_uid']
|
265
|
+
name = "#{name}.#{site}.grid5000.fr" unless name.end_with?('.fr')
|
266
|
+
status = node['system_state']
|
267
|
+
nodes[name] = status
|
268
|
+
end
|
269
|
+
return nodes
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
Factory = G5KRestFactory.new
|
275
|
+
|
276
|
+
class Library < ActivityLibrary
|
277
|
+
|
278
|
+
attr_accessor :logging
|
279
|
+
attr_accessor :proxy
|
280
|
+
|
281
|
+
activities :reserve, :reserve_nodes, :release, :nodes, :switches, :switch,
|
282
|
+
:nodes_of_switch, :sites, :jobs, :wait_for_job, :nodes_available,
|
283
|
+
:release_all, :my_jobs, :wait_for_reservation, :nodes_available?,
|
284
|
+
:deploy, :execute, :copy, :bash, :dist_keys, :execute_frontend,
|
285
|
+
:bash_frontend, :distribute, :retrieve, :kavlan, :vlan_nodes,
|
286
|
+
:vlan_bash, :pick_reservation,
|
287
|
+
:node_site, :nodes_sites, :run_script, :rsync,
|
288
|
+
:version, :job,
|
289
|
+
:clean, :dist_ssh_keys
|
290
|
+
|
291
|
+
def initialize
|
292
|
+
super
|
293
|
+
@cache = Cache.new
|
294
|
+
G5K.install_ssh_config_file(g5k.user)
|
295
|
+
inject_library('__core__', CoreLibrary.new)
|
296
|
+
end
|
297
|
+
|
298
|
+
def version
|
299
|
+
return "0.1"
|
300
|
+
end
|
301
|
+
|
302
|
+
def inside_g5k
|
303
|
+
# checks if we are inside Grid5000
|
304
|
+
@cache.fetch(:inside_g5k) do
|
305
|
+
`hostname`.strip.end_with?('grid5000.fr')
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
def g5k
|
310
|
+
Factory.connect
|
311
|
+
end
|
312
|
+
|
313
|
+
def nodes_with_site(nodes)
|
314
|
+
# maps each node to its site
|
315
|
+
ss = site_uids()
|
316
|
+
h = {}
|
317
|
+
nodes.each do |n|
|
318
|
+
s = ss.detect { |x| n.include?(x) }
|
319
|
+
raise "Could not map node '#{n}' to its site" if s.nil?
|
320
|
+
h[n] = s
|
321
|
+
end
|
322
|
+
return h
|
323
|
+
end
|
324
|
+
|
325
|
+
def nodes_sites(nodes)
|
326
|
+
# returns a set of sites the given nodes are at
|
327
|
+
return nodes_with_site(nodes).values.uniq
|
328
|
+
end
|
329
|
+
|
330
|
+
def node_site(node)
|
331
|
+
return nodes_sites([ node ]).first
|
332
|
+
end
|
333
|
+
|
334
|
+
def nodes_status(nodes)
|
335
|
+
# maps nodes to their statuses
|
336
|
+
status = {}
|
337
|
+
nodes_sites(nodes).each do |site|
|
338
|
+
st = g5k.get_nodes_status(site)
|
339
|
+
st = st.select { |k, v| nodes.include?(k) }
|
340
|
+
st = Hash[st]
|
341
|
+
status = status.merge(st)
|
342
|
+
end
|
343
|
+
# beware: it is not guaranteed that every node will have its status!
|
344
|
+
return status
|
345
|
+
end
|
346
|
+
|
347
|
+
def nodes_available(nodes, opts = {})
|
348
|
+
ignore_dead = opts[:ignore_dead]
|
349
|
+
status = nodes_status(nodes)
|
350
|
+
avail = status.select do |k, v|
|
351
|
+
(v == 'free') or (ignore_dead and v == 'unknown')
|
352
|
+
end
|
353
|
+
return Hash[avail].keys
|
354
|
+
end
|
355
|
+
|
356
|
+
def nodes_available?(nodes, opts = {})
|
357
|
+
avail = nodes_available(nodes, opts)
|
358
|
+
unavail = nodes - avail
|
359
|
+
r = unavail.empty?
|
360
|
+
r.inject_method(:availability) do
|
361
|
+
1.0 - (unavail.length.to_f / nodes.length.to_f)
|
362
|
+
end
|
363
|
+
r.inject_method(:total) { nodes.length }
|
364
|
+
return r
|
365
|
+
end
|
366
|
+
|
367
|
+
def filter_dead_nodes(nodes)
|
368
|
+
# remove dead or unknown nodes
|
369
|
+
dead = []
|
370
|
+
nodes_status(nodes).each do |node, status|
|
371
|
+
dead.push(node) if status == 'unknown'
|
372
|
+
end
|
373
|
+
return nodes - dead
|
374
|
+
end
|
375
|
+
|
376
|
+
def parse_time(spec)
|
377
|
+
spec = spec.strip
|
378
|
+
return DateTime.now.to_s if spec == "now"
|
379
|
+
timezone = `date +%Z`.strip
|
380
|
+
return DateTime.parse("#{spec} #{timezone}").to_s
|
381
|
+
end
|
382
|
+
|
383
|
+
def handle_slash(opts)
|
384
|
+
slash = nil
|
385
|
+
predefined = { :slash_22 => 22, :slash_18 => 18 }
|
386
|
+
if opts[:slash]
|
387
|
+
bits = opts[:slash].to_i
|
388
|
+
slash = "slash_#{bits}=1"
|
389
|
+
else
|
390
|
+
slashes = predefined.select { |label, bits| opts.key?(label) }
|
391
|
+
unless slashes.empty?
|
392
|
+
label, bits = slashes.first
|
393
|
+
count = opts[label].to_i
|
394
|
+
slash = "slash_#{bits}=#{count}"
|
395
|
+
end
|
396
|
+
end
|
397
|
+
return slash
|
398
|
+
end
|
399
|
+
|
400
|
+
def pick_reservation(opts = {})
|
401
|
+
site = opts[:site]
|
402
|
+
jobs = site.nil? ? my_all_jobs() : jobs(site)
|
403
|
+
# pp jobs
|
404
|
+
jobs = jobs.select { |x| x['state'] == 'running' }
|
405
|
+
jobs = jobs.select { |x| x['user_uid'] == g5k.user } # WEIRD!
|
406
|
+
raise "No reservations available" if jobs.empty?
|
407
|
+
raise "Too many reservation meeting the criteria." if jobs.length > 1
|
408
|
+
job = jobs.first
|
409
|
+
info "Found reservation with ID = #{job["uid"]}"
|
410
|
+
j = g5k.get_json_raw(job.rel_self)
|
411
|
+
j = wait_for_job(j)
|
412
|
+
return j
|
413
|
+
end
|
414
|
+
|
415
|
+
def job(site, jid)
|
416
|
+
j = g5k.get_job(site, jid.to_i)
|
417
|
+
j = wait_for_job(j)
|
418
|
+
return j
|
419
|
+
end
|
420
|
+
|
421
|
+
def reserve_nodes(opts)
|
422
|
+
# helper for making the reservations the easy way
|
423
|
+
nodes = opts.fetch(:nodes, 1)
|
424
|
+
time = opts.fetch(:time, '01:00:00')
|
425
|
+
at = opts[:at]
|
426
|
+
slash = handle_slash(opts)
|
427
|
+
site = opts[:site]
|
428
|
+
type = opts.fetch(:type, :normal)
|
429
|
+
keep = opts[:keep]
|
430
|
+
name = opts.fetch(:name, 'xpflow job')
|
431
|
+
command = opts[:cmd]
|
432
|
+
async = opts[:async]
|
433
|
+
ignore_dead = opts[:ignore_dead]
|
434
|
+
props = nil
|
435
|
+
vlan = opts[:vlan]
|
436
|
+
cluster = opts[:cluster]
|
437
|
+
|
438
|
+
raise 'At least nodes, time and site must be given' \
|
439
|
+
if [nodes, time, site].any? { |x| x.nil? }
|
440
|
+
|
441
|
+
secs = Timespan.to_secs(time)
|
442
|
+
time = Timespan.to_time(time)
|
443
|
+
|
444
|
+
if nodes.is_a?(Array)
|
445
|
+
all_nodes = nodes
|
446
|
+
nodes = filter_dead_nodes(nodes) if ignore_dead
|
447
|
+
removed_nodes = all_nodes - nodes
|
448
|
+
info "Ignored nodes #{removed_nodes}." unless removed_nodes.empty?
|
449
|
+
hosts = nodes.map { |n| "'#{n}'" }.sort.join(',')
|
450
|
+
props = "host in (#{hosts})"
|
451
|
+
nodes = nodes.length
|
452
|
+
end
|
453
|
+
|
454
|
+
raise 'Nodes must be an integer.' unless nodes.is_a?(Integer)
|
455
|
+
site = site.__repr__
|
456
|
+
raise 'Type must be either :deploy or :normal' \
|
457
|
+
unless (type.respond_to?(:to_sym) && [ :normal, :deploy ].include?(type.to_sym))
|
458
|
+
command = "sleep #{secs}" if command.nil?
|
459
|
+
type = type.to_sym
|
460
|
+
|
461
|
+
resources = "/nodes=#{nodes},walltime=#{time}"
|
462
|
+
resources = "{cluster='#{cluster}'}" + resources unless cluster.nil?
|
463
|
+
resources = "{type='kavlan'}/vlan=1+" + resources if vlan == true
|
464
|
+
resources = "#{slash}+" + resources unless slash.nil?
|
465
|
+
|
466
|
+
payload = {
|
467
|
+
'resources' => resources,
|
468
|
+
'name' => name,
|
469
|
+
'command' => command
|
470
|
+
}
|
471
|
+
|
472
|
+
info "Reserving resources: #{resources} (type: #{type}) (in #{site})"
|
473
|
+
|
474
|
+
payload['properties'] = props unless props.nil?
|
475
|
+
if type == :deploy
|
476
|
+
payload['types'] = [ 'deploy' ]
|
477
|
+
else
|
478
|
+
payload['types'] = [ 'allow_classic_ssh' ]
|
479
|
+
end
|
480
|
+
|
481
|
+
unless at.nil?
|
482
|
+
dt = parse_time(at)
|
483
|
+
payload['reservation'] = dt
|
484
|
+
info "Starting this reservation at #{dt}"
|
485
|
+
end
|
486
|
+
|
487
|
+
begin
|
488
|
+
r = g5k.post_json("sites/#{site}/jobs", payload)
|
489
|
+
rescue => e
|
490
|
+
raise
|
491
|
+
end
|
492
|
+
|
493
|
+
# it may be a different thread that releases reservations
|
494
|
+
# therefore we need to dereference proxy which
|
495
|
+
# in fact uses Thread.current and is local to the thread...
|
496
|
+
|
497
|
+
engine = proxy.engine
|
498
|
+
|
499
|
+
engine.on_finish do
|
500
|
+
engine.verbose("Releasing job at #{r.rel_self}")
|
501
|
+
release(r)
|
502
|
+
end if keep != true
|
503
|
+
|
504
|
+
job = g5k.get_json_raw(r.rel_self)
|
505
|
+
job = wait_for_job(job) if async != true
|
506
|
+
return job
|
507
|
+
end
|
508
|
+
|
509
|
+
def info(msg)
|
510
|
+
if @logging
|
511
|
+
@logging.call(msg)
|
512
|
+
else
|
513
|
+
proxy.engine.log(msg, :g5k)
|
514
|
+
end
|
515
|
+
end
|
516
|
+
|
517
|
+
def wait_for_job(job)
|
518
|
+
# wait for the job to be in a running state
|
519
|
+
# timeouts after 10 seconds
|
520
|
+
jid = job.__repr__
|
521
|
+
info "Waiting for reservation #{jid}"
|
522
|
+
Timeout.timeout(36000) do
|
523
|
+
while true
|
524
|
+
job = job.refresh(g5k)
|
525
|
+
t = job['scheduled_at']
|
526
|
+
if !t.nil?
|
527
|
+
t = Time.at(t)
|
528
|
+
secs = [ t - Time.now, 0 ].max.to_i
|
529
|
+
info "Reservation #{jid} should be available at #{t} (#{secs} s)"
|
530
|
+
end
|
531
|
+
break if job['state'] == 'running'
|
532
|
+
raise "Job is finishing." if job['state'] == 'finishing'
|
533
|
+
Kernel.sleep(5)
|
534
|
+
end
|
535
|
+
end
|
536
|
+
info "Reservation #{jid} ready"
|
537
|
+
return job
|
538
|
+
end
|
539
|
+
|
540
|
+
def release_all(site)
|
541
|
+
# releases all jobs on a site
|
542
|
+
site = site.__repr__
|
543
|
+
Timeout.check(20) do
|
544
|
+
jobs = my_jobs(site)
|
545
|
+
pass if jobs.length == 0
|
546
|
+
begin
|
547
|
+
jobs.each { |j| release(j) }
|
548
|
+
rescue RestClient::InternalServerError => e
|
549
|
+
raise unless e.response.include?('already killed')
|
550
|
+
end
|
551
|
+
end
|
552
|
+
end
|
553
|
+
|
554
|
+
def release(r)
|
555
|
+
begin
|
556
|
+
return g5k.delete_json_raw(r.rel_self)
|
557
|
+
rescue RestClient::InternalServerError => e
|
558
|
+
raise unless e.response.include?('already killed')
|
559
|
+
end
|
560
|
+
end
|
561
|
+
|
562
|
+
def reserve(opts)
|
563
|
+
raise 'not implemented'
|
564
|
+
end
|
565
|
+
|
566
|
+
def sites
|
567
|
+
@cache.fetch(:sites) do
|
568
|
+
g5k.get_sites
|
569
|
+
end
|
570
|
+
end
|
571
|
+
|
572
|
+
def site_uids
|
573
|
+
return sites.uids
|
574
|
+
end
|
575
|
+
|
576
|
+
def nodes(r)
|
577
|
+
return r['nodes'] if r.key?('nodes')
|
578
|
+
return r['assigned_nodes']
|
579
|
+
end
|
580
|
+
|
581
|
+
def vlan_nodes(r)
|
582
|
+
vlan = kavlan(r)
|
583
|
+
return vlan[:hosts]
|
584
|
+
end
|
585
|
+
|
586
|
+
def jobs(site)
|
587
|
+
name = site.__repr__
|
588
|
+
return g5k.get_jobs(name)
|
589
|
+
end
|
590
|
+
|
591
|
+
def my_jobs(site)
|
592
|
+
name = site.__repr__
|
593
|
+
return g5k.get_jobs(name, g5k.user)
|
594
|
+
end
|
595
|
+
|
596
|
+
def my_all_jobs
|
597
|
+
ss = sites()
|
598
|
+
return ss.map { |s| my_jobs(s) }.reduce(:+)
|
599
|
+
end
|
600
|
+
|
601
|
+
def switches(site)
|
602
|
+
name = site.__repr__
|
603
|
+
return g5k.get_switches(name)
|
604
|
+
end
|
605
|
+
|
606
|
+
def switch(site, sw)
|
607
|
+
name = site.__repr__
|
608
|
+
return g5k.get_switch(site, sw)
|
609
|
+
end
|
610
|
+
|
611
|
+
def wait_for_reservation(opts = {})
|
612
|
+
site = opts.fetch(:site, :any).__repr__
|
613
|
+
timeout = opts.fetch(:timeout, Infinity)
|
614
|
+
name = opts[:name]
|
615
|
+
|
616
|
+
timeout = Timespan.to_secs(timeout)
|
617
|
+
places = sites.uids
|
618
|
+
places = places.select { |uid| uid == site } if site != 'any'
|
619
|
+
raise "No '#{site}' site" if places.empty?
|
620
|
+
job = nil
|
621
|
+
Timeout.check(timeout) do
|
622
|
+
js = places.map { |p| my_jobs(p) }.reduce(:+)
|
623
|
+
js = js.select { |j| j['name'] == name } unless name.nil?
|
624
|
+
job = js.first
|
625
|
+
pass unless job.nil?
|
626
|
+
end
|
627
|
+
job = wait_for_job(job)
|
628
|
+
return job
|
629
|
+
end
|
630
|
+
|
631
|
+
def kavlan(job)
|
632
|
+
jid = job['uid']
|
633
|
+
site = g5k.follow_parent(job).uid
|
634
|
+
begin
|
635
|
+
info = bash_frontend(site) do
|
636
|
+
uid = run "kavlan -V -j #{jid}"
|
637
|
+
list = run "kavlan -l -j #{jid}"
|
638
|
+
{ :uid => uid.to_i, :hosts => list.lines.map { |x| x.strip } }
|
639
|
+
end
|
640
|
+
rescue Bash::StatusError => e
|
641
|
+
raise e if e.output.strip != 'no vlan found'
|
642
|
+
return nil
|
643
|
+
end
|
644
|
+
return info
|
645
|
+
end
|
646
|
+
|
647
|
+
|
648
|
+
def get_ssh_key_for_site(site)
|
649
|
+
ssh_key = bash_frontend(site) do
|
650
|
+
name = expand_path '~/.ssh/id_rsa.pub'
|
651
|
+
(exists name) ? (contents name).strip : nil
|
652
|
+
end
|
653
|
+
return ssh_key
|
654
|
+
end
|
655
|
+
|
656
|
+
def deploy(job, opts = {})
|
657
|
+
# TODO: make sure this is deployment job
|
658
|
+
# TODO: this is deprecated
|
659
|
+
|
660
|
+
nodes = job['assigned_nodes']
|
661
|
+
env = opts[:env]
|
662
|
+
|
663
|
+
site = g5k.follow_parent(job).uid
|
664
|
+
|
665
|
+
keys = [ G5K.get_ssh_key() ]
|
666
|
+
|
667
|
+
frontend_ssh_key = get_ssh_key_for_site(site)
|
668
|
+
|
669
|
+
keys.push(frontend_ssh_key) unless frontend_ssh_key.nil?
|
670
|
+
|
671
|
+
info "Deploying #{keys.length} SSH keys"
|
672
|
+
|
673
|
+
raise "Environment must be given" if env.nil?
|
674
|
+
|
675
|
+
payload = {
|
676
|
+
'nodes' => nodes,
|
677
|
+
'environment' => env,
|
678
|
+
'key' => keys.join("\n") + "\n",
|
679
|
+
}
|
680
|
+
|
681
|
+
vlan = kavlan(job)
|
682
|
+
|
683
|
+
if !vlan.nil?
|
684
|
+
payload['vlan'] = vlan[:uid]
|
685
|
+
info "Found VLAN with uid = #{vlan[:uid]}"
|
686
|
+
end
|
687
|
+
|
688
|
+
info "Creating deployment"
|
689
|
+
# puts payload.inspect
|
690
|
+
|
691
|
+
begin
|
692
|
+
r = g5k.post_json("sites/#{site}/deployments", payload)
|
693
|
+
rescue => e
|
694
|
+
raise e
|
695
|
+
end
|
696
|
+
|
697
|
+
info "Entering waiting loop"
|
698
|
+
|
699
|
+
Timeout.check(Infinity) do
|
700
|
+
r = r.refresh(g5k)
|
701
|
+
pass if r['status'] == 'terminated'
|
702
|
+
info "Waiting for deployment to finish (state = #{r['status']})."
|
703
|
+
end
|
704
|
+
|
705
|
+
ok = r['result'].map { |node, info| info }.all? { |x| x['state'] == 'OK' }
|
706
|
+
|
707
|
+
raise "Deployment (at least partially) failed" unless ok
|
708
|
+
|
709
|
+
return r
|
710
|
+
|
711
|
+
end
|
712
|
+
|
713
|
+
def find_node(node)
|
714
|
+
j, n, site = nil, nil, nil
|
715
|
+
# 1. Try to find a site.
|
716
|
+
info "Looking for node #{node}..."
|
717
|
+
# info "Sites considered: #{site_uids.inspect}"
|
718
|
+
site = site_uids.detect { |s| node.include?(s) }
|
719
|
+
# info "Site is #{site}"
|
720
|
+
jobs = (site.nil?) ? my_all_jobs() : my_jobs(site)
|
721
|
+
# info "Jobs considered: #{jobs.inspect}"
|
722
|
+
jobs = jobs.map { |x| x.refresh(g5k) }
|
723
|
+
for job in jobs do
|
724
|
+
n = job['assigned_nodes'].detect { |n| n.start_with?(node) }
|
725
|
+
if n.nil? == false
|
726
|
+
j = job
|
727
|
+
site = g5k.follow_parent(job).uid if site.nil?
|
728
|
+
break
|
729
|
+
end
|
730
|
+
end
|
731
|
+
return j, n, site
|
732
|
+
end
|
733
|
+
|
734
|
+
def _ssh(site, job, n, cmd)
|
735
|
+
# connects to node 'n', on site 'site', being a part of 'job'
|
736
|
+
cmd = Shellwords.escape(cmd)
|
737
|
+
cmd2 = Shellwords.escape(cmd)
|
738
|
+
bashc = "OAR_JOB_ID=#{job.uid} oarsh #{n} -- #{cmd2}"
|
739
|
+
if inside_g5k
|
740
|
+
if job.job_type == :deploy
|
741
|
+
return "ssh root@#{n} -- #{cmd}"
|
742
|
+
else
|
743
|
+
# TODO: this can be simplified if we are
|
744
|
+
# running on 'site'
|
745
|
+
return "ssh -F #{SSH_CONFIG} #{site}.grid5000.fr -- #{bashc}"
|
746
|
+
end
|
747
|
+
else
|
748
|
+
proxy = "ssh -F #{SSH_CONFIG} #{site}.g5k"
|
749
|
+
if job.job_type == :deploy
|
750
|
+
return "#{proxy} -- ssh root@#{n} -- #{cmd2}"
|
751
|
+
else
|
752
|
+
return "#{proxy} -- #{bashc}"
|
753
|
+
end
|
754
|
+
end
|
755
|
+
end
|
756
|
+
|
757
|
+
def _ssh_deploy(site, node, cmd)
|
758
|
+
gw = _ssh_gw(site, node)
|
759
|
+
return "#{gw} -- #{cmd}"
|
760
|
+
end
|
761
|
+
|
762
|
+
def _ssh_gw(site, node)
|
763
|
+
if inside_g5k
|
764
|
+
return "ssh root@#{node}"
|
765
|
+
else
|
766
|
+
proxy = "ssh -F #{SSH_CONFIG} #{site}.g5k"
|
767
|
+
return "#{proxy} -- ssh root@#{node}"
|
768
|
+
end
|
769
|
+
end
|
770
|
+
|
771
|
+
def _frontend(site, cmd)
|
772
|
+
if inside_g5k
|
773
|
+
ssh = "ssh -F #{SSH_CONFIG} #{site}.grid5000.fr"
|
774
|
+
else
|
775
|
+
ssh = "ssh -F #{SSH_CONFIG} #{site}.g5k"
|
776
|
+
end
|
777
|
+
cmd = Shellwords.escape(cmd)
|
778
|
+
return "#{ssh} -- #{cmd}"
|
779
|
+
end
|
780
|
+
|
781
|
+
def execute(node, cmd, prefix = '', postfix = '')
|
782
|
+
if node.include?("kavlan")
|
783
|
+
ssh = _ssh_deploy('nancy', node, 'bash') # TODO: how does it work?
|
784
|
+
ssh = ssh.gsub("bash", "") # TODO: OMG
|
785
|
+
prog = "#{prefix}#{ssh} #{cmd} #{postfix}"
|
786
|
+
else
|
787
|
+
job, n, site = find_node(node)
|
788
|
+
raise "Node '#{node}' not found" if job.nil?
|
789
|
+
prog = "#{prefix}#{_ssh(site, job, n, cmd)}#{postfix}"
|
790
|
+
end
|
791
|
+
info "Running command: #{prog}"
|
792
|
+
return `#{prog}`
|
793
|
+
end
|
794
|
+
|
795
|
+
def execute_frontend(site, cmd, prefix = '')
|
796
|
+
bash_frontend(site) do
|
797
|
+
run(cmd)
|
798
|
+
end
|
799
|
+
end
|
800
|
+
|
801
|
+
def bash(node, opts = {}, &block)
|
802
|
+
return vlan_bash(node, debug, &block) if node.include?("kavlan")
|
803
|
+
job, n, site = find_node(node)
|
804
|
+
raise "Node #{node} not found" if job.nil?
|
805
|
+
ssh = _ssh(site, job, n, 'bash')
|
806
|
+
info "Running bash via: #{ssh}"
|
807
|
+
return Bash.bash(ssh, opts, &block)
|
808
|
+
end
|
809
|
+
|
810
|
+
def vlan_bash(node, opts = {}, &block)
|
811
|
+
ssh = _ssh_deploy('nancy', node, 'bash') # TODO: how does it work?
|
812
|
+
info "Running vlan bash via: #{ssh}"
|
813
|
+
return Bash.bash(ssh, opts, &block)
|
814
|
+
end
|
815
|
+
|
816
|
+
def bash_local(&block)
|
817
|
+
return Bash.bash(&block)
|
818
|
+
end
|
819
|
+
|
820
|
+
def bash_frontend(site, opts = {}, &block)
|
821
|
+
proxy = _frontend(site, 'bash')
|
822
|
+
info "Running bash via: #{proxy}"
|
823
|
+
return Bash.bash(proxy, opts, &block)
|
824
|
+
end
|
825
|
+
|
826
|
+
def copy(filename, node, path)
|
827
|
+
raise 'File does not exist!' unless File.exists?(filename)
|
828
|
+
base = File.basename(filename)
|
829
|
+
bash(node) do
|
830
|
+
path = expand_path(path) # get rid of ~
|
831
|
+
if exists(path)
|
832
|
+
type = get_type(path)
|
833
|
+
if type == :dir
|
834
|
+
path = File.join(path, base)
|
835
|
+
elsif type == :file
|
836
|
+
# pass
|
837
|
+
else
|
838
|
+
raise 'Unknown file type.'
|
839
|
+
end
|
840
|
+
else
|
841
|
+
# the path does not exist
|
842
|
+
end
|
843
|
+
end
|
844
|
+
info "Copying file #{filename} to #{node}:#{path}"
|
845
|
+
return execute(node, "tee #{path}", "cat #{filename} | ") # FIX THIS
|
846
|
+
end
|
847
|
+
|
848
|
+
def retrieve(node, path, dir = '.')
|
849
|
+
files = bash(node) do
|
850
|
+
glob(path)
|
851
|
+
end
|
852
|
+
files.each do |f|
|
853
|
+
base = File.basename(f)
|
854
|
+
dest = File.join(dir, base)
|
855
|
+
execute(node, "cat #{f}", "", " > #{dest}")
|
856
|
+
end
|
857
|
+
end
|
858
|
+
|
859
|
+
def dist_keys(master, slaves)
|
860
|
+
# generates and distributes SSH keys so that
|
861
|
+
# master can connect password-lessly
|
862
|
+
# if public key is already present, it won't be
|
863
|
+
# recreated
|
864
|
+
label = "#{master} && #{slaves}"
|
865
|
+
if @cache.get(label)
|
866
|
+
info "SSH keys already distributed."
|
867
|
+
return
|
868
|
+
end
|
869
|
+
priv = '~/.ssh/id_rsa'
|
870
|
+
pub = "#{priv}.pub"
|
871
|
+
key = bash(master) do
|
872
|
+
trunc '~/.ssh/config'
|
873
|
+
append_line '~/.ssh/config', 'Host *'
|
874
|
+
append_line '~/.ssh/config', 'StrictHostKeyChecking no'
|
875
|
+
if !exists(priv)
|
876
|
+
run("ssh-keygen -N '' -q -f #{priv}")
|
877
|
+
end
|
878
|
+
run("ssh-keygen -y -f #{priv}").strip
|
879
|
+
end
|
880
|
+
info "The key is: #{key}"
|
881
|
+
(slaves + [ master ]).uniq.each do |node|
|
882
|
+
bash(node) do
|
883
|
+
make_dirs '~/.ssh'
|
884
|
+
append_line '~/.ssh/authorized_keys', key
|
885
|
+
end
|
886
|
+
end
|
887
|
+
@cache.set(label, true)
|
888
|
+
info "Keys distributed."
|
889
|
+
end
|
890
|
+
|
891
|
+
def dist_ssh_keys(nodes)
|
892
|
+
master = nodes.first
|
893
|
+
rest = nodes.tail
|
894
|
+
dist_keys(master, rest)
|
895
|
+
return master
|
896
|
+
end
|
897
|
+
|
898
|
+
def clean
|
899
|
+
proxy.engine.inline_process :"g5k-clean" do
|
900
|
+
sites = run :"sites"
|
901
|
+
forall sites do |s|
|
902
|
+
log "Cleaning ", s
|
903
|
+
jobs = run :"my_jobs", s
|
904
|
+
log "Cleaning ", s, " from ", (length_of jobs), " jobs..."
|
905
|
+
run :"release_all", s
|
906
|
+
log "Done with #{name_of s}"
|
907
|
+
end
|
908
|
+
end
|
909
|
+
return true
|
910
|
+
end
|
911
|
+
|
912
|
+
def run_script(node, name)
|
913
|
+
# TODO
|
914
|
+
tmp = '/tmp/script-xpflow.sh'
|
915
|
+
info "Pushing script #{name} to the node #{node}"
|
916
|
+
copy($files[name], node, tmp) # TODO
|
917
|
+
return execute(node, "bash -e #{tmp}")
|
918
|
+
end
|
919
|
+
|
920
|
+
def rsync(node, name, where)
|
921
|
+
info "Pushing '#{name}' to the node #{node}"
|
922
|
+
rsh = _ssh_gw('sophia', node) # TODO: how does it work?
|
923
|
+
dir = $dirs[name]
|
924
|
+
cmd = "rsync --delete --numeric-ids --archive --bwlimit=100000 --rsh '#{rsh}' #{dir} :#{where}"
|
925
|
+
info "Running: #{cmd}"
|
926
|
+
return `#{cmd}`
|
927
|
+
end
|
928
|
+
|
929
|
+
end
|
930
|
+
|
931
|
+
end; end
|