xpflow 0.1b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/bin/xpflow +96 -0
  2. data/lib/colorado.rb +198 -0
  3. data/lib/json/add/core.rb +243 -0
  4. data/lib/json/add/rails.rb +8 -0
  5. data/lib/json/common.rb +423 -0
  6. data/lib/json/editor.rb +1369 -0
  7. data/lib/json/ext.rb +28 -0
  8. data/lib/json/pure/generator.rb +442 -0
  9. data/lib/json/pure/parser.rb +320 -0
  10. data/lib/json/pure.rb +15 -0
  11. data/lib/json/version.rb +8 -0
  12. data/lib/json.rb +62 -0
  13. data/lib/mime/types.rb +881 -0
  14. data/lib/mime-types.rb +3 -0
  15. data/lib/restclient/abstract_response.rb +106 -0
  16. data/lib/restclient/exceptions.rb +193 -0
  17. data/lib/restclient/net_http_ext.rb +55 -0
  18. data/lib/restclient/payload.rb +235 -0
  19. data/lib/restclient/raw_response.rb +34 -0
  20. data/lib/restclient/request.rb +316 -0
  21. data/lib/restclient/resource.rb +169 -0
  22. data/lib/restclient/response.rb +24 -0
  23. data/lib/restclient.rb +174 -0
  24. data/lib/xpflow/bash.rb +341 -0
  25. data/lib/xpflow/bundle.rb +113 -0
  26. data/lib/xpflow/cmdline.rb +249 -0
  27. data/lib/xpflow/collection.rb +122 -0
  28. data/lib/xpflow/concurrency.rb +79 -0
  29. data/lib/xpflow/data.rb +393 -0
  30. data/lib/xpflow/dsl.rb +816 -0
  31. data/lib/xpflow/engine.rb +574 -0
  32. data/lib/xpflow/ensemble.rb +135 -0
  33. data/lib/xpflow/events.rb +56 -0
  34. data/lib/xpflow/experiment.rb +65 -0
  35. data/lib/xpflow/exts/facter.rb +30 -0
  36. data/lib/xpflow/exts/g5k.rb +931 -0
  37. data/lib/xpflow/exts/g5k_use.rb +50 -0
  38. data/lib/xpflow/exts/gui.rb +140 -0
  39. data/lib/xpflow/exts/model.rb +155 -0
  40. data/lib/xpflow/graph.rb +1603 -0
  41. data/lib/xpflow/graph_xpflow.rb +251 -0
  42. data/lib/xpflow/import.rb +196 -0
  43. data/lib/xpflow/library.rb +349 -0
  44. data/lib/xpflow/logging.rb +153 -0
  45. data/lib/xpflow/manager.rb +147 -0
  46. data/lib/xpflow/nodes.rb +1250 -0
  47. data/lib/xpflow/runs.rb +773 -0
  48. data/lib/xpflow/runtime.rb +125 -0
  49. data/lib/xpflow/scope.rb +168 -0
  50. data/lib/xpflow/ssh.rb +186 -0
  51. data/lib/xpflow/stat.rb +50 -0
  52. data/lib/xpflow/stdlib.rb +381 -0
  53. data/lib/xpflow/structs.rb +369 -0
  54. data/lib/xpflow/taktuk.rb +193 -0
  55. data/lib/xpflow/templates/ssh-config.basic +14 -0
  56. data/lib/xpflow/templates/ssh-config.inria +18 -0
  57. data/lib/xpflow/templates/ssh-config.proxy +13 -0
  58. data/lib/xpflow/templates/taktuk +6590 -0
  59. data/lib/xpflow/templates/utils/batch +4 -0
  60. data/lib/xpflow/templates/utils/bootstrap +12 -0
  61. data/lib/xpflow/templates/utils/hostname +3 -0
  62. data/lib/xpflow/templates/utils/ping +3 -0
  63. data/lib/xpflow/templates/utils/rsync +12 -0
  64. data/lib/xpflow/templates/utils/scp +17 -0
  65. data/lib/xpflow/templates/utils/scp_many +8 -0
  66. data/lib/xpflow/templates/utils/ssh +3 -0
  67. data/lib/xpflow/templates/utils/ssh-interactive +4 -0
  68. data/lib/xpflow/templates/utils/taktuk +19 -0
  69. data/lib/xpflow/threads.rb +187 -0
  70. data/lib/xpflow/utils.rb +569 -0
  71. data/lib/xpflow/visual.rb +230 -0
  72. data/lib/xpflow/with_g5k.rb +7 -0
  73. data/lib/xpflow.rb +349 -0
  74. metadata +135 -0
@@ -0,0 +1,931 @@
1
+ #
2
+ # name: XPFlow::G5K::Library
3
+ #
4
+
5
+ require 'tempfile'
6
+ require 'etc'
7
+ require 'json'
8
+ require 'xpflow'
9
+ require 'restclient'
10
+ require 'pp'
11
+ require 'digest'
12
+ require 'date'
13
+ require 'cgi'
14
+ require 'shellwords'
15
+
16
+ module XPFlow; module G5K
17
+
18
+ SSH_CONFIG = "/tmp/.xpflow_ssh_config_#{Etc.getlogin}"
19
+
20
+ def self.install_ssh_config_file(user)
21
+ # TODO: this has to be fixed
22
+ # race conditions are possible
23
+ File.open(SSH_CONFIG, File::WRONLY|File::CREAT, 0600) do |f|
24
+ f.flock(File::LOCK_EX)
25
+ f.truncate(0)
26
+ f.write("LogLevel quiet\n")
27
+ f.write("StrictHostKeyChecking no\n")
28
+ f.write("UserKnownHostsFile /dev/null\n")
29
+ f.write("ForwardAgent yes\n")
30
+ f.write("Host g5k\n")
31
+ f.write(" Hostname access.nancy.grid5000.fr\n")
32
+ f.write(" User #{user}\n\n")
33
+ f.write("Host *.g5k\n")
34
+ f.write(" User #{user}\n")
35
+ f.write(" ProxyCommand ssh -F #{SSH_CONFIG} g5k \"nc -q 0 `basename %h .g5k` %p\"\n\n")
36
+
37
+ end
38
+ end
39
+
40
+ def self.get_ssh_key
41
+ name = File.expand_path('~/.ssh/id_rsa.pub')
42
+ raise 'SSH key not present' unless File.exists?(name)
43
+ return IO::read(name).strip
44
+ end
45
+
46
+ class G5KRestFactory
47
+
48
+ def initialize
49
+ @mutex = Mutex.new
50
+ end
51
+
52
+ def get_credentials
53
+ return [ $g5k_user, $g5k_pass ]
54
+ end
55
+
56
+ def connect
57
+ @mutex.synchronize do
58
+ creds = get_credentials()
59
+ G5KRest.new(*creds)
60
+ end
61
+ end
62
+
63
+ end
64
+
65
+ class G5KArray < Array
66
+
67
+ alias old_select select
68
+
69
+ def list
70
+ return self
71
+ end
72
+
73
+ def uids
74
+ return self.map { |it| it['uid'] }
75
+ end
76
+
77
+ def select(&block)
78
+ return old_select(&block)
79
+ end
80
+
81
+ def __repr__
82
+ return self.map { |it| it.__repr__ }.to_s
83
+ end
84
+
85
+ end
86
+
87
+ class G5KJson < Hash
88
+
89
+ def items
90
+ return self['items']
91
+ end
92
+
93
+ def rel(r)
94
+ return self['links'].detect { |x| x['rel'] == r }['href']
95
+ end
96
+
97
+ def rel_self
98
+ return rel('self')
99
+ end
100
+
101
+ def rel_parent
102
+ return rel('parent')
103
+ end
104
+
105
+ def link(title)
106
+ return self['links'].detect { |x| x['title'] == title }['href']
107
+ end
108
+
109
+ def uid
110
+ return self['uid']
111
+ end
112
+
113
+ def self.parse(s)
114
+ return JSON.parse(s, :object_class => G5KJson, :array_class => G5KArray)
115
+ end
116
+
117
+ def __repr__
118
+ return self['uid'] unless self['uid'].nil?
119
+ return Hash[self.map { |k, v| [k, v.__repr__ ] }].to_s
120
+ end
121
+
122
+ def refresh(g5k)
123
+ return g5k.get_json_raw(rel_self)
124
+ end
125
+
126
+ def job_type
127
+ # gets type of job
128
+ return (self['types'].include?('deploy') ? :deploy : :normal)
129
+ end
130
+
131
+ end
132
+
133
+ class G5KRest
134
+
135
+ # Basic Grid5000 Rest Interface
136
+
137
+ attr_reader :user
138
+
139
+ def self.from_config
140
+ G5KRestFactory.new.connect
141
+ end
142
+
143
+ def initialize(user, pass)
144
+ @user = user
145
+ @pass = pass
146
+ raise "You forgot to use :g5k library!" if (user.nil? or pass.nil?)
147
+ user_escaped = CGI.escape(user)
148
+ pass_escaped = CGI.escape(pass)
149
+ @endpoint = "https://#{user_escaped}:#{pass_escaped}@api.grid5000.fr"
150
+ @api = RestClient::Resource.new(@endpoint, :timeout => 15)
151
+ end
152
+
153
+ def resource(path)
154
+ path = path[1..-1] if path.start_with?('/')
155
+ return @api[path]
156
+ end
157
+
158
+ def delete_json_raw(path)
159
+ begin
160
+ return resource(path).delete()
161
+ rescue RestClient::InternalServerError => e
162
+ raise
163
+ end
164
+ end
165
+
166
+ def post_json_raw(path, json)
167
+ r = resource(path).post(json.to_json,
168
+ :content_type => "application/json", :accept => "application/json")
169
+ return G5KJson.parse(r)
170
+ end
171
+
172
+ def get_json_raw(path)
173
+ maxfails = 3
174
+ fails = 0
175
+ while true
176
+ begin
177
+ r = resource(path).get()
178
+ return G5KJson.parse(r)
179
+ rescue RestClient::RequestTimeout
180
+ fails += 1
181
+ raise if fails > maxfails
182
+ Kernel.sleep(1.0)
183
+ end
184
+ end
185
+ end
186
+
187
+
188
+ def get_json(resource)
189
+ return get_json_raw("sid/#{resource}")
190
+ end
191
+
192
+ def post_json(resource, json)
193
+ begin
194
+ return post_json_raw("sid/#{resource}", json)
195
+ rescue => e
196
+ raise
197
+ end
198
+ end
199
+
200
+ def get_items(resource)
201
+ return get_json(resource).items
202
+ end
203
+
204
+
205
+ def get_sites
206
+ sites = get_items('sites').list
207
+ # TODO: temporary fix for two sites
208
+ sites = sites.select { |x| ! [ "Bordeaux", "Reims" ].include?(x['name']) }
209
+ return sites
210
+ end
211
+
212
+ def get_site_status(site)
213
+ return get_items("sites/#{site}/status").list
214
+ end
215
+
216
+ def get_jobs(site, uid = nil)
217
+ filter = uid.nil? ? "" : "&user_uid=#{uid}"
218
+ resource = "sites/#{site}/jobs/?state=running#{filter}"
219
+ return get_items(resource).list
220
+ end
221
+
222
+ def get_job(site, jid)
223
+ resource = "sites/#{site}/jobs/#{jid}"
224
+ return get_json(resource)
225
+ end
226
+
227
+ def get_clusters(site)
228
+ return get_items("sites/#{site}/clusters").list
229
+ end
230
+
231
+ def get_switches(site)
232
+ items = get_items("sites/#{site}/network_equipments")
233
+ items = items.select { |x| x['kind'] == 'switch' }
234
+ # extract nodes connected to those switches
235
+ items.each { |switch|
236
+ conns = switch['linecards'].detect { |c| c['kind'] == 'node' }
237
+ next if conns.nil? # IB switches for example
238
+ nodes = conns['ports'] \
239
+ .select { |x| x != {} } \
240
+ .map { |x| x['uid'] } \
241
+ .map { |x| "#{x}.#{site}.grid5000.fr"}
242
+ switch['nodes'] = nodes
243
+ }
244
+ return items.select { |it| it.key?('nodes') }
245
+ end
246
+
247
+ def get_switch(site, name)
248
+ s = get_switches(site).detect { |x| x.uid == name }
249
+ raise "Unknown switch '#{name}'" if s.nil?
250
+ return s
251
+ end
252
+
253
+ def follow_link(obj, rel)
254
+ return get_json_raw(obj.link(rel))
255
+ end
256
+
257
+ def follow_parent(obj)
258
+ return get_json_raw(obj.rel_parent)
259
+ end
260
+
261
+ def get_nodes_status(site)
262
+ nodes = {}
263
+ get_site_status(site).map do |node|
264
+ name = node['node_uid']
265
+ name = "#{name}.#{site}.grid5000.fr" unless name.end_with?('.fr')
266
+ status = node['system_state']
267
+ nodes[name] = status
268
+ end
269
+ return nodes
270
+ end
271
+
272
+ end
273
+
274
+ Factory = G5KRestFactory.new
275
+
276
+ class Library < ActivityLibrary
277
+
278
+ attr_accessor :logging
279
+ attr_accessor :proxy
280
+
281
+ activities :reserve, :reserve_nodes, :release, :nodes, :switches, :switch,
282
+ :nodes_of_switch, :sites, :jobs, :wait_for_job, :nodes_available,
283
+ :release_all, :my_jobs, :wait_for_reservation, :nodes_available?,
284
+ :deploy, :execute, :copy, :bash, :dist_keys, :execute_frontend,
285
+ :bash_frontend, :distribute, :retrieve, :kavlan, :vlan_nodes,
286
+ :vlan_bash, :pick_reservation,
287
+ :node_site, :nodes_sites, :run_script, :rsync,
288
+ :version, :job,
289
+ :clean, :dist_ssh_keys
290
+
291
+ def initialize
292
+ super
293
+ @cache = Cache.new
294
+ G5K.install_ssh_config_file(g5k.user)
295
+ inject_library('__core__', CoreLibrary.new)
296
+ end
297
+
298
+ def version
299
+ return "0.1"
300
+ end
301
+
302
+ def inside_g5k
303
+ # checks if we are inside Grid5000
304
+ @cache.fetch(:inside_g5k) do
305
+ `hostname`.strip.end_with?('grid5000.fr')
306
+ end
307
+ end
308
+
309
+ def g5k
310
+ Factory.connect
311
+ end
312
+
313
+ def nodes_with_site(nodes)
314
+ # maps each node to its site
315
+ ss = site_uids()
316
+ h = {}
317
+ nodes.each do |n|
318
+ s = ss.detect { |x| n.include?(x) }
319
+ raise "Could not map node '#{n}' to its site" if s.nil?
320
+ h[n] = s
321
+ end
322
+ return h
323
+ end
324
+
325
+ def nodes_sites(nodes)
326
+ # returns a set of sites the given nodes are at
327
+ return nodes_with_site(nodes).values.uniq
328
+ end
329
+
330
+ def node_site(node)
331
+ return nodes_sites([ node ]).first
332
+ end
333
+
334
+ def nodes_status(nodes)
335
+ # maps nodes to their statuses
336
+ status = {}
337
+ nodes_sites(nodes).each do |site|
338
+ st = g5k.get_nodes_status(site)
339
+ st = st.select { |k, v| nodes.include?(k) }
340
+ st = Hash[st]
341
+ status = status.merge(st)
342
+ end
343
+ # beware: it is not guaranteed that every node will have its status!
344
+ return status
345
+ end
346
+
347
+ def nodes_available(nodes, opts = {})
348
+ ignore_dead = opts[:ignore_dead]
349
+ status = nodes_status(nodes)
350
+ avail = status.select do |k, v|
351
+ (v == 'free') or (ignore_dead and v == 'unknown')
352
+ end
353
+ return Hash[avail].keys
354
+ end
355
+
356
+ def nodes_available?(nodes, opts = {})
357
+ avail = nodes_available(nodes, opts)
358
+ unavail = nodes - avail
359
+ r = unavail.empty?
360
+ r.inject_method(:availability) do
361
+ 1.0 - (unavail.length.to_f / nodes.length.to_f)
362
+ end
363
+ r.inject_method(:total) { nodes.length }
364
+ return r
365
+ end
366
+
367
+ def filter_dead_nodes(nodes)
368
+ # remove dead or unknown nodes
369
+ dead = []
370
+ nodes_status(nodes).each do |node, status|
371
+ dead.push(node) if status == 'unknown'
372
+ end
373
+ return nodes - dead
374
+ end
375
+
376
+ def parse_time(spec)
377
+ spec = spec.strip
378
+ return DateTime.now.to_s if spec == "now"
379
+ timezone = `date +%Z`.strip
380
+ return DateTime.parse("#{spec} #{timezone}").to_s
381
+ end
382
+
383
+ def handle_slash(opts)
384
+ slash = nil
385
+ predefined = { :slash_22 => 22, :slash_18 => 18 }
386
+ if opts[:slash]
387
+ bits = opts[:slash].to_i
388
+ slash = "slash_#{bits}=1"
389
+ else
390
+ slashes = predefined.select { |label, bits| opts.key?(label) }
391
+ unless slashes.empty?
392
+ label, bits = slashes.first
393
+ count = opts[label].to_i
394
+ slash = "slash_#{bits}=#{count}"
395
+ end
396
+ end
397
+ return slash
398
+ end
399
+
400
+ def pick_reservation(opts = {})
401
+ site = opts[:site]
402
+ jobs = site.nil? ? my_all_jobs() : jobs(site)
403
+ # pp jobs
404
+ jobs = jobs.select { |x| x['state'] == 'running' }
405
+ jobs = jobs.select { |x| x['user_uid'] == g5k.user } # WEIRD!
406
+ raise "No reservations available" if jobs.empty?
407
+ raise "Too many reservation meeting the criteria." if jobs.length > 1
408
+ job = jobs.first
409
+ info "Found reservation with ID = #{job["uid"]}"
410
+ j = g5k.get_json_raw(job.rel_self)
411
+ j = wait_for_job(j)
412
+ return j
413
+ end
414
+
415
+ def job(site, jid)
416
+ j = g5k.get_job(site, jid.to_i)
417
+ j = wait_for_job(j)
418
+ return j
419
+ end
420
+
421
+ def reserve_nodes(opts)
422
+ # helper for making the reservations the easy way
423
+ nodes = opts.fetch(:nodes, 1)
424
+ time = opts.fetch(:time, '01:00:00')
425
+ at = opts[:at]
426
+ slash = handle_slash(opts)
427
+ site = opts[:site]
428
+ type = opts.fetch(:type, :normal)
429
+ keep = opts[:keep]
430
+ name = opts.fetch(:name, 'xpflow job')
431
+ command = opts[:cmd]
432
+ async = opts[:async]
433
+ ignore_dead = opts[:ignore_dead]
434
+ props = nil
435
+ vlan = opts[:vlan]
436
+ cluster = opts[:cluster]
437
+
438
+ raise 'At least nodes, time and site must be given' \
439
+ if [nodes, time, site].any? { |x| x.nil? }
440
+
441
+ secs = Timespan.to_secs(time)
442
+ time = Timespan.to_time(time)
443
+
444
+ if nodes.is_a?(Array)
445
+ all_nodes = nodes
446
+ nodes = filter_dead_nodes(nodes) if ignore_dead
447
+ removed_nodes = all_nodes - nodes
448
+ info "Ignored nodes #{removed_nodes}." unless removed_nodes.empty?
449
+ hosts = nodes.map { |n| "'#{n}'" }.sort.join(',')
450
+ props = "host in (#{hosts})"
451
+ nodes = nodes.length
452
+ end
453
+
454
+ raise 'Nodes must be an integer.' unless nodes.is_a?(Integer)
455
+ site = site.__repr__
456
+ raise 'Type must be either :deploy or :normal' \
457
+ unless (type.respond_to?(:to_sym) && [ :normal, :deploy ].include?(type.to_sym))
458
+ command = "sleep #{secs}" if command.nil?
459
+ type = type.to_sym
460
+
461
+ resources = "/nodes=#{nodes},walltime=#{time}"
462
+ resources = "{cluster='#{cluster}'}" + resources unless cluster.nil?
463
+ resources = "{type='kavlan'}/vlan=1+" + resources if vlan == true
464
+ resources = "#{slash}+" + resources unless slash.nil?
465
+
466
+ payload = {
467
+ 'resources' => resources,
468
+ 'name' => name,
469
+ 'command' => command
470
+ }
471
+
472
+ info "Reserving resources: #{resources} (type: #{type}) (in #{site})"
473
+
474
+ payload['properties'] = props unless props.nil?
475
+ if type == :deploy
476
+ payload['types'] = [ 'deploy' ]
477
+ else
478
+ payload['types'] = [ 'allow_classic_ssh' ]
479
+ end
480
+
481
+ unless at.nil?
482
+ dt = parse_time(at)
483
+ payload['reservation'] = dt
484
+ info "Starting this reservation at #{dt}"
485
+ end
486
+
487
+ begin
488
+ r = g5k.post_json("sites/#{site}/jobs", payload)
489
+ rescue => e
490
+ raise
491
+ end
492
+
493
+ # it may be a different thread that releases reservations
494
+ # therefore we need to dereference proxy which
495
+ # in fact uses Thread.current and is local to the thread...
496
+
497
+ engine = proxy.engine
498
+
499
+ engine.on_finish do
500
+ engine.verbose("Releasing job at #{r.rel_self}")
501
+ release(r)
502
+ end if keep != true
503
+
504
+ job = g5k.get_json_raw(r.rel_self)
505
+ job = wait_for_job(job) if async != true
506
+ return job
507
+ end
508
+
509
+ def info(msg)
510
+ if @logging
511
+ @logging.call(msg)
512
+ else
513
+ proxy.engine.log(msg, :g5k)
514
+ end
515
+ end
516
+
517
+ def wait_for_job(job)
518
+ # wait for the job to be in a running state
519
+ # timeouts after 10 seconds
520
+ jid = job.__repr__
521
+ info "Waiting for reservation #{jid}"
522
+ Timeout.timeout(36000) do
523
+ while true
524
+ job = job.refresh(g5k)
525
+ t = job['scheduled_at']
526
+ if !t.nil?
527
+ t = Time.at(t)
528
+ secs = [ t - Time.now, 0 ].max.to_i
529
+ info "Reservation #{jid} should be available at #{t} (#{secs} s)"
530
+ end
531
+ break if job['state'] == 'running'
532
+ raise "Job is finishing." if job['state'] == 'finishing'
533
+ Kernel.sleep(5)
534
+ end
535
+ end
536
+ info "Reservation #{jid} ready"
537
+ return job
538
+ end
539
+
540
+ def release_all(site)
541
+ # releases all jobs on a site
542
+ site = site.__repr__
543
+ Timeout.check(20) do
544
+ jobs = my_jobs(site)
545
+ pass if jobs.length == 0
546
+ begin
547
+ jobs.each { |j| release(j) }
548
+ rescue RestClient::InternalServerError => e
549
+ raise unless e.response.include?('already killed')
550
+ end
551
+ end
552
+ end
553
+
554
+ def release(r)
555
+ begin
556
+ return g5k.delete_json_raw(r.rel_self)
557
+ rescue RestClient::InternalServerError => e
558
+ raise unless e.response.include?('already killed')
559
+ end
560
+ end
561
+
562
+ def reserve(opts)
563
+ raise 'not implemented'
564
+ end
565
+
566
+ def sites
567
+ @cache.fetch(:sites) do
568
+ g5k.get_sites
569
+ end
570
+ end
571
+
572
+ def site_uids
573
+ return sites.uids
574
+ end
575
+
576
+ def nodes(r)
577
+ return r['nodes'] if r.key?('nodes')
578
+ return r['assigned_nodes']
579
+ end
580
+
581
+ def vlan_nodes(r)
582
+ vlan = kavlan(r)
583
+ return vlan[:hosts]
584
+ end
585
+
586
+ def jobs(site)
587
+ name = site.__repr__
588
+ return g5k.get_jobs(name)
589
+ end
590
+
591
+ def my_jobs(site)
592
+ name = site.__repr__
593
+ return g5k.get_jobs(name, g5k.user)
594
+ end
595
+
596
+ def my_all_jobs
597
+ ss = sites()
598
+ return ss.map { |s| my_jobs(s) }.reduce(:+)
599
+ end
600
+
601
+ def switches(site)
602
+ name = site.__repr__
603
+ return g5k.get_switches(name)
604
+ end
605
+
606
+ def switch(site, sw)
607
+ name = site.__repr__
608
+ return g5k.get_switch(site, sw)
609
+ end
610
+
611
+ def wait_for_reservation(opts = {})
612
+ site = opts.fetch(:site, :any).__repr__
613
+ timeout = opts.fetch(:timeout, Infinity)
614
+ name = opts[:name]
615
+
616
+ timeout = Timespan.to_secs(timeout)
617
+ places = sites.uids
618
+ places = places.select { |uid| uid == site } if site != 'any'
619
+ raise "No '#{site}' site" if places.empty?
620
+ job = nil
621
+ Timeout.check(timeout) do
622
+ js = places.map { |p| my_jobs(p) }.reduce(:+)
623
+ js = js.select { |j| j['name'] == name } unless name.nil?
624
+ job = js.first
625
+ pass unless job.nil?
626
+ end
627
+ job = wait_for_job(job)
628
+ return job
629
+ end
630
+
631
+ def kavlan(job)
632
+ jid = job['uid']
633
+ site = g5k.follow_parent(job).uid
634
+ begin
635
+ info = bash_frontend(site) do
636
+ uid = run "kavlan -V -j #{jid}"
637
+ list = run "kavlan -l -j #{jid}"
638
+ { :uid => uid.to_i, :hosts => list.lines.map { |x| x.strip } }
639
+ end
640
+ rescue Bash::StatusError => e
641
+ raise e if e.output.strip != 'no vlan found'
642
+ return nil
643
+ end
644
+ return info
645
+ end
646
+
647
+
648
+ def get_ssh_key_for_site(site)
649
+ ssh_key = bash_frontend(site) do
650
+ name = expand_path '~/.ssh/id_rsa.pub'
651
+ (exists name) ? (contents name).strip : nil
652
+ end
653
+ return ssh_key
654
+ end
655
+
656
+ def deploy(job, opts = {})
657
+ # TODO: make sure this is deployment job
658
+ # TODO: this is deprecated
659
+
660
+ nodes = job['assigned_nodes']
661
+ env = opts[:env]
662
+
663
+ site = g5k.follow_parent(job).uid
664
+
665
+ keys = [ G5K.get_ssh_key() ]
666
+
667
+ frontend_ssh_key = get_ssh_key_for_site(site)
668
+
669
+ keys.push(frontend_ssh_key) unless frontend_ssh_key.nil?
670
+
671
+ info "Deploying #{keys.length} SSH keys"
672
+
673
+ raise "Environment must be given" if env.nil?
674
+
675
+ payload = {
676
+ 'nodes' => nodes,
677
+ 'environment' => env,
678
+ 'key' => keys.join("\n") + "\n",
679
+ }
680
+
681
+ vlan = kavlan(job)
682
+
683
+ if !vlan.nil?
684
+ payload['vlan'] = vlan[:uid]
685
+ info "Found VLAN with uid = #{vlan[:uid]}"
686
+ end
687
+
688
+ info "Creating deployment"
689
+ # puts payload.inspect
690
+
691
+ begin
692
+ r = g5k.post_json("sites/#{site}/deployments", payload)
693
+ rescue => e
694
+ raise e
695
+ end
696
+
697
+ info "Entering waiting loop"
698
+
699
+ Timeout.check(Infinity) do
700
+ r = r.refresh(g5k)
701
+ pass if r['status'] == 'terminated'
702
+ info "Waiting for deployment to finish (state = #{r['status']})."
703
+ end
704
+
705
+ ok = r['result'].map { |node, info| info }.all? { |x| x['state'] == 'OK' }
706
+
707
+ raise "Deployment (at least partially) failed" unless ok
708
+
709
+ return r
710
+
711
+ end
712
+
713
+ def find_node(node)
714
+ j, n, site = nil, nil, nil
715
+ # 1. Try to find a site.
716
+ info "Looking for node #{node}..."
717
+ # info "Sites considered: #{site_uids.inspect}"
718
+ site = site_uids.detect { |s| node.include?(s) }
719
+ # info "Site is #{site}"
720
+ jobs = (site.nil?) ? my_all_jobs() : my_jobs(site)
721
+ # info "Jobs considered: #{jobs.inspect}"
722
+ jobs = jobs.map { |x| x.refresh(g5k) }
723
+ for job in jobs do
724
+ n = job['assigned_nodes'].detect { |n| n.start_with?(node) }
725
+ if n.nil? == false
726
+ j = job
727
+ site = g5k.follow_parent(job).uid if site.nil?
728
+ break
729
+ end
730
+ end
731
+ return j, n, site
732
+ end
733
+
734
+ def _ssh(site, job, n, cmd)
735
+ # connects to node 'n', on site 'site', being a part of 'job'
736
+ cmd = Shellwords.escape(cmd)
737
+ cmd2 = Shellwords.escape(cmd)
738
+ bashc = "OAR_JOB_ID=#{job.uid} oarsh #{n} -- #{cmd2}"
739
+ if inside_g5k
740
+ if job.job_type == :deploy
741
+ return "ssh root@#{n} -- #{cmd}"
742
+ else
743
+ # TODO: this can be simplified if we are
744
+ # running on 'site'
745
+ return "ssh -F #{SSH_CONFIG} #{site}.grid5000.fr -- #{bashc}"
746
+ end
747
+ else
748
+ proxy = "ssh -F #{SSH_CONFIG} #{site}.g5k"
749
+ if job.job_type == :deploy
750
+ return "#{proxy} -- ssh root@#{n} -- #{cmd2}"
751
+ else
752
+ return "#{proxy} -- #{bashc}"
753
+ end
754
+ end
755
+ end
756
+
757
+ def _ssh_deploy(site, node, cmd)
758
+ gw = _ssh_gw(site, node)
759
+ return "#{gw} -- #{cmd}"
760
+ end
761
+
762
+ def _ssh_gw(site, node)
763
+ if inside_g5k
764
+ return "ssh root@#{node}"
765
+ else
766
+ proxy = "ssh -F #{SSH_CONFIG} #{site}.g5k"
767
+ return "#{proxy} -- ssh root@#{node}"
768
+ end
769
+ end
770
+
771
+ def _frontend(site, cmd)
772
+ if inside_g5k
773
+ ssh = "ssh -F #{SSH_CONFIG} #{site}.grid5000.fr"
774
+ else
775
+ ssh = "ssh -F #{SSH_CONFIG} #{site}.g5k"
776
+ end
777
+ cmd = Shellwords.escape(cmd)
778
+ return "#{ssh} -- #{cmd}"
779
+ end
780
+
781
+ def execute(node, cmd, prefix = '', postfix = '')
782
+ if node.include?("kavlan")
783
+ ssh = _ssh_deploy('nancy', node, 'bash') # TODO: how does it work?
784
+ ssh = ssh.gsub("bash", "") # TODO: OMG
785
+ prog = "#{prefix}#{ssh} #{cmd} #{postfix}"
786
+ else
787
+ job, n, site = find_node(node)
788
+ raise "Node '#{node}' not found" if job.nil?
789
+ prog = "#{prefix}#{_ssh(site, job, n, cmd)}#{postfix}"
790
+ end
791
+ info "Running command: #{prog}"
792
+ return `#{prog}`
793
+ end
794
+
795
+ def execute_frontend(site, cmd, prefix = '')
796
+ bash_frontend(site) do
797
+ run(cmd)
798
+ end
799
+ end
800
+
801
+ def bash(node, opts = {}, &block)
802
+ return vlan_bash(node, debug, &block) if node.include?("kavlan")
803
+ job, n, site = find_node(node)
804
+ raise "Node #{node} not found" if job.nil?
805
+ ssh = _ssh(site, job, n, 'bash')
806
+ info "Running bash via: #{ssh}"
807
+ return Bash.bash(ssh, opts, &block)
808
+ end
809
+
810
+ def vlan_bash(node, opts = {}, &block)
811
+ ssh = _ssh_deploy('nancy', node, 'bash') # TODO: how does it work?
812
+ info "Running vlan bash via: #{ssh}"
813
+ return Bash.bash(ssh, opts, &block)
814
+ end
815
+
816
+ def bash_local(&block)
817
+ return Bash.bash(&block)
818
+ end
819
+
820
+ def bash_frontend(site, opts = {}, &block)
821
+ proxy = _frontend(site, 'bash')
822
+ info "Running bash via: #{proxy}"
823
+ return Bash.bash(proxy, opts, &block)
824
+ end
825
+
826
+ def copy(filename, node, path)
827
+ raise 'File does not exist!' unless File.exists?(filename)
828
+ base = File.basename(filename)
829
+ bash(node) do
830
+ path = expand_path(path) # get rid of ~
831
+ if exists(path)
832
+ type = get_type(path)
833
+ if type == :dir
834
+ path = File.join(path, base)
835
+ elsif type == :file
836
+ # pass
837
+ else
838
+ raise 'Unknown file type.'
839
+ end
840
+ else
841
+ # the path does not exist
842
+ end
843
+ end
844
+ info "Copying file #{filename} to #{node}:#{path}"
845
+ return execute(node, "tee #{path}", "cat #{filename} | ") # FIX THIS
846
+ end
847
+
848
+ def retrieve(node, path, dir = '.')
849
+ files = bash(node) do
850
+ glob(path)
851
+ end
852
+ files.each do |f|
853
+ base = File.basename(f)
854
+ dest = File.join(dir, base)
855
+ execute(node, "cat #{f}", "", " > #{dest}")
856
+ end
857
+ end
858
+
859
+ def dist_keys(master, slaves)
860
+ # generates and distributes SSH keys so that
861
+ # master can connect password-lessly
862
+ # if public key is already present, it won't be
863
+ # recreated
864
+ label = "#{master} && #{slaves}"
865
+ if @cache.get(label)
866
+ info "SSH keys already distributed."
867
+ return
868
+ end
869
+ priv = '~/.ssh/id_rsa'
870
+ pub = "#{priv}.pub"
871
+ key = bash(master) do
872
+ trunc '~/.ssh/config'
873
+ append_line '~/.ssh/config', 'Host *'
874
+ append_line '~/.ssh/config', 'StrictHostKeyChecking no'
875
+ if !exists(priv)
876
+ run("ssh-keygen -N '' -q -f #{priv}")
877
+ end
878
+ run("ssh-keygen -y -f #{priv}").strip
879
+ end
880
+ info "The key is: #{key}"
881
+ (slaves + [ master ]).uniq.each do |node|
882
+ bash(node) do
883
+ make_dirs '~/.ssh'
884
+ append_line '~/.ssh/authorized_keys', key
885
+ end
886
+ end
887
+ @cache.set(label, true)
888
+ info "Keys distributed."
889
+ end
890
+
891
+ def dist_ssh_keys(nodes)
892
+ master = nodes.first
893
+ rest = nodes.tail
894
+ dist_keys(master, rest)
895
+ return master
896
+ end
897
+
898
+ def clean
899
+ proxy.engine.inline_process :"g5k-clean" do
900
+ sites = run :"sites"
901
+ forall sites do |s|
902
+ log "Cleaning ", s
903
+ jobs = run :"my_jobs", s
904
+ log "Cleaning ", s, " from ", (length_of jobs), " jobs..."
905
+ run :"release_all", s
906
+ log "Done with #{name_of s}"
907
+ end
908
+ end
909
+ return true
910
+ end
911
+
912
+ def run_script(node, name)
913
+ # TODO
914
+ tmp = '/tmp/script-xpflow.sh'
915
+ info "Pushing script #{name} to the node #{node}"
916
+ copy($files[name], node, tmp) # TODO
917
+ return execute(node, "bash -e #{tmp}")
918
+ end
919
+
920
+ def rsync(node, name, where)
921
+ info "Pushing '#{name}' to the node #{node}"
922
+ rsh = _ssh_gw('sophia', node) # TODO: how does it work?
923
+ dir = $dirs[name]
924
+ cmd = "rsync --delete --numeric-ids --archive --bwlimit=100000 --rsh '#{rsh}' #{dir} :#{where}"
925
+ info "Running: #{cmd}"
926
+ return `#{cmd}`
927
+ end
928
+
929
+ end
930
+
931
+ end; end