pwrake 2.1.3 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,43 +13,24 @@ module Pwrake
13
13
  def initialize
14
14
  @selector = NBIO::Selector.new
15
15
  @hostinfo_by_taskname = {}
16
- @option = Option.new
17
16
  @hdl_set = []
18
17
  @channel_by_hostid = {}
19
18
  @channels = []
20
19
  @hostinfo_by_id = {}
21
- @n_retry = @option["RETRY"]
22
- init_logger
20
+ # init
21
+ @option = Option.new
22
+ Log.set_logger(@option)
23
+ TaskWrapper.init_task_logger(@option)
24
+ # moved from Option#init
25
+ @option.put_log
26
+ if @option['LOG_DIR'] && @option['GC_LOG_FILE']
27
+ GC::Profiler.enable
28
+ end
23
29
  end
24
30
 
25
31
  attr_reader :task_queue
26
32
  attr_reader :option
27
- attr_reader :logger
28
-
29
- def init_logger
30
- if logdir = @option['LOG_DIR']
31
- ::FileUtils.mkdir_p(logdir)
32
- logfile = File.join(logdir,@option['LOG_FILE'])
33
- @logger = Logger.new(logfile)
34
- else
35
- if @option['DEBUG']
36
- @logger = Logger.new($stderr)
37
- else
38
- @logger = Logger.new(File::NULL)
39
- end
40
- end
41
-
42
- if @option['DEBUG']
43
- @logger.level = Logger::DEBUG
44
- else
45
- @logger.level = Logger::INFO
46
- end
47
- end
48
-
49
- def init(hosts=nil)
50
- @option.init
51
- TaskWrapper.init_task_logger(@option)
52
- end
33
+ attr_reader :thread
53
34
 
54
35
  def setup_branch_handler(sub_host)
55
36
  ior,w0 = IO.pipe
@@ -83,13 +64,14 @@ module Pwrake
83
64
  case @killed
84
65
  when 0
85
66
  # log writing failed. can't be called from trap context
67
+ $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid} ##{@killed})"
86
68
  if Rake.application.options.debug
87
- $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid}"+
88
- " thread=#{Thread.current} ##{@killed})"
69
+ $stderr.puts "in master thread #{Thread.current}:"
89
70
  $stderr.puts caller
90
- else
91
- $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid}"+
92
- " ##{@killed})"
71
+ if @thread
72
+ $stderr.puts "in branch thread #{@thread}:"
73
+ $stderr.puts @thread.backtrace.join("\n")
74
+ end
93
75
  end
94
76
  $stderr.puts "Exiting..."
95
77
  @no_more_run = true
@@ -111,33 +93,37 @@ module Pwrake
111
93
  @option.host_map.each do |sub_host, wk_hosts|
112
94
  @hdl_set << hdl = setup_branch_handler(sub_host)
113
95
  Fiber.new do
114
- hdl.put_line "host_list_begin"
115
- wk_hosts.each do |host_info|
116
- name = host_info.name
117
- ncore = host_info.ncore
118
- host_id = host_info.id
119
- Log.debug "connecting #{name} ncore=#{ncore} id=#{host_id}"
120
- hdl.put_line "host:#{host_id} #{name} #{ncore}"
121
- @channel_by_hostid[host_id] = hdl
122
- @hostinfo_by_id[host_id] = host_info
123
- end
124
- hdl.put_line "host_list_end"
125
- while s = hdl.get_line
126
- case s
127
- when /^ncore:done$/
128
- break
129
- when /^ncore:(\d+):(\d+)$/
130
- id, ncore = $1.to_i, $2.to_i
131
- Log.debug "worker_id=#{id} ncore=#{ncore}"
132
- @hostinfo_by_id[id].set_ncore(ncore)
133
- sum_ncore += ncore
134
- when /^exited$/
135
- raise RuntimeError,"Unexpected branch exit"
136
- else
137
- msg = "#{hdl.host}:#{s.inspect}"
138
- raise RuntimeError,"invalid return: #{msg}"
96
+ hdl.put_line "host_list_begin"
97
+ wk_hosts.each do |host_info|
98
+ name = host_info.name
99
+ ncore = host_info.ncore
100
+ host_id = host_info.id
101
+ Log.debug "connecting #{name} ncore=#{ncore} id=#{host_id}"
102
+ hdl.put_line "host:#{host_id} #{name} #{ncore}"
103
+ @channel_by_hostid[host_id] = hdl
104
+ @hostinfo_by_id[host_id] = host_info
105
+ end
106
+ hdl.put_line "host_list_end"
107
+ while s = hdl.get_line
108
+ case s
109
+ when /^ncore:done$/
110
+ break
111
+ when /^ncore:(\d+):(\d+)$/
112
+ id, ncore = $1.to_i, $2.to_i
113
+ Log.debug "worker_id=#{id} ncore=#{ncore}"
114
+ @hostinfo_by_id[id].set_ncore(ncore)
115
+ sum_ncore += ncore
116
+ when /^ip:(\d+):(\S+)$/
117
+ id, ipa = $1.to_i, $2
118
+ Log.debug "worker_id=#{id} ip=#{ipa}"
119
+ @hostinfo_by_id[id].set_ip(ipa)
120
+ when /^exited$/
121
+ raise RuntimeError,"Unexpected branch exit"
122
+ else
123
+ msg = "#{hdl.host}:#{s.inspect}"
124
+ raise RuntimeError,"invalid return: #{msg}"
125
+ end
139
126
  end
140
- end
141
127
  end.resume
142
128
  end
143
129
  @selector.run
@@ -146,7 +132,7 @@ module Pwrake
146
132
  @option.total_cores = sum_ncore
147
133
  @hostinfo_by_id.each do |id,host|
148
134
  if ncore = @hostinfo_by_id[id].idle_cores
149
- Log.info "#{host} id=#{id} ncore=#{ncore}"
135
+ Log.info "#{host.name} id=#{id} ncore=#{ncore}"
150
136
  else
151
137
  @hostinfo_by_id.delete(id)
152
138
  end
@@ -226,6 +212,7 @@ module Pwrake
226
212
  @branch_setup_thread.join
227
213
  send_task_to_idle_core
228
214
  #
215
+ n_retry = @option["RETRY"]
229
216
  create_fiber(@hdl_set) do |hdl|
230
217
  while s = hdl.get_line
231
218
  Log.debug "Master:recv #{s.inspect} from branch[#{hdl.host}]"
@@ -249,7 +236,7 @@ module Pwrake
249
236
  if host_info
250
237
  continuous_fail = host_info.task_result(tw.status)
251
238
  Log.debug "task=#{tw.name} continuous_fail=#{continuous_fail}"
252
- if continuous_fail > @n_retry && @hostinfo_by_id.size > 1
239
+ if continuous_fail > n_retry && @hostinfo_by_id.size > 1
253
240
  # retire this host
254
241
  drop_host(host_info)
255
242
  Log.warn("retired host:#{host_info.name} due to continuous fail")
@@ -313,6 +300,7 @@ module Pwrake
313
300
  s = "#{hid}:#{tw.task_id}:#{tw.name}"
314
301
  @channel_by_hostid[hid].put_line(s)
315
302
  tw.exec_host = host_info.name
303
+ tw.exec_host_id = hid
316
304
  else
317
305
  tw.status = "end"
318
306
  task_end(tw,host_info) # @idle_cores.increase(..
@@ -339,8 +327,7 @@ module Pwrake
339
327
  j = i
340
328
  while tw = pool.deq()
341
329
  Log.debug "postproc##{j} deq=#{tw.name}"
342
- loc = postproc.run(tw)
343
- tw.postprocess(loc)
330
+ tw.postprocess(postproc)
344
331
  pool.count_down
345
332
  @hostinfo_by_taskname.delete(tw.name)
346
333
  tw.retry_or_subsequent unless @exited
@@ -9,14 +9,6 @@ module Pwrake
9
9
  @role.option
10
10
  end
11
11
 
12
- def logger
13
- @role.logger
14
- end
15
-
16
- def task_logger
17
- @role.task_logger
18
- end
19
-
20
12
  def task_queue
21
13
  @role.task_queue
22
14
  end
@@ -27,7 +19,6 @@ module Pwrake
27
19
  init("pwrake") # <- parse options here
28
20
  @role = @master = Master.new
29
21
  t = Time.now
30
- @master.init
31
22
  @master.setup_branches
32
23
  load_rakefile
33
24
  begin
@@ -39,6 +30,10 @@ module Pwrake
39
30
  rescue Exception => e
40
31
  # Exit with error message
41
32
  m = Log.bt(e)
33
+ if @master.thread
34
+ m += "\nIn branch thread #{@master.thread}:\n "
35
+ m += @master.thread.backtrace.join("\n ")
36
+ end
42
37
  Log.fatal m
43
38
  $stderr.puts m
44
39
  @master.signal_trap("INT")
@@ -0,0 +1,76 @@
1
+ begin
2
+ require 'rake'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'rake'
6
+ end
7
+
8
+ require "pwrake/version"
9
+ require "pwrake/master/master_application"
10
+ require "pwrake/task/task_manager"
11
+ require "pwrake/task/task_algorithm"
12
+ require "pwrake/branch/branch_application"
13
+
14
+ class Rake::Application
15
+ include Pwrake::BranchApplication
16
+ prepend Pwrake::MasterApplication
17
+ prepend Pwrake::TaskManager
18
+ end
19
+
20
+ class Rake::Task
21
+ include Pwrake::TaskAlgorithm
22
+ end
23
+
24
+ module Pwrake
25
+
26
+ class CommunicatorSet
27
+
28
+ def init_hosts
29
+ comm_size = MPipe::Comm.size
30
+ @ipaddr_to_rank = {}
31
+ @rank_to_ipaddr = Array.new(comm_size)
32
+ # read ip addresses
33
+ (1..comm_size-1).each do |rank|
34
+ io = MPipe.new(rank)
35
+ sz, = io.read(4).unpack("V")
36
+ s = io.read(sz)
37
+ v = s.split('|')
38
+ v.each{|a| @ipaddr_to_rank[a] = rank}
39
+ @rank_to_ipaddr[rank] = v
40
+ end
41
+ Log.debug "@ipaddr_to_rank="+@ipaddr_to_rank.inspect
42
+ end
43
+
44
+ attr_reader :ipaddr_to_rank
45
+ attr_reader :rank_to_ipaddr
46
+ end
47
+
48
+ class Communicator
49
+
50
+ def setup_pipe(worker_code)
51
+ ipa = IPSocket.getaddress(@host)
52
+ if %w[127.0.0.1 ::1].include?(ipa)
53
+ ipa = IPSocket.getaddress(Socket.gethostname)
54
+ end
55
+ @rank = @set.ipaddr_to_rank[ipa]
56
+ if @rank.nil?
57
+ raise RuntimeError,"no rank for #{@host}"
58
+ end
59
+ mp = MPipe.new(@rank)
60
+ @ior = mp
61
+ @ioe,w1 = IO.pipe
62
+ @iow = mp
63
+ @pid = nil
64
+ w1.close
65
+ @ipaddr = @set.rank_to_ipaddr[@rank]
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+ Pwrake::Branch.io_class = MPipe
72
+
73
+ # does NOT exit when writing to broken pipe
74
+ Signal.trap(:PIPE, "IGNORE")
75
+
76
+ Rake.application.run
@@ -0,0 +1,42 @@
1
+ require "parallel/processor_count.rb"
2
+ require "pwrake/nbio"
3
+ require "pwrake/branch/fiber_queue"
4
+ require "pwrake/worker/writer"
5
+ require "pwrake/worker/log_executor"
6
+ require "pwrake/worker/executor"
7
+ require "pwrake/worker/invoker"
8
+ require "pwrake/worker/shared_directory"
9
+ require "pwrake/worker/gfarm_directory"
10
+
11
+ require "thread"
12
+ require "fileutils"
13
+ require "timeout"
14
+ require "socket"
15
+
16
+ module Pwrake
17
+
18
+ class Invoker
19
+
20
+ def get_io
21
+ # get IP addresses
22
+ v = Socket.getifaddrs
23
+ v = v.select{|a| a.addr.ip? && (a.flags & Socket::IFF_MULTICAST != 0)}
24
+ v = v.map{|a| a.addr.ip_address}
25
+ s = v.join('|')
26
+ # write IP addresses
27
+ iow = MPipe.new(0)
28
+ iow.write([s.size].pack("V"))
29
+ iow.write(s)
30
+ iow.flush
31
+ # returns IO, $stdin, $stdout
32
+ [MPipe, MPipe.new(0), MPipe.new(0)]
33
+ end
34
+
35
+ def send_ipaddr
36
+ # do nothing
37
+ end
38
+
39
+ end
40
+ end
41
+
42
+ require "pwrake/worker/worker_main"
@@ -1,5 +1,7 @@
1
1
  require "fiber"
2
2
 
3
+ $debug=false
4
+
3
5
  module Pwrake
4
6
  module NBIO
5
7
 
@@ -8,10 +10,11 @@ module NBIO
8
10
 
9
11
  class Selector
10
12
 
11
- def initialize
13
+ def initialize(io_class=IO)
12
14
  @reader = {}
13
15
  @writer = {}
14
16
  @running = false
17
+ @io_class = io_class
15
18
  end
16
19
 
17
20
  attr_reader :reader, :writer
@@ -52,9 +55,12 @@ module NBIO
52
55
  @running = true
53
56
  init_heartbeat if timeout
54
57
  while @running && !empty?
55
- if $debug
56
- Log.debug "Selector#run: "+caller[0..1].join(", ")+
57
- " @reader.size=#{@reader.size} @writer.size=#{@writer.size}"
58
+ if $debug && defined? Log
59
+ rd_insp = @reader.map{|k,v|
60
+ "%s=>%s,%s" % [k.inspect,v.class.inspect,v.waiter.inspect]
61
+ }.join(",")
62
+ Log.debug "Selector#run:\n "+caller[0..1].join("\n ")+
63
+ "\n @reader={#{rd_insp}}\n @writer.size=#{@writer.size}"
58
64
  $stderr.puts "Selector#run: "+caller[0]
59
65
  end
60
66
  run_select(timeout)
@@ -67,7 +73,7 @@ module NBIO
67
73
  private
68
74
  def run_select(timeout)
69
75
  to = (timeout) ? timeout*0.75 : nil
70
- r, w, = IO.select(@reader.keys,@writer.keys,[],to)
76
+ r, w, = @io_class.select(@reader.keys,@writer.keys,[],to)
71
77
  check_heartbeat(r,timeout) if timeout
72
78
  r.each{|io| @reader[io].call} if r
73
79
  w.each{|io| @writer[io].call} if w
@@ -76,7 +82,7 @@ module NBIO
76
82
  @reader.keys.each do |io|
77
83
  if io.closed?
78
84
  m = "#{em} io=#{io}"
79
- Log.error(m)
85
+ Log.error(m) if defined? Log
80
86
  $stderr.puts m
81
87
  hdl = @reader.delete(io)
82
88
  hdl.error(e)
@@ -85,13 +91,12 @@ module NBIO
85
91
  @writer.keys.each do |io|
86
92
  if io.closed?
87
93
  m = "#{em} io=#{io}"
88
- Log.error(m)
94
+ Log.error(m) if defined? Log
89
95
  $stderr.puts m
90
96
  hdl = @writer.delete(io)
91
97
  hdl.error(e)
92
98
  end
93
99
  end
94
- #raise e
95
100
  end
96
101
 
97
102
  def init_heartbeat
@@ -169,6 +174,12 @@ module NBIO
169
174
  flush unless buffered
170
175
  end
171
176
 
177
+ alias print :write
178
+
179
+ def puts(s)
180
+ write(s+"\n")
181
+ end
182
+
172
183
  def flush
173
184
  until @pool.empty?
174
185
  len = _write(@pool[0])
@@ -221,10 +232,11 @@ module NBIO
221
232
  @io = io
222
233
  @waiter = []
223
234
  @buf = ''
235
+ @eof = false
224
236
  @sep = "\n"
225
237
  @chunk_size = 8192
226
238
  end
227
- attr_reader :io
239
+ attr_reader :io, :waiter
228
240
  attr_accessor :check_timeout
229
241
 
230
242
  # call from Selector#run
@@ -240,19 +252,26 @@ module NBIO
240
252
  @buf.slice!(0, index+@sep.bytesize)
241
253
  rescue EOFError => e
242
254
  if @buf.empty?
243
- #return nil
244
255
  raise e
245
256
  else
246
257
  buf = @buf; @buf = ''
247
258
  return buf
248
259
  end
249
- #rescue IO::WaitReadable
250
260
  end
251
261
 
252
262
  # call from Reader#_read and FiberReaderQueue#deq
253
263
  def select_io
254
- @selector.add_reader(self) if @waiter.empty?
264
+ if @waiter.empty?
265
+ @selector.add_reader(self)
266
+ else
267
+ if @selector.reader[@io] != self
268
+ raise RuntimeError, "access from multiple Fiber"
269
+ end
270
+ end
255
271
  @waiter.push(Fiber.current)
272
+ if $debug && defined? Log
273
+ Log.debug("Reader#select_io: #{Fiber.current.inspect}\n "+caller.join("\n "))
274
+ end
256
275
  Fiber.yield
257
276
  ensure
258
277
  @waiter.delete(Fiber.current)
@@ -271,11 +290,16 @@ module NBIO
271
290
  @halting = false
272
291
  end
273
292
 
293
+ def eof?
294
+ @eof && @buf.empty?
295
+ end
296
+
274
297
  # from Bartender
275
298
 
276
299
  def _read(sz)
277
300
  @io.read_nonblock(sz)
278
301
  rescue EOFError
302
+ @eof = true
279
303
  nil
280
304
  rescue IO::WaitReadable
281
305
  return nil if @halting
@@ -313,6 +337,7 @@ module NBIO
313
337
  end
314
338
 
315
339
  alias get_line :readln
340
+ alias gets :readln
316
341
 
317
342
  end
318
343
 
@@ -368,7 +393,6 @@ module NBIO
368
393
  rescue EOFError
369
394
  halt
370
395
  rescue IO::WaitReadable
371
- #p IO::WaitReadable
372
396
  end
373
397
 
374
398
  def error(e)
@@ -378,6 +402,7 @@ module NBIO
378
402
  end
379
403
 
380
404
  def halt
405
+ Log.debug("Handler.halt") if defined? Log
381
406
  @queue.each{|q| q.halt}
382
407
  @default_queue.halt
383
408
  end
@@ -430,6 +455,7 @@ module NBIO
430
455
  @writer = writer
431
456
  @reader = reader
432
457
  @host = hostname
458
+ @exited = false
433
459
  end
434
460
  attr_reader :reader, :writer, :host
435
461
 
@@ -442,8 +468,7 @@ module NBIO
442
468
  end
443
469
 
444
470
  def put_kill(sig="INT")
445
- #@writer.put_line("kill:#{sig}")
446
- @writer.io.puts("kill:#{sig}")
471
+ @writer.io.write("kill:#{sig}\n")
447
472
  @writer.io.flush
448
473
  end
449
474
 
@@ -452,24 +477,31 @@ module NBIO
452
477
  end
453
478
 
454
479
  def exit
455
- exit_msg = "exited"
456
480
  iow = @writer.io
457
- Log.debug "Handler#exit iow=#{iow.inspect}"
458
- return if iow.closed?
481
+ Log.debug "Handler#exit iow=#{iow.inspect}" if defined? Log
482
+ if @exited
483
+ Log.debug "Handler<##{object_id}#exit multiple called" if defined? Log
484
+ bt = caller.join("\n ")
485
+ Log.debug "Handler#exit bt=\n #{bt}" if defined? Log
486
+ return
487
+ end
488
+ @exited = true
489
+ exit_msg = "exited"
490
+ #return if iow.closed?
459
491
  @writer.put_line "exit"
460
- while line = @reader.get_line
461
- # here might receive "retire:0" from branch...
462
- line.chomp!
463
- Log.debug "Handler#exit: #{line} host=#{@host}"
464
- return if line == exit_msg
492
+ Log.debug "Handler#exit: end: @writer.put_line \"exit\"" if defined? Log
493
+ #
494
+ if @reader.class == Reader # MultiReader not work
495
+ while line = @reader.get_line
496
+ # here might receive "retire:0" from branch...
497
+ line.chomp!
498
+ Log.debug "Handler#exit: #{line} host=#{@host}" if defined? Log
499
+ return if line == exit_msg
500
+ end
465
501
  end
466
502
  rescue Errno::EPIPE => e
467
- if Rake.application.options.debug
468
- #$stderr.puts "Errno::EPIPE in #{self.class}#exit iow=#{iow.inspect}"
469
- #$stderr.puts e.backtrace.join("\n")
470
- end
471
503
  Log.error "Errno::EPIPE in #{self.class}.exit iow=#{iow.inspect}\n"+
472
- e.backtrace.join("\n")
504
+ e.backtrace.join("\n") if defined? Log
473
505
  end
474
506
 
475
507
  def halt
@@ -496,38 +528,4 @@ module NBIO
496
528
  end
497
529
 
498
530
  end
499
-
500
- #------------------------------------------------------------------
501
-
502
- if __FILE__ == $0
503
- iosel = NBIO::Selector.new
504
-
505
- io = 5.times.map do
506
- IO.pipe
507
- end
508
-
509
- io.each do |ior,iow|
510
- rd = NBIO::MultiReader.new(iosel,ior,1)
511
- Fiber.new do
512
- while s = rd.get_line(0)
513
- puts s
514
- end
515
- puts "fiber end"
516
- end.resume
517
- end
518
-
519
- io.each do |ior,iow|
520
- wr = NBIO::Writer.new(iosel,iow)
521
- Fiber.new do
522
- 2000.times do |i|
523
- wr.put_line("test str#{i}"+"-"*80,0)
524
- #iow.puts "0:test str#{i}"+"-"*80
525
- end
526
- #iow.print "hage"
527
- iow.close
528
- end.resume
529
- end
530
-
531
- iosel.run
532
- end
533
531
  end