pwrake 2.1.3 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,43 +13,24 @@ module Pwrake
13
13
  def initialize
14
14
  @selector = NBIO::Selector.new
15
15
  @hostinfo_by_taskname = {}
16
- @option = Option.new
17
16
  @hdl_set = []
18
17
  @channel_by_hostid = {}
19
18
  @channels = []
20
19
  @hostinfo_by_id = {}
21
- @n_retry = @option["RETRY"]
22
- init_logger
20
+ # init
21
+ @option = Option.new
22
+ Log.set_logger(@option)
23
+ TaskWrapper.init_task_logger(@option)
24
+ # moved from Option#init
25
+ @option.put_log
26
+ if @option['LOG_DIR'] && @option['GC_LOG_FILE']
27
+ GC::Profiler.enable
28
+ end
23
29
  end
24
30
 
25
31
  attr_reader :task_queue
26
32
  attr_reader :option
27
- attr_reader :logger
28
-
29
- def init_logger
30
- if logdir = @option['LOG_DIR']
31
- ::FileUtils.mkdir_p(logdir)
32
- logfile = File.join(logdir,@option['LOG_FILE'])
33
- @logger = Logger.new(logfile)
34
- else
35
- if @option['DEBUG']
36
- @logger = Logger.new($stderr)
37
- else
38
- @logger = Logger.new(File::NULL)
39
- end
40
- end
41
-
42
- if @option['DEBUG']
43
- @logger.level = Logger::DEBUG
44
- else
45
- @logger.level = Logger::INFO
46
- end
47
- end
48
-
49
- def init(hosts=nil)
50
- @option.init
51
- TaskWrapper.init_task_logger(@option)
52
- end
33
+ attr_reader :thread
53
34
 
54
35
  def setup_branch_handler(sub_host)
55
36
  ior,w0 = IO.pipe
@@ -83,13 +64,14 @@ module Pwrake
83
64
  case @killed
84
65
  when 0
85
66
  # log writing failed. can't be called from trap context
67
+ $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid} ##{@killed})"
86
68
  if Rake.application.options.debug
87
- $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid}"+
88
- " thread=#{Thread.current} ##{@killed})"
69
+ $stderr.puts "in master thread #{Thread.current}:"
89
70
  $stderr.puts caller
90
- else
91
- $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid}"+
92
- " ##{@killed})"
71
+ if @thread
72
+ $stderr.puts "in branch thread #{@thread}:"
73
+ $stderr.puts @thread.backtrace.join("\n")
74
+ end
93
75
  end
94
76
  $stderr.puts "Exiting..."
95
77
  @no_more_run = true
@@ -111,33 +93,37 @@ module Pwrake
111
93
  @option.host_map.each do |sub_host, wk_hosts|
112
94
  @hdl_set << hdl = setup_branch_handler(sub_host)
113
95
  Fiber.new do
114
- hdl.put_line "host_list_begin"
115
- wk_hosts.each do |host_info|
116
- name = host_info.name
117
- ncore = host_info.ncore
118
- host_id = host_info.id
119
- Log.debug "connecting #{name} ncore=#{ncore} id=#{host_id}"
120
- hdl.put_line "host:#{host_id} #{name} #{ncore}"
121
- @channel_by_hostid[host_id] = hdl
122
- @hostinfo_by_id[host_id] = host_info
123
- end
124
- hdl.put_line "host_list_end"
125
- while s = hdl.get_line
126
- case s
127
- when /^ncore:done$/
128
- break
129
- when /^ncore:(\d+):(\d+)$/
130
- id, ncore = $1.to_i, $2.to_i
131
- Log.debug "worker_id=#{id} ncore=#{ncore}"
132
- @hostinfo_by_id[id].set_ncore(ncore)
133
- sum_ncore += ncore
134
- when /^exited$/
135
- raise RuntimeError,"Unexpected branch exit"
136
- else
137
- msg = "#{hdl.host}:#{s.inspect}"
138
- raise RuntimeError,"invalid return: #{msg}"
96
+ hdl.put_line "host_list_begin"
97
+ wk_hosts.each do |host_info|
98
+ name = host_info.name
99
+ ncore = host_info.ncore
100
+ host_id = host_info.id
101
+ Log.debug "connecting #{name} ncore=#{ncore} id=#{host_id}"
102
+ hdl.put_line "host:#{host_id} #{name} #{ncore}"
103
+ @channel_by_hostid[host_id] = hdl
104
+ @hostinfo_by_id[host_id] = host_info
105
+ end
106
+ hdl.put_line "host_list_end"
107
+ while s = hdl.get_line
108
+ case s
109
+ when /^ncore:done$/
110
+ break
111
+ when /^ncore:(\d+):(\d+)$/
112
+ id, ncore = $1.to_i, $2.to_i
113
+ Log.debug "worker_id=#{id} ncore=#{ncore}"
114
+ @hostinfo_by_id[id].set_ncore(ncore)
115
+ sum_ncore += ncore
116
+ when /^ip:(\d+):(\S+)$/
117
+ id, ipa = $1.to_i, $2
118
+ Log.debug "worker_id=#{id} ip=#{ipa}"
119
+ @hostinfo_by_id[id].set_ip(ipa)
120
+ when /^exited$/
121
+ raise RuntimeError,"Unexpected branch exit"
122
+ else
123
+ msg = "#{hdl.host}:#{s.inspect}"
124
+ raise RuntimeError,"invalid return: #{msg}"
125
+ end
139
126
  end
140
- end
141
127
  end.resume
142
128
  end
143
129
  @selector.run
@@ -146,7 +132,7 @@ module Pwrake
146
132
  @option.total_cores = sum_ncore
147
133
  @hostinfo_by_id.each do |id,host|
148
134
  if ncore = @hostinfo_by_id[id].idle_cores
149
- Log.info "#{host} id=#{id} ncore=#{ncore}"
135
+ Log.info "#{host.name} id=#{id} ncore=#{ncore}"
150
136
  else
151
137
  @hostinfo_by_id.delete(id)
152
138
  end
@@ -226,6 +212,7 @@ module Pwrake
226
212
  @branch_setup_thread.join
227
213
  send_task_to_idle_core
228
214
  #
215
+ n_retry = @option["RETRY"]
229
216
  create_fiber(@hdl_set) do |hdl|
230
217
  while s = hdl.get_line
231
218
  Log.debug "Master:recv #{s.inspect} from branch[#{hdl.host}]"
@@ -249,7 +236,7 @@ module Pwrake
249
236
  if host_info
250
237
  continuous_fail = host_info.task_result(tw.status)
251
238
  Log.debug "task=#{tw.name} continuous_fail=#{continuous_fail}"
252
- if continuous_fail > @n_retry && @hostinfo_by_id.size > 1
239
+ if continuous_fail > n_retry && @hostinfo_by_id.size > 1
253
240
  # retire this host
254
241
  drop_host(host_info)
255
242
  Log.warn("retired host:#{host_info.name} due to continuous fail")
@@ -313,6 +300,7 @@ module Pwrake
313
300
  s = "#{hid}:#{tw.task_id}:#{tw.name}"
314
301
  @channel_by_hostid[hid].put_line(s)
315
302
  tw.exec_host = host_info.name
303
+ tw.exec_host_id = hid
316
304
  else
317
305
  tw.status = "end"
318
306
  task_end(tw,host_info) # @idle_cores.increase(..
@@ -339,8 +327,7 @@ module Pwrake
339
327
  j = i
340
328
  while tw = pool.deq()
341
329
  Log.debug "postproc##{j} deq=#{tw.name}"
342
- loc = postproc.run(tw)
343
- tw.postprocess(loc)
330
+ tw.postprocess(postproc)
344
331
  pool.count_down
345
332
  @hostinfo_by_taskname.delete(tw.name)
346
333
  tw.retry_or_subsequent unless @exited
@@ -9,14 +9,6 @@ module Pwrake
9
9
  @role.option
10
10
  end
11
11
 
12
- def logger
13
- @role.logger
14
- end
15
-
16
- def task_logger
17
- @role.task_logger
18
- end
19
-
20
12
  def task_queue
21
13
  @role.task_queue
22
14
  end
@@ -27,7 +19,6 @@ module Pwrake
27
19
  init("pwrake") # <- parse options here
28
20
  @role = @master = Master.new
29
21
  t = Time.now
30
- @master.init
31
22
  @master.setup_branches
32
23
  load_rakefile
33
24
  begin
@@ -39,6 +30,10 @@ module Pwrake
39
30
  rescue Exception => e
40
31
  # Exit with error message
41
32
  m = Log.bt(e)
33
+ if @master.thread
34
+ m += "\nIn branch thread #{@master.thread}:\n "
35
+ m += @master.thread.backtrace.join("\n ")
36
+ end
42
37
  Log.fatal m
43
38
  $stderr.puts m
44
39
  @master.signal_trap("INT")
@@ -0,0 +1,76 @@
1
+ begin
2
+ require 'rake'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'rake'
6
+ end
7
+
8
+ require "pwrake/version"
9
+ require "pwrake/master/master_application"
10
+ require "pwrake/task/task_manager"
11
+ require "pwrake/task/task_algorithm"
12
+ require "pwrake/branch/branch_application"
13
+
14
+ class Rake::Application
15
+ include Pwrake::BranchApplication
16
+ prepend Pwrake::MasterApplication
17
+ prepend Pwrake::TaskManager
18
+ end
19
+
20
+ class Rake::Task
21
+ include Pwrake::TaskAlgorithm
22
+ end
23
+
24
+ module Pwrake
25
+
26
+ class CommunicatorSet
27
+
28
+ def init_hosts
29
+ comm_size = MPipe::Comm.size
30
+ @ipaddr_to_rank = {}
31
+ @rank_to_ipaddr = Array.new(comm_size)
32
+ # read ip addresses
33
+ (1..comm_size-1).each do |rank|
34
+ io = MPipe.new(rank)
35
+ sz, = io.read(4).unpack("V")
36
+ s = io.read(sz)
37
+ v = s.split('|')
38
+ v.each{|a| @ipaddr_to_rank[a] = rank}
39
+ @rank_to_ipaddr[rank] = v
40
+ end
41
+ Log.debug "@ipaddr_to_rank="+@ipaddr_to_rank.inspect
42
+ end
43
+
44
+ attr_reader :ipaddr_to_rank
45
+ attr_reader :rank_to_ipaddr
46
+ end
47
+
48
+ class Communicator
49
+
50
+ def setup_pipe(worker_code)
51
+ ipa = IPSocket.getaddress(@host)
52
+ if %w[127.0.0.1 ::1].include?(ipa)
53
+ ipa = IPSocket.getaddress(Socket.gethostname)
54
+ end
55
+ @rank = @set.ipaddr_to_rank[ipa]
56
+ if @rank.nil?
57
+ raise RuntimeError,"no rank for #{@host}"
58
+ end
59
+ mp = MPipe.new(@rank)
60
+ @ior = mp
61
+ @ioe,w1 = IO.pipe
62
+ @iow = mp
63
+ @pid = nil
64
+ w1.close
65
+ @ipaddr = @set.rank_to_ipaddr[@rank]
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+ Pwrake::Branch.io_class = MPipe
72
+
73
+ # does NOT exit when writing to broken pipe
74
+ Signal.trap(:PIPE, "IGNORE")
75
+
76
+ Rake.application.run
@@ -0,0 +1,42 @@
1
+ require "parallel/processor_count.rb"
2
+ require "pwrake/nbio"
3
+ require "pwrake/branch/fiber_queue"
4
+ require "pwrake/worker/writer"
5
+ require "pwrake/worker/log_executor"
6
+ require "pwrake/worker/executor"
7
+ require "pwrake/worker/invoker"
8
+ require "pwrake/worker/shared_directory"
9
+ require "pwrake/worker/gfarm_directory"
10
+
11
+ require "thread"
12
+ require "fileutils"
13
+ require "timeout"
14
+ require "socket"
15
+
16
+ module Pwrake
17
+
18
+ class Invoker
19
+
20
+ def get_io
21
+ # get IP addresses
22
+ v = Socket.getifaddrs
23
+ v = v.select{|a| a.addr.ip? && (a.flags & Socket::IFF_MULTICAST != 0)}
24
+ v = v.map{|a| a.addr.ip_address}
25
+ s = v.join('|')
26
+ # write IP addresses
27
+ iow = MPipe.new(0)
28
+ iow.write([s.size].pack("V"))
29
+ iow.write(s)
30
+ iow.flush
31
+ # returns IO, $stdin, $stdout
32
+ [MPipe, MPipe.new(0), MPipe.new(0)]
33
+ end
34
+
35
+ def send_ipaddr
36
+ # do nothing
37
+ end
38
+
39
+ end
40
+ end
41
+
42
+ require "pwrake/worker/worker_main"
@@ -1,5 +1,7 @@
1
1
  require "fiber"
2
2
 
3
+ $debug=false
4
+
3
5
  module Pwrake
4
6
  module NBIO
5
7
 
@@ -8,10 +10,11 @@ module NBIO
8
10
 
9
11
  class Selector
10
12
 
11
- def initialize
13
+ def initialize(io_class=IO)
12
14
  @reader = {}
13
15
  @writer = {}
14
16
  @running = false
17
+ @io_class = io_class
15
18
  end
16
19
 
17
20
  attr_reader :reader, :writer
@@ -52,9 +55,12 @@ module NBIO
52
55
  @running = true
53
56
  init_heartbeat if timeout
54
57
  while @running && !empty?
55
- if $debug
56
- Log.debug "Selector#run: "+caller[0..1].join(", ")+
57
- " @reader.size=#{@reader.size} @writer.size=#{@writer.size}"
58
+ if $debug && defined? Log
59
+ rd_insp = @reader.map{|k,v|
60
+ "%s=>%s,%s" % [k.inspect,v.class.inspect,v.waiter.inspect]
61
+ }.join(",")
62
+ Log.debug "Selector#run:\n "+caller[0..1].join("\n ")+
63
+ "\n @reader={#{rd_insp}}\n @writer.size=#{@writer.size}"
58
64
  $stderr.puts "Selector#run: "+caller[0]
59
65
  end
60
66
  run_select(timeout)
@@ -67,7 +73,7 @@ module NBIO
67
73
  private
68
74
  def run_select(timeout)
69
75
  to = (timeout) ? timeout*0.75 : nil
70
- r, w, = IO.select(@reader.keys,@writer.keys,[],to)
76
+ r, w, = @io_class.select(@reader.keys,@writer.keys,[],to)
71
77
  check_heartbeat(r,timeout) if timeout
72
78
  r.each{|io| @reader[io].call} if r
73
79
  w.each{|io| @writer[io].call} if w
@@ -76,7 +82,7 @@ module NBIO
76
82
  @reader.keys.each do |io|
77
83
  if io.closed?
78
84
  m = "#{em} io=#{io}"
79
- Log.error(m)
85
+ Log.error(m) if defined? Log
80
86
  $stderr.puts m
81
87
  hdl = @reader.delete(io)
82
88
  hdl.error(e)
@@ -85,13 +91,12 @@ module NBIO
85
91
  @writer.keys.each do |io|
86
92
  if io.closed?
87
93
  m = "#{em} io=#{io}"
88
- Log.error(m)
94
+ Log.error(m) if defined? Log
89
95
  $stderr.puts m
90
96
  hdl = @writer.delete(io)
91
97
  hdl.error(e)
92
98
  end
93
99
  end
94
- #raise e
95
100
  end
96
101
 
97
102
  def init_heartbeat
@@ -169,6 +174,12 @@ module NBIO
169
174
  flush unless buffered
170
175
  end
171
176
 
177
+ alias print :write
178
+
179
+ def puts(s)
180
+ write(s+"\n")
181
+ end
182
+
172
183
  def flush
173
184
  until @pool.empty?
174
185
  len = _write(@pool[0])
@@ -221,10 +232,11 @@ module NBIO
221
232
  @io = io
222
233
  @waiter = []
223
234
  @buf = ''
235
+ @eof = false
224
236
  @sep = "\n"
225
237
  @chunk_size = 8192
226
238
  end
227
- attr_reader :io
239
+ attr_reader :io, :waiter
228
240
  attr_accessor :check_timeout
229
241
 
230
242
  # call from Selector#run
@@ -240,19 +252,26 @@ module NBIO
240
252
  @buf.slice!(0, index+@sep.bytesize)
241
253
  rescue EOFError => e
242
254
  if @buf.empty?
243
- #return nil
244
255
  raise e
245
256
  else
246
257
  buf = @buf; @buf = ''
247
258
  return buf
248
259
  end
249
- #rescue IO::WaitReadable
250
260
  end
251
261
 
252
262
  # call from Reader#_read and FiberReaderQueue#deq
253
263
  def select_io
254
- @selector.add_reader(self) if @waiter.empty?
264
+ if @waiter.empty?
265
+ @selector.add_reader(self)
266
+ else
267
+ if @selector.reader[@io] != self
268
+ raise RuntimeError, "access from multiple Fiber"
269
+ end
270
+ end
255
271
  @waiter.push(Fiber.current)
272
+ if $debug && defined? Log
273
+ Log.debug("Reader#select_io: #{Fiber.current.inspect}\n "+caller.join("\n "))
274
+ end
256
275
  Fiber.yield
257
276
  ensure
258
277
  @waiter.delete(Fiber.current)
@@ -271,11 +290,16 @@ module NBIO
271
290
  @halting = false
272
291
  end
273
292
 
293
+ def eof?
294
+ @eof && @buf.empty?
295
+ end
296
+
274
297
  # from Bartender
275
298
 
276
299
  def _read(sz)
277
300
  @io.read_nonblock(sz)
278
301
  rescue EOFError
302
+ @eof = true
279
303
  nil
280
304
  rescue IO::WaitReadable
281
305
  return nil if @halting
@@ -313,6 +337,7 @@ module NBIO
313
337
  end
314
338
 
315
339
  alias get_line :readln
340
+ alias gets :readln
316
341
 
317
342
  end
318
343
 
@@ -368,7 +393,6 @@ module NBIO
368
393
  rescue EOFError
369
394
  halt
370
395
  rescue IO::WaitReadable
371
- #p IO::WaitReadable
372
396
  end
373
397
 
374
398
  def error(e)
@@ -378,6 +402,7 @@ module NBIO
378
402
  end
379
403
 
380
404
  def halt
405
+ Log.debug("Handler.halt") if defined? Log
381
406
  @queue.each{|q| q.halt}
382
407
  @default_queue.halt
383
408
  end
@@ -430,6 +455,7 @@ module NBIO
430
455
  @writer = writer
431
456
  @reader = reader
432
457
  @host = hostname
458
+ @exited = false
433
459
  end
434
460
  attr_reader :reader, :writer, :host
435
461
 
@@ -442,8 +468,7 @@ module NBIO
442
468
  end
443
469
 
444
470
  def put_kill(sig="INT")
445
- #@writer.put_line("kill:#{sig}")
446
- @writer.io.puts("kill:#{sig}")
471
+ @writer.io.write("kill:#{sig}\n")
447
472
  @writer.io.flush
448
473
  end
449
474
 
@@ -452,24 +477,31 @@ module NBIO
452
477
  end
453
478
 
454
479
  def exit
455
- exit_msg = "exited"
456
480
  iow = @writer.io
457
- Log.debug "Handler#exit iow=#{iow.inspect}"
458
- return if iow.closed?
481
+ Log.debug "Handler#exit iow=#{iow.inspect}" if defined? Log
482
+ if @exited
483
+ Log.debug "Handler<##{object_id}#exit multiple called" if defined? Log
484
+ bt = caller.join("\n ")
485
+ Log.debug "Handler#exit bt=\n #{bt}" if defined? Log
486
+ return
487
+ end
488
+ @exited = true
489
+ exit_msg = "exited"
490
+ #return if iow.closed?
459
491
  @writer.put_line "exit"
460
- while line = @reader.get_line
461
- # here might receive "retire:0" from branch...
462
- line.chomp!
463
- Log.debug "Handler#exit: #{line} host=#{@host}"
464
- return if line == exit_msg
492
+ Log.debug "Handler#exit: end: @writer.put_line \"exit\"" if defined? Log
493
+ #
494
+ if @reader.class == Reader # MultiReader not work
495
+ while line = @reader.get_line
496
+ # here might receive "retire:0" from branch...
497
+ line.chomp!
498
+ Log.debug "Handler#exit: #{line} host=#{@host}" if defined? Log
499
+ return if line == exit_msg
500
+ end
465
501
  end
466
502
  rescue Errno::EPIPE => e
467
- if Rake.application.options.debug
468
- #$stderr.puts "Errno::EPIPE in #{self.class}#exit iow=#{iow.inspect}"
469
- #$stderr.puts e.backtrace.join("\n")
470
- end
471
503
  Log.error "Errno::EPIPE in #{self.class}.exit iow=#{iow.inspect}\n"+
472
- e.backtrace.join("\n")
504
+ e.backtrace.join("\n") if defined? Log
473
505
  end
474
506
 
475
507
  def halt
@@ -496,38 +528,4 @@ module NBIO
496
528
  end
497
529
 
498
530
  end
499
-
500
- #------------------------------------------------------------------
501
-
502
- if __FILE__ == $0
503
- iosel = NBIO::Selector.new
504
-
505
- io = 5.times.map do
506
- IO.pipe
507
- end
508
-
509
- io.each do |ior,iow|
510
- rd = NBIO::MultiReader.new(iosel,ior,1)
511
- Fiber.new do
512
- while s = rd.get_line(0)
513
- puts s
514
- end
515
- puts "fiber end"
516
- end.resume
517
- end
518
-
519
- io.each do |ior,iow|
520
- wr = NBIO::Writer.new(iosel,iow)
521
- Fiber.new do
522
- 2000.times do |i|
523
- wr.put_line("test str#{i}"+"-"*80,0)
524
- #iow.puts "0:test str#{i}"+"-"*80
525
- end
526
- #iow.print "hage"
527
- iow.close
528
- end.resume
529
- end
530
-
531
- iosel.run
532
- end
533
531
  end