pwrake 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +22 -9
  3. data/bin/gfwhere-pipe +33 -9
  4. data/bin/pwrake +5 -2
  5. data/bin/pwrake_branch +5 -3
  6. data/lib/pwrake/branch/branch.rb +95 -86
  7. data/lib/pwrake/branch/branch_application.rb +4 -0
  8. data/lib/pwrake/branch/communicator.rb +173 -0
  9. data/lib/pwrake/branch/communicator_set.rb +100 -0
  10. data/lib/pwrake/branch/fiber_queue.rb +10 -0
  11. data/lib/pwrake/branch/shell.rb +68 -24
  12. data/lib/pwrake/branch/shell_profiler.rb +2 -0
  13. data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
  14. data/lib/pwrake/logger.rb +5 -0
  15. data/lib/pwrake/master/master.rb +190 -87
  16. data/lib/pwrake/master/master_application.rb +8 -0
  17. data/lib/pwrake/nbio.rb +525 -0
  18. data/lib/pwrake/option/host_map.rb +36 -4
  19. data/lib/pwrake/option/option.rb +7 -1
  20. data/lib/pwrake/option/option_filesystem.rb +13 -3
  21. data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
  22. data/lib/pwrake/queue/queue_array.rb +31 -11
  23. data/lib/pwrake/queue/task_queue.rb +15 -18
  24. data/lib/pwrake/report/report.rb +2 -0
  25. data/lib/pwrake/task/task_algorithm.rb +4 -1
  26. data/lib/pwrake/task/task_manager.rb +2 -0
  27. data/lib/pwrake/task/task_property.rb +1 -0
  28. data/lib/pwrake/task/task_wrapper.rb +40 -21
  29. data/lib/pwrake/version.rb +1 -1
  30. data/lib/pwrake/worker/invoker.rb +4 -29
  31. data/pwrake.gemspec +3 -2
  32. metadata +24 -12
  33. data/lib/pwrake/branch.rb +0 -22
  34. data/lib/pwrake/branch/worker_communicator.rb +0 -104
  35. data/lib/pwrake/iomux/channel.rb +0 -70
  36. data/lib/pwrake/iomux/handler.rb +0 -124
  37. data/lib/pwrake/iomux/handler_set.rb +0 -35
  38. data/lib/pwrake/iomux/runner.rb +0 -62
  39. data/lib/pwrake/master.rb +0 -30
@@ -1,3 +1,6 @@
1
+ require "pwrake/logger"
2
+ require "pwrake/branch/branch"
3
+
1
4
  module Pwrake
2
5
 
3
6
  # The TaskManager module is a mixin for managing tasks.
@@ -36,6 +39,7 @@ module Pwrake
36
39
  def run_branch_in_thread(r,w,opts)
37
40
  #standard_exception_handling do
38
41
  @branch = Branch.new(opts,r,w)
42
+ @branch.init_logger
39
43
  begin
40
44
  @branch.run
41
45
  rescue => e
@@ -0,0 +1,173 @@
1
+ module Pwrake
2
+
3
+ class CommChannel
4
+
5
+ def initialize(host,id,queue,writer,ios=[])
6
+ @host = host
7
+ @id = id
8
+ @queue = queue
9
+ @writer = writer
10
+ @ios = ios
11
+ end
12
+
13
+ attr_reader :host, :id
14
+
15
+ def put_line(s)
16
+ if $cause_fault
17
+ $cause_fault = nil
18
+ Log.warn("closing writer io caller=\n#{caller.join("\n")}")
19
+ @ios.each{|io| io.close}
20
+ end
21
+ @writer.put_line(s,@id)
22
+ end
23
+
24
+ def get_line
25
+ @queue.deq
26
+ end
27
+
28
+ def halt
29
+ @queue.halt
30
+ @writer.halt
31
+ end
32
+ end
33
+
34
+ class Communicator
35
+
36
+ class ConnectError < IOError; end
37
+
38
+ attr_reader :id, :host, :ncore, :channel
39
+ attr_reader :reader, :writer, :handler
40
+ attr_reader :shells
41
+
42
+ def initialize(set,id,host,ncore,selector,option)
43
+ @set = set
44
+ @id = id
45
+ @host = host
46
+ @ncore = @ncore_given = ncore
47
+ @selector = selector
48
+ @option = option
49
+ @shells = {}
50
+ end
51
+
52
+ def inspect
53
+ "#<#{self.class} @id=#{@id},@host=#{@host},@ncore=#{@ncore}>"
54
+ end
55
+
56
+ def new_channel
57
+ i,q = @reader.new_queue
58
+ CommChannel.new(@host,i,q,@writer,[@ior,@iow,@ioe])
59
+ end
60
+
61
+ def connect(worker_code)
62
+ rb_cmd = "ruby -e 'eval ARGF.read(#{worker_code.size})'"
63
+ if ['localhost','localhost.localdomain','127.0.0.1'].include? @host
64
+ #if /^localhost/ =~ @host
65
+ cmd = rb_cmd
66
+ else
67
+ cmd = "ssh -x -T #{@option[:ssh_option]} #{@host} \"#{rb_cmd}\""
68
+ end
69
+ #
70
+ @ior,w0 = IO.pipe
71
+ @ioe,w1 = IO.pipe
72
+ r2,@iow = IO.pipe
73
+ @pid = Kernel.spawn(cmd,:pgroup=>true,:out=>w0,:err=>w1,:in=>r2)
74
+ w0.close
75
+ w1.close
76
+ r2.close
77
+ sel = @set.selector
78
+ @reader = NBIO::MultiReader.new(sel,@ior)
79
+ @rd_err = NBIO::Reader.new(sel,@ioe)
80
+ @writer = NBIO::Writer.new(sel,@iow)
81
+ @handler = NBIO::Handler.new(@reader,@writer,@host)
82
+ #
83
+ @writer.write(worker_code)
84
+ @writer.write(Marshal.dump(@ncore))
85
+ @writer.write(Marshal.dump(@option))
86
+ # read ncore
87
+ while s = @reader.get_line
88
+ if /^ncore:(.*)$/ =~ s
89
+ a = $1
90
+ Log.debug "ncore=#{a} @#{@host}"
91
+ if /^(\d+)$/ =~ a
92
+ @ncore = $1.to_i
93
+ return false
94
+ else
95
+ raise ConnectError, "invalid for ncore: #{a.inspect}"
96
+ end
97
+ else
98
+ return false if !common_line(s)
99
+ end
100
+ end
101
+ raise ConnectError, "fail to connect #{cmd.inspect}"
102
+ rescue => e
103
+ dropout(e)
104
+ end
105
+
106
+ def common_line(s)
107
+ x = "Communicator#common_line(id=#{@id},host=#{@host})"
108
+ case s
109
+ when /^heartbeat$/
110
+ Log.debug "#{x}: #{s.inspect}"
111
+ @selector.heartbeat(@reader.io)
112
+ when /^exited$/
113
+ Log.debug "#{x}: #{s.inspect}"
114
+ return false
115
+ when /^log:(.*)$/
116
+ Log.info "#{x}: log>#{$1}"
117
+ when String
118
+ Log.warn "#{x}: out>#{s.inspect}"
119
+ when Exception
120
+ Log.warn "#{x}: err>#{s.class}: #{s.message}"
121
+ dropout(s)
122
+ return false
123
+ else
124
+ raise ConnectError, "#{x}: invalid for read: #{s.inspect}"
125
+ end
126
+ true
127
+ end
128
+
129
+ def finish_shells
130
+ @shells.keys.each{|sh| sh.finish_task_q}
131
+ end
132
+
133
+ def dropout(exc=nil)
134
+ # Error output
135
+ err_out = []
136
+ begin
137
+ finish_shells
138
+ @handler.exit
139
+ while s = @rd_err.get_line
140
+ err_out << s
141
+ end
142
+ rescue => e
143
+ m = Log.bt(e)
144
+ #$stderr.puts m
145
+ Log.error(m)
146
+ end
147
+ # Error output
148
+ if !err_out.empty?
149
+ $stderr.puts err_out.join("\n")
150
+ Log.error((["process error output:"]+err_out).join("\n "))
151
+ end
152
+ # Exception
153
+ if exc
154
+ m = Log.bt(exc)
155
+ #$stderr.puts m
156
+ Log.error m
157
+ end
158
+ ensure
159
+ @set.delete(self)
160
+ end
161
+
162
+ def finish
163
+ @iow.close
164
+ while s=@ior.gets
165
+ puts "out=#{s.chomp}"
166
+ end
167
+ while s=@ioe.gets
168
+ puts "err=#{s.chomp}"
169
+ end
170
+ end
171
+
172
+ end
173
+ end
@@ -0,0 +1,100 @@
1
+ require "forwardable"
2
+ require "pwrake/branch/communicator"
3
+
4
+ module Pwrake
5
+ class CommunicatorSet
6
+
7
+ extend Forwardable
8
+
9
+ def initialize(master_rd,selector,option)
10
+ @master_rd = master_rd
11
+ @selector = selector
12
+ @option = option
13
+ @communicators = {}
14
+ @initial_communicators = []
15
+ if hb = @option[:heartbeat]
16
+ @heartbeat_timeout = hb + 15
17
+ end
18
+ end
19
+
20
+ attr_reader :selector
21
+
22
+ def_delegators :@communicators, :each, :each_value, :values, :size
23
+
24
+ def create_communicators
25
+ Fiber.new do
26
+ s = @master_rd.get_line
27
+ if s.chomp != "host_list_begin"
28
+ raise "Branch#setup_worker: recv=#{s.chomp} expected=host_list_begin"
29
+ end
30
+
31
+ while s = @master_rd.get_line
32
+ s.chomp!
33
+ break if s == "host_list_end"
34
+ if /^host:(\d+) (\S+) ([+-]?\d+)?$/ =~ s
35
+ id, host, ncore = $1,$2,$3
36
+ ncore &&= ncore.to_i
37
+ @communicators[id] = Communicator.new(self,id,host,ncore,@selector,@option)
38
+ else
39
+ raise "Branch#setup_worker: recv=#{s.chomp} expected=host:id hostname ncore"
40
+ end
41
+ end
42
+ end.resume
43
+ @selector.run(@heartbeat_timeout)
44
+ @initial_communicators = @communicators.dup
45
+ end
46
+
47
+ def add(comm)
48
+ @communicators[comm.id] = comm
49
+ end
50
+
51
+ def delete(comm)
52
+ @communicators.delete(comm.id)
53
+ @error_host << comm.host
54
+ end
55
+
56
+ def drop(id)
57
+ comm = @communicators[id]
58
+ Log.debug "drop:id=#{id} comm=#{comm.inspect} @communicators.keys=#{@communicators.keys}"
59
+ comm.dropout if comm
60
+ end
61
+
62
+ def drop_all
63
+ Log.debug "drop_all"
64
+ @communicators.keys.each do |id|
65
+ @communicators[id].dropout
66
+ end
67
+ end
68
+
69
+ def finish_shells
70
+ Log.debug "finish_shells"
71
+ @communicators.keys.each do |id|
72
+ @communicators[id].finish_shells
73
+ end
74
+ end
75
+
76
+ def run(message)
77
+ @error_host = []
78
+ n1 = @communicators.size
79
+ @selector.run(@heartbeat_timeout)
80
+ n2 = @communicators.size
81
+ if n1 != n2
82
+ Log.info "# of communicators: #{n1}->#{n2} during #{message.inspect}"
83
+ Log.info "retired hosts=[#{@error_host.join(',')}]"
84
+ end
85
+ end
86
+
87
+ def handler_set
88
+ @communicators.each_value.map{|comm| comm.handler}
89
+ end
90
+
91
+ def kill(sig)
92
+ NBIO::Handler.kill(handler_set,sig)
93
+ end
94
+
95
+ def exit
96
+ NBIO::Handler.exit(handler_set)
97
+ end
98
+
99
+ end
100
+ end
@@ -2,6 +2,9 @@ require 'fiber'
2
2
 
3
3
  module Pwrake
4
4
 
5
+ class FiberQueueError < StandardError
6
+ end
7
+
5
8
  class FiberQueue
6
9
 
7
10
  def initialize
@@ -11,6 +14,9 @@ module Pwrake
11
14
  end
12
15
 
13
16
  def enq(x)
17
+ if @finished
18
+ raise FiberQueueError,"cannot enq to already finished queue"
19
+ end
14
20
  @q.push(x)
15
21
  f = @waiter.shift
16
22
  f.resume if f
@@ -25,6 +31,10 @@ module Pwrake
25
31
  return @q.shift
26
32
  end
27
33
 
34
+ def deq_nonblock
35
+ @q.shift
36
+ end
37
+
28
38
  def finish
29
39
  @finished = true
30
40
  while f = @waiter.shift
@@ -1,3 +1,5 @@
1
+ require 'pwrake/branch/shell_profiler'
2
+
1
3
  module Pwrake
2
4
 
3
5
  class DummyMutex
@@ -20,15 +22,16 @@ module Pwrake
20
22
  BY_FIBER[Fiber.current]
21
23
  end
22
24
 
23
- def initialize(chan,task_q,opt={})
25
+ def initialize(chan,comm,task_q,opt={})
24
26
  @chan = chan
25
- @host = chan.handler.host
27
+ @id = chan.id
28
+ @host = chan.host
29
+ @comm = comm
26
30
  @task_q = task_q
27
31
  @lock = DummyMutex.new
28
- @id = chan.id
29
- #
30
32
  @option = opt
31
33
  @work_dir = @option[:work_dir] || Dir.pwd
34
+ @comm.shells[self] = true
32
35
  end
33
36
 
34
37
  attr_reader :id, :host, :status, :profile
@@ -36,11 +39,12 @@ module Pwrake
36
39
  def open
37
40
  if @opened
38
41
  Log.warn "already opened: host=#{@host} id=#{@id}"
42
+ return
39
43
  end
44
+ @opened = true
40
45
  _puts("open")
41
46
  if (s = _gets) == "open"
42
47
  OPEN_LIST[__id__] = self
43
- @opened = true
44
48
  true
45
49
  else
46
50
  Log.error("Shell#open failed: recieve #{s.inspect}")
@@ -48,19 +52,24 @@ module Pwrake
48
52
  end
49
53
  end
50
54
 
51
- def close
55
+ def exit
52
56
  if !@opened
53
- Log.warn "already closed: host=#{@host} id=#{@id}"
57
+ Log.debug "already exited: host=#{@host} id=#{@id}"
58
+ return
54
59
  end
60
+ @opened = false
55
61
  _puts("exit")
56
62
  if (s = _gets) == "exit"
57
63
  OPEN_LIST.delete(__id__)
58
- @opened = false
64
+ Log.debug("Shell#exit: recieve #{s.inspect}")
59
65
  true
60
66
  else
61
- Log.warn("Shell#close failed: recieve #{s.inspect}")
67
+ Log.debug("Shell#exit: recieve #{s.inspect}")
62
68
  false
63
69
  end
70
+ rescue IOError,Errno::EPIPE => e
71
+ Log.debug("Shell#exit: #{Log.bt(e)}")
72
+ false
64
73
  end
65
74
 
66
75
  def set_current_task(task_id,task_name)
@@ -106,20 +115,17 @@ module Pwrake
106
115
 
107
116
  def _gets
108
117
  s = @chan.get_line
109
- #Log.debug "Shell#_gets(host=#{@host},id=#{@id}): #{s.inspect}"
110
- if s.nil?
111
- begin
112
- raise
113
- rescue => e
114
- Log.debug e
115
- end
118
+ Log.debug "Shell#_gets(host=#{@host},id=#{@id}): #{s.inspect}"
119
+ case s
120
+ when Exception
121
+ @chan.halt
122
+ Log.error Log.bt(s)
116
123
  end
117
124
  s
118
125
  end
119
126
 
120
127
  def _system(cmd)
121
128
  @cmd = cmd
122
- #raise "@chan is closed" if @chan.closed?
123
129
  @lock.synchronize do
124
130
  _puts(cmd)
125
131
  status = io_read_loop{}
@@ -129,11 +135,10 @@ module Pwrake
129
135
 
130
136
  def _backquote(cmd)
131
137
  @cmd = cmd
132
- #raise "@chan is closed" if @chan.closed?
133
138
  a = []
134
139
  @lock.synchronize do
135
140
  _puts(cmd)
136
- status = io_read_loop{|x| a << x}
141
+ @status = io_read_loop{|x| a << x}
137
142
  end
138
143
  a.join("\n")
139
144
  end
@@ -141,9 +146,9 @@ module Pwrake
141
146
  def _execute(cmd,quote=nil,&block)
142
147
  @cmd = cmd
143
148
  if !@opened
144
- raise "closed"
149
+ raise "non opened"
145
150
  end
146
- status = nil
151
+ @status = nil
147
152
  start_time = Time.now
148
153
  begin
149
154
  _puts(cmd)
@@ -184,6 +189,18 @@ module Pwrake
184
189
  end
185
190
  return status
186
191
  end
192
+ when "exit"
193
+ msg = "Shell#io_read_loop: exit"
194
+ $stderr.puts(msg)
195
+ Log.error(msg)
196
+ @chan.halt
197
+ return "exit"
198
+ when IOError
199
+ @chan.halt
200
+ return "ioerror"
201
+ when NBIO::TimeoutError
202
+ @chan.halt
203
+ return "timeout"
187
204
  end
188
205
  msg = "Shell#io_read_loop: Invalid result: #{s.inspect}"
189
206
  $stderr.puts(msg)
@@ -193,7 +210,8 @@ module Pwrake
193
210
 
194
211
  public
195
212
 
196
- def create_fiber(hdl)
213
+ def create_fiber(master_w)
214
+ @master_w = master_w
197
215
  if !@opened
198
216
  Log.warn "not opened: host=#{@host} id=#{@id}"
199
217
  end
@@ -218,13 +236,39 @@ module Pwrake
218
236
  Rake.application.display_error_message(e)
219
237
  Log.error e
220
238
  result = "taskfail:#{@id}:#{task.name}"
239
+ ensure
240
+ master_w.put_line result
221
241
  end
222
- hdl.put_line result
223
242
  end
224
- ensure
225
243
  Log.debug "shell id=#{@id} fiber end"
244
+ master_w.put_line "retire:#{@comm.id}"
245
+ @comm.shells.delete(self)
246
+ exit
247
+ if @comm.shells.empty?
248
+ @comm.dropout
249
+ end
250
+ @chan.halt
251
+ rescue => e
252
+ m = Log.bt(e)
253
+ #$stderr.puts m
254
+ Log.error(m)
255
+ end
256
+ end
257
+ end
258
+
259
+ def finish_task_q
260
+ @task_q.finish
261
+ #Log.debug "finish_task_q: @task_q=#{@task_q.inspect}"
262
+ while task_str = @task_q.deq_nonblock
263
+ if /^(\d+):(.*)$/ =~ task_str
264
+ task_id, task_name = $1.to_i, $2
265
+ else
266
+ raise RuntimeError, "invalid task_str: #{task_str}"
226
267
  end
268
+ @master_w.put_line "taskfail:#{@id}:#{task_name}"
269
+ Log.warn "unexecuted task: #{result}"
227
270
  end
271
+ @chan.halt
228
272
  end
229
273
 
230
274
  end