pwrake 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +22 -9
  3. data/bin/gfwhere-pipe +33 -9
  4. data/bin/pwrake +5 -2
  5. data/bin/pwrake_branch +5 -3
  6. data/lib/pwrake/branch/branch.rb +95 -86
  7. data/lib/pwrake/branch/branch_application.rb +4 -0
  8. data/lib/pwrake/branch/communicator.rb +173 -0
  9. data/lib/pwrake/branch/communicator_set.rb +100 -0
  10. data/lib/pwrake/branch/fiber_queue.rb +10 -0
  11. data/lib/pwrake/branch/shell.rb +68 -24
  12. data/lib/pwrake/branch/shell_profiler.rb +2 -0
  13. data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
  14. data/lib/pwrake/logger.rb +5 -0
  15. data/lib/pwrake/master/master.rb +190 -87
  16. data/lib/pwrake/master/master_application.rb +8 -0
  17. data/lib/pwrake/nbio.rb +525 -0
  18. data/lib/pwrake/option/host_map.rb +36 -4
  19. data/lib/pwrake/option/option.rb +7 -1
  20. data/lib/pwrake/option/option_filesystem.rb +13 -3
  21. data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
  22. data/lib/pwrake/queue/queue_array.rb +31 -11
  23. data/lib/pwrake/queue/task_queue.rb +15 -18
  24. data/lib/pwrake/report/report.rb +2 -0
  25. data/lib/pwrake/task/task_algorithm.rb +4 -1
  26. data/lib/pwrake/task/task_manager.rb +2 -0
  27. data/lib/pwrake/task/task_property.rb +1 -0
  28. data/lib/pwrake/task/task_wrapper.rb +40 -21
  29. data/lib/pwrake/version.rb +1 -1
  30. data/lib/pwrake/worker/invoker.rb +4 -29
  31. data/pwrake.gemspec +3 -2
  32. metadata +24 -12
  33. data/lib/pwrake/branch.rb +0 -22
  34. data/lib/pwrake/branch/worker_communicator.rb +0 -104
  35. data/lib/pwrake/iomux/channel.rb +0 -70
  36. data/lib/pwrake/iomux/handler.rb +0 -124
  37. data/lib/pwrake/iomux/handler_set.rb +0 -35
  38. data/lib/pwrake/iomux/runner.rb +0 -62
  39. data/lib/pwrake/master.rb +0 -30
@@ -1,3 +1,6 @@
1
+ require "pwrake/logger"
2
+ require "pwrake/branch/branch"
3
+
1
4
  module Pwrake
2
5
 
3
6
  # The TaskManager module is a mixin for managing tasks.
@@ -36,6 +39,7 @@ module Pwrake
36
39
  def run_branch_in_thread(r,w,opts)
37
40
  #standard_exception_handling do
38
41
  @branch = Branch.new(opts,r,w)
42
+ @branch.init_logger
39
43
  begin
40
44
  @branch.run
41
45
  rescue => e
@@ -0,0 +1,173 @@
1
+ module Pwrake
2
+
3
+ class CommChannel
4
+
5
+ def initialize(host,id,queue,writer,ios=[])
6
+ @host = host
7
+ @id = id
8
+ @queue = queue
9
+ @writer = writer
10
+ @ios = ios
11
+ end
12
+
13
+ attr_reader :host, :id
14
+
15
+ def put_line(s)
16
+ if $cause_fault
17
+ $cause_fault = nil
18
+ Log.warn("closing writer io caller=\n#{caller.join("\n")}")
19
+ @ios.each{|io| io.close}
20
+ end
21
+ @writer.put_line(s,@id)
22
+ end
23
+
24
+ def get_line
25
+ @queue.deq
26
+ end
27
+
28
+ def halt
29
+ @queue.halt
30
+ @writer.halt
31
+ end
32
+ end
33
+
34
+ class Communicator
35
+
36
+ class ConnectError < IOError; end
37
+
38
+ attr_reader :id, :host, :ncore, :channel
39
+ attr_reader :reader, :writer, :handler
40
+ attr_reader :shells
41
+
42
+ def initialize(set,id,host,ncore,selector,option)
43
+ @set = set
44
+ @id = id
45
+ @host = host
46
+ @ncore = @ncore_given = ncore
47
+ @selector = selector
48
+ @option = option
49
+ @shells = {}
50
+ end
51
+
52
+ def inspect
53
+ "#<#{self.class} @id=#{@id},@host=#{@host},@ncore=#{@ncore}>"
54
+ end
55
+
56
+ def new_channel
57
+ i,q = @reader.new_queue
58
+ CommChannel.new(@host,i,q,@writer,[@ior,@iow,@ioe])
59
+ end
60
+
61
+ def connect(worker_code)
62
+ rb_cmd = "ruby -e 'eval ARGF.read(#{worker_code.size})'"
63
+ if ['localhost','localhost.localdomain','127.0.0.1'].include? @host
64
+ #if /^localhost/ =~ @host
65
+ cmd = rb_cmd
66
+ else
67
+ cmd = "ssh -x -T #{@option[:ssh_option]} #{@host} \"#{rb_cmd}\""
68
+ end
69
+ #
70
+ @ior,w0 = IO.pipe
71
+ @ioe,w1 = IO.pipe
72
+ r2,@iow = IO.pipe
73
+ @pid = Kernel.spawn(cmd,:pgroup=>true,:out=>w0,:err=>w1,:in=>r2)
74
+ w0.close
75
+ w1.close
76
+ r2.close
77
+ sel = @set.selector
78
+ @reader = NBIO::MultiReader.new(sel,@ior)
79
+ @rd_err = NBIO::Reader.new(sel,@ioe)
80
+ @writer = NBIO::Writer.new(sel,@iow)
81
+ @handler = NBIO::Handler.new(@reader,@writer,@host)
82
+ #
83
+ @writer.write(worker_code)
84
+ @writer.write(Marshal.dump(@ncore))
85
+ @writer.write(Marshal.dump(@option))
86
+ # read ncore
87
+ while s = @reader.get_line
88
+ if /^ncore:(.*)$/ =~ s
89
+ a = $1
90
+ Log.debug "ncore=#{a} @#{@host}"
91
+ if /^(\d+)$/ =~ a
92
+ @ncore = $1.to_i
93
+ return false
94
+ else
95
+ raise ConnectError, "invalid for ncore: #{a.inspect}"
96
+ end
97
+ else
98
+ return false if !common_line(s)
99
+ end
100
+ end
101
+ raise ConnectError, "fail to connect #{cmd.inspect}"
102
+ rescue => e
103
+ dropout(e)
104
+ end
105
+
106
+ def common_line(s)
107
+ x = "Communicator#common_line(id=#{@id},host=#{@host})"
108
+ case s
109
+ when /^heartbeat$/
110
+ Log.debug "#{x}: #{s.inspect}"
111
+ @selector.heartbeat(@reader.io)
112
+ when /^exited$/
113
+ Log.debug "#{x}: #{s.inspect}"
114
+ return false
115
+ when /^log:(.*)$/
116
+ Log.info "#{x}: log>#{$1}"
117
+ when String
118
+ Log.warn "#{x}: out>#{s.inspect}"
119
+ when Exception
120
+ Log.warn "#{x}: err>#{s.class}: #{s.message}"
121
+ dropout(s)
122
+ return false
123
+ else
124
+ raise ConnectError, "#{x}: invalid for read: #{s.inspect}"
125
+ end
126
+ true
127
+ end
128
+
129
+ def finish_shells
130
+ @shells.keys.each{|sh| sh.finish_task_q}
131
+ end
132
+
133
+ def dropout(exc=nil)
134
+ # Error output
135
+ err_out = []
136
+ begin
137
+ finish_shells
138
+ @handler.exit
139
+ while s = @rd_err.get_line
140
+ err_out << s
141
+ end
142
+ rescue => e
143
+ m = Log.bt(e)
144
+ #$stderr.puts m
145
+ Log.error(m)
146
+ end
147
+ # Error output
148
+ if !err_out.empty?
149
+ $stderr.puts err_out.join("\n")
150
+ Log.error((["process error output:"]+err_out).join("\n "))
151
+ end
152
+ # Exception
153
+ if exc
154
+ m = Log.bt(exc)
155
+ #$stderr.puts m
156
+ Log.error m
157
+ end
158
+ ensure
159
+ @set.delete(self)
160
+ end
161
+
162
+ def finish
163
+ @iow.close
164
+ while s=@ior.gets
165
+ puts "out=#{s.chomp}"
166
+ end
167
+ while s=@ioe.gets
168
+ puts "err=#{s.chomp}"
169
+ end
170
+ end
171
+
172
+ end
173
+ end
@@ -0,0 +1,100 @@
1
+ require "forwardable"
2
+ require "pwrake/branch/communicator"
3
+
4
+ module Pwrake
5
+ class CommunicatorSet
6
+
7
+ extend Forwardable
8
+
9
+ def initialize(master_rd,selector,option)
10
+ @master_rd = master_rd
11
+ @selector = selector
12
+ @option = option
13
+ @communicators = {}
14
+ @initial_communicators = []
15
+ if hb = @option[:heartbeat]
16
+ @heartbeat_timeout = hb + 15
17
+ end
18
+ end
19
+
20
+ attr_reader :selector
21
+
22
+ def_delegators :@communicators, :each, :each_value, :values, :size
23
+
24
+ def create_communicators
25
+ Fiber.new do
26
+ s = @master_rd.get_line
27
+ if s.chomp != "host_list_begin"
28
+ raise "Branch#setup_worker: recv=#{s.chomp} expected=host_list_begin"
29
+ end
30
+
31
+ while s = @master_rd.get_line
32
+ s.chomp!
33
+ break if s == "host_list_end"
34
+ if /^host:(\d+) (\S+) ([+-]?\d+)?$/ =~ s
35
+ id, host, ncore = $1,$2,$3
36
+ ncore &&= ncore.to_i
37
+ @communicators[id] = Communicator.new(self,id,host,ncore,@selector,@option)
38
+ else
39
+ raise "Branch#setup_worker: recv=#{s.chomp} expected=host:id hostname ncore"
40
+ end
41
+ end
42
+ end.resume
43
+ @selector.run(@heartbeat_timeout)
44
+ @initial_communicators = @communicators.dup
45
+ end
46
+
47
+ def add(comm)
48
+ @communicators[comm.id] = comm
49
+ end
50
+
51
+ def delete(comm)
52
+ @communicators.delete(comm.id)
53
+ @error_host << comm.host
54
+ end
55
+
56
+ def drop(id)
57
+ comm = @communicators[id]
58
+ Log.debug "drop:id=#{id} comm=#{comm.inspect} @communicators.keys=#{@communicators.keys}"
59
+ comm.dropout if comm
60
+ end
61
+
62
+ def drop_all
63
+ Log.debug "drop_all"
64
+ @communicators.keys.each do |id|
65
+ @communicators[id].dropout
66
+ end
67
+ end
68
+
69
+ def finish_shells
70
+ Log.debug "finish_shells"
71
+ @communicators.keys.each do |id|
72
+ @communicators[id].finish_shells
73
+ end
74
+ end
75
+
76
+ def run(message)
77
+ @error_host = []
78
+ n1 = @communicators.size
79
+ @selector.run(@heartbeat_timeout)
80
+ n2 = @communicators.size
81
+ if n1 != n2
82
+ Log.info "# of communicators: #{n1}->#{n2} during #{message.inspect}"
83
+ Log.info "retired hosts=[#{@error_host.join(',')}]"
84
+ end
85
+ end
86
+
87
+ def handler_set
88
+ @communicators.each_value.map{|comm| comm.handler}
89
+ end
90
+
91
+ def kill(sig)
92
+ NBIO::Handler.kill(handler_set,sig)
93
+ end
94
+
95
+ def exit
96
+ NBIO::Handler.exit(handler_set)
97
+ end
98
+
99
+ end
100
+ end
@@ -2,6 +2,9 @@ require 'fiber'
2
2
 
3
3
  module Pwrake
4
4
 
5
+ class FiberQueueError < StandardError
6
+ end
7
+
5
8
  class FiberQueue
6
9
 
7
10
  def initialize
@@ -11,6 +14,9 @@ module Pwrake
11
14
  end
12
15
 
13
16
  def enq(x)
17
+ if @finished
18
+ raise FiberQueueError,"cannot enq to already finished queue"
19
+ end
14
20
  @q.push(x)
15
21
  f = @waiter.shift
16
22
  f.resume if f
@@ -25,6 +31,10 @@ module Pwrake
25
31
  return @q.shift
26
32
  end
27
33
 
34
+ def deq_nonblock
35
+ @q.shift
36
+ end
37
+
28
38
  def finish
29
39
  @finished = true
30
40
  while f = @waiter.shift
@@ -1,3 +1,5 @@
1
+ require 'pwrake/branch/shell_profiler'
2
+
1
3
  module Pwrake
2
4
 
3
5
  class DummyMutex
@@ -20,15 +22,16 @@ module Pwrake
20
22
  BY_FIBER[Fiber.current]
21
23
  end
22
24
 
23
- def initialize(chan,task_q,opt={})
25
+ def initialize(chan,comm,task_q,opt={})
24
26
  @chan = chan
25
- @host = chan.handler.host
27
+ @id = chan.id
28
+ @host = chan.host
29
+ @comm = comm
26
30
  @task_q = task_q
27
31
  @lock = DummyMutex.new
28
- @id = chan.id
29
- #
30
32
  @option = opt
31
33
  @work_dir = @option[:work_dir] || Dir.pwd
34
+ @comm.shells[self] = true
32
35
  end
33
36
 
34
37
  attr_reader :id, :host, :status, :profile
@@ -36,11 +39,12 @@ module Pwrake
36
39
  def open
37
40
  if @opened
38
41
  Log.warn "already opened: host=#{@host} id=#{@id}"
42
+ return
39
43
  end
44
+ @opened = true
40
45
  _puts("open")
41
46
  if (s = _gets) == "open"
42
47
  OPEN_LIST[__id__] = self
43
- @opened = true
44
48
  true
45
49
  else
46
50
  Log.error("Shell#open failed: recieve #{s.inspect}")
@@ -48,19 +52,24 @@ module Pwrake
48
52
  end
49
53
  end
50
54
 
51
- def close
55
+ def exit
52
56
  if !@opened
53
- Log.warn "already closed: host=#{@host} id=#{@id}"
57
+ Log.debug "already exited: host=#{@host} id=#{@id}"
58
+ return
54
59
  end
60
+ @opened = false
55
61
  _puts("exit")
56
62
  if (s = _gets) == "exit"
57
63
  OPEN_LIST.delete(__id__)
58
- @opened = false
64
+ Log.debug("Shell#exit: recieve #{s.inspect}")
59
65
  true
60
66
  else
61
- Log.warn("Shell#close failed: recieve #{s.inspect}")
67
+ Log.debug("Shell#exit: recieve #{s.inspect}")
62
68
  false
63
69
  end
70
+ rescue IOError,Errno::EPIPE => e
71
+ Log.debug("Shell#exit: #{Log.bt(e)}")
72
+ false
64
73
  end
65
74
 
66
75
  def set_current_task(task_id,task_name)
@@ -106,20 +115,17 @@ module Pwrake
106
115
 
107
116
  def _gets
108
117
  s = @chan.get_line
109
- #Log.debug "Shell#_gets(host=#{@host},id=#{@id}): #{s.inspect}"
110
- if s.nil?
111
- begin
112
- raise
113
- rescue => e
114
- Log.debug e
115
- end
118
+ Log.debug "Shell#_gets(host=#{@host},id=#{@id}): #{s.inspect}"
119
+ case s
120
+ when Exception
121
+ @chan.halt
122
+ Log.error Log.bt(s)
116
123
  end
117
124
  s
118
125
  end
119
126
 
120
127
  def _system(cmd)
121
128
  @cmd = cmd
122
- #raise "@chan is closed" if @chan.closed?
123
129
  @lock.synchronize do
124
130
  _puts(cmd)
125
131
  status = io_read_loop{}
@@ -129,11 +135,10 @@ module Pwrake
129
135
 
130
136
  def _backquote(cmd)
131
137
  @cmd = cmd
132
- #raise "@chan is closed" if @chan.closed?
133
138
  a = []
134
139
  @lock.synchronize do
135
140
  _puts(cmd)
136
- status = io_read_loop{|x| a << x}
141
+ @status = io_read_loop{|x| a << x}
137
142
  end
138
143
  a.join("\n")
139
144
  end
@@ -141,9 +146,9 @@ module Pwrake
141
146
  def _execute(cmd,quote=nil,&block)
142
147
  @cmd = cmd
143
148
  if !@opened
144
- raise "closed"
149
+ raise "non opened"
145
150
  end
146
- status = nil
151
+ @status = nil
147
152
  start_time = Time.now
148
153
  begin
149
154
  _puts(cmd)
@@ -184,6 +189,18 @@ module Pwrake
184
189
  end
185
190
  return status
186
191
  end
192
+ when "exit"
193
+ msg = "Shell#io_read_loop: exit"
194
+ $stderr.puts(msg)
195
+ Log.error(msg)
196
+ @chan.halt
197
+ return "exit"
198
+ when IOError
199
+ @chan.halt
200
+ return "ioerror"
201
+ when NBIO::TimeoutError
202
+ @chan.halt
203
+ return "timeout"
187
204
  end
188
205
  msg = "Shell#io_read_loop: Invalid result: #{s.inspect}"
189
206
  $stderr.puts(msg)
@@ -193,7 +210,8 @@ module Pwrake
193
210
 
194
211
  public
195
212
 
196
- def create_fiber(hdl)
213
+ def create_fiber(master_w)
214
+ @master_w = master_w
197
215
  if !@opened
198
216
  Log.warn "not opened: host=#{@host} id=#{@id}"
199
217
  end
@@ -218,13 +236,39 @@ module Pwrake
218
236
  Rake.application.display_error_message(e)
219
237
  Log.error e
220
238
  result = "taskfail:#{@id}:#{task.name}"
239
+ ensure
240
+ master_w.put_line result
221
241
  end
222
- hdl.put_line result
223
242
  end
224
- ensure
225
243
  Log.debug "shell id=#{@id} fiber end"
244
+ master_w.put_line "retire:#{@comm.id}"
245
+ @comm.shells.delete(self)
246
+ exit
247
+ if @comm.shells.empty?
248
+ @comm.dropout
249
+ end
250
+ @chan.halt
251
+ rescue => e
252
+ m = Log.bt(e)
253
+ #$stderr.puts m
254
+ Log.error(m)
255
+ end
256
+ end
257
+ end
258
+
259
+ def finish_task_q
260
+ @task_q.finish
261
+ #Log.debug "finish_task_q: @task_q=#{@task_q.inspect}"
262
+ while task_str = @task_q.deq_nonblock
263
+ if /^(\d+):(.*)$/ =~ task_str
264
+ task_id, task_name = $1.to_i, $2
265
+ else
266
+ raise RuntimeError, "invalid task_str: #{task_str}"
226
267
  end
268
+ @master_w.put_line "taskfail:#{@id}:#{task_name}"
269
+ Log.warn "unexecuted task: #{result}"
227
270
  end
271
+ @chan.halt
228
272
  end
229
273
 
230
274
  end