pwrake 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +22 -9
  3. data/bin/gfwhere-pipe +33 -9
  4. data/bin/pwrake +5 -2
  5. data/bin/pwrake_branch +5 -3
  6. data/lib/pwrake/branch/branch.rb +95 -86
  7. data/lib/pwrake/branch/branch_application.rb +4 -0
  8. data/lib/pwrake/branch/communicator.rb +173 -0
  9. data/lib/pwrake/branch/communicator_set.rb +100 -0
  10. data/lib/pwrake/branch/fiber_queue.rb +10 -0
  11. data/lib/pwrake/branch/shell.rb +68 -24
  12. data/lib/pwrake/branch/shell_profiler.rb +2 -0
  13. data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
  14. data/lib/pwrake/logger.rb +5 -0
  15. data/lib/pwrake/master/master.rb +190 -87
  16. data/lib/pwrake/master/master_application.rb +8 -0
  17. data/lib/pwrake/nbio.rb +525 -0
  18. data/lib/pwrake/option/host_map.rb +36 -4
  19. data/lib/pwrake/option/option.rb +7 -1
  20. data/lib/pwrake/option/option_filesystem.rb +13 -3
  21. data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
  22. data/lib/pwrake/queue/queue_array.rb +31 -11
  23. data/lib/pwrake/queue/task_queue.rb +15 -18
  24. data/lib/pwrake/report/report.rb +2 -0
  25. data/lib/pwrake/task/task_algorithm.rb +4 -1
  26. data/lib/pwrake/task/task_manager.rb +2 -0
  27. data/lib/pwrake/task/task_property.rb +1 -0
  28. data/lib/pwrake/task/task_wrapper.rb +40 -21
  29. data/lib/pwrake/version.rb +1 -1
  30. data/lib/pwrake/worker/invoker.rb +4 -29
  31. data/pwrake.gemspec +3 -2
  32. metadata +24 -12
  33. data/lib/pwrake/branch.rb +0 -22
  34. data/lib/pwrake/branch/worker_communicator.rb +0 -104
  35. data/lib/pwrake/iomux/channel.rb +0 -70
  36. data/lib/pwrake/iomux/handler.rb +0 -124
  37. data/lib/pwrake/iomux/handler_set.rb +0 -35
  38. data/lib/pwrake/iomux/runner.rb +0 -62
  39. data/lib/pwrake/master.rb +0 -30
@@ -8,24 +8,37 @@ module Pwrake
8
8
  @weight = weight || 1.0
9
9
  @group = group || 0
10
10
  @id = id
11
+ @continuous_fail = 0
12
+ @total_fail = 0
13
+ @count_task = 0
11
14
  end
12
15
 
13
16
  attr_reader :name, :ncore, :weight, :group, :id, :steal_flag
14
17
  attr_accessor :idle_cores
15
18
 
16
19
  def set_ncore(n)
20
+ @retire = 0
21
+ @busy_cores = 0
17
22
  @ncore = @idle_cores = n
18
23
  end
19
24
 
20
- def increase(n)
25
+ def idle(n)
26
+ @busy_cores -= n
21
27
  @idle_cores += n
28
+ @idle_cores -= @retire
29
+ @retire = 0
30
+ @idle_cores + @busy_cores < 1 # all retired
31
+ end
32
+
33
+ def busy(n)
34
+ @busy_cores += n
35
+ @idle_cores -= n
36
+ @idle_cores + @busy_cores < 1 # all retired
22
37
  end
23
38
 
24
39
  def decrease(n)
25
40
  @idle_cores -= n
26
- if @idle_cores < 0
27
- raise RuntimeError,"# of cores must be non-negative"
28
- end
41
+ @idle_cores + @busy_cores < 1 # all retired
29
42
  end
30
43
 
31
44
  def steal_phase
@@ -34,6 +47,25 @@ module Pwrake
34
47
  @steal_flag = false
35
48
  t
36
49
  end
50
+
51
+ def retire(n)
52
+ @retire += n
53
+ Log.debug "retire n=#{n}, host=#{@name}"
54
+ end
55
+
56
+ def task_result(result)
57
+ @count_task += 1
58
+ case result
59
+ when "end"
60
+ @continuous_fail = 0
61
+ when "fail"
62
+ @continuous_fail += 1
63
+ @total_fail += 1
64
+ else
65
+ raise "unknown result: #{result}"
66
+ end
67
+ @continuous_fail
68
+ end
37
69
  end
38
70
 
39
71
  class HostMap < Hash
@@ -1,3 +1,7 @@
1
+ require "pathname"
2
+ require "yaml"
3
+ require "pwrake/option/host_map"
4
+
1
5
  module Pwrake
2
6
 
3
7
  START_TIME = Time.now
@@ -178,7 +182,7 @@ module Pwrake
178
182
  ['NUM_THREADS', proc{|v| v && v.to_i}],
179
183
  ['SHELL_START_INTERVAL', proc{|v| (v || 0.012).to_f}],
180
184
  ['HEARTBEAT', proc{|v| (v || 240).to_i}],
181
- ['RETRY', proc{|v| (v || 0).to_i}],
185
+ ['RETRY', proc{|v| (v || 1).to_i}],
182
186
  ['DISABLE_AFFINITY', proc{|v| v || ENV['AFFINITY']=='off'}],
183
187
  ['DISABLE_STEAL', proc{|v| v || ENV['STEAL']=='off'}],
184
188
  ['GFARM_BASEDIR', proc{|v| v || '/tmp'}],
@@ -356,3 +360,5 @@ done
356
360
 
357
361
  end
358
362
  end
363
+
364
+ require "pwrake/option/option_filesystem"
@@ -1,3 +1,6 @@
1
+ require "pwrake/option/option_filesystem"
2
+ require "parallel"
3
+
1
4
  module Pwrake
2
5
 
3
6
  class Option
@@ -8,7 +11,14 @@ module Pwrake
8
11
 
9
12
  def setup_filesystem
10
13
 
11
- @worker_progs = %w[ writer log_executor executor invoker shared_directory ]
14
+ @worker_progs = %w[
15
+ parallel/processor_count
16
+ pwrake/worker/writer
17
+ pwrake/worker/log_executor
18
+ pwrake/worker/executor
19
+ pwrake/worker/invoker
20
+ pwrake/worker/shared_directory
21
+ ]
12
22
  @worker_option = {
13
23
  :base_dir => "",
14
24
  :work_dir => self['WORK_DIR'],
@@ -47,7 +57,7 @@ module Pwrake
47
57
  :gfarm2fs_debug_wait => self['GFARM2FS_DEBUG_WAIT'],
48
58
  :single_mp => self['GFARM_SINGLE_MP']
49
59
  })
50
- @worker_progs << "gfarm_directory"
60
+ @worker_progs.push "pwrake/worker/gfarm_directory"
51
61
 
52
62
  if self['DISABLE_AFFINITY']
53
63
  @queue_class = "TaskQueue"
@@ -61,7 +71,7 @@ module Pwrake
61
71
  #@num_noaction_threads = (n_noaction_th || 1).to_i
62
72
  @worker_option[:shared_directory] = "SharedDirectory"
63
73
  end
64
- @worker_progs << "worker_main"
74
+ @worker_progs.push "pwrake/worker/worker_main"
65
75
  Log.debug "@queue_class=#{@queue_class}"
66
76
  end
67
77
 
@@ -6,9 +6,13 @@ module Pwrake
6
6
  # group_map = {gid1=>[hid1,hid2,...], ...}
7
7
  @size_q = 0
8
8
  @q = {}
9
- @host_map.by_id.each{|h| @q[h.id] = @array_class.new(h.ncore)}
9
+ @hostinfo_by_name = {}
10
+ @hostinfo_by_id.each do |id,h|
11
+ @hostinfo_by_name[h.name] = h
12
+ @q[id] = @array_class.new(h.ncore)
13
+ end
10
14
  @q_group = {}
11
- group_map ||= {1=>@host_map.by_id.map{|h| h.id}}
15
+ group_map ||= {1=>@hostinfo_by_id.map{|id,h| id}}
12
16
  group_map.each do |gid,ary|
13
17
  q1 = {} # same group
14
18
  q2 = @q.dup # other groups
@@ -31,7 +35,7 @@ module Pwrake
31
35
  else
32
36
  stored = false
33
37
  hints.each do |h|
34
- host_info = @host_map.by_name[h]
38
+ host_info = @hostinfo_by_name[h]
35
39
  if host_info && q = @q[host_info.id]
36
40
  t.assigned.push(host_info.id)
37
41
  q.push(t)
@@ -88,7 +92,9 @@ module Pwrake
88
92
  t = q.shift(run_host)
89
93
  if t
90
94
  t.assigned.each do |h|
91
- @q[h].delete(t)
95
+ if q_h = @q[h]
96
+ q_h.delete(t)
97
+ end
92
98
  end
93
99
  @size_q -= 1
94
100
  end
@@ -113,7 +119,7 @@ module Pwrake
113
119
  end
114
120
  end
115
121
  if max_num > 0
116
- max_info = @host_map.by_id[max_host]
122
+ max_info = @hostinfo_by_id[max_host]
117
123
  Log.debug "deq_steal max_host=#{max_info.name} max_num=#{max_num}"
118
124
  t = host_info.steal_phase{|h| deq_locate(max_info,h)}
119
125
  #Log.debug "deq_steal task=#{t.inspect}"
@@ -131,7 +137,12 @@ module Pwrake
131
137
  n = 0
132
138
  @q.each do |h,q|
133
139
  if q.size > 0
134
- s << _qstr(@host_map.by_id[h].name,q)
140
+ hinfo = @hostinfo_by_id[h]
141
+ if hinfo
142
+ s << _qstr(hinfo.name,q)
143
+ else
144
+ s << _qstr("(#{hinfo.inspect})",q)
145
+ end
135
146
  else
136
147
  n += 1
137
148
  end
@@ -139,6 +150,7 @@ module Pwrake
139
150
  end
140
151
  s << _qstr("local*#{n}",[]) if n > 0
141
152
  s << _qstr("remote",@q_remote)
153
+ s << "@size_q=#{@size_q}"
142
154
  s
143
155
  end
144
156
 
@@ -154,5 +166,28 @@ module Pwrake
154
166
  @q_remote.empty?
155
167
  end
156
168
 
169
+ def drop_host(host_info)
170
+ hid = host_info.id
171
+ if q_drop = @q.delete(hid)
172
+ n_move = 0
173
+ q_size = q_drop.size
174
+ while t = q_drop.shift
175
+ assigned_other = false
176
+ t.assigned.each do |h|
177
+ if h != hid && @q[h]
178
+ assigned_other = true
179
+ break
180
+ end
181
+ end
182
+ if !assigned_other
183
+ @size_q -= 1
184
+ @q_remote.push(t)
185
+ n_move += 1
186
+ end
187
+ end
188
+ Log.debug "LAQ#drop_host: host=#{host_info.name} q.size=#{q_size} n_move=#{n_move}"
189
+ end
190
+ end
191
+
157
192
  end
158
193
  end
@@ -1,4 +1,5 @@
1
1
  require "forwardable"
2
+ require "pwrake/task/task_rank"
2
3
 
3
4
  module Pwrake
4
5
 
@@ -65,13 +66,20 @@ module Pwrake
65
66
  super()
66
67
  end
67
68
 
68
- def shift(host_info)
69
+ def shift(host_info=nil)
70
+ return super() unless host_info
71
+ tw_found = nil
69
72
  (size-1).downto(0) do |i|
70
- if at(i).acceptable_for(host_info)
71
- return delete_at(i)
73
+ tw = at(i)
74
+ if tw.acceptable_for(host_info)
75
+ if tw.untried_host?(host_info)
76
+ return delete_at(i)
77
+ else
78
+ tw_found ||= tw
79
+ end
72
80
  end
73
81
  end
74
- nil
82
+ tw_found
75
83
  end
76
84
  end
77
85
 
@@ -80,13 +88,20 @@ module Pwrake
80
88
  super()
81
89
  end
82
90
 
83
- def shift(host_info)
91
+ def shift(host_info=nil)
92
+ return super() unless host_info
93
+ tw_found = nil
84
94
  size.times do |i|
85
- if at(i).acceptable_for(host_info)
86
- return delete_at(i)
95
+ tw = at(i)
96
+ if tw.acceptable_for(host_info)
97
+ if tw.untried_host?(host_info)
98
+ return delete_at(i)
99
+ else
100
+ tw_found ||= tw
101
+ end
87
102
  end
88
103
  end
89
- nil
104
+ tw_found
90
105
  end
91
106
  end
92
107
 
@@ -157,13 +172,18 @@ module Pwrake
157
172
  end
158
173
 
159
174
  def pop_last_rank(r,host_info)
175
+ tw_found = nil
160
176
  (size-1).downto(0) do |i|
161
177
  tw = at(i)
162
178
  if tw.rank == r && tw.acceptable_for(host_info)
163
- return delete_at(i)
179
+ if tw.untried_host?(host_info)
180
+ return delete_at(i)
181
+ else
182
+ tw_found ||= tw
183
+ end
164
184
  end
165
185
  end
166
- nil
186
+ tw_found
167
187
  end
168
188
 
169
189
  def hrf_delete(t)
@@ -197,7 +217,7 @@ module Pwrake
197
217
  hrf_push(t)
198
218
  end
199
219
 
200
- def shift(host_info)
220
+ def shift(host_info=nil)
201
221
  return nil if empty?
202
222
  hrf_get(host_info)
203
223
  end
@@ -1,15 +1,15 @@
1
+ require "pwrake/queue/queue_array"
2
+ require "pwrake/queue/no_action_queue"
3
+
1
4
  module Pwrake
2
5
 
3
6
  class TaskQueue
4
7
 
5
- def initialize(host_map, group_map=nil)
6
- @q = []
7
- @empty = []
8
-
8
+ def initialize(hostinfo_by_id, group_map=nil)
9
9
  @enable_steal = true
10
10
  @q_no_action = NoActionQueue.new
11
11
 
12
- @host_map = host_map
12
+ @hostinfo_by_id = hostinfo_by_id
13
13
 
14
14
  pri = Rake.application.pwrake_options['QUEUE_PRIORITY'] || "LIHR"
15
15
  case pri
@@ -62,22 +62,22 @@ module Pwrake
62
62
  end
63
63
 
64
64
  def deq_noaction_task(&block)
65
- Log.debug "deq_task:"+(empty? ? " empty" : "\n#{inspect_q}")
65
+ Log.debug "deq_noaction_task:"+(empty? ? " (empty)" : "\n#{inspect_q}")
66
66
  while tw = @q_no_action.shift
67
67
  Log.debug "deq_noaction: #{tw.name}"
68
- yield(tw,nil)
68
+ yield(tw)
69
69
  end
70
70
  end
71
71
 
72
72
  def deq_task(&block) # locality version
73
- Log.debug "deq_task:"+(empty? ? " empty" : "\n#{inspect_q}")
73
+ Log.debug "deq_task:"+(empty? ? " (empty)" : "\n#{inspect_q}")
74
74
  queued = 0
75
75
  @n_turn.times do |turn|
76
76
  next if turn_empty?(turn)
77
77
  queued += deq_turn(turn,&block)
78
78
  end
79
79
  if queued>0
80
- Log.debug "queued:#{queued}"
80
+ Log.debug "deq_task: queued=#{queued}"
81
81
  end
82
82
  end
83
83
 
@@ -85,9 +85,9 @@ module Pwrake
85
85
  queued = 0
86
86
  while true
87
87
  count = 0
88
- @host_map.by_id.each do |host_info|
88
+ @hostinfo_by_id.each_value do |host_info|
89
89
  #Log.debug "TaskQueue#deq_turn host_info=#{host_info.name}"
90
- if host_info.idle_cores > 0
90
+ if (n = host_info.idle_cores) && n > 0
91
91
  if turn_empty?(turn)
92
92
  return queued
93
93
  elsif tw = deq_impl(host_info,turn)
@@ -99,8 +99,7 @@ module Pwrake
99
99
  Log.fatal m
100
100
  raise RuntimeError,m
101
101
  else
102
- host_info.decrease(n_task_cores)
103
- yield(tw,host_info.id)
102
+ yield(tw,host_info,n_task_cores)
104
103
  count += 1
105
104
  queued += 1
106
105
  end
@@ -134,11 +133,6 @@ module Pwrake
134
133
  @q_no_input.empty?
135
134
  end
136
135
 
137
- def task_end(tw, hid)
138
- host_info = @host_map.by_id[hid]
139
- host_info.increase(tw.n_used_cores(host_info))
140
- end
141
-
142
136
  def _qstr(h,q)
143
137
  s = " #{h}: size=#{q.size} "
144
138
  case q.size
@@ -160,5 +154,8 @@ module Pwrake
160
154
  _qstr("no_input",@q_no_input)
161
155
  end
162
156
 
157
+ def drop_host(host_info)
158
+ end
159
+
163
160
  end
164
161
  end
@@ -1,3 +1,5 @@
1
+ require "csv"
2
+
1
3
  module Pwrake
2
4
 
3
5
  class Report
@@ -9,6 +9,7 @@ module Pwrake
9
9
  attr_reader :subsequents
10
10
  attr_reader :arguments
11
11
  attr_reader :property
12
+ attr_reader :unfinished_prereq
12
13
 
13
14
  def pw_search_tasks(args)
14
15
  Log.debug "#{self.class}#pw_search_tasks start, args=#{args.inspect}"
@@ -81,10 +82,12 @@ module Pwrake
81
82
  private :format_search_flags
82
83
 
83
84
  def pw_enq_subsequents
84
- t = Time.now
85
+ #t = Time.now
85
86
  #h = application.pwrake_options['HALT_QUEUE_WHILE_SEARCH']
86
87
  #application.task_queue.synchronize(h) do
87
88
  @subsequents.each do |t| # <<--- competition !!!
89
+ #u = t.unfinished_prereq.keys
90
+ #Log.debug "enq_subseq: self=#{self.name} subseq=#{t.name} @unfin_preq=#{u.inspect}"
88
91
  if t && t.check_prereq_finished(self.name)
89
92
  application.task_queue.enq(t.wrapper)
90
93
  end
@@ -1,3 +1,5 @@
1
+ require "pwrake/task/task_property"
2
+
1
3
  module Pwrake
2
4
 
3
5
  module TaskManager
@@ -39,6 +39,7 @@ module Pwrake
39
39
  end
40
40
 
41
41
  def acceptable_for(host_info)
42
+ return true unless host_info
42
43
  if @disable_steal && host_info.steal_flag
43
44
  #Log.debug("@disable_steal && host_info.steal_flag")
44
45
  return false
@@ -1,4 +1,6 @@
1
- require 'forwardable'
1
+ require "forwardable"
2
+ require "csv"
3
+ require "pwrake/task/task_rank"
2
4
 
3
5
  module Pwrake
4
6
 
@@ -16,18 +18,19 @@ module Pwrake
16
18
  @@current_id += 1
17
19
  @location = []
18
20
  @group = []
19
- @group_id
21
+ @group_id = nil
20
22
  @suggest_location = nil
21
- @file_stat
22
- @input_file_size
23
- @input_file_mtime
24
- @rank
25
- @priority
23
+ @file_stat = nil
24
+ @input_file_size = nil
25
+ @input_file_mtime = nil
26
+ @rank = nil
27
+ @priority = nil
26
28
  @lock_rank = Monitor.new
27
29
  @executed = false
28
30
  @assigned = []
29
31
  @exec_host = nil
30
- @nretry = @property.retry || Rake.application.pwrake_options["RETRY"] || 0
32
+ @tried_hosts = []
33
+ @n_retry = @property.retry || Rake.application.pwrake_options["RETRY"] || 1
31
34
  end
32
35
 
33
36
  def_delegators :@task, :name, :actions, :prerequisites, :subsequents
@@ -36,6 +39,7 @@ module Pwrake
36
39
  attr_reader :task, :task_id, :group, :group_id, :file_stat
37
40
  attr_reader :location
38
41
  attr_reader :assigned
42
+ attr_reader :tried_hosts
39
43
  attr_accessor :executed
40
44
  attr_accessor :exec_host
41
45
  attr_accessor :shell_id, :status
@@ -66,24 +70,19 @@ module Pwrake
66
70
  @time_start = Time.now
67
71
  end
68
72
 
69
- def retry
70
- if @nretry > 0
71
- s="retry task: #{name}"
72
- Log.debug(s)
73
- $stderr.puts(s)
74
- @nretry -= 1
75
- Rake.application.task_queue.enq(self)
76
- true
77
- else
78
- false
79
- end
73
+ def retry?
74
+ @status != "end" && @n_retry > 0
75
+ end
76
+
77
+ def no_more_retry
78
+ @n_retry == 0
80
79
  end
81
80
 
82
81
  def postprocess(location)
83
82
  @executed = true if !@task.actions.empty?
84
- tm_taskend = Time.now
83
+ #tm_taskend = Time.now
85
84
  if is_file_task?
86
- t = Time.now
85
+ #t = Time.now
87
86
  if File.exist?(name)
88
87
  @file_stat = File::Stat.new(name)
89
88
  @location = location
@@ -92,8 +91,23 @@ module Pwrake
92
91
  #Log.debug "postprocess time=#{Time.now-tm_taskend}"
93
92
  log_task
94
93
  @shell.current_task = nil if @shell
94
+ end
95
+
96
+ def retry_or_subsequent
97
+ @tried_hosts << @exec_host
95
98
  if @status=="end"
96
99
  @task.pw_enq_subsequents
100
+ elsif @n_retry > 0
101
+ @suggest_location = []
102
+ s="retry task (retry_count=#{@n_retry}): #{name}"
103
+ Log.warn(s)
104
+ $stderr.puts(s)
105
+ @n_retry -= 1
106
+ Rake.application.task_queue.enq(self)
107
+ else
108
+ s="give up retry (retry_count=0): #{name}"
109
+ Log.error(s)
110
+ $stderr.puts(s)
97
111
  end
98
112
  end
99
113
 
@@ -306,5 +320,10 @@ module Pwrake
306
320
  @n_used_cores ||= @property.n_used_cores(host_info)
307
321
  end
308
322
 
323
+ def untried_host?(host_info)
324
+ return true unless host_info
325
+ !@tried_hosts.include?(host_info.name)
326
+ end
327
+
309
328
  end
310
329
  end