pwrake 2.2.9 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 93b60deb41a4fda0bafa94a5b240812e5c5925ab204f8b802ef63d78fbcf81b2
4
- data.tar.gz: 6ac5c66ad22c8ba3316bca3aeab8b34fe0a4e9189f50a99b2056c5a8cced7277
3
+ metadata.gz: 99a7b5b9f4ecf97b511f58495f6b22facbd66c97d79aa0cab9f452d96ae44ea4
4
+ data.tar.gz: 2874a8eec50b7f508138567899da703e4c5d18687d58ad53a715ffa81b7fa402
5
5
  SHA512:
6
- metadata.gz: 7b0d50fb5cb90b8c7682be343c4ef70acfb324d01b46903f39606f8e044d3315ae7cf7a6cd8e4074e5b5178b3e16aa224c827f53b1ef8ea74a27b6176f84b39b
7
- data.tar.gz: 90911a12880e22276be78feae3e13b850d9614f93fc30195a7fb253e730a369bc51783778524aca2213672fca6b290c7622bdf503278f0e04dfe2eb29d46acb5
6
+ metadata.gz: c49bcd791f52010a2a221118bec1f59c08796c053ef43127f68142bd97d95499d23becf2c3d6ab9f351bc48384300999824fd3bd0f6fe1e94cba5de6b291664a
7
+ data.tar.gz: af1ecd762e914f1e891a40639ab8f59a55165132646816ccb1a5209e9c761a518cb09f0c4385086f23bb7d172e658127852780577d748baa0c83313edb72e848
data/README.md CHANGED
@@ -131,10 +131,13 @@ In this case, you need the rehash of command paths:
131
131
  SSH_OPTION SSH option
132
132
  PASS_ENV (Array) Environment variables passed to SSH
133
133
  HEARTBEAT default=240 - Hearbeat interval in seconds
134
- RETRY default=1 - The number of retry
134
+ RETRY default=1 - The number of task retry
135
+ HOST_FAILURE default=2 - The number of allowed continuous host failure (since v2.3)
135
136
  FAILED_TARGET rename(default)|delete|leave - Treatment of failed target files
136
137
  FAILURE_TERMINATION wait(default)|kill|continue - Behavior of other tasks when a task is failed
137
- QUEUE_PRIORITY LIHR(default)|FIFO|LIFO|RANK
138
+ QUEUE_PRIORITY LIFO(default)|FIFO|LIHR(LIfo&Highest-Rank-first; obsolete)
139
+ DISABLE_RANK_PRIORITY false(default)|true - Disable rank-aware task scheduling (since v2.3)
140
+ RESERVE_NODE false(default)|true - Reserve a node for tasks with ncore>1 (since v2.3)
138
141
  NOACTION_QUEUE_PRIORITY FIFO(default)|LIFO|RAND
139
142
  SHELL_START_INTERVAL default=0.012 (sec)
140
143
  GRAPH_PARTITION false(default)|true
@@ -91,7 +91,7 @@ class CommunicatorSet
91
91
  end
92
92
 
93
93
  def handler_set
94
- @communicators.each_value.map{|comm| comm.handler}
94
+ @communicators.each_value.map{|comm| comm.handler}.compact
95
95
  end
96
96
 
97
97
  def kill(sig)
@@ -148,8 +148,7 @@ module Pwrake
148
148
  if @hostinfo_by_id.empty?
149
149
  raise RuntimeError,"no worker host"
150
150
  end
151
- queue_class = Pwrake.const_get(@option.queue_class)
152
- @task_queue = queue_class.new(@hostinfo_by_id)
151
+ @task_queue = TaskQueue.new(@option.queue_class,@hostinfo_by_id)
153
152
 
154
153
  @branch_setup_thread = Thread.new do
155
154
  create_fiber(@hdl_set) do |hdl|
@@ -171,15 +170,16 @@ module Pwrake
171
170
 
172
171
  def retire(hid)
173
172
  host_info = @hostinfo_by_id[hid.to_i]
174
- if host_info && host_info.decrease(1)
175
- # all retired
173
+ return if host_info.nil?
174
+ host_info.retire(1)
175
+ if host_info.retired?
176
176
  if !@exited
177
177
  m = "retired: host #{host_info.name}"
178
178
  Log.warn(m)
179
179
  $stderr.puts(m)
180
180
  drop_host(host_info) # delete from hostinfo_by_id
181
181
  if @hostinfo_by_id.empty?
182
- raise RuntimeError,"no worker host"
182
+ raise RuntimeError,"no worker host"
183
183
  end
184
184
  end
185
185
  end
@@ -231,7 +231,7 @@ module Pwrake
231
231
  end
232
232
 
233
233
  def setup_fiber(t)
234
- n_retry = @option["RETRY"]
234
+ @host_fail = @option["HOST_FAILURE"]
235
235
  create_fiber(@hdl_set) do |hdl|
236
236
  while s = hdl.get_line
237
237
  Log.debug "Master:recv #{s.inspect} from branch[#{hdl.host}]"
@@ -253,9 +253,10 @@ module Pwrake
253
253
  if tw.status == "fail"
254
254
  $stderr.puts %[task "#{tw.name}" failed.]
255
255
  if host_info
256
- continuous_fail = host_info.task_result(tw.status)
256
+ host_info.count_result(tw.status)
257
+ continuous_fail = host_info.continuous_fail
257
258
  Log.debug "task=#{tw.name} continuous_fail=#{continuous_fail}"
258
- if continuous_fail > n_retry && @hostinfo_by_id.size > 1
259
+ if continuous_fail > @host_fail && @hostinfo_by_id.size > 1
259
260
  # retire this host
260
261
  drop_host(host_info)
261
262
  Log.warn("retired host:#{host_info.name} due to continuous fail")
@@ -310,11 +311,12 @@ module Pwrake
310
311
  count = 0
311
312
  # @idle_cores.decrease(..
312
313
  @task_queue.deq_task do |tw,host_info,ncore|
313
- host_info.busy(ncore)
314
314
  count += 1
315
315
  @hostinfo_by_taskname[tw.name] = host_info
316
+ tw.set_used_cores(ncore)
316
317
  tw.preprocess
317
- if tw.has_action?
318
+ if host_info
319
+ host_info.busy(ncore)
318
320
  hid = host_info.id
319
321
  s = "#{hid}:#{tw.task_id}:#{tw.name}"
320
322
  @channel_by_hostid[hid].put_line(s)
@@ -322,14 +324,13 @@ module Pwrake
322
324
  tw.exec_host_id = hid
323
325
  else
324
326
  tw.status = "end"
325
- task_end(tw,host_info) # @idle_cores.increase(..
326
327
  @post_pool.enq(tw)
327
328
  end
328
329
  end
329
330
  if count == 0 && !@task_queue.empty? && @hostinfo_by_taskname.empty?
330
331
  m="No task was invoked while unexecuted tasks remain"
331
332
  Log.error m
332
- Log.error "count=#{count} @hostinfo_by_taskname.empty?=#{@hostinfo_by_taskname.empty?} @task_queue.empty?={@task_queue.empty?} @task_queue=\n"+@task_queue.inspect_q
333
+ Log.error "count=#{count} @hostinfo_by_taskname.empty?=#{@hostinfo_by_taskname.empty?} @hostinfo_by_taskname=#{@hostinfo_by_taskname.inspect} @task_queue.empty?=#{@task_queue.empty?} @task_queue=\n"+@task_queue.inspect_q
333
334
  raise RuntimeError,m
334
335
  end
335
336
  #Log.debug "#{self.class}#send_task_to_idle_core end time=#{Time.now-tm}"
@@ -359,7 +360,9 @@ module Pwrake
359
360
  end
360
361
 
361
362
  def task_end(tw,host_info)
362
- if host_info && host_info.idle(tw.n_used_cores(host_info))
363
+ return if host_info.nil?
364
+ host_info.idle(tw.n_used_cores)
365
+ if host_info.retired?
363
366
  # all retired
364
367
  Log.warn("retired host:#{host_info.name} because all core retired")
365
368
  drop_host(host_info)
@@ -21,10 +21,15 @@ module Pwrake
21
21
  @total_fail = 0
22
22
  @count_task = 0
23
23
  @ipaddr = []
24
+ begin
25
+ @ipaddr << IPSocket.getaddress(@name)
26
+ rescue
27
+ end
24
28
  end
25
29
 
26
30
  attr_reader :name, :ncore, :weight, :group, :id, :steal_flag
27
31
  attr_reader :ipaddr
32
+ attr_reader :continuous_fail
28
33
  attr_accessor :idle_cores
29
34
 
30
35
  def local?
@@ -33,7 +38,6 @@ module Pwrake
33
38
  end
34
39
 
35
40
  def set_ncore(n)
36
- @retire = 0
37
41
  @busy_cores = 0
38
42
  @ncore = @idle_cores = n
39
43
  end
@@ -45,19 +49,18 @@ module Pwrake
45
49
  def idle(n)
46
50
  @busy_cores -= n
47
51
  @idle_cores += n
48
- @idle_cores -= @retire
49
- @retire = 0
50
- @idle_cores + @busy_cores < 1 # all retired
51
52
  end
52
53
 
53
54
  def busy(n)
54
55
  @busy_cores += n
55
56
  @idle_cores -= n
56
- @idle_cores + @busy_cores < 1 # all retired
57
57
  end
58
58
 
59
- def decrease(n)
59
+ def retire(n)
60
60
  @idle_cores -= n
61
+ end
62
+
63
+ def retired?
61
64
  @idle_cores + @busy_cores < 1 # all retired
62
65
  end
63
66
 
@@ -68,12 +71,7 @@ module Pwrake
68
71
  t
69
72
  end
70
73
 
71
- def retire(n)
72
- @retire += n
73
- Log.debug "retire n=#{n}, host=#{@name}"
74
- end
75
-
76
- def task_result(result)
74
+ def count_result(result)
77
75
  @count_task += 1
78
76
  case result
79
77
  when "end"
@@ -87,20 +85,8 @@ module Pwrake
87
85
  @continuous_fail
88
86
  end
89
87
 
90
- def check_cores(use_cores)
91
- unless (1-@ncore..@ncore) === use_cores
92
- m = "use_cores=#{use_cores} is invalid for @ncore=#{@ncore}"
93
- Log.fatal m
94
- raise RuntimeError,m
95
- end
96
- if use_cores < 1
97
- use_cores += @ncore
98
- end
99
- use_cores
100
- end
101
-
102
88
  def accept_core(use_cores)
103
- check_cores(use_cores) <= @idle_cores
89
+ use_cores <= @idle_cores
104
90
  end
105
91
  end
106
92
 
@@ -166,8 +166,10 @@ module Pwrake
166
166
  'REPORT_IMAGE',
167
167
  'FAILED_TARGET', # rename(default), delete, leave
168
168
  'FAILURE_TERMINATION', # wait, kill, continue
169
- 'QUEUE_PRIORITY', # RANK(default), FIFO, LIFO, DFS
169
+ 'QUEUE_PRIORITY', # LIHR(default), FIFO, LIFO, RANK
170
170
  'NOACTION_QUEUE_PRIORITY', # FIFO(default), LIFO, RAND
171
+ 'DISABLE_RANK_PRIORITY',
172
+ ['RESERVE_NODE','RESERVE_HOST'],
171
173
  'GRAPH_PARTITION',
172
174
  'PLOT_PARTITION',
173
175
 
@@ -225,6 +227,7 @@ module Pwrake
225
227
  ['SHELL_START_INTERVAL', proc{|v| (v || 0.012).to_f}],
226
228
  ['HEARTBEAT', proc{|v| v && v.to_i}],
227
229
  ['RETRY', proc{|v| (v || 1).to_i}],
230
+ ['HOST_FAILURE', 'HOST_FAIL', proc{|v| (v || 2).to_i}],
228
231
  ['MASTER_HOSTNAME', proc{|v| (v || Socket.gethostname).chomp}],
229
232
  ['WORK_DIR', proc{|v|
230
233
  v ||= '%CWD_RELATIVE_TO_HOME'
@@ -286,9 +289,9 @@ module Pwrake
286
289
 
287
290
  def parse_opt(s)
288
291
  case s
289
- when /^(false|nil|off)$/i
292
+ when /^(false|nil|off|n|no)$/i
290
293
  false
291
- when /^(true|on)$/i
294
+ when /^(true|on|y|yes)$/i
292
295
  true
293
296
  when $stdout
294
297
  "stdout"
@@ -30,7 +30,7 @@ module Pwrake
30
30
  :shared_directory => "SharedDirectory"
31
31
  }
32
32
  @filesystem = "default"
33
- @queue_class = "TaskQueue"
33
+ @queue_class = "NonLocalityQueue"
34
34
  end
35
35
 
36
36
  def max_postprocess_pool
@@ -50,7 +50,7 @@ module Pwrake
50
50
  pwrake/worker/worker_main
51
51
  ]
52
52
  if self['DISABLE_AFFINITY']
53
- @queue_class = "TaskQueue"
53
+ @queue_class = "NonLocalityQueue"
54
54
  else
55
55
  @queue_class = "LocalityAwareQueue"
56
56
  end
@@ -1,13 +1,18 @@
1
1
  module Pwrake
2
2
 
3
- class LocalityAwareQueue < TaskQueue
3
+ class LocalityAwareQueue
4
+
5
+ def initialize(hostinfo_by_id, array_class, median_core, group_map=nil)
6
+ @hostinfo_by_id = hostinfo_by_id
7
+ @array_class = array_class
8
+ @median_core = median_core
4
9
 
5
- def init_queue(group_map=nil)
6
10
  # group_map = {gid1=>[hid1,hid2,...], ...}
7
- @size_q = 0
11
+ @total_core = 0
8
12
  @q = {}
9
- @hostinfo_by_id.each do |id,h|
10
- @q[id] = @array_class.new(h.ncore)
13
+ @hostinfo_by_id.each do |id,host_info|
14
+ @total_core += c = host_info.ncore
15
+ @q[id] = @array_class.new(c)
11
16
  end
12
17
  @q_group = {}
13
18
  group_map ||= {1=>@hostinfo_by_id.map{|id,h| id}}
@@ -18,15 +23,25 @@ module Pwrake
18
23
  a = [q1,q2]
19
24
  ary.each{|hid| @q_group[hid] = a}
20
25
  end
21
- @q_remote = @array_class.new(0)
26
+
27
+ @q_remote = @array_class.new(@total_core)
28
+ @q_all = @array_class.new(@total_core)
29
+
22
30
  @disable_steal = Rake.application.pwrake_options['DISABLE_STEAL']
31
+ Log.debug "#{self.class}: @disable_steal=#{@disable_steal.inspect}"
32
+ @disable_rank = Rake.application.pwrake_options['DISABLE_RANK_PRIORITY']
33
+ Log.debug "#{self.class}: @disable_rank=#{@disable_rank.inspect}"
34
+
35
+ @turns = @disable_steal ? [0] : [0,1]
23
36
  @last_enq_time = Time.now
24
- @n_turn = @disable_steal ? 1 : 2
25
37
  end
26
38
 
39
+ attr_reader :turns
40
+
27
41
  def enq_impl(t)
28
42
  hints = t && t.suggest_location
29
43
  Log.debug "enq #{t.name} hints=#{hints.inspect}"
44
+ @q_all.push(t)
30
45
  if hints.nil? || hints.empty?
31
46
  @q_remote.push(t)
32
47
  else
@@ -47,9 +62,7 @@ module Pwrake
47
62
  end
48
63
  end
49
64
  end
50
- if q_success
51
- @size_q += 1
52
- else
65
+ unless q_success
53
66
  @q_remote.push(t)
54
67
  end
55
68
  end
@@ -58,88 +71,71 @@ module Pwrake
58
71
 
59
72
  def turn_empty?(turn)
60
73
  case turn
61
- when 0
74
+ when 0,2
62
75
  empty?
63
- when 1
64
- @size_q == 0
76
+ when 1,3
77
+ @q_all.size == @q_remote.size
65
78
  end
66
79
  end
67
80
 
81
+ def deq_start
82
+ @rank = @disable_rank ? 0 : @q_all.find_rank(@median_core)
83
+ end
84
+
68
85
  def deq_impl(host_info, turn)
69
- host = host_info.name
70
86
  case turn
71
87
  when 0
72
- if t = @q_no_action.shift
73
- Log.debug "deq_no_action task=#{t&&t.name} host=#{host}"
74
- return t
75
- elsif t = deq_locate(host_info,host_info)
76
- Log.debug "deq_locate task=#{t&&t.name} host=#{host}"
77
- return t
78
- elsif t = @q_remote.shift(host_info)
79
- Log.debug "deq_remote task=#{t&&t.name}"
80
- return t
81
- else
82
- nil
83
- end
88
+ deq_local(host_info) ||
89
+ deq_remote(host_info)
84
90
  when 1
85
- if t = deq_steal(host_info)
86
- Log.debug "deq_steal task=#{t&&t.name} host=#{host}"
87
- return t
88
- else
89
- nil
90
- end
91
+ deq_steal(host_info)
91
92
  end
92
93
  end
93
94
 
94
- def deq_locate(q_host,run_host)
95
- q = @q[q_host.id]
95
+ def deq_local(run_host)
96
+ q = @q[run_host.id]
96
97
  if q && !q.empty?
97
- t = q.shift(run_host)
98
+ t = q.shift(run_host,@rank)
98
99
  if t
99
- t.assigned.each do |h|
100
- if q_h = @q[h]
101
- q_h.delete(t)
102
- end
103
- end
104
- @size_q -= 1
100
+ q_delete_assigned_to(t)
101
+ @q_all.delete(t)
102
+ Log.debug "deq_local task=#{t&&t.name} host=#{run_host.name} req_rank=#{@rank}"
103
+ return t
105
104
  end
105
+ end
106
+ nil
107
+ end
108
+
109
+ def deq_remote(host_info)
110
+ if t = @q_remote.shift(host_info,@rank)
111
+ @q_all.delete(t)
112
+ Log.debug "deq_remote task=#{t&&t.name} host=#{host_info.name} req_rank=#{@rank}"
106
113
  return t
107
- else
108
- nil
109
114
  end
115
+ nil
110
116
  end
111
117
 
112
- def deq_steal(host_info)
113
- # select a task based on many and close
114
- max_host = nil
115
- max_num = 0
116
- @q_group[host_info.id].each do |qg|
117
- qg.each do |h,a|
118
- if !a.empty? # && h!=host_info.id
119
- d = a.size
120
- if d > max_num
121
- max_host = h
122
- max_num = d
123
- end
124
- end
125
- end
126
- if max_num > 0
127
- max_info = @hostinfo_by_id[max_host]
128
- #Log.debug "deq_steal max_host=#{max_info.name} max_num=#{max_num}"
129
- t = host_info.steal_phase{|h| deq_locate(max_info,h)}
130
- #Log.debug "deq_steal task=#{t.inspect}"
131
- if t
132
- Log.debug "deq_steal max_host=#{max_info.name} max_num=#{max_num}"
133
- return t
134
- end
135
- end
118
+ def deq_steal(run_host)
119
+ if t = @q_all.shift(run_host,@rank)
120
+ q_delete_assigned_to(t)
121
+ @q_remote.delete(t)
122
+ Log.debug "deq_steal task=#{t&&t.name} host=#{run_host.name} req_rank=#{@rank}"
123
+ return t
136
124
  end
137
125
  nil
138
126
  end
139
127
 
128
+ def q_delete_assigned_to(t)
129
+ t.assigned.each do |h|
130
+ if q_h = @q[h]
131
+ q_h.delete(t)
132
+ end
133
+ end
134
+ end
135
+
140
136
  def inspect_q
141
- s = _qstr("noaction",@q_no_action)
142
- if @size_q == 0
137
+ s = ""
138
+ if @q_all.size == @q_remote.size
143
139
  n = @q.size
144
140
  else
145
141
  n = 0
@@ -147,35 +143,33 @@ module Pwrake
147
143
  if q.size > 0
148
144
  hinfo = @hostinfo_by_id[h]
149
145
  if hinfo
150
- s << _qstr(hinfo.name,q)
146
+ s << TaskQueue._qstr(hinfo.name,q)
151
147
  else
152
- s << _qstr("(#{hinfo.inspect})",q)
148
+ s << TaskQueue._qstr("(#{hinfo.inspect})",q)
153
149
  end
154
150
  else
155
151
  n += 1
156
152
  end
157
153
  end
158
154
  end
159
- s << _qstr("local*#{n}",[]) if n > 0
160
- s << _qstr("remote",@q_remote)
161
- s << _qstr("reserved",@q_reserved)
162
- s << "@size_q=#{@size_q}"
155
+ s << TaskQueue._qstr("local*#{n}",[]) if n > 0
156
+ s << TaskQueue._qstr("remote",@q_remote)
157
+ s << TaskQueue._qstr("all",@q_all)
163
158
  s
164
159
  end
165
160
 
161
+ def size
162
+ @q_all.size
163
+ end
164
+
166
165
  def clear
167
- @q_no_action.clear
168
- @q_reserved.clear
169
166
  @q.each{|h,q| q.clear}
170
- @size_q = 0
171
167
  @q_remote.clear
168
+ @q_all.clear
172
169
  end
173
170
 
174
171
  def empty?
175
- @size_q == 0 &&
176
- @q_no_action.empty? &&
177
- @q_reserved.empty? &&
178
- @q_remote.empty?
172
+ @q_all.empty?
179
173
  end
180
174
 
181
175
  def drop_host(host_info)
@@ -183,7 +177,7 @@ module Pwrake
183
177
  if q_drop = @q.delete(hid)
184
178
  n_move = 0
185
179
  q_size = q_drop.size
186
- while t = q_drop.shift
180
+ while t = q_drop.shift(host_info,@rank)
187
181
  assigned_other = false
188
182
  t.assigned.each do |h|
189
183
  if h != hid && @q[h]
@@ -192,7 +186,6 @@ module Pwrake
192
186
  end
193
187
  end
194
188
  if !assigned_other
195
- @size_q -= 1
196
189
  @q_remote.push(t)
197
190
  n_move += 1
198
191
  end