pwrake 2.2.9 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 93b60deb41a4fda0bafa94a5b240812e5c5925ab204f8b802ef63d78fbcf81b2
4
- data.tar.gz: 6ac5c66ad22c8ba3316bca3aeab8b34fe0a4e9189f50a99b2056c5a8cced7277
3
+ metadata.gz: 99a7b5b9f4ecf97b511f58495f6b22facbd66c97d79aa0cab9f452d96ae44ea4
4
+ data.tar.gz: 2874a8eec50b7f508138567899da703e4c5d18687d58ad53a715ffa81b7fa402
5
5
  SHA512:
6
- metadata.gz: 7b0d50fb5cb90b8c7682be343c4ef70acfb324d01b46903f39606f8e044d3315ae7cf7a6cd8e4074e5b5178b3e16aa224c827f53b1ef8ea74a27b6176f84b39b
7
- data.tar.gz: 90911a12880e22276be78feae3e13b850d9614f93fc30195a7fb253e730a369bc51783778524aca2213672fca6b290c7622bdf503278f0e04dfe2eb29d46acb5
6
+ metadata.gz: c49bcd791f52010a2a221118bec1f59c08796c053ef43127f68142bd97d95499d23becf2c3d6ab9f351bc48384300999824fd3bd0f6fe1e94cba5de6b291664a
7
+ data.tar.gz: af1ecd762e914f1e891a40639ab8f59a55165132646816ccb1a5209e9c761a518cb09f0c4385086f23bb7d172e658127852780577d748baa0c83313edb72e848
data/README.md CHANGED
@@ -131,10 +131,13 @@ In this case, you need the rehash of command paths:
131
131
  SSH_OPTION SSH option
132
132
  PASS_ENV (Array) Environment variables passed to SSH
133
133
  HEARTBEAT default=240 - Hearbeat interval in seconds
134
- RETRY default=1 - The number of retry
134
+ RETRY default=1 - The number of task retry
135
+ HOST_FAILURE default=2 - The number of allowed continuous host failure (since v2.3)
135
136
  FAILED_TARGET rename(default)|delete|leave - Treatment of failed target files
136
137
  FAILURE_TERMINATION wait(default)|kill|continue - Behavior of other tasks when a task is failed
137
- QUEUE_PRIORITY LIHR(default)|FIFO|LIFO|RANK
138
+ QUEUE_PRIORITY LIFO(default)|FIFO|LIHR(LIfo&Highest-Rank-first; obsolete)
139
+ DISABLE_RANK_PRIORITY false(default)|true - Disable rank-aware task scheduling (since v2.3)
140
+ RESERVE_NODE false(default)|true - Reserve a node for tasks with ncore>1 (since v2.3)
138
141
  NOACTION_QUEUE_PRIORITY FIFO(default)|LIFO|RAND
139
142
  SHELL_START_INTERVAL default=0.012 (sec)
140
143
  GRAPH_PARTITION false(default)|true
@@ -91,7 +91,7 @@ class CommunicatorSet
91
91
  end
92
92
 
93
93
  def handler_set
94
- @communicators.each_value.map{|comm| comm.handler}
94
+ @communicators.each_value.map{|comm| comm.handler}.compact
95
95
  end
96
96
 
97
97
  def kill(sig)
@@ -148,8 +148,7 @@ module Pwrake
148
148
  if @hostinfo_by_id.empty?
149
149
  raise RuntimeError,"no worker host"
150
150
  end
151
- queue_class = Pwrake.const_get(@option.queue_class)
152
- @task_queue = queue_class.new(@hostinfo_by_id)
151
+ @task_queue = TaskQueue.new(@option.queue_class,@hostinfo_by_id)
153
152
 
154
153
  @branch_setup_thread = Thread.new do
155
154
  create_fiber(@hdl_set) do |hdl|
@@ -171,15 +170,16 @@ module Pwrake
171
170
 
172
171
  def retire(hid)
173
172
  host_info = @hostinfo_by_id[hid.to_i]
174
- if host_info && host_info.decrease(1)
175
- # all retired
173
+ return if host_info.nil?
174
+ host_info.retire(1)
175
+ if host_info.retired?
176
176
  if !@exited
177
177
  m = "retired: host #{host_info.name}"
178
178
  Log.warn(m)
179
179
  $stderr.puts(m)
180
180
  drop_host(host_info) # delete from hostinfo_by_id
181
181
  if @hostinfo_by_id.empty?
182
- raise RuntimeError,"no worker host"
182
+ raise RuntimeError,"no worker host"
183
183
  end
184
184
  end
185
185
  end
@@ -231,7 +231,7 @@ module Pwrake
231
231
  end
232
232
 
233
233
  def setup_fiber(t)
234
- n_retry = @option["RETRY"]
234
+ @host_fail = @option["HOST_FAILURE"]
235
235
  create_fiber(@hdl_set) do |hdl|
236
236
  while s = hdl.get_line
237
237
  Log.debug "Master:recv #{s.inspect} from branch[#{hdl.host}]"
@@ -253,9 +253,10 @@ module Pwrake
253
253
  if tw.status == "fail"
254
254
  $stderr.puts %[task "#{tw.name}" failed.]
255
255
  if host_info
256
- continuous_fail = host_info.task_result(tw.status)
256
+ host_info.count_result(tw.status)
257
+ continuous_fail = host_info.continuous_fail
257
258
  Log.debug "task=#{tw.name} continuous_fail=#{continuous_fail}"
258
- if continuous_fail > n_retry && @hostinfo_by_id.size > 1
259
+ if continuous_fail > @host_fail && @hostinfo_by_id.size > 1
259
260
  # retire this host
260
261
  drop_host(host_info)
261
262
  Log.warn("retired host:#{host_info.name} due to continuous fail")
@@ -310,11 +311,12 @@ module Pwrake
310
311
  count = 0
311
312
  # @idle_cores.decrease(..
312
313
  @task_queue.deq_task do |tw,host_info,ncore|
313
- host_info.busy(ncore)
314
314
  count += 1
315
315
  @hostinfo_by_taskname[tw.name] = host_info
316
+ tw.set_used_cores(ncore)
316
317
  tw.preprocess
317
- if tw.has_action?
318
+ if host_info
319
+ host_info.busy(ncore)
318
320
  hid = host_info.id
319
321
  s = "#{hid}:#{tw.task_id}:#{tw.name}"
320
322
  @channel_by_hostid[hid].put_line(s)
@@ -322,14 +324,13 @@ module Pwrake
322
324
  tw.exec_host_id = hid
323
325
  else
324
326
  tw.status = "end"
325
- task_end(tw,host_info) # @idle_cores.increase(..
326
327
  @post_pool.enq(tw)
327
328
  end
328
329
  end
329
330
  if count == 0 && !@task_queue.empty? && @hostinfo_by_taskname.empty?
330
331
  m="No task was invoked while unexecuted tasks remain"
331
332
  Log.error m
332
- Log.error "count=#{count} @hostinfo_by_taskname.empty?=#{@hostinfo_by_taskname.empty?} @task_queue.empty?={@task_queue.empty?} @task_queue=\n"+@task_queue.inspect_q
333
+ Log.error "count=#{count} @hostinfo_by_taskname.empty?=#{@hostinfo_by_taskname.empty?} @hostinfo_by_taskname=#{@hostinfo_by_taskname.inspect} @task_queue.empty?=#{@task_queue.empty?} @task_queue=\n"+@task_queue.inspect_q
333
334
  raise RuntimeError,m
334
335
  end
335
336
  #Log.debug "#{self.class}#send_task_to_idle_core end time=#{Time.now-tm}"
@@ -359,7 +360,9 @@ module Pwrake
359
360
  end
360
361
 
361
362
  def task_end(tw,host_info)
362
- if host_info && host_info.idle(tw.n_used_cores(host_info))
363
+ return if host_info.nil?
364
+ host_info.idle(tw.n_used_cores)
365
+ if host_info.retired?
363
366
  # all retired
364
367
  Log.warn("retired host:#{host_info.name} because all core retired")
365
368
  drop_host(host_info)
@@ -21,10 +21,15 @@ module Pwrake
21
21
  @total_fail = 0
22
22
  @count_task = 0
23
23
  @ipaddr = []
24
+ begin
25
+ @ipaddr << IPSocket.getaddress(@name)
26
+ rescue
27
+ end
24
28
  end
25
29
 
26
30
  attr_reader :name, :ncore, :weight, :group, :id, :steal_flag
27
31
  attr_reader :ipaddr
32
+ attr_reader :continuous_fail
28
33
  attr_accessor :idle_cores
29
34
 
30
35
  def local?
@@ -33,7 +38,6 @@ module Pwrake
33
38
  end
34
39
 
35
40
  def set_ncore(n)
36
- @retire = 0
37
41
  @busy_cores = 0
38
42
  @ncore = @idle_cores = n
39
43
  end
@@ -45,19 +49,18 @@ module Pwrake
45
49
  def idle(n)
46
50
  @busy_cores -= n
47
51
  @idle_cores += n
48
- @idle_cores -= @retire
49
- @retire = 0
50
- @idle_cores + @busy_cores < 1 # all retired
51
52
  end
52
53
 
53
54
  def busy(n)
54
55
  @busy_cores += n
55
56
  @idle_cores -= n
56
- @idle_cores + @busy_cores < 1 # all retired
57
57
  end
58
58
 
59
- def decrease(n)
59
+ def retire(n)
60
60
  @idle_cores -= n
61
+ end
62
+
63
+ def retired?
61
64
  @idle_cores + @busy_cores < 1 # all retired
62
65
  end
63
66
 
@@ -68,12 +71,7 @@ module Pwrake
68
71
  t
69
72
  end
70
73
 
71
- def retire(n)
72
- @retire += n
73
- Log.debug "retire n=#{n}, host=#{@name}"
74
- end
75
-
76
- def task_result(result)
74
+ def count_result(result)
77
75
  @count_task += 1
78
76
  case result
79
77
  when "end"
@@ -87,20 +85,8 @@ module Pwrake
87
85
  @continuous_fail
88
86
  end
89
87
 
90
- def check_cores(use_cores)
91
- unless (1-@ncore..@ncore) === use_cores
92
- m = "use_cores=#{use_cores} is invalid for @ncore=#{@ncore}"
93
- Log.fatal m
94
- raise RuntimeError,m
95
- end
96
- if use_cores < 1
97
- use_cores += @ncore
98
- end
99
- use_cores
100
- end
101
-
102
88
  def accept_core(use_cores)
103
- check_cores(use_cores) <= @idle_cores
89
+ use_cores <= @idle_cores
104
90
  end
105
91
  end
106
92
 
@@ -166,8 +166,10 @@ module Pwrake
166
166
  'REPORT_IMAGE',
167
167
  'FAILED_TARGET', # rename(default), delete, leave
168
168
  'FAILURE_TERMINATION', # wait, kill, continue
169
- 'QUEUE_PRIORITY', # RANK(default), FIFO, LIFO, DFS
169
+ 'QUEUE_PRIORITY', # LIHR(default), FIFO, LIFO, RANK
170
170
  'NOACTION_QUEUE_PRIORITY', # FIFO(default), LIFO, RAND
171
+ 'DISABLE_RANK_PRIORITY',
172
+ ['RESERVE_NODE','RESERVE_HOST'],
171
173
  'GRAPH_PARTITION',
172
174
  'PLOT_PARTITION',
173
175
 
@@ -225,6 +227,7 @@ module Pwrake
225
227
  ['SHELL_START_INTERVAL', proc{|v| (v || 0.012).to_f}],
226
228
  ['HEARTBEAT', proc{|v| v && v.to_i}],
227
229
  ['RETRY', proc{|v| (v || 1).to_i}],
230
+ ['HOST_FAILURE', 'HOST_FAIL', proc{|v| (v || 2).to_i}],
228
231
  ['MASTER_HOSTNAME', proc{|v| (v || Socket.gethostname).chomp}],
229
232
  ['WORK_DIR', proc{|v|
230
233
  v ||= '%CWD_RELATIVE_TO_HOME'
@@ -286,9 +289,9 @@ module Pwrake
286
289
 
287
290
  def parse_opt(s)
288
291
  case s
289
- when /^(false|nil|off)$/i
292
+ when /^(false|nil|off|n|no)$/i
290
293
  false
291
- when /^(true|on)$/i
294
+ when /^(true|on|y|yes)$/i
292
295
  true
293
296
  when $stdout
294
297
  "stdout"
@@ -30,7 +30,7 @@ module Pwrake
30
30
  :shared_directory => "SharedDirectory"
31
31
  }
32
32
  @filesystem = "default"
33
- @queue_class = "TaskQueue"
33
+ @queue_class = "NonLocalityQueue"
34
34
  end
35
35
 
36
36
  def max_postprocess_pool
@@ -50,7 +50,7 @@ module Pwrake
50
50
  pwrake/worker/worker_main
51
51
  ]
52
52
  if self['DISABLE_AFFINITY']
53
- @queue_class = "TaskQueue"
53
+ @queue_class = "NonLocalityQueue"
54
54
  else
55
55
  @queue_class = "LocalityAwareQueue"
56
56
  end
@@ -1,13 +1,18 @@
1
1
  module Pwrake
2
2
 
3
- class LocalityAwareQueue < TaskQueue
3
+ class LocalityAwareQueue
4
+
5
+ def initialize(hostinfo_by_id, array_class, median_core, group_map=nil)
6
+ @hostinfo_by_id = hostinfo_by_id
7
+ @array_class = array_class
8
+ @median_core = median_core
4
9
 
5
- def init_queue(group_map=nil)
6
10
  # group_map = {gid1=>[hid1,hid2,...], ...}
7
- @size_q = 0
11
+ @total_core = 0
8
12
  @q = {}
9
- @hostinfo_by_id.each do |id,h|
10
- @q[id] = @array_class.new(h.ncore)
13
+ @hostinfo_by_id.each do |id,host_info|
14
+ @total_core += c = host_info.ncore
15
+ @q[id] = @array_class.new(c)
11
16
  end
12
17
  @q_group = {}
13
18
  group_map ||= {1=>@hostinfo_by_id.map{|id,h| id}}
@@ -18,15 +23,25 @@ module Pwrake
18
23
  a = [q1,q2]
19
24
  ary.each{|hid| @q_group[hid] = a}
20
25
  end
21
- @q_remote = @array_class.new(0)
26
+
27
+ @q_remote = @array_class.new(@total_core)
28
+ @q_all = @array_class.new(@total_core)
29
+
22
30
  @disable_steal = Rake.application.pwrake_options['DISABLE_STEAL']
31
+ Log.debug "#{self.class}: @disable_steal=#{@disable_steal.inspect}"
32
+ @disable_rank = Rake.application.pwrake_options['DISABLE_RANK_PRIORITY']
33
+ Log.debug "#{self.class}: @disable_rank=#{@disable_rank.inspect}"
34
+
35
+ @turns = @disable_steal ? [0] : [0,1]
23
36
  @last_enq_time = Time.now
24
- @n_turn = @disable_steal ? 1 : 2
25
37
  end
26
38
 
39
+ attr_reader :turns
40
+
27
41
  def enq_impl(t)
28
42
  hints = t && t.suggest_location
29
43
  Log.debug "enq #{t.name} hints=#{hints.inspect}"
44
+ @q_all.push(t)
30
45
  if hints.nil? || hints.empty?
31
46
  @q_remote.push(t)
32
47
  else
@@ -47,9 +62,7 @@ module Pwrake
47
62
  end
48
63
  end
49
64
  end
50
- if q_success
51
- @size_q += 1
52
- else
65
+ unless q_success
53
66
  @q_remote.push(t)
54
67
  end
55
68
  end
@@ -58,88 +71,71 @@ module Pwrake
58
71
 
59
72
  def turn_empty?(turn)
60
73
  case turn
61
- when 0
74
+ when 0,2
62
75
  empty?
63
- when 1
64
- @size_q == 0
76
+ when 1,3
77
+ @q_all.size == @q_remote.size
65
78
  end
66
79
  end
67
80
 
81
+ def deq_start
82
+ @rank = @disable_rank ? 0 : @q_all.find_rank(@median_core)
83
+ end
84
+
68
85
  def deq_impl(host_info, turn)
69
- host = host_info.name
70
86
  case turn
71
87
  when 0
72
- if t = @q_no_action.shift
73
- Log.debug "deq_no_action task=#{t&&t.name} host=#{host}"
74
- return t
75
- elsif t = deq_locate(host_info,host_info)
76
- Log.debug "deq_locate task=#{t&&t.name} host=#{host}"
77
- return t
78
- elsif t = @q_remote.shift(host_info)
79
- Log.debug "deq_remote task=#{t&&t.name}"
80
- return t
81
- else
82
- nil
83
- end
88
+ deq_local(host_info) ||
89
+ deq_remote(host_info)
84
90
  when 1
85
- if t = deq_steal(host_info)
86
- Log.debug "deq_steal task=#{t&&t.name} host=#{host}"
87
- return t
88
- else
89
- nil
90
- end
91
+ deq_steal(host_info)
91
92
  end
92
93
  end
93
94
 
94
- def deq_locate(q_host,run_host)
95
- q = @q[q_host.id]
95
+ def deq_local(run_host)
96
+ q = @q[run_host.id]
96
97
  if q && !q.empty?
97
- t = q.shift(run_host)
98
+ t = q.shift(run_host,@rank)
98
99
  if t
99
- t.assigned.each do |h|
100
- if q_h = @q[h]
101
- q_h.delete(t)
102
- end
103
- end
104
- @size_q -= 1
100
+ q_delete_assigned_to(t)
101
+ @q_all.delete(t)
102
+ Log.debug "deq_local task=#{t&&t.name} host=#{run_host.name} req_rank=#{@rank}"
103
+ return t
105
104
  end
105
+ end
106
+ nil
107
+ end
108
+
109
+ def deq_remote(host_info)
110
+ if t = @q_remote.shift(host_info,@rank)
111
+ @q_all.delete(t)
112
+ Log.debug "deq_remote task=#{t&&t.name} host=#{host_info.name} req_rank=#{@rank}"
106
113
  return t
107
- else
108
- nil
109
114
  end
115
+ nil
110
116
  end
111
117
 
112
- def deq_steal(host_info)
113
- # select a task based on many and close
114
- max_host = nil
115
- max_num = 0
116
- @q_group[host_info.id].each do |qg|
117
- qg.each do |h,a|
118
- if !a.empty? # && h!=host_info.id
119
- d = a.size
120
- if d > max_num
121
- max_host = h
122
- max_num = d
123
- end
124
- end
125
- end
126
- if max_num > 0
127
- max_info = @hostinfo_by_id[max_host]
128
- #Log.debug "deq_steal max_host=#{max_info.name} max_num=#{max_num}"
129
- t = host_info.steal_phase{|h| deq_locate(max_info,h)}
130
- #Log.debug "deq_steal task=#{t.inspect}"
131
- if t
132
- Log.debug "deq_steal max_host=#{max_info.name} max_num=#{max_num}"
133
- return t
134
- end
135
- end
118
+ def deq_steal(run_host)
119
+ if t = @q_all.shift(run_host,@rank)
120
+ q_delete_assigned_to(t)
121
+ @q_remote.delete(t)
122
+ Log.debug "deq_steal task=#{t&&t.name} host=#{run_host.name} req_rank=#{@rank}"
123
+ return t
136
124
  end
137
125
  nil
138
126
  end
139
127
 
128
+ def q_delete_assigned_to(t)
129
+ t.assigned.each do |h|
130
+ if q_h = @q[h]
131
+ q_h.delete(t)
132
+ end
133
+ end
134
+ end
135
+
140
136
  def inspect_q
141
- s = _qstr("noaction",@q_no_action)
142
- if @size_q == 0
137
+ s = ""
138
+ if @q_all.size == @q_remote.size
143
139
  n = @q.size
144
140
  else
145
141
  n = 0
@@ -147,35 +143,33 @@ module Pwrake
147
143
  if q.size > 0
148
144
  hinfo = @hostinfo_by_id[h]
149
145
  if hinfo
150
- s << _qstr(hinfo.name,q)
146
+ s << TaskQueue._qstr(hinfo.name,q)
151
147
  else
152
- s << _qstr("(#{hinfo.inspect})",q)
148
+ s << TaskQueue._qstr("(#{hinfo.inspect})",q)
153
149
  end
154
150
  else
155
151
  n += 1
156
152
  end
157
153
  end
158
154
  end
159
- s << _qstr("local*#{n}",[]) if n > 0
160
- s << _qstr("remote",@q_remote)
161
- s << _qstr("reserved",@q_reserved)
162
- s << "@size_q=#{@size_q}"
155
+ s << TaskQueue._qstr("local*#{n}",[]) if n > 0
156
+ s << TaskQueue._qstr("remote",@q_remote)
157
+ s << TaskQueue._qstr("all",@q_all)
163
158
  s
164
159
  end
165
160
 
161
+ def size
162
+ @q_all.size
163
+ end
164
+
166
165
  def clear
167
- @q_no_action.clear
168
- @q_reserved.clear
169
166
  @q.each{|h,q| q.clear}
170
- @size_q = 0
171
167
  @q_remote.clear
168
+ @q_all.clear
172
169
  end
173
170
 
174
171
  def empty?
175
- @size_q == 0 &&
176
- @q_no_action.empty? &&
177
- @q_reserved.empty? &&
178
- @q_remote.empty?
172
+ @q_all.empty?
179
173
  end
180
174
 
181
175
  def drop_host(host_info)
@@ -183,7 +177,7 @@ module Pwrake
183
177
  if q_drop = @q.delete(hid)
184
178
  n_move = 0
185
179
  q_size = q_drop.size
186
- while t = q_drop.shift
180
+ while t = q_drop.shift(host_info,@rank)
187
181
  assigned_other = false
188
182
  t.assigned.each do |h|
189
183
  if h != hid && @q[h]
@@ -192,7 +186,6 @@ module Pwrake
192
186
  end
193
187
  end
194
188
  if !assigned_other
195
- @size_q -= 1
196
189
  @q_remote.push(t)
197
190
  n_move += 1
198
191
  end