drbqs 0.0.17 → 0.0.18

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/docs/FormatExecute.md +44 -2
  2. data/example/group/execute.rb +19 -0
  3. data/example/group/server.rb +27 -0
  4. data/example/group/sum.rb +9 -0
  5. data/example/mandelbrot/README.md +8 -0
  6. data/example/mandelbrot/execute.rb +4 -0
  7. data/lib/drbqs/command_line/command_node.rb +1 -0
  8. data/lib/drbqs/execute/execute_node.rb +4 -21
  9. data/lib/drbqs/node/connection.rb +1 -2
  10. data/lib/drbqs/node/node.rb +163 -102
  11. data/lib/drbqs/node/state.rb +100 -35
  12. data/lib/drbqs/node/task_client.rb +46 -33
  13. data/lib/drbqs/server/message.rb +13 -7
  14. data/lib/drbqs/server/server.rb +57 -29
  15. data/lib/drbqs/server/server_hook.rb +19 -5
  16. data/lib/drbqs/server/test/node.rb +31 -6
  17. data/lib/drbqs/setting/node.rb +11 -2
  18. data/lib/drbqs/setting/server.rb +1 -1
  19. data/lib/drbqs/task/task.rb +26 -6
  20. data/lib/drbqs/task/task_generator.rb +2 -1
  21. data/lib/drbqs/utility/temporary.rb +27 -6
  22. data/lib/drbqs/utility/transfer/transfer_client.rb +10 -12
  23. data/lib/drbqs/version.rb +1 -1
  24. data/lib/drbqs/worker.rb +2 -0
  25. data/lib/drbqs/worker/forked_process.rb +100 -0
  26. data/lib/drbqs/worker/serialize.rb +66 -0
  27. data/lib/drbqs/worker/worker.rb +133 -0
  28. data/lib/drbqs/worker/worker_process_set.rb +219 -0
  29. data/spec/integration_test/01_basic_usage_spec.rb +3 -2
  30. data/spec/integration_test/06_node_exit_after_task_spec.rb +3 -2
  31. data/spec/integration_test/07_command_server_with_node_spec.rb +1 -0
  32. data/spec/integration_test/08_shutdown_unused_nodes_spec.rb +3 -2
  33. data/spec/integration_test/10_test_server_spec.rb +2 -2
  34. data/spec/integration_test/11_special_tasks_spec.rb +61 -0
  35. data/spec/integration_test/12_multiple_workers_spec.rb +43 -0
  36. data/spec/integration_test/definition/task_obj_definition.rb +33 -6
  37. data/spec/node/connection_spec.rb +6 -6
  38. data/spec/node/node_spec.rb +10 -2
  39. data/spec/node/state_spec.rb +146 -62
  40. data/spec/node/task_client_spec.rb +58 -53
  41. data/spec/server/message_spec.rb +10 -6
  42. data/spec/server/queue_spec.rb +7 -4
  43. data/spec/server/server_hook_spec.rb +28 -1
  44. data/spec/task/task_spec.rb +43 -6
  45. data/spec/utility/temporary_spec.rb +32 -9
  46. data/spec/worker/forked_process_spec.rb +66 -0
  47. data/spec/worker/serialize_spec.rb +73 -0
  48. data/spec/worker/worker_process_set_spec.rb +104 -0
  49. data/spec/worker/worker_spec.rb +127 -0
  50. metadata +34 -19
@@ -11,7 +11,9 @@ drbqs-execute execute a server of which uri is made from hostname and port.
11
11
  Moreover, drbqs-execute make nodes connecting to the uri of server.
12
12
  The server and nodes can be over SSH.
13
13
 
14
- ## Example: execute.rb
14
+ ## Example
15
+
16
+ ### execute.rb
15
17
 
16
18
  #!/usr/bin/env drbqs-execute
17
19
  # -*-ruby-*-
@@ -55,7 +57,21 @@ The server and nodes can be over SSH.
55
57
  node :even, group: [:node02, :node04, :node06]
56
58
  node :odd, group: [:node01, :node03, :node05]
57
59
 
58
- ## Help message
60
+ ### Execution
61
+
62
+ In the above example, there is the shebang line
63
+
64
+ #!/usr/bin/env drbqs-execute
65
+
66
+ and therefore we can execute by
67
+
68
+ ./execute.rb
69
+
70
+ If there is no shebang line, we type
71
+
72
+ drbqs-execute execute.rb
73
+
74
+ ### Help message
59
75
 
60
76
  If we run the following command
61
77
 
@@ -63,6 +79,32 @@ If we run the following command
63
79
 
64
80
  then help message of server.rb is displayed in addition to that of drbqs-execute.
65
81
 
82
+ ### Information of server and nodes
83
+
84
+ The command
85
+
86
+ drbqs-execute -i
87
+
88
+ shows information of server and nodes.
89
+ The output is
90
+
91
+ Server:
92
+ * server1 ssh
93
+ local local
94
+ Node:
95
+ - node_base ssh,template
96
+ node01 ssh
97
+ node02 ssh
98
+ node03 ssh
99
+ node04 ssh
100
+ node05 ssh
101
+ node06 ssh
102
+ - even group: node02,node04,node06
103
+ - odd group: node01,node03,node05
104
+ Port: 12345
105
+
106
+ The character "*" means default and "-" means virtual nodes (template or group).
107
+
66
108
  ## Methods
67
109
 
68
110
  The following methods are available.
@@ -0,0 +1,19 @@
1
+ default :server => :server_local, :port => 13789, :log => '/tmp/drbqs_execute'
2
+
3
+ current_dir = File.expand_path(File.dirname(__FILE__))
4
+
5
+ usage :message => "Calculate sum of numbers", :server => File.join(current_dir, 'server.rb')
6
+
7
+ server :server, "localhost" do |server|
8
+ server.load File.expand_path(File.join(File.dirname(__FILE__), 'server.rb'))
9
+ end
10
+
11
+ node :node_odd do |node|
12
+ node.load File.join(current_dir, 'sum.rb')
13
+ node.group :odd
14
+ end
15
+
16
+ node :node_even do |node|
17
+ node.load File.join(current_dir, 'sum.rb')
18
+ node.group :even
19
+ end
@@ -0,0 +1,27 @@
1
+ #
2
+ # Usage:
3
+ # drbqs-server server.rb -- 30 50
4
+ # drbqs-server server.rb -- 100 500 --step 100
5
+ #
6
+
7
+ require_relative 'sum.rb'
8
+
9
+ DRbQS.option_parser do |opt, hash|
10
+ opt.on('--step NUM', Integer) do |v|
11
+ hash[:step] = v
12
+ end
13
+ end
14
+
15
+ DRbQS.define_server do |server, argv, opts|
16
+ start_num = (argv[0] || 10).to_i
17
+ end_num = (argv[1] || 100).to_i
18
+ step_num = opts[:step] || 10
19
+
20
+ server.task_generator(:generate => 10) do |tgen|
21
+ start_num.step(end_num, step_num).with_index do |i, count|
22
+ create_add(Sum.new(i - 10, i), :exec, group: (count.even? ? :even : :odd)) do |srv, ret|
23
+ puts "Receive: #{ret.inspect}"
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,9 @@
1
+ class Sum
2
+ def initialize(start_num, end_num)
3
+ @num = [start_num, end_num]
4
+ end
5
+
6
+ def exec
7
+ (@num[0]..@num[1]).inject(0) { |sum, i| sum += i }
8
+ end
9
+ end
@@ -10,6 +10,14 @@ This program requires gunzip command.
10
10
 
11
11
  drbqs-server server.rb -h
12
12
 
13
+ or
14
+
15
+ drbqs-execute execute.rb -h
16
+
13
17
  ## Execute
14
18
 
15
19
  drbqs-server server.rb --execute-node <process_number>
20
+
21
+ or
22
+
23
+ bin/drbqs-execute example/mandelbrot/execute.rb
@@ -1,5 +1,9 @@
1
1
  DIR = File.dirname(__FILE__)
2
2
 
3
+ current_dir = File.expand_path(File.dirname(__FILE__))
4
+
5
+ usage :message => "Calculate Mandelbrot set", :server => File.join(current_dir, 'server.rb')
6
+
3
7
  server :local_server, "localhost" do |srv|
4
8
  srv.load File.join(DIR, 'server.rb')
5
9
  end
@@ -15,6 +15,7 @@ HELP
15
15
  argv = option_parser_base(argv, :log_level => true, :daemon => true, :debug => true) do
16
16
  set(:load, '-l', '--load FILE', String, 'Add a file to load.')
17
17
  set(:process, '-P', '--process NUM', Integer, 'Set the number of node processes to execute.')
18
+ set(:group, '--group STR', String, 'Set the group of node.')
18
19
  set(:loadavg, '--loadavg STR', String, 'Set the threshold load average to sleep.')
19
20
  set(:log_prefix, '--log-prefix STR', String, "Set the prefix of log files. The default is '#{setting.default[:log_prefix][0]}'.")
20
21
  set(:log_stdout, '--log-stdout', 'Use stdout for outputting logs. This option cancels --log-prefix.')
@@ -1,8 +1,6 @@
1
1
  module DRbQS
2
2
  class Execution
3
3
  class ExecuteNode
4
- attr_reader :pid
5
-
6
4
  def initialize(uri, log_prefix, log_level, node_opts = {})
7
5
  @uri = uri
8
6
  @log_level = log_level
@@ -13,7 +11,6 @@ module DRbQS
13
11
  else
14
12
  @fname = nil
15
13
  end
16
- @pid = []
17
14
  @node_opts = node_opts
18
15
  end
19
16
 
@@ -25,25 +22,11 @@ module DRbQS
25
22
  end
26
23
  private :get_log_file
27
24
 
28
- def create_process
29
- @pid << fork do
30
- opts = @node_opts.merge({ :log_level => @log_level, :log_file => get_log_file })
31
- node = DRbQS::Node.new(@uri, opts)
32
- node.connect
33
- node.calculate
34
- end
35
- end
36
- private :create_process
37
-
38
25
  def execute(process_num, interval = 0)
39
- process_num.times do |i|
40
- create_process
41
- sleep(interval) if interval > 0
42
- end
43
- end
44
-
45
- def wait
46
- Process.waitall
26
+ opts = @node_opts.merge({ :log_level => @log_level, :log_file => get_log_file, :process => process_num })
27
+ node = DRbQS::Node.new(@uri, opts)
28
+ node.connect
29
+ node.calculate
47
30
  end
48
31
  end
49
32
  end
@@ -28,8 +28,7 @@ module DRbQS
28
28
 
29
29
  def get_special_task(label)
30
30
  begin
31
- ary = @message.read([label, nil, Symbol, nil], 0)
32
- ary[1..-1]
31
+ @message.read([label, Array], 0)[1]
33
32
  rescue Rinda::RequestExpiredError
34
33
  nil
35
34
  end
@@ -4,6 +4,7 @@ require 'drbqs/utility/transfer/transfer_client'
4
4
  require 'drbqs/node/connection'
5
5
  require 'drbqs/node/task_client'
6
6
  require 'drbqs/node/state'
7
+ require 'drbqs/worker/worker'
7
8
 
8
9
  module DRbQS
9
10
 
@@ -13,25 +14,41 @@ module DRbQS
13
14
  PRIORITY_CALCULATE = 0
14
15
  OUTPUT_NOT_SEND_RESULT = 'not_send_result'
15
16
  DEFAULT_LOG_FILE = 'drbqs_client.log'
16
- INTERVAL_TIME_DEFAULT = 1
17
+ INTERVAL_TIME_DEFAULT = 0.1
18
+ SAME_HOST_GROUP = :local
17
19
 
18
- # :continue
19
- # :max_loadavg
20
- # :sleep_time
20
+ # @param [String] acces_uri Set the uri of server
21
+ # @param [Hash] opts Options of a node
22
+ # @option opts [Fixnum] :process Number of worker processes
23
+ # @option opts [Array] :group An array of group symbols
24
+ # @option opts [Boolean] :continue If we set true then the node process does not exit
25
+ # @option opts [Fixnum] :sleep_time Time interval during sleep of the node
26
+ # @option opts [String] :max_loadavg Note that this optiono is experimental
21
27
  def initialize(access_uri, opts = {})
22
28
  @access_uri = access_uri
23
29
  @logger = DRbQS::Misc.create_logger(opts[:log_file] || DEFAULT_LOG_FILE, opts[:log_level])
24
30
  @connection = nil
25
31
  @task_client = nil
26
- @state = DRbQS::Node::State.new(:wait, :max_loadavg => opts[:max_loadavg], :sleep_time => opts[:sleep_time])
32
+ @worker_number = opts[:process] || 1
33
+ @state = DRbQS::Node::State.new(:wait, @worker_number, :max_loadavg => opts[:max_loadavg], :sleep_time => opts[:sleep_time])
27
34
  @process_continue = opts[:continue]
28
- @signal_queue = Queue.new
35
+ @group = opts[:group] || []
36
+ @signal_to_server_queue = Queue.new
29
37
  @config = DRbQS::Config.new
38
+ @special_task_number = 0
39
+ @worker = DRbQS::Worker::ProcessSet.new(DRbQS::Worker::ForkedProcess)
40
+ @worker.on_result do |proc_key, res|
41
+ task_id, h = res
42
+ queue_result(task_id, h)
43
+ end
44
+ @worker.on_error do |proc_key, res|
45
+ @signal_to_server_queue.push([:node_error, res])
46
+ end
30
47
  end
31
48
 
32
- def transfer_file
49
+ def transfer_file(files)
33
50
  begin
34
- DRbQS::Transfer::Client.transfer_to_server
51
+ DRbQS::Transfer::Client.transfer_to_server(files)
35
52
  rescue Exception => err
36
53
  @logger.error("Fail to transfer files.") do
37
54
  "#{err.to_s} (#{err.class})\n#{err.backtrace.join("\n")}"
@@ -41,33 +58,62 @@ module DRbQS
41
58
  end
42
59
  private :transfer_file
43
60
 
44
- def execute_task(marshal_obj, method_sym, args)
45
- result = DRbQS::Task.execute_task(marshal_obj, method_sym, args)
46
- transfer_file
47
- DRbQS::Temporary.delete
48
- result
61
+ def queue_result(task_id, result_hash)
62
+ if files = result_hash[:transfer]
63
+ transfer_file(files)
64
+ end
65
+ if subdir = result_hash[:tmp]
66
+ FileUtils.rm_r(result_hash[:tmp])
67
+ end
68
+ @task_client.queue_result(task_id, result_hash[:result])
49
69
  end
50
- private :execute_task
70
+ private :queue_result
51
71
 
52
72
  def node_data
53
73
  { :uri => @access_uri }
54
74
  end
55
75
  private :node_data
56
76
 
77
+ # @param [Array] task_ary An array from @connection.get_initialization or @connection.get_finalization.
78
+ def send_special_task_ary_to_all_workers(task_ary)
79
+ task_ary.each do |ary|
80
+ ary_to_send = [nil] + ary
81
+ @state.each_worker_id do |wid|
82
+ @worker.send_task(wid, ary_to_send)
83
+ end
84
+ end
85
+ end
86
+ private :send_special_task_ary_to_all_workers
87
+
88
+ # Connect to the server and finish initialization of the node.
57
89
  def connect
58
90
  obj = DRbObject.new_with_uri(@access_uri)
59
91
  @server_key = obj[:key]
60
- @connection = Node::Connection.new(obj[:message], @logger)
61
- @task_client = Node::TaskClient.new(@connection.node_number, obj[:queue], obj[:result], @logger)
92
+ @connection = DRbQS::Node::Connection.new(obj[:message], @logger)
93
+ set_node_group_for_task
94
+ @task_client = DRbQS::Node::TaskClient.new(@connection.node_number, obj[:queue], obj[:result],
95
+ @group, @logger)
62
96
  DRbQS::Transfer::Client.set(obj[:transfer].get_client(server_on_same_host?)) if obj[:transfer]
63
- if ary = @connection.get_initialization
64
- execute_task(*ary)
97
+ @state.each_worker_id do |wid|
98
+ @worker.create_process(wid)
99
+ end
100
+ if ary_initialization = @connection.get_initialization
101
+ send_special_task_ary_to_all_workers(ary_initialization)
65
102
  end
66
103
  @config.list.node.save(Process.pid, node_data)
67
104
  end
68
105
 
106
+ # This method must be executed after @connection is set.
107
+ def set_node_group_for_task
108
+ if server_on_same_host?
109
+ @group << DRbQS::Node::SAME_HOST_GROUP
110
+ end
111
+ end
112
+ private :set_node_group_for_task
113
+
69
114
  def server_on_same_host?
70
- @config.list.server.server_of_key_exist?(@access_uri, @server_key)
115
+ @server_on_same_host ||
116
+ (@server_on_same_host = @config.list.server.server_of_key_exist?(@access_uri, @server_key))
71
117
  end
72
118
 
73
119
  def dump_not_send_result_to_file
@@ -83,17 +129,10 @@ module DRbQS
83
129
  end
84
130
  private :output_error
85
131
 
86
- def process_exit
87
- dump_not_send_result_to_file
88
- unless @process_continue
89
- Kernel.exit
90
- end
91
- end
92
- private :process_exit
93
-
94
132
  def execute_finalization
95
- if ary = @connection.get_finalization
96
- execute_task(*ary)
133
+ @worker.waitall
134
+ if ary_finalization = @connection.get_finalization
135
+ send_special_task_ary_to_all_workers(ary_finalization)
97
136
  end
98
137
  rescue => err
99
138
  output_error(err, "On finalization")
@@ -102,80 +141,88 @@ module DRbQS
102
141
 
103
142
  def send_error(err, mes)
104
143
  output_error(err, mes)
105
- @connection.send_node_error("#{err.to_s}\n#{err.backtrace.join("\n")}")
144
+ begin
145
+ @connection.send_node_error("#{err.to_s}\n#{err.backtrace.join("\n")}")
146
+ rescue
147
+ end
106
148
  end
107
149
  private :send_error
108
150
 
109
151
  def get_new_task
110
- if @state.request?
111
- if @state.change_to_sleep_for_busy_system
112
- @logger.info("Sleep because system is busy.")
113
- elsif @task_client.add_new_task
114
- @state.change_to_calculate
115
- end
152
+ if @state.request? && (obtained_task_id = @task_client.add_new_task(@state.request_task_number))
153
+ return obtained_task_id
116
154
  end
155
+ nil
117
156
  end
118
157
  private :get_new_task
119
158
 
120
- def send_result
121
- flag_finilize_exit = @task_client.send_result
122
- if @state.calculate? && !@task_client.calculating_task
123
- @state.change_to_finish_calculating
159
+ def send_result_to_server
160
+ if sent_task_id = @task_client.send_result
161
+ @state.set_finish_of_task(sent_task_id)
124
162
  end
125
- flag_finilize_exit
126
163
  end
127
- private :send_result
164
+ private :send_result_to_server
128
165
 
166
+ # Send signals from @signal_to_server_queue,
167
+ # which stores errors of workers and signals to current process.
168
+ # @return [Boolean] If some error signal is sent then this method returns true. Otherwise, nil.
129
169
  def send_signal
130
- until @signal_queue.empty?
131
- signal, obj = @signal_queue.pop
170
+ flag_finalize_exit = nil
171
+ until @signal_to_server_queue.empty?
172
+ signal, obj = @signal_to_server_queue.pop
132
173
  case signal
133
174
  when :node_error
134
175
  send_error(obj, "Communicating with server")
135
- process_exit
176
+ dump_not_send_result_to_file
177
+ flag_finalize_exit = true
178
+ when :signal_kill
179
+ flag_finalize_exit = true
180
+ else
181
+ raise "Not implemented"
136
182
  end
137
183
  end
184
+ flag_finalize_exit
138
185
  end
139
186
  private :send_signal
140
187
 
141
- def process_signal
188
+ # If the method returns true, the node finishes.
189
+ def process_signal_for_server
190
+ flag_finalize_exit = send_signal
142
191
  case @connection.respond_signal
143
192
  when :wake
144
- @state.change_to_wait
193
+ @state.wakeup_sleeping_worker
145
194
  when :sleep
146
195
  @state.change_to_sleep
147
196
  when :exit
148
- return :exit
197
+ return nil
149
198
  when :finalize
150
- return :finalize
199
+ flag_finalize_exit = true
151
200
  when :exit_after_task
152
- @task_client.set_exit_after_task
201
+ @state.set_exit_after_task
153
202
  @process_continue = nil
154
203
  end
155
- nil
156
- end
157
- private :process_signal
158
-
159
- def communicate_with_server
160
- get_new_task
161
- sig = process_signal
162
- return nil if sig == :exit
163
- flag_finilize_exit = send_result
164
- send_signal
165
- if sig == :finalize || flag_finilize_exit
204
+ if flag_finalize_exit
166
205
  execute_finalization
167
206
  return nil
168
207
  end
169
208
  @state.wakeup_automatically_for_unbusy_system
170
209
  true
171
210
  end
172
- private :communicate_with_server
211
+ private :process_signal_for_server
173
212
 
174
- def calculate_task
175
- marshal_obj, method_sym, args = @task_client.dequeue_task
176
- @task_client.queue_result(execute_task(marshal_obj, method_sym, args))
213
+ # Dequeue tasks from @task_client and send them to worker processes.
214
+ def send_task_to_worker
215
+ wids = @state.waiting_worker_id
216
+ wids.each do |wid|
217
+ if ary = @task_client.dequeue_task
218
+ @state.set_calculating_task(wid, ary[0])
219
+ @worker.send_task(wid, ary)
220
+ else
221
+ break
222
+ end
223
+ end
177
224
  end
178
- private :calculate_task
225
+ private :send_task_to_worker
179
226
 
180
227
  def clear_node_files
181
228
  DRbQS::Temporary.delete_all
@@ -188,52 +235,66 @@ module DRbQS
188
235
  end
189
236
  private :wait_interval_of_connection
190
237
 
191
- def thread_communicate
192
- Thread.new do
193
- begin
194
- loop do
195
- unless communicate_with_server
196
- clear_node_files
197
- break
198
- end
199
- wait_interval_of_connection
200
- end
201
- rescue => err
202
- send_error(err, "Calculating thread")
203
- ensure
204
- process_exit
205
- end
238
+ def set_signal_trap
239
+ Signal.trap(:TERM) do
240
+ @signal_to_server_queue.push([:signal_kill])
206
241
  end
207
242
  end
208
- private :thread_communicate
209
243
 
210
- def thread_calculate
211
- Thread.new do
212
- begin
213
- loop do
214
- calculate_task
215
- end
216
- rescue => err
217
- @signal_queue.push([:node_error, err])
218
- end
219
- end
244
+ MAX_WAIT_FINISH = 3
245
+ WAIT_INTERVAL = 0.1
246
+
247
+ def respond_worker_signal
248
+ @worker.respond_signal
220
249
  end
221
- private :thread_calculate
250
+ private :respond_worker_signal
222
251
 
223
- def set_signal_trap
224
- Signal.trap(:TERM) do
225
- process_exit
252
+ def wait_process_finish
253
+ @worker.prepare_to_exit
254
+ total_wait_time = 0.0
255
+ loop do
256
+ respond_worker_signal
257
+ if !@worker.has_process?
258
+ break
259
+ elsif total_wait_time > MAX_WAIT_FINISH
260
+ # Kill worker processes forcibly.
261
+ @worker.kill_all_processes
262
+ break
263
+ end
264
+ sleep(WAIT_INTERVAL)
265
+ total_wait_time += WAIT_INTERVAL
226
266
  end
267
+ send_result_to_server
227
268
  end
269
+ private :wait_process_finish
228
270
 
229
271
  def calculate(opts = {})
230
272
  set_signal_trap
231
- cn = thread_communicate
232
- exec = thread_calculate
233
- cn.priority = PRIORITY_RESPOND
234
- exec.priority = PRIORITY_CALCULATE
235
- cn.join
273
+ begin
274
+ loop do
275
+ send_result_to_server
276
+ unless process_signal_for_server
277
+ break
278
+ end
279
+ if @state.change_to_sleep_for_busy_system
280
+ @logger.info("Sleep because system is busy.")
281
+ end
282
+ if get_new_task
283
+ send_task_to_worker
284
+ elsif @state.all_workers_waiting? && !@process_continue
285
+ execute_finalization
286
+ break
287
+ end
288
+ unless respond_worker_signal
289
+ wait_interval_of_connection
290
+ end
291
+ end
292
+ rescue => err
293
+ send_error(err, "Node error occurs.")
294
+ @worker.kill_all_processes
295
+ end
296
+ wait_process_finish
297
+ clear_node_files
236
298
  end
237
299
  end
238
-
239
300
  end