drbqs 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data/docs/FormatExecute.md +44 -2
  2. data/example/group/execute.rb +19 -0
  3. data/example/group/server.rb +27 -0
  4. data/example/group/sum.rb +9 -0
  5. data/example/mandelbrot/README.md +8 -0
  6. data/example/mandelbrot/execute.rb +4 -0
  7. data/lib/drbqs/command_line/command_node.rb +1 -0
  8. data/lib/drbqs/execute/execute_node.rb +4 -21
  9. data/lib/drbqs/node/connection.rb +1 -2
  10. data/lib/drbqs/node/node.rb +163 -102
  11. data/lib/drbqs/node/state.rb +100 -35
  12. data/lib/drbqs/node/task_client.rb +46 -33
  13. data/lib/drbqs/server/message.rb +13 -7
  14. data/lib/drbqs/server/server.rb +57 -29
  15. data/lib/drbqs/server/server_hook.rb +19 -5
  16. data/lib/drbqs/server/test/node.rb +31 -6
  17. data/lib/drbqs/setting/node.rb +11 -2
  18. data/lib/drbqs/setting/server.rb +1 -1
  19. data/lib/drbqs/task/task.rb +26 -6
  20. data/lib/drbqs/task/task_generator.rb +2 -1
  21. data/lib/drbqs/utility/temporary.rb +27 -6
  22. data/lib/drbqs/utility/transfer/transfer_client.rb +10 -12
  23. data/lib/drbqs/version.rb +1 -1
  24. data/lib/drbqs/worker.rb +2 -0
  25. data/lib/drbqs/worker/forked_process.rb +100 -0
  26. data/lib/drbqs/worker/serialize.rb +66 -0
  27. data/lib/drbqs/worker/worker.rb +133 -0
  28. data/lib/drbqs/worker/worker_process_set.rb +219 -0
  29. data/spec/integration_test/01_basic_usage_spec.rb +3 -2
  30. data/spec/integration_test/06_node_exit_after_task_spec.rb +3 -2
  31. data/spec/integration_test/07_command_server_with_node_spec.rb +1 -0
  32. data/spec/integration_test/08_shutdown_unused_nodes_spec.rb +3 -2
  33. data/spec/integration_test/10_test_server_spec.rb +2 -2
  34. data/spec/integration_test/11_special_tasks_spec.rb +61 -0
  35. data/spec/integration_test/12_multiple_workers_spec.rb +43 -0
  36. data/spec/integration_test/definition/task_obj_definition.rb +33 -6
  37. data/spec/node/connection_spec.rb +6 -6
  38. data/spec/node/node_spec.rb +10 -2
  39. data/spec/node/state_spec.rb +146 -62
  40. data/spec/node/task_client_spec.rb +58 -53
  41. data/spec/server/message_spec.rb +10 -6
  42. data/spec/server/queue_spec.rb +7 -4
  43. data/spec/server/server_hook_spec.rb +28 -1
  44. data/spec/task/task_spec.rb +43 -6
  45. data/spec/utility/temporary_spec.rb +32 -9
  46. data/spec/worker/forked_process_spec.rb +66 -0
  47. data/spec/worker/serialize_spec.rb +73 -0
  48. data/spec/worker/worker_process_set_spec.rb +104 -0
  49. data/spec/worker/worker_spec.rb +127 -0
  50. metadata +34 -19
@@ -11,7 +11,9 @@ drbqs-execute execute a server of which uri is made from hostname and port.
11
11
  Moreover, drbqs-execute make nodes connecting to the uri of server.
12
12
  The server and nodes can be over SSH.
13
13
 
14
- ## Example: execute.rb
14
+ ## Example
15
+
16
+ ### execute.rb
15
17
 
16
18
  #!/usr/bin/env drbqs-execute
17
19
  # -*-ruby-*-
@@ -55,7 +57,21 @@ The server and nodes can be over SSH.
55
57
  node :even, group: [:node02, :node04, :node06]
56
58
  node :odd, group: [:node01, :node03, :node05]
57
59
 
58
- ## Help message
60
+ ### Execution
61
+
62
+ In the above example, there is the shebang line
63
+
64
+ #!/usr/bin/env drbqs-execute
65
+
66
+ and therefore we can execute by
67
+
68
+ ./execute.rb
69
+
70
+ If there is no shebang line, we type
71
+
72
+ drbqs-execute execute.rb
73
+
74
+ ### Help message
59
75
 
60
76
  If we run the following command
61
77
 
@@ -63,6 +79,32 @@ If we run the following command
63
79
 
64
80
  then help message of server.rb is displayed in addition to that of drbqs-execute.
65
81
 
82
+ ### Information of server and nodes
83
+
84
+ The command
85
+
86
+ drbqs-execute -i
87
+
88
+ shows information of server and nodes.
89
+ The output is
90
+
91
+ Server:
92
+ * server1 ssh
93
+ local local
94
+ Node:
95
+ - node_base ssh,template
96
+ node01 ssh
97
+ node02 ssh
98
+ node03 ssh
99
+ node04 ssh
100
+ node05 ssh
101
+ node06 ssh
102
+ - even group: node02,node04,node06
103
+ - odd group: node01,node03,node05
104
+ Port: 12345
105
+
106
+ The character "*" means default and "-" means virtual nodes (template or group).
107
+
66
108
  ## Methods
67
109
 
68
110
  The following methods are available.
@@ -0,0 +1,19 @@
1
+ default :server => :server_local, :port => 13789, :log => '/tmp/drbqs_execute'
2
+
3
+ current_dir = File.expand_path(File.dirname(__FILE__))
4
+
5
+ usage :message => "Calculate sum of numbers", :server => File.join(current_dir, 'server.rb')
6
+
7
+ server :server, "localhost" do |server|
8
+ server.load File.expand_path(File.join(File.dirname(__FILE__), 'server.rb'))
9
+ end
10
+
11
+ node :node_odd do |node|
12
+ node.load File.join(current_dir, 'sum.rb')
13
+ node.group :odd
14
+ end
15
+
16
+ node :node_even do |node|
17
+ node.load File.join(current_dir, 'sum.rb')
18
+ node.group :even
19
+ end
@@ -0,0 +1,27 @@
1
+ #
2
+ # Usage:
3
+ # drbqs-server server.rb -- 30 50
4
+ # drbqs-server server.rb -- 100 500 --step 100
5
+ #
6
+
7
+ require_relative 'sum.rb'
8
+
9
+ DRbQS.option_parser do |opt, hash|
10
+ opt.on('--step NUM', Integer) do |v|
11
+ hash[:step] = v
12
+ end
13
+ end
14
+
15
+ DRbQS.define_server do |server, argv, opts|
16
+ start_num = (argv[0] || 10).to_i
17
+ end_num = (argv[1] || 100).to_i
18
+ step_num = opts[:step] || 10
19
+
20
+ server.task_generator(:generate => 10) do |tgen|
21
+ start_num.step(end_num, step_num).with_index do |i, count|
22
+ create_add(Sum.new(i - 10, i), :exec, group: (count.even? ? :even : :odd)) do |srv, ret|
23
+ puts "Receive: #{ret.inspect}"
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,9 @@
1
+ class Sum
2
+ def initialize(start_num, end_num)
3
+ @num = [start_num, end_num]
4
+ end
5
+
6
+ def exec
7
+ (@num[0]..@num[1]).inject(0) { |sum, i| sum += i }
8
+ end
9
+ end
@@ -10,6 +10,14 @@ This program requires gunzip command.
10
10
 
11
11
  drbqs-server server.rb -h
12
12
 
13
+ or
14
+
15
+ drbqs-execute execute.rb -h
16
+
13
17
  ## Execute
14
18
 
15
19
  drbqs-server server.rb --execute-node <process_number>
20
+
21
+ or
22
+
23
+ bin/drbqs-execute example/mandelbrot/execute.rb
@@ -1,5 +1,9 @@
1
1
  DIR = File.dirname(__FILE__)
2
2
 
3
+ current_dir = File.expand_path(File.dirname(__FILE__))
4
+
5
+ usage :message => "Calculate Mandelbrot set", :server => File.join(current_dir, 'server.rb')
6
+
3
7
  server :local_server, "localhost" do |srv|
4
8
  srv.load File.join(DIR, 'server.rb')
5
9
  end
@@ -15,6 +15,7 @@ HELP
15
15
  argv = option_parser_base(argv, :log_level => true, :daemon => true, :debug => true) do
16
16
  set(:load, '-l', '--load FILE', String, 'Add a file to load.')
17
17
  set(:process, '-P', '--process NUM', Integer, 'Set the number of node processes to execute.')
18
+ set(:group, '--group STR', String, 'Set the group of node.')
18
19
  set(:loadavg, '--loadavg STR', String, 'Set the threshold load average to sleep.')
19
20
  set(:log_prefix, '--log-prefix STR', String, "Set the prefix of log files. The default is '#{setting.default[:log_prefix][0]}'.")
20
21
  set(:log_stdout, '--log-stdout', 'Use stdout for outputting logs. This option cancels --log-prefix.')
@@ -1,8 +1,6 @@
1
1
  module DRbQS
2
2
  class Execution
3
3
  class ExecuteNode
4
- attr_reader :pid
5
-
6
4
  def initialize(uri, log_prefix, log_level, node_opts = {})
7
5
  @uri = uri
8
6
  @log_level = log_level
@@ -13,7 +11,6 @@ module DRbQS
13
11
  else
14
12
  @fname = nil
15
13
  end
16
- @pid = []
17
14
  @node_opts = node_opts
18
15
  end
19
16
 
@@ -25,25 +22,11 @@ module DRbQS
25
22
  end
26
23
  private :get_log_file
27
24
 
28
- def create_process
29
- @pid << fork do
30
- opts = @node_opts.merge({ :log_level => @log_level, :log_file => get_log_file })
31
- node = DRbQS::Node.new(@uri, opts)
32
- node.connect
33
- node.calculate
34
- end
35
- end
36
- private :create_process
37
-
38
25
  def execute(process_num, interval = 0)
39
- process_num.times do |i|
40
- create_process
41
- sleep(interval) if interval > 0
42
- end
43
- end
44
-
45
- def wait
46
- Process.waitall
26
+ opts = @node_opts.merge({ :log_level => @log_level, :log_file => get_log_file, :process => process_num })
27
+ node = DRbQS::Node.new(@uri, opts)
28
+ node.connect
29
+ node.calculate
47
30
  end
48
31
  end
49
32
  end
@@ -28,8 +28,7 @@ module DRbQS
28
28
 
29
29
  def get_special_task(label)
30
30
  begin
31
- ary = @message.read([label, nil, Symbol, nil], 0)
32
- ary[1..-1]
31
+ @message.read([label, Array], 0)[1]
33
32
  rescue Rinda::RequestExpiredError
34
33
  nil
35
34
  end
@@ -4,6 +4,7 @@ require 'drbqs/utility/transfer/transfer_client'
4
4
  require 'drbqs/node/connection'
5
5
  require 'drbqs/node/task_client'
6
6
  require 'drbqs/node/state'
7
+ require 'drbqs/worker/worker'
7
8
 
8
9
  module DRbQS
9
10
 
@@ -13,25 +14,41 @@ module DRbQS
13
14
  PRIORITY_CALCULATE = 0
14
15
  OUTPUT_NOT_SEND_RESULT = 'not_send_result'
15
16
  DEFAULT_LOG_FILE = 'drbqs_client.log'
16
- INTERVAL_TIME_DEFAULT = 1
17
+ INTERVAL_TIME_DEFAULT = 0.1
18
+ SAME_HOST_GROUP = :local
17
19
 
18
- # :continue
19
- # :max_loadavg
20
- # :sleep_time
20
+ # @param [String] acces_uri Set the uri of server
21
+ # @param [Hash] opts Options of a node
22
+ # @option opts [Fixnum] :process Number of worker processes
23
+ # @option opts [Array] :group An array of group symbols
24
+ # @option opts [Boolean] :continue If we set true then the node process does not exit
25
+ # @option opts [Fixnum] :sleep_time Time interval during sleep of the node
26
+ # @option opts [String] :max_loadavg Note that this optiono is experimental
21
27
  def initialize(access_uri, opts = {})
22
28
  @access_uri = access_uri
23
29
  @logger = DRbQS::Misc.create_logger(opts[:log_file] || DEFAULT_LOG_FILE, opts[:log_level])
24
30
  @connection = nil
25
31
  @task_client = nil
26
- @state = DRbQS::Node::State.new(:wait, :max_loadavg => opts[:max_loadavg], :sleep_time => opts[:sleep_time])
32
+ @worker_number = opts[:process] || 1
33
+ @state = DRbQS::Node::State.new(:wait, @worker_number, :max_loadavg => opts[:max_loadavg], :sleep_time => opts[:sleep_time])
27
34
  @process_continue = opts[:continue]
28
- @signal_queue = Queue.new
35
+ @group = opts[:group] || []
36
+ @signal_to_server_queue = Queue.new
29
37
  @config = DRbQS::Config.new
38
+ @special_task_number = 0
39
+ @worker = DRbQS::Worker::ProcessSet.new(DRbQS::Worker::ForkedProcess)
40
+ @worker.on_result do |proc_key, res|
41
+ task_id, h = res
42
+ queue_result(task_id, h)
43
+ end
44
+ @worker.on_error do |proc_key, res|
45
+ @signal_to_server_queue.push([:node_error, res])
46
+ end
30
47
  end
31
48
 
32
- def transfer_file
49
+ def transfer_file(files)
33
50
  begin
34
- DRbQS::Transfer::Client.transfer_to_server
51
+ DRbQS::Transfer::Client.transfer_to_server(files)
35
52
  rescue Exception => err
36
53
  @logger.error("Fail to transfer files.") do
37
54
  "#{err.to_s} (#{err.class})\n#{err.backtrace.join("\n")}"
@@ -41,33 +58,62 @@ module DRbQS
41
58
  end
42
59
  private :transfer_file
43
60
 
44
- def execute_task(marshal_obj, method_sym, args)
45
- result = DRbQS::Task.execute_task(marshal_obj, method_sym, args)
46
- transfer_file
47
- DRbQS::Temporary.delete
48
- result
61
+ def queue_result(task_id, result_hash)
62
+ if files = result_hash[:transfer]
63
+ transfer_file(files)
64
+ end
65
+ if subdir = result_hash[:tmp]
66
+ FileUtils.rm_r(result_hash[:tmp])
67
+ end
68
+ @task_client.queue_result(task_id, result_hash[:result])
49
69
  end
50
- private :execute_task
70
+ private :queue_result
51
71
 
52
72
  def node_data
53
73
  { :uri => @access_uri }
54
74
  end
55
75
  private :node_data
56
76
 
77
+ # @param [Array] task_ary An array from @connection.get_initialization or @connection.get_finalization.
78
+ def send_special_task_ary_to_all_workers(task_ary)
79
+ task_ary.each do |ary|
80
+ ary_to_send = [nil] + ary
81
+ @state.each_worker_id do |wid|
82
+ @worker.send_task(wid, ary_to_send)
83
+ end
84
+ end
85
+ end
86
+ private :send_special_task_ary_to_all_workers
87
+
88
+ # Connect to the server and finish initialization of the node.
57
89
  def connect
58
90
  obj = DRbObject.new_with_uri(@access_uri)
59
91
  @server_key = obj[:key]
60
- @connection = Node::Connection.new(obj[:message], @logger)
61
- @task_client = Node::TaskClient.new(@connection.node_number, obj[:queue], obj[:result], @logger)
92
+ @connection = DRbQS::Node::Connection.new(obj[:message], @logger)
93
+ set_node_group_for_task
94
+ @task_client = DRbQS::Node::TaskClient.new(@connection.node_number, obj[:queue], obj[:result],
95
+ @group, @logger)
62
96
  DRbQS::Transfer::Client.set(obj[:transfer].get_client(server_on_same_host?)) if obj[:transfer]
63
- if ary = @connection.get_initialization
64
- execute_task(*ary)
97
+ @state.each_worker_id do |wid|
98
+ @worker.create_process(wid)
99
+ end
100
+ if ary_initialization = @connection.get_initialization
101
+ send_special_task_ary_to_all_workers(ary_initialization)
65
102
  end
66
103
  @config.list.node.save(Process.pid, node_data)
67
104
  end
68
105
 
106
+ # This method must be executed after @connection is set.
107
+ def set_node_group_for_task
108
+ if server_on_same_host?
109
+ @group << DRbQS::Node::SAME_HOST_GROUP
110
+ end
111
+ end
112
+ private :set_node_group_for_task
113
+
69
114
  def server_on_same_host?
70
- @config.list.server.server_of_key_exist?(@access_uri, @server_key)
115
+ @server_on_same_host ||
116
+ (@server_on_same_host = @config.list.server.server_of_key_exist?(@access_uri, @server_key))
71
117
  end
72
118
 
73
119
  def dump_not_send_result_to_file
@@ -83,17 +129,10 @@ module DRbQS
83
129
  end
84
130
  private :output_error
85
131
 
86
- def process_exit
87
- dump_not_send_result_to_file
88
- unless @process_continue
89
- Kernel.exit
90
- end
91
- end
92
- private :process_exit
93
-
94
132
  def execute_finalization
95
- if ary = @connection.get_finalization
96
- execute_task(*ary)
133
+ @worker.waitall
134
+ if ary_finalization = @connection.get_finalization
135
+ send_special_task_ary_to_all_workers(ary_finalization)
97
136
  end
98
137
  rescue => err
99
138
  output_error(err, "On finalization")
@@ -102,80 +141,88 @@ module DRbQS
102
141
 
103
142
  def send_error(err, mes)
104
143
  output_error(err, mes)
105
- @connection.send_node_error("#{err.to_s}\n#{err.backtrace.join("\n")}")
144
+ begin
145
+ @connection.send_node_error("#{err.to_s}\n#{err.backtrace.join("\n")}")
146
+ rescue
147
+ end
106
148
  end
107
149
  private :send_error
108
150
 
109
151
  def get_new_task
110
- if @state.request?
111
- if @state.change_to_sleep_for_busy_system
112
- @logger.info("Sleep because system is busy.")
113
- elsif @task_client.add_new_task
114
- @state.change_to_calculate
115
- end
152
+ if @state.request? && (obtained_task_id = @task_client.add_new_task(@state.request_task_number))
153
+ return obtained_task_id
116
154
  end
155
+ nil
117
156
  end
118
157
  private :get_new_task
119
158
 
120
- def send_result
121
- flag_finilize_exit = @task_client.send_result
122
- if @state.calculate? && !@task_client.calculating_task
123
- @state.change_to_finish_calculating
159
+ def send_result_to_server
160
+ if sent_task_id = @task_client.send_result
161
+ @state.set_finish_of_task(sent_task_id)
124
162
  end
125
- flag_finilize_exit
126
163
  end
127
- private :send_result
164
+ private :send_result_to_server
128
165
 
166
+ # Send signals from @signal_to_server_queue,
167
+ # which stores errors of workers and signals to current process.
168
+ # @return [Boolean] If some error signal is sent then this method returns true. Otherwise, nil.
129
169
  def send_signal
130
- until @signal_queue.empty?
131
- signal, obj = @signal_queue.pop
170
+ flag_finalize_exit = nil
171
+ until @signal_to_server_queue.empty?
172
+ signal, obj = @signal_to_server_queue.pop
132
173
  case signal
133
174
  when :node_error
134
175
  send_error(obj, "Communicating with server")
135
- process_exit
176
+ dump_not_send_result_to_file
177
+ flag_finalize_exit = true
178
+ when :signal_kill
179
+ flag_finalize_exit = true
180
+ else
181
+ raise "Not implemented"
136
182
  end
137
183
  end
184
+ flag_finalize_exit
138
185
  end
139
186
  private :send_signal
140
187
 
141
- def process_signal
188
+ # If the method returns true, the node finishes.
189
+ def process_signal_for_server
190
+ flag_finalize_exit = send_signal
142
191
  case @connection.respond_signal
143
192
  when :wake
144
- @state.change_to_wait
193
+ @state.wakeup_sleeping_worker
145
194
  when :sleep
146
195
  @state.change_to_sleep
147
196
  when :exit
148
- return :exit
197
+ return nil
149
198
  when :finalize
150
- return :finalize
199
+ flag_finalize_exit = true
151
200
  when :exit_after_task
152
- @task_client.set_exit_after_task
201
+ @state.set_exit_after_task
153
202
  @process_continue = nil
154
203
  end
155
- nil
156
- end
157
- private :process_signal
158
-
159
- def communicate_with_server
160
- get_new_task
161
- sig = process_signal
162
- return nil if sig == :exit
163
- flag_finilize_exit = send_result
164
- send_signal
165
- if sig == :finalize || flag_finilize_exit
204
+ if flag_finalize_exit
166
205
  execute_finalization
167
206
  return nil
168
207
  end
169
208
  @state.wakeup_automatically_for_unbusy_system
170
209
  true
171
210
  end
172
- private :communicate_with_server
211
+ private :process_signal_for_server
173
212
 
174
- def calculate_task
175
- marshal_obj, method_sym, args = @task_client.dequeue_task
176
- @task_client.queue_result(execute_task(marshal_obj, method_sym, args))
213
+ # Dequeue tasks from @task_client and send them to worker processes.
214
+ def send_task_to_worker
215
+ wids = @state.waiting_worker_id
216
+ wids.each do |wid|
217
+ if ary = @task_client.dequeue_task
218
+ @state.set_calculating_task(wid, ary[0])
219
+ @worker.send_task(wid, ary)
220
+ else
221
+ break
222
+ end
223
+ end
177
224
  end
178
- private :calculate_task
225
+ private :send_task_to_worker
179
226
 
180
227
  def clear_node_files
181
228
  DRbQS::Temporary.delete_all
@@ -188,52 +235,66 @@ module DRbQS
188
235
  end
189
236
  private :wait_interval_of_connection
190
237
 
191
- def thread_communicate
192
- Thread.new do
193
- begin
194
- loop do
195
- unless communicate_with_server
196
- clear_node_files
197
- break
198
- end
199
- wait_interval_of_connection
200
- end
201
- rescue => err
202
- send_error(err, "Calculating thread")
203
- ensure
204
- process_exit
205
- end
238
+ def set_signal_trap
239
+ Signal.trap(:TERM) do
240
+ @signal_to_server_queue.push([:signal_kill])
206
241
  end
207
242
  end
208
- private :thread_communicate
209
243
 
210
- def thread_calculate
211
- Thread.new do
212
- begin
213
- loop do
214
- calculate_task
215
- end
216
- rescue => err
217
- @signal_queue.push([:node_error, err])
218
- end
219
- end
244
+ MAX_WAIT_FINISH = 3
245
+ WAIT_INTERVAL = 0.1
246
+
247
+ def respond_worker_signal
248
+ @worker.respond_signal
220
249
  end
221
- private :thread_calculate
250
+ private :respond_worker_signal
222
251
 
223
- def set_signal_trap
224
- Signal.trap(:TERM) do
225
- process_exit
252
+ def wait_process_finish
253
+ @worker.prepare_to_exit
254
+ total_wait_time = 0.0
255
+ loop do
256
+ respond_worker_signal
257
+ if !@worker.has_process?
258
+ break
259
+ elsif total_wait_time > MAX_WAIT_FINISH
260
+ # Kill worker processes forcibly.
261
+ @worker.kill_all_processes
262
+ break
263
+ end
264
+ sleep(WAIT_INTERVAL)
265
+ total_wait_time += WAIT_INTERVAL
226
266
  end
267
+ send_result_to_server
227
268
  end
269
+ private :wait_process_finish
228
270
 
229
271
  def calculate(opts = {})
230
272
  set_signal_trap
231
- cn = thread_communicate
232
- exec = thread_calculate
233
- cn.priority = PRIORITY_RESPOND
234
- exec.priority = PRIORITY_CALCULATE
235
- cn.join
273
+ begin
274
+ loop do
275
+ send_result_to_server
276
+ unless process_signal_for_server
277
+ break
278
+ end
279
+ if @state.change_to_sleep_for_busy_system
280
+ @logger.info("Sleep because system is busy.")
281
+ end
282
+ if get_new_task
283
+ send_task_to_worker
284
+ elsif @state.all_workers_waiting? && !@process_continue
285
+ execute_finalization
286
+ break
287
+ end
288
+ unless respond_worker_signal
289
+ wait_interval_of_connection
290
+ end
291
+ end
292
+ rescue => err
293
+ send_error(err, "Node error occurs.")
294
+ @worker.kill_all_processes
295
+ end
296
+ wait_process_finish
297
+ clear_node_files
236
298
  end
237
299
  end
238
-
239
300
  end