scbi_mapreduce 0.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
@@ -0,0 +1,15 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ # $: << File.join(File.dirname(__FILE__),File.basename(__FILE__,File.extname(__FILE__)))
5
+
6
+ module ScbiMapreduce
7
+ VERSION = '0.0.29'
8
+ end
9
+
10
+ require 'scbi_mapreduce/manager'
11
+ require 'scbi_mapreduce/worker_launcher'
12
+ require 'scbi_mapreduce/worker'
13
+ require 'scbi_mapreduce/work_manager'
14
+ require 'scbi_mapreduce/error_handler'
15
+ require 'scbi_mapreduce/zlib_serializer'
@@ -0,0 +1,15 @@
1
+ module ScbiMapreduce
2
+
3
+ class WorkerError < Exception
4
+
5
+ attr_reader :worker_id,:original_exception, :object
6
+
7
+ def initialize(message, original_exception, worker_id, object)
8
+ @message = message
9
+ @worker_id = worker_id
10
+ @original_exception = original_exception
11
+ @object = object
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # $: << '/Users/dariogf/progs/ruby/gems/scbi_mapreduce/lib'
4
+
5
+ require 'scbi_mapreduce'
6
+
7
+ class String
8
+ def camelize
9
+ self.split(/[^a-z0-9]/i).map{|w| w.capitalize}.join
10
+ end
11
+
12
+ def decamelize
13
+ self.to_s.
14
+ gsub(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2').
15
+ gsub(/([a-z]+)([A-Z\d])/, '\1_\2').
16
+ gsub(/([A-Z]{2,})(\d+)/i, '\1_\2').
17
+ gsub(/(\d+)([a-z])/i, '\1_\2').
18
+ gsub(/(.+?)\&(.+?)/, '\1_&_\2').
19
+ gsub(/\s/, '_').downcase
20
+ end
21
+ end
22
+
23
+ #================= MAIN
24
+
25
+ if ARGV.size != 4
26
+ puts "Usage #{$0} worker_id server_ip server_port custom_worker_class"
27
+ puts "Eg.: #{$0} 1 localhost 50000 MyWorker"
28
+ exit
29
+ end
30
+
31
+ worker_id = ARGV[0]
32
+ ip = ARGV[1]
33
+ port = ARGV[2].to_i
34
+ custom_worker_file = ARGV[3]
35
+
36
+ #$: << File.expand_path(File.dirname(custom_worker_file))
37
+
38
+ require custom_worker_file
39
+
40
+ klass_name = File.basename(custom_worker_file,File.extname(custom_worker_file)).camelize
41
+
42
+ worker_class = Object.const_get(klass_name)
43
+
44
+ worker_class.start_worker(worker_id,ip,port)
45
+
46
+ puts "FINISH WORKER"
47
+
48
+
49
+
50
+ # ============
@@ -0,0 +1,110 @@
1
+ require 'eventmachine'
2
+ require 'socket'
3
+ # require 'worker_launcher'
4
+ require 'logger'
5
+ require 'fileutils'
6
+
7
+ #
8
+ #= Manager class
9
+ #
10
+ # The manager side of scbi_mapreduce
11
+ #
12
+
13
+ module ScbiMapreduce
14
+
15
+
16
+
17
+
18
+ class Manager
19
+
20
+ attr_accessor :checkpointing, :keep_order, :retry_failed_jobs, :exit_on_many_errors, :chunk_size
21
+
22
+ # initialize Manager
23
+ def initialize(server_ip, port, workers, work_manager_class,custom_worker_file,log_file=nil, init_env_file=nil)
24
+ @port=port
25
+
26
+
27
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
28
+
29
+ ip=ip_list.select{|one_ip| one_ip.index(server_ip)==0}.first
30
+
31
+ if !ip
32
+ ip='0.0.0.0'
33
+ end
34
+
35
+ @ip = ip
36
+
37
+ port = 0
38
+
39
+
40
+ @checkpointing=false
41
+ @keep_order=false
42
+ @retry_failed_jobs=false
43
+
44
+ @chunk_size=1
45
+
46
+
47
+ @worker_names=[]
48
+ if workers.is_a?(Integer)
49
+ @workers=workers
50
+ else
51
+ # puts "find worker_names"
52
+ host_name=`hostname`.chomp
53
+ @workers=workers.count(host_name)
54
+
55
+ @worker_names=workers
56
+ @worker_names.delete(host_name)
57
+ # puts @workers
58
+ end
59
+
60
+ @work_manager_class = work_manager_class
61
+ @worker_launcher = WorkerLauncher.new(@ip,port,@workers,custom_worker_file,log_file,init_env_file)
62
+
63
+
64
+ if log_file.nil?
65
+ log_file = File.join('logs','server_log.txt')
66
+ end
67
+
68
+ FileUtils.mkdir_p(File.dirname(log_file)) if ((log_file!=STDOUT) && (!File.exists?(File.dirname(log_file))))
69
+ $SERVER_LOG = Logger.new(log_file)
70
+
71
+
72
+ $SERVER_LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
73
+
74
+ end
75
+
76
+
77
+ # Start a EventMachine loop acting as a server for incoming workers connections
78
+ def start_server
79
+
80
+ # set a custom error handler, otherwise errors are silently ignored when they occurs inside a callback.
81
+ EM.error_handler{ |e|
82
+ $SERVER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
83
+ }
84
+
85
+ # start EM loop
86
+ EventMachine::run {
87
+
88
+ @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_failed_jobs,@exit_on_many_errors,@chunk_size)
89
+
90
+ evm=EventMachine::start_server @ip, @port, @work_manager_class
91
+ dir=Socket.unpack_sockaddr_in( EM.get_sockname( evm ))
92
+
93
+ @port = dir[0].to_i
94
+ @ip=dir[1].to_s
95
+
96
+ $SERVER_LOG.info 'Server running at : ['+@ip.to_s+':'+@port.to_s+']'
97
+ @worker_launcher.server_port=@port
98
+ @worker_launcher.launch_workers
99
+ @worker_launcher.launch_external_workers(@worker_names)
100
+
101
+ }
102
+ rescue Exception => e
103
+ $SERVER_LOG.error("Exiting server due to exception:\n" + e.message+"\n"+e.backtrace.join("\n"))
104
+ @work_manager_class.end_work_manager
105
+ end
106
+
107
+
108
+ end
109
+
110
+ end
@@ -0,0 +1,405 @@
1
+ # = WorkManager
2
+ #
3
+ # One instance of this class is created automatically by EM to attend each worker.
4
+ #
5
+ #This class handles server <-> worker communications. It waits for workers connections, sends them the initial configuration parameters,
6
+ #and later sends new jobs each time a worker request a new one until no more works are available.
7
+ #
8
+ # Reliability can be incremented by using a hash @@running_jobs tracking the object_id of each running work. This approach should be slower than current one.
9
+
10
+ # require 'error_handler'
11
+
12
+ # TODO - Data preload (queue?) instead of under demand loading
13
+ # DONE - Add serializer with marshal + zlib deflate/inflate
14
+
15
+ module ScbiMapreduce
16
+
17
+
18
+ PENDING_TO_SAVE=100
19
+
20
+
21
+ class WorkManagerData
22
+
23
+ @@job_id=1
24
+
25
+ attr_reader :job_identifier
26
+ attr_accessor :status, :data
27
+
28
+ def initialize(job)
29
+
30
+ @job_identifier=@@job_id
31
+ @@job_id+=1
32
+ @data=job
33
+ @status=:running
34
+ end
35
+
36
+ def inspect
37
+ return "WorkManagerData: #{@job_identifier} => #{@status}"
38
+ end
39
+
40
+ def self.job_id=(c)
41
+ # puts "Setting job_id to #{c}"
42
+ @@job_id=c
43
+ end
44
+
45
+ def self.job_id
46
+ # puts "Setting job_id to #{c}"
47
+ @@job_id
48
+ end
49
+
50
+ end
51
+
52
+ #require 'json'
53
+ class WorkManager < EventMachine::Connection
54
+
55
+ include EM::P::ObjectProtocol
56
+
57
+ def self.init_work_manager
58
+
59
+ end
60
+
61
+ def self.end_work_manager
62
+
63
+ end
64
+
65
+ def next_work
66
+
67
+ end
68
+
69
+ def work_received(obj)
70
+
71
+ end
72
+
73
+ def worker_initial_config
74
+
75
+ end
76
+
77
+ def error_received(worker_error, obj)
78
+
79
+ end
80
+
81
+ def too_many_errors_received
82
+
83
+ end
84
+
85
+ def read_until_checkpoint(checkpoint)
86
+
87
+ end
88
+
89
+ # if this function returns -1, then automatic checkpointing is done.
90
+ # Return 0 to no checkpointing.
91
+ # Return the restored checkpoint number to start in this point.
92
+ def load_user_checkpoint(checkpoint)
93
+ return -1
94
+ end
95
+
96
+ def save_user_checkpoint
97
+ end
98
+
99
+ def trash_checkpointed_work
100
+
101
+ end
102
+
103
+ ############
104
+
105
+ def self.init_work_manager_internals(checkpointing, keep_order, retry_failed_jobs,exit_on_many_errors,chunk_size)
106
+ @@count = 0
107
+ @@chunk_count = 0
108
+ @@workers = 0
109
+ @@error_count = 0
110
+ @@running_jobs=[]
111
+ # @@compress=true
112
+
113
+ @@checkpointing=checkpointing
114
+ @@keep_order=keep_order
115
+ @@retry_failed_jobs=retry_failed_jobs
116
+ @@exit_on_many_errors=exit_on_many_errors
117
+
118
+ # TODO - Implement a dynamic chunk_size
119
+
120
+ @@chunk_size=chunk_size
121
+ $SERVER_LOG.info "Processing in chunks of #{@@chunk_size} objects"
122
+
123
+ @@checkpoint=0
124
+ if @@checkpointing
125
+ @@checkpoint=self.get_checkpoint
126
+ $SERVER_LOG.info "Detected checkpoint at #{@@checkpoint}"
127
+ end
128
+
129
+ end
130
+
131
+ def self.checkpoint
132
+ return @@checkpoint
133
+ end
134
+
135
+ def save_checkpoint
136
+ checkpoint_file = File.open('scbi_mapreduce_checkpoint','w')
137
+
138
+ if !@@running_jobs.empty?
139
+ checkpoint_file.puts @@running_jobs.first.job_identifier
140
+ else
141
+ checkpoint_file.puts WorkManagerData.job_id-1
142
+ end
143
+
144
+ checkpoint_file.close
145
+
146
+ save_user_checkpoint
147
+
148
+ end
149
+
150
+ def self.get_checkpoint
151
+ res = 0
152
+ begin
153
+ if File.exists?('scbi_mapreduce_checkpoint')
154
+ res=File.read('scbi_mapreduce_checkpoint').chomp
155
+ # puts "read checkpoint #{res}"
156
+
157
+ res = res.to_i
158
+ end
159
+ rescue
160
+ res = 0
161
+ end
162
+
163
+ return res
164
+ end
165
+
166
+ def send_initial_config
167
+ config = worker_initial_config
168
+
169
+ if config.nil?
170
+ obj = :no_initial_config
171
+ else
172
+ obj = {:initial_config => config}
173
+ end
174
+
175
+ send_object(obj)
176
+ end
177
+
178
+ # send next work to worker
179
+ def send_next_work
180
+
181
+ objs=[]
182
+
183
+ @@chunk_size.times do
184
+ obj=next_work
185
+ if obj.nil?
186
+ break
187
+ else
188
+ # add to obj array
189
+ objs << obj
190
+ end
191
+ end
192
+
193
+
194
+ if objs.count>0
195
+ @@count += objs.count
196
+ @@chunk_count += 1
197
+
198
+ work_data=WorkManagerData.new(objs)
199
+
200
+ send_object(work_data)
201
+
202
+ # to keep order or retry failed job, we need job status
203
+ if @@keep_order || @@retry_failed_jobs
204
+ @@running_jobs.push work_data
205
+ end
206
+ else
207
+
208
+ send_object(:quit)
209
+ end
210
+
211
+
212
+ end
213
+
214
+ def goto_checkpoint
215
+ if @@checkpoint>0
216
+ $SERVER_LOG.info "Skipping until checkpoint #{@@checkpoint}"
217
+
218
+ checkpoint=load_user_checkpoint(@@checkpoint)
219
+
220
+ # do an automatic checkpoint restore
221
+ if checkpoint==-1
222
+ @@checkpoint.times do |i|
223
+ # puts "Skipping #{i+1}"
224
+
225
+ # get next work
226
+ trash_checkpointed_work
227
+ # if obj
228
+ # if obj.methods.include?(:count)
229
+ # @@count += obj.count
230
+ # else
231
+ # @@count += 1
232
+ # end
233
+ # end
234
+ end
235
+
236
+ $SERVER_LOG.info "Automatic checkpoint finished"
237
+
238
+ WorkManagerData.job_id=@@checkpoint
239
+
240
+ #user has done the checkpoint restoration
241
+ elsif checkpoint>0
242
+
243
+ WorkManagerData.job_id=checkpoint
244
+ elsif checkpoint==0
245
+ $SERVER_LOG.info "Automatic checkpoint not done"
246
+ end
247
+
248
+
249
+ @@checkpoint=0
250
+
251
+ end
252
+
253
+ end
254
+
255
+ def post_init
256
+ @@workers += 1
257
+
258
+ # when first worker is connected, do special config
259
+ if @@workers == 1
260
+ @@total_seconds = Time.now
261
+ $SERVER_LOG.info "First worker connected"
262
+
263
+ if @@checkpointing
264
+ $SERVER_LOG.info "Checking for checkpoint"
265
+ goto_checkpoint
266
+ end
267
+ end
268
+
269
+ $SERVER_LOG.info "#{@@workers} workers connected"
270
+ send_initial_config
271
+ send_next_work
272
+ end
273
+
274
+
275
+ def receive_object(obj)
276
+
277
+ # check if response is an error
278
+ if obj.is_a?(Exception)
279
+ $SERVER_LOG.error("Error in worker #{obj.worker_id} while processing object #{obj.object.inspect}\n" + obj.original_exception.message + ":\n" + obj.original_exception.backtrace.join("\n"))
280
+
281
+ @@error_count += 1
282
+
283
+ error_received(obj,obj.object.data)
284
+
285
+ # if there are too many errors
286
+ if (@@count>100) && (@@error_count >= @@count*0.8)
287
+ @@exit = @@exit_on_many_errors
288
+
289
+ # notice programmer
290
+ res=too_many_errors_received
291
+
292
+ # force exit if too_many_errors_received returns true
293
+ if res==true
294
+ @@exit=res
295
+ end
296
+ end
297
+
298
+ else
299
+ # if not using checkpointing
300
+
301
+ if @@checkpointing || @@keep_order || @@retry_failed_jobs
302
+ checkpointable_job_received(obj)
303
+ else
304
+ work_received(obj.data)
305
+ end
306
+ end
307
+
308
+
309
+ send_next_work
310
+
311
+ end
312
+
313
+
314
+ def checkpointable_job_received(obj)
315
+ received_job=@@running_jobs.find{|o| o.job_identifier==obj.job_identifier}
316
+
317
+ # save job
318
+ if received_job
319
+
320
+ # change job's status to received
321
+ received_job.data=obj.data
322
+ received_job.status=:received
323
+
324
+ # if there are sufficient jobs, count pending ones
325
+ if (@@running_jobs.count>=PENDING_TO_SAVE)
326
+ # count received objects pending to be written
327
+ pending=0
328
+
329
+ @@running_jobs.each do |job|
330
+ if job.status==:received
331
+ pending += 1
332
+ else
333
+ break
334
+ end
335
+ end
336
+
337
+
338
+ if (pending>PENDING_TO_SAVE) || (pending==@@running_jobs.count)
339
+ # purge contiguos saved data
340
+ to_remove = 0
341
+
342
+ @@running_jobs.each_with_index do |job,i|
343
+ if job.status==:received
344
+ # puts "Sent to save: #{job.inspect}"
345
+ work_received(job.data)
346
+ job.status=:saved
347
+ to_remove += 1
348
+ else
349
+ break
350
+ end
351
+ end
352
+
353
+ # if some objects were saved
354
+ if to_remove > 0
355
+ to_remove.times do |i|
356
+ o=@@running_jobs.shift
357
+ # puts "Job removed #{o.inspect}"
358
+ end
359
+
360
+ save_checkpoint
361
+ end
362
+ end
363
+ end
364
+ else
365
+ $SERVER_LOG.info "Job already processed #{obj.inspect}"
366
+ end
367
+ end
368
+
369
+ def initialize(*args)
370
+ super
371
+ #puts "WORK MANAGER INITIALIZE NEWWWWWWWWWW, ONE per worker"
372
+ end
373
+
374
+ # A worker has disconected
375
+ def unbind
376
+
377
+ @@workers -= 1
378
+ #puts @@running_jobs.to_json
379
+
380
+ $SERVER_LOG.info "Worker disconnected. #{@@workers} kept running"
381
+
382
+ # no more workers left, shutdown EM and stop server
383
+ if @@workers == 0
384
+ $SERVER_LOG.info "All workers finished"
385
+ EM.stop
386
+ $SERVER_LOG.info "Exiting server"
387
+
388
+
389
+
390
+ self.class.end_work_manager
391
+
392
+ @@total_seconds = Time.now-@@total_seconds
393
+ $SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
394
+ $SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
395
+ $SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
396
+
397
+ $SERVER_LOG.info "Number of errors: #{@@error_count}"
398
+ $SERVER_LOG.info "Chunk size: #{@@chunk_size}"
399
+
400
+
401
+ end
402
+ end
403
+
404
+ end
405
+ end