scbi_mapreduce 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
@@ -0,0 +1,15 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ # $: << File.join(File.dirname(__FILE__),File.basename(__FILE__,File.extname(__FILE__)))
5
+
6
+ module ScbiMapreduce
7
+ VERSION = '0.0.29'
8
+ end
9
+
10
+ require 'scbi_mapreduce/manager'
11
+ require 'scbi_mapreduce/worker_launcher'
12
+ require 'scbi_mapreduce/worker'
13
+ require 'scbi_mapreduce/work_manager'
14
+ require 'scbi_mapreduce/error_handler'
15
+ require 'scbi_mapreduce/zlib_serializer'
@@ -0,0 +1,15 @@
1
+ module ScbiMapreduce
2
+
3
+ class WorkerError < Exception
4
+
5
+ attr_reader :worker_id,:original_exception, :object
6
+
7
+ def initialize(message, original_exception, worker_id, object)
8
+ @message = message
9
+ @worker_id = worker_id
10
+ @original_exception = original_exception
11
+ @object = object
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # $: << '/Users/dariogf/progs/ruby/gems/scbi_mapreduce/lib'
4
+
5
+ require 'scbi_mapreduce'
6
+
7
+ class String
8
+ def camelize
9
+ self.split(/[^a-z0-9]/i).map{|w| w.capitalize}.join
10
+ end
11
+
12
+ def decamelize
13
+ self.to_s.
14
+ gsub(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2').
15
+ gsub(/([a-z]+)([A-Z\d])/, '\1_\2').
16
+ gsub(/([A-Z]{2,})(\d+)/i, '\1_\2').
17
+ gsub(/(\d+)([a-z])/i, '\1_\2').
18
+ gsub(/(.+?)\&(.+?)/, '\1_&_\2').
19
+ gsub(/\s/, '_').downcase
20
+ end
21
+ end
22
+
23
+ #================= MAIN
24
+
25
+ if ARGV.size != 4
26
+ puts "Usage #{$0} worker_id server_ip server_port custom_worker_class"
27
+ puts "Eg.: #{$0} 1 localhost 50000 MyWorker"
28
+ exit
29
+ end
30
+
31
+ worker_id = ARGV[0]
32
+ ip = ARGV[1]
33
+ port = ARGV[2].to_i
34
+ custom_worker_file = ARGV[3]
35
+
36
+ #$: << File.expand_path(File.dirname(custom_worker_file))
37
+
38
+ require custom_worker_file
39
+
40
+ klass_name = File.basename(custom_worker_file,File.extname(custom_worker_file)).camelize
41
+
42
+ worker_class = Object.const_get(klass_name)
43
+
44
+ worker_class.start_worker(worker_id,ip,port)
45
+
46
+ puts "FINISH WORKER"
47
+
48
+
49
+
50
+ # ============
@@ -0,0 +1,110 @@
1
+ require 'eventmachine'
2
+ require 'socket'
3
+ # require 'worker_launcher'
4
+ require 'logger'
5
+ require 'fileutils'
6
+
7
+ #
8
+ #= Manager class
9
+ #
10
+ # The manager side of scbi_mapreduce
11
+ #
12
+
13
+ module ScbiMapreduce
14
+
15
+
16
+
17
+
18
+ class Manager
19
+
20
+ attr_accessor :checkpointing, :keep_order, :retry_failed_jobs, :exit_on_many_errors, :chunk_size
21
+
22
+ # initialize Manager
23
+ def initialize(server_ip, port, workers, work_manager_class,custom_worker_file,log_file=nil, init_env_file=nil)
24
+ @port=port
25
+
26
+
27
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
28
+
29
+ ip=ip_list.select{|one_ip| one_ip.index(server_ip)==0}.first
30
+
31
+ if !ip
32
+ ip='0.0.0.0'
33
+ end
34
+
35
+ @ip = ip
36
+
37
+ port = 0
38
+
39
+
40
+ @checkpointing=false
41
+ @keep_order=false
42
+ @retry_failed_jobs=false
43
+
44
+ @chunk_size=1
45
+
46
+
47
+ @worker_names=[]
48
+ if workers.is_a?(Integer)
49
+ @workers=workers
50
+ else
51
+ # puts "find worker_names"
52
+ host_name=`hostname`.chomp
53
+ @workers=workers.count(host_name)
54
+
55
+ @worker_names=workers
56
+ @worker_names.delete(host_name)
57
+ # puts @workers
58
+ end
59
+
60
+ @work_manager_class = work_manager_class
61
+ @worker_launcher = WorkerLauncher.new(@ip,port,@workers,custom_worker_file,log_file,init_env_file)
62
+
63
+
64
+ if log_file.nil?
65
+ log_file = File.join('logs','server_log.txt')
66
+ end
67
+
68
+ FileUtils.mkdir_p(File.dirname(log_file)) if ((log_file!=STDOUT) && (!File.exists?(File.dirname(log_file))))
69
+ $SERVER_LOG = Logger.new(log_file)
70
+
71
+
72
+ $SERVER_LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
73
+
74
+ end
75
+
76
+
77
+ # Start a EventMachine loop acting as a server for incoming workers connections
78
+ def start_server
79
+
80
+ # set a custom error handler, otherwise errors are silently ignored when they occurs inside a callback.
81
+ EM.error_handler{ |e|
82
+ $SERVER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
83
+ }
84
+
85
+ # start EM loop
86
+ EventMachine::run {
87
+
88
+ @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_failed_jobs,@exit_on_many_errors,@chunk_size)
89
+
90
+ evm=EventMachine::start_server @ip, @port, @work_manager_class
91
+ dir=Socket.unpack_sockaddr_in( EM.get_sockname( evm ))
92
+
93
+ @port = dir[0].to_i
94
+ @ip=dir[1].to_s
95
+
96
+ $SERVER_LOG.info 'Server running at : ['+@ip.to_s+':'+@port.to_s+']'
97
+ @worker_launcher.server_port=@port
98
+ @worker_launcher.launch_workers
99
+ @worker_launcher.launch_external_workers(@worker_names)
100
+
101
+ }
102
+ rescue Exception => e
103
+ $SERVER_LOG.error("Exiting server due to exception:\n" + e.message+"\n"+e.backtrace.join("\n"))
104
+ @work_manager_class.end_work_manager
105
+ end
106
+
107
+
108
+ end
109
+
110
+ end
@@ -0,0 +1,405 @@
1
+ # = WorkManager
2
+ #
3
+ # One instance of this class is created automatically by EM to attend each worker.
4
+ #
5
+ #This class handles server <-> worker communications. It waits for workers connections, sends them the initial configuration parameters,
6
+ #and later sends new jobs each time a worker request a new one until no more works are available.
7
+ #
8
+ # Reliability can be incremented by using a hash @@running_jobs tracking the object_id of each running work. This approach should be slower than current one.
9
+
10
+ # require 'error_handler'
11
+
12
+ # TODO - Data preload (queue?) instead of under demand loading
13
+ # DONE - Add serializer with marshal + zlib deflate/inflate
14
+
15
+ module ScbiMapreduce
16
+
17
+
18
+ PENDING_TO_SAVE=100
19
+
20
+
21
+ class WorkManagerData
22
+
23
+ @@job_id=1
24
+
25
+ attr_reader :job_identifier
26
+ attr_accessor :status, :data
27
+
28
+ def initialize(job)
29
+
30
+ @job_identifier=@@job_id
31
+ @@job_id+=1
32
+ @data=job
33
+ @status=:running
34
+ end
35
+
36
+ def inspect
37
+ return "WorkManagerData: #{@job_identifier} => #{@status}"
38
+ end
39
+
40
+ def self.job_id=(c)
41
+ # puts "Setting job_id to #{c}"
42
+ @@job_id=c
43
+ end
44
+
45
+ def self.job_id
46
+ # puts "Setting job_id to #{c}"
47
+ @@job_id
48
+ end
49
+
50
+ end
51
+
52
+ #require 'json'
53
+ class WorkManager < EventMachine::Connection
54
+
55
+ include EM::P::ObjectProtocol
56
+
57
+ def self.init_work_manager
58
+
59
+ end
60
+
61
+ def self.end_work_manager
62
+
63
+ end
64
+
65
+ def next_work
66
+
67
+ end
68
+
69
+ def work_received(obj)
70
+
71
+ end
72
+
73
+ def worker_initial_config
74
+
75
+ end
76
+
77
+ def error_received(worker_error, obj)
78
+
79
+ end
80
+
81
+ def too_many_errors_received
82
+
83
+ end
84
+
85
+ def read_until_checkpoint(checkpoint)
86
+
87
+ end
88
+
89
+ # if this function returns -1, then automatic checkpointing is done.
90
+ # Return 0 to no checkpointing.
91
+ # Return the restored checkpoint number to start in this point.
92
+ def load_user_checkpoint(checkpoint)
93
+ return -1
94
+ end
95
+
96
+ def save_user_checkpoint
97
+ end
98
+
99
+ def trash_checkpointed_work
100
+
101
+ end
102
+
103
+ ############
104
+
105
+ def self.init_work_manager_internals(checkpointing, keep_order, retry_failed_jobs,exit_on_many_errors,chunk_size)
106
+ @@count = 0
107
+ @@chunk_count = 0
108
+ @@workers = 0
109
+ @@error_count = 0
110
+ @@running_jobs=[]
111
+ # @@compress=true
112
+
113
+ @@checkpointing=checkpointing
114
+ @@keep_order=keep_order
115
+ @@retry_failed_jobs=retry_failed_jobs
116
+ @@exit_on_many_errors=exit_on_many_errors
117
+
118
+ # TODO - Implement a dynamic chunk_size
119
+
120
+ @@chunk_size=chunk_size
121
+ $SERVER_LOG.info "Processing in chunks of #{@@chunk_size} objects"
122
+
123
+ @@checkpoint=0
124
+ if @@checkpointing
125
+ @@checkpoint=self.get_checkpoint
126
+ $SERVER_LOG.info "Detected checkpoint at #{@@checkpoint}"
127
+ end
128
+
129
+ end
130
+
131
+ def self.checkpoint
132
+ return @@checkpoint
133
+ end
134
+
135
+ def save_checkpoint
136
+ checkpoint_file = File.open('scbi_mapreduce_checkpoint','w')
137
+
138
+ if !@@running_jobs.empty?
139
+ checkpoint_file.puts @@running_jobs.first.job_identifier
140
+ else
141
+ checkpoint_file.puts WorkManagerData.job_id-1
142
+ end
143
+
144
+ checkpoint_file.close
145
+
146
+ save_user_checkpoint
147
+
148
+ end
149
+
150
+ def self.get_checkpoint
151
+ res = 0
152
+ begin
153
+ if File.exists?('scbi_mapreduce_checkpoint')
154
+ res=File.read('scbi_mapreduce_checkpoint').chomp
155
+ # puts "read checkpoint #{res}"
156
+
157
+ res = res.to_i
158
+ end
159
+ rescue
160
+ res = 0
161
+ end
162
+
163
+ return res
164
+ end
165
+
166
+ def send_initial_config
167
+ config = worker_initial_config
168
+
169
+ if config.nil?
170
+ obj = :no_initial_config
171
+ else
172
+ obj = {:initial_config => config}
173
+ end
174
+
175
+ send_object(obj)
176
+ end
177
+
178
+ # send next work to worker
179
+ def send_next_work
180
+
181
+ objs=[]
182
+
183
+ @@chunk_size.times do
184
+ obj=next_work
185
+ if obj.nil?
186
+ break
187
+ else
188
+ # add to obj array
189
+ objs << obj
190
+ end
191
+ end
192
+
193
+
194
+ if objs.count>0
195
+ @@count += objs.count
196
+ @@chunk_count += 1
197
+
198
+ work_data=WorkManagerData.new(objs)
199
+
200
+ send_object(work_data)
201
+
202
+ # to keep order or retry failed job, we need job status
203
+ if @@keep_order || @@retry_failed_jobs
204
+ @@running_jobs.push work_data
205
+ end
206
+ else
207
+
208
+ send_object(:quit)
209
+ end
210
+
211
+
212
+ end
213
+
214
+ def goto_checkpoint
215
+ if @@checkpoint>0
216
+ $SERVER_LOG.info "Skipping until checkpoint #{@@checkpoint}"
217
+
218
+ checkpoint=load_user_checkpoint(@@checkpoint)
219
+
220
+ # do an automatic checkpoint restore
221
+ if checkpoint==-1
222
+ @@checkpoint.times do |i|
223
+ # puts "Skipping #{i+1}"
224
+
225
+ # get next work
226
+ trash_checkpointed_work
227
+ # if obj
228
+ # if obj.methods.include?(:count)
229
+ # @@count += obj.count
230
+ # else
231
+ # @@count += 1
232
+ # end
233
+ # end
234
+ end
235
+
236
+ $SERVER_LOG.info "Automatic checkpoint finished"
237
+
238
+ WorkManagerData.job_id=@@checkpoint
239
+
240
+ #user has done the checkpoint restoration
241
+ elsif checkpoint>0
242
+
243
+ WorkManagerData.job_id=checkpoint
244
+ elsif checkpoint==0
245
+ $SERVER_LOG.info "Automatic checkpoint not done"
246
+ end
247
+
248
+
249
+ @@checkpoint=0
250
+
251
+ end
252
+
253
+ end
254
+
255
+ def post_init
256
+ @@workers += 1
257
+
258
+ # when first worker is connected, do special config
259
+ if @@workers == 1
260
+ @@total_seconds = Time.now
261
+ $SERVER_LOG.info "First worker connected"
262
+
263
+ if @@checkpointing
264
+ $SERVER_LOG.info "Checking for checkpoint"
265
+ goto_checkpoint
266
+ end
267
+ end
268
+
269
+ $SERVER_LOG.info "#{@@workers} workers connected"
270
+ send_initial_config
271
+ send_next_work
272
+ end
273
+
274
+
275
+ def receive_object(obj)
276
+
277
+ # check if response is an error
278
+ if obj.is_a?(Exception)
279
+ $SERVER_LOG.error("Error in worker #{obj.worker_id} while processing object #{obj.object.inspect}\n" + obj.original_exception.message + ":\n" + obj.original_exception.backtrace.join("\n"))
280
+
281
+ @@error_count += 1
282
+
283
+ error_received(obj,obj.object.data)
284
+
285
+ # if there are too many errors
286
+ if (@@count>100) && (@@error_count >= @@count*0.8)
287
+ @@exit = @@exit_on_many_errors
288
+
289
+ # notice programmer
290
+ res=too_many_errors_received
291
+
292
+ # force exit if too_many_errors_received returns true
293
+ if res==true
294
+ @@exit=res
295
+ end
296
+ end
297
+
298
+ else
299
+ # if not using checkpointing
300
+
301
+ if @@checkpointing || @@keep_order || @@retry_failed_jobs
302
+ checkpointable_job_received(obj)
303
+ else
304
+ work_received(obj.data)
305
+ end
306
+ end
307
+
308
+
309
+ send_next_work
310
+
311
+ end
312
+
313
+
314
+ def checkpointable_job_received(obj)
315
+ received_job=@@running_jobs.find{|o| o.job_identifier==obj.job_identifier}
316
+
317
+ # save job
318
+ if received_job
319
+
320
+ # change job's status to received
321
+ received_job.data=obj.data
322
+ received_job.status=:received
323
+
324
+ # if there are sufficient jobs, count pending ones
325
+ if (@@running_jobs.count>=PENDING_TO_SAVE)
326
+ # count received objects pending to be written
327
+ pending=0
328
+
329
+ @@running_jobs.each do |job|
330
+ if job.status==:received
331
+ pending += 1
332
+ else
333
+ break
334
+ end
335
+ end
336
+
337
+
338
+ if (pending>PENDING_TO_SAVE) || (pending==@@running_jobs.count)
339
+ # purge contiguos saved data
340
+ to_remove = 0
341
+
342
+ @@running_jobs.each_with_index do |job,i|
343
+ if job.status==:received
344
+ # puts "Sent to save: #{job.inspect}"
345
+ work_received(job.data)
346
+ job.status=:saved
347
+ to_remove += 1
348
+ else
349
+ break
350
+ end
351
+ end
352
+
353
+ # if some objects were saved
354
+ if to_remove > 0
355
+ to_remove.times do |i|
356
+ o=@@running_jobs.shift
357
+ # puts "Job removed #{o.inspect}"
358
+ end
359
+
360
+ save_checkpoint
361
+ end
362
+ end
363
+ end
364
+ else
365
+ $SERVER_LOG.info "Job already processed #{obj.inspect}"
366
+ end
367
+ end
368
+
369
+ def initialize(*args)
370
+ super
371
+ #puts "WORK MANAGER INITIALIZE NEWWWWWWWWWW, ONE per worker"
372
+ end
373
+
374
+ # A worker has disconected
375
+ def unbind
376
+
377
+ @@workers -= 1
378
+ #puts @@running_jobs.to_json
379
+
380
+ $SERVER_LOG.info "Worker disconnected. #{@@workers} kept running"
381
+
382
+ # no more workers left, shutdown EM and stop server
383
+ if @@workers == 0
384
+ $SERVER_LOG.info "All workers finished"
385
+ EM.stop
386
+ $SERVER_LOG.info "Exiting server"
387
+
388
+
389
+
390
+ self.class.end_work_manager
391
+
392
+ @@total_seconds = Time.now-@@total_seconds
393
+ $SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
394
+ $SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
395
+ $SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
396
+
397
+ $SERVER_LOG.info "Number of errors: #{@@error_count}"
398
+ $SERVER_LOG.info "Chunk size: #{@@chunk_size}"
399
+
400
+
401
+ end
402
+ end
403
+
404
+ end
405
+ end