scbi_mapreduce 0.0.37 → 0.0.38

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.0.38 2012-04-13
2
+
3
+ Automatic checkpointing improvements
4
+
1
5
  === 0.0.37 2011-10-20
2
6
 
3
7
  Memory management improvement
@@ -17,13 +17,12 @@ module ScbiMapreduce
17
17
 
18
18
  class Manager
19
19
 
20
- attr_accessor :checkpointing, :keep_order, :retry_failed_jobs, :exit_on_many_errors, :chunk_size
20
+ attr_accessor :checkpointing, :keep_order, :retry_stuck_jobs, :exit_on_many_errors, :chunk_size
21
21
 
22
22
  # initialize Manager
23
23
  def initialize(server_ip, port, workers, work_manager_class,custom_worker_file,log_file=nil, init_env_file=nil)
24
24
  @port=port
25
25
 
26
-
27
26
  if log_file.nil?
28
27
  log_file = File.join('logs','server_log.txt')
29
28
  end
@@ -55,8 +54,9 @@ module ScbiMapreduce
55
54
 
56
55
  @checkpointing=false
57
56
  @keep_order=false
58
- @retry_failed_jobs=false
59
-
57
+ @retry_stuck_jobs=false
58
+ @exit_on_many_errors=true
59
+
60
60
  @chunk_size=1
61
61
 
62
62
 
@@ -84,7 +84,6 @@ module ScbiMapreduce
84
84
 
85
85
  @worker_launcher = WorkerLauncher.new(@ip,port,ip_list,@workers,custom_worker_file,log_file,init_env_file)
86
86
 
87
-
88
87
  $SERVER_LOG.info("Local workers: #{@workers}")
89
88
  $SERVER_LOG.info("Remote workers: #{@worker_names}")
90
89
 
@@ -101,11 +100,15 @@ module ScbiMapreduce
101
100
  EM.error_handler{ |e|
102
101
  $SERVER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
103
102
  }
103
+
104
+ # $SERVER_LOG.info("Installing INT and TERM traps in #{@work_manager_class}")
105
+ # Signal.trap("INT") { puts "TRAP INT";@work_manager_class.controlled_exit; EM.stop}
106
+ # Signal.trap("TERM") { puts "TRAP TERM";@work_manager_class.controlled_exit; EM.stop}
104
107
 
105
108
  # start EM loop
106
109
  EventMachine::run {
107
110
 
108
- @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_failed_jobs,@exit_on_many_errors,@chunk_size)
111
+ @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_stuck_jobs,@exit_on_many_errors,@chunk_size)
109
112
 
110
113
  evm=EventMachine::start_server @ip, @port, @work_manager_class
111
114
  dir=Socket.unpack_sockaddr_in( EM.get_sockname( evm ))
@@ -14,27 +14,58 @@
14
14
 
15
15
  module ScbiMapreduce
16
16
 
17
-
18
- PENDING_TO_SAVE=100
19
17
 
18
+ PENDING_TO_SAVE=10
19
+ CHECKPOINT_FILE='scbi_mapreduce_checkpoint'
20
+ OLD_CHECKPOINT_FILE='old_scbi_mapreduce_checkpoint'
20
21
 
21
22
  class WorkManagerData
22
23
 
23
24
  @@job_id=1
25
+ @@longest_processing_time=0
24
26
 
25
27
  attr_reader :job_identifier
26
- attr_accessor :status, :data
28
+ attr_accessor :status, :data,:sent_time,:received_time
27
29
 
28
- def initialize(job)
30
+ def initialize(objs)
29
31
 
30
32
  @job_identifier=@@job_id
31
33
  @@job_id+=1
32
- @data=job
34
+ @data=objs
35
+
36
+ sent!
37
+ @received_time=0
38
+ @processing_time=nil
39
+ end
40
+
41
+ def received!(objs)
42
+ @data=objs
43
+ @received_time=Time.now
44
+ @processing_time=@received_time-@sent_time
45
+
46
+ # save longer processing time
47
+ @@longest_processing_time=[@@longest_processing_time,@processing_time].max
48
+
49
+ @status=:received
50
+ end
51
+
52
+ def sent!
33
53
  @status=:running
54
+ @sent_time=Time.now
55
+ end
56
+
57
+ def stuck?
58
+ (@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*2))
59
+ end
60
+
61
+ # return running or real processing time
62
+ def processing_time
63
+ return (@processing_time || (Time.now-@sent_time))
34
64
  end
35
65
 
36
66
  def inspect
37
- return "WorkManagerData: #{@job_identifier} => #{@status}"
67
+ time="; time: #{processing_time} seg"
68
+ return "WorkManagerData: #{@job_identifier} => #{@status} #{time}"
38
69
  end
39
70
 
40
71
  def self.job_id=(c)
@@ -53,7 +84,7 @@ module ScbiMapreduce
53
84
  class WorkManager < EventMachine::Connection
54
85
 
55
86
  include EM::P::ObjectProtocol
56
-
87
+
57
88
  def self.init_work_manager
58
89
 
59
90
  end
@@ -102,8 +133,9 @@ module ScbiMapreduce
102
133
 
103
134
  ############
104
135
 
105
- def self.init_work_manager_internals(checkpointing, keep_order, retry_failed_jobs,exit_on_many_errors,chunk_size)
136
+ def self.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs,exit_on_many_errors,chunk_size)
106
137
  @@count = 0
138
+ @@want_to_exit=false
107
139
  @@chunk_count = 0
108
140
  @@workers = 0
109
141
  @@max_workers = 0
@@ -113,13 +145,17 @@ module ScbiMapreduce
113
145
 
114
146
  @@checkpointing=checkpointing
115
147
  @@keep_order=keep_order
116
- @@retry_failed_jobs=retry_failed_jobs
148
+ @@retry_stuck_jobs=retry_stuck_jobs
117
149
  @@exit_on_many_errors=exit_on_many_errors
118
150
 
119
151
  # TODO - Implement a dynamic chunk_size
120
152
 
121
153
  @@chunk_size=chunk_size
122
154
  $SERVER_LOG.info "Processing in chunks of #{@@chunk_size} objects"
155
+ $SERVER_LOG.info "Checkpointing: #{@@checkpointing}"
156
+ $SERVER_LOG.info "Keeping output order: #{@@keep_order}"
157
+ $SERVER_LOG.info "Retrying stuck jobs: #{@@retry_stuck_jobs}"
158
+ $SERVER_LOG.info "Exiting on too many errors: #{@@exit_on_many_errors}"
123
159
 
124
160
  @@checkpoint=0
125
161
  if @@checkpointing
@@ -133,17 +169,28 @@ module ScbiMapreduce
133
169
  return @@checkpoint
134
170
  end
135
171
 
136
- def save_checkpoint
137
- checkpoint_file = File.open('scbi_mapreduce_checkpoint','w')
172
+ def remove_checkpoint
173
+ if File.exists?(CHECKPOINT_FILE)
174
+ checkpoint_file = FileUtils.mv(CHECKPOINT_FILE,OLD_CHECKPOINT_FILE)
175
+ end
176
+ end
177
+
138
178
 
179
+ def save_checkpoint
180
+ checkpoint_file = File.open(CHECKPOINT_FILE,'w')
181
+
139
182
  if !@@running_jobs.empty?
140
- checkpoint_file.puts @@running_jobs.first.job_identifier
183
+ checkpoint_value = @@running_jobs.first.job_identifier
141
184
  else
142
- checkpoint_file.puts WorkManagerData.job_id-1
185
+ checkpoint_value = WorkManagerData.job_id
143
186
  end
144
-
187
+
188
+ $SERVER_LOG.info "Saving checkpoint: #{checkpoint_value}"
189
+
190
+ checkpoint_file.puts checkpoint_value
191
+
145
192
  checkpoint_file.close
146
-
193
+
147
194
  save_user_checkpoint
148
195
 
149
196
  end
@@ -151,8 +198,8 @@ module ScbiMapreduce
151
198
  def self.get_checkpoint
152
199
  res = 0
153
200
  begin
154
- if File.exists?('scbi_mapreduce_checkpoint')
155
- res=File.read('scbi_mapreduce_checkpoint').chomp
201
+ if File.exists?(CHECKPOINT_FILE)
202
+ res=File.read(CHECKPOINT_FILE).chomp
156
203
  # puts "read checkpoint #{res}"
157
204
 
158
205
  res = res.to_i
@@ -176,42 +223,80 @@ module ScbiMapreduce
176
223
  send_object(obj)
177
224
  end
178
225
 
179
- # send next work to worker
180
- def send_next_work
226
+ def print_running_jobs
227
+ jobs=@@running_jobs.map{|j| j.inspect}.join("\n")
228
+ $SERVER_LOG.debug("Running Jobs:\n#{jobs}")
229
+ end
181
230
 
182
- objs=[]
231
+ def send_stuck_work
232
+ sent=false
183
233
 
184
- @@chunk_size.times do
185
- obj=next_work
186
- if obj.nil?
187
- break
188
- else
189
- # add to obj array
190
- objs << obj
191
- end
192
- end
234
+ if @@retry_stuck_jobs
235
+ # count stuck jobs and re-sent the first one
236
+ stuck_works=@@running_jobs.select{|job| job.stuck?}
193
237
 
238
+ if !stuck_works.empty?
239
+ jobs=stuck_works.map{|j| j.inspect}.join("\n")
240
+ $SERVER_LOG.info("Stuck Jobs:\n#{jobs}")
194
241
 
195
- if objs.count>0
196
- @@count += objs.count
197
- @@chunk_count += 1
242
+ # send_object
243
+ send_object(stuck_works.first)
244
+ stuck_works.first.sent!
245
+ $SERVER_LOG.info("Sending stuck work #{stuck_works.first.inspect}")
246
+ sent=true
247
+ end
248
+ end
198
249
 
199
- work_data=WorkManagerData.new(objs)
250
+ return sent
251
+ end
200
252
 
201
- send_object(work_data)
253
+ # send next work to worker
254
+ def send_next_work
202
255
 
203
- # to keep order or retry failed job, we need job status
204
- if @@keep_order || @@retry_failed_jobs
205
- @@running_jobs.push work_data
256
+ # if we need to exit, send quit to workers
257
+
258
+ if @@want_to_exit
259
+ send_object(:quit)
260
+
261
+ elsif !send_stuck_work
262
+
263
+ #send stuck work
264
+ objs=[]
265
+
266
+ # prepare new data
267
+ @@chunk_size.times do
268
+ obj=next_work
269
+ if obj.nil?
270
+ break
271
+ else
272
+ # add to obj array
273
+ objs << obj
274
+ end
206
275
  end
207
- else
208
276
 
209
- send_object(:quit)
210
- end
277
+ # if new was data collected, send it
278
+ if objs.count>0
279
+ @@count += objs.count
280
+ @@chunk_count += 1
211
281
 
282
+ work_data=WorkManagerData.new(objs)
283
+ send_object(work_data)
212
284
 
285
+ # to keep order or retry failed job, we need job status
286
+ if @@keep_order || @@retry_stuck_jobs
287
+ # do not remove data to be able to sent it again
288
+ # work_data.data=nil
289
+ @@running_jobs.push work_data
290
+ # print_running_jobs
291
+ end
292
+ else
293
+ # otherwise, send a quit value indicating no more data available
294
+ send_object(:quit)
295
+ end
296
+ end
213
297
  end
214
298
 
299
+ # loads a checkpoint
215
300
  def goto_checkpoint
216
301
  if @@checkpoint>0
217
302
  $SERVER_LOG.info "Skipping until checkpoint #{@@checkpoint}"
@@ -220,18 +305,13 @@ module ScbiMapreduce
220
305
 
221
306
  # do an automatic checkpoint restore
222
307
  if checkpoint==-1
223
- @@checkpoint.times do |i|
224
- # puts "Skipping #{i+1}"
225
-
308
+ (@@checkpoint - 1).times do |i|
309
+ $SERVER_LOG.info "Automatic trashing Chunk #{i+1}"
226
310
  # get next work
227
- trash_checkpointed_work
228
- # if obj
229
- # if obj.methods.include?(:count)
230
- # @@count += obj.count
231
- # else
232
- # @@count += 1
233
- # end
234
- # end
311
+ @@chunk_size.times do
312
+ obj=next_work
313
+ end
314
+ # trash_checkpointed_work
235
315
  end
236
316
 
237
317
  $SERVER_LOG.info "Automatic checkpoint finished"
@@ -240,8 +320,9 @@ module ScbiMapreduce
240
320
 
241
321
  #user has done the checkpoint restoration
242
322
  elsif checkpoint>0
243
-
323
+
244
324
  WorkManagerData.job_id=checkpoint
325
+
245
326
  elsif checkpoint==0
246
327
  $SERVER_LOG.info "Automatic checkpoint not done"
247
328
  end
@@ -271,6 +352,11 @@ module ScbiMapreduce
271
352
  send_initial_config
272
353
  send_next_work
273
354
  end
355
+
356
+ def self.controlled_exit
357
+ $SERVER_LOG.info("Controlled exit. Workers will be noticed in next round")
358
+ @@want_to_exit=true
359
+ end
274
360
 
275
361
 
276
362
  def receive_object(obj)
@@ -285,21 +371,23 @@ module ScbiMapreduce
285
371
 
286
372
  # if there are too many errors
287
373
  if (@@count>100) && (@@error_count >= @@count*0.8)
288
- @@exit = @@exit_on_many_errors
289
374
 
290
375
  # notice programmer
291
376
  res=too_many_errors_received
292
377
 
293
378
  # force exit if too_many_errors_received returns true
294
- if res==true
295
- @@exit=res
379
+ if @@exit_on_many_errors || res
380
+ $SERVER_LOG.error("Want to exit due to too many errors")
381
+ self.controlled_exit
296
382
  end
297
383
  end
298
384
 
299
385
  else
300
386
  # if not using checkpointing
301
387
 
302
- if @@checkpointing || @@keep_order || @@retry_failed_jobs
388
+
389
+ if @@checkpointing || @@keep_order || @@retry_stuck_jobs
390
+ # print_running_jobs
303
391
  checkpointable_job_received(obj)
304
392
  else
305
393
  work_received(obj.data)
@@ -314,58 +402,71 @@ module ScbiMapreduce
314
402
 
315
403
 
316
404
  def checkpointable_job_received(obj)
405
+
406
+ # find reveived object between sent jobs
317
407
  received_job=@@running_jobs.find{|o| o.job_identifier==obj.job_identifier}
318
408
 
319
- # save job
409
+ # save job if there is was a valid work previously sent
320
410
  if received_job
321
411
 
322
- # change job's status to received
323
- received_job.data=obj.data
324
- received_job.status=:received
412
+ # change this job's status to received
413
+ received_job.received!(obj.data)
414
+
415
+
325
416
 
326
- # if there are sufficient jobs, count pending ones
327
- if (@@running_jobs.count>=PENDING_TO_SAVE)
328
- # count received objects pending to be written
329
- pending=0
417
+ # # if there are sufficient jobs, count pending ones
418
+ # if (@@running_jobs.count>=PENDING_TO_SAVE)
330
419
 
420
+ # count received objects pending to be written, only until one that is still running is found
421
+ pending_to_save=0
422
+ @@running_jobs.each do |job|
423
+ if job.status==:received
424
+ pending_to_save += 1
425
+ else
426
+ break
427
+ end
428
+ end
429
+
430
+ # if there are a few pending to save works, or all remaining works are pending, then save
431
+ if (pending_to_save>=PENDING_TO_SAVE) || (pending_to_save==@@running_jobs.count)
432
+ # save pending jobs and write to disk
433
+ to_remove = 0
434
+
435
+ if @@checkpointing
436
+ remove_checkpoint
437
+ end
438
+
331
439
  @@running_jobs.each do |job|
332
440
  if job.status==:received
333
- pending += 1
441
+ # puts "Sent to save: #{job.inspect}"
442
+ work_received(job.data)
443
+ job.status=:saved
444
+ to_remove += 1
334
445
  else
335
446
  break
336
447
  end
337
448
  end
338
449
 
450
+ # if some objects were saved, remove them from the running_jobs
451
+ if to_remove > 0
452
+ to_remove.times do |i|
453
+ o=@@running_jobs.shift
339
454
 
340
- if (pending>PENDING_TO_SAVE) || (pending==@@running_jobs.count)
341
- # purge contiguos saved data
342
- to_remove = 0
343
-
344
- @@running_jobs.each_with_index do |job,i|
345
- if job.status==:received
346
- # puts "Sent to save: #{job.inspect}"
347
- work_received(job.data)
348
- job.status=:saved
349
- to_remove += 1
350
- else
351
- break
352
- end
455
+ # puts "Job removed #{o.inspect}"
456
+ o=nil
353
457
  end
354
458
 
355
- # if some objects were saved
356
- if to_remove > 0
357
- to_remove.times do |i|
358
- o=@@running_jobs.shift
359
- # puts "Job removed #{o.inspect}"
360
- o=nil
361
- end
459
+ # print_running_jobs
460
+
461
+ if @@checkpointing && !@@want_to_exit
362
462
 
363
463
  save_checkpoint
364
464
  end
365
465
  end
366
466
  end
467
+ # end
367
468
  else
368
- $SERVER_LOG.info "Job already processed #{obj.inspect}"
469
+ $SERVER_LOG.warn "Job already processed #{obj.inspect}"
369
470
  end
370
471
  end
371
472
 
@@ -385,26 +486,29 @@ module ScbiMapreduce
385
486
  # no more workers left, shutdown EM and stop server
386
487
  if @@workers == 0
387
488
  $SERVER_LOG.info "All workers finished"
388
- EM.stop
389
- $SERVER_LOG.info "Exiting server"
390
-
391
-
392
-
393
- self.class.end_work_manager
394
-
395
- @@total_seconds = Time.now-@@total_seconds
396
- $SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
397
- $SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
398
- $SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
399
-
400
- $SERVER_LOG.info "Number of errors: #{@@error_count}"
401
- $SERVER_LOG.info "Chunk size: #{@@chunk_size}"
402
- $SERVER_LOG.info "Total connected workers: #{@@max_workers}"
403
-
404
-
405
-
489
+ stop_work_manager
406
490
  end
407
491
  end
492
+
493
+ def stop_work_manager
494
+
495
+
496
+
497
+ EM.stop
498
+ $SERVER_LOG.info "Exiting server"
499
+
500
+ self.class.end_work_manager
501
+
502
+ @@total_seconds = Time.now-@@total_seconds
503
+ $SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
504
+ $SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
505
+ $SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
506
+
507
+ $SERVER_LOG.info "Number of errors: #{@@error_count}"
508
+ $SERVER_LOG.info "Chunk size: #{@@chunk_size}"
509
+ $SERVER_LOG.info "Total connected workers: #{@@max_workers}"
510
+
511
+ end
408
512
 
409
513
  end
410
514
  end
@@ -10,7 +10,7 @@ module ScbiMapreduce
10
10
  class Worker < EventMachine::Connection
11
11
  include EM::P::ObjectProtocol
12
12
 
13
-
13
+ @@want_to_exit_worker=false
14
14
 
15
15
  def receive_initial_config(obj)
16
16
 
@@ -40,7 +40,6 @@ module ScbiMapreduce
40
40
 
41
41
  def initialize(*args)
42
42
  super
43
-
44
43
  end
45
44
 
46
45
  def post_init
@@ -67,8 +66,15 @@ module ScbiMapreduce
67
66
  # At first iteration, start worker
68
67
  starting_worker
69
68
  else
70
-
71
- if obj == :quit
69
+ $WORKER_LOG.info("received:"+obj.to_s)
70
+
71
+ if (obj == :quit) || @@want_to_exit_worker
72
+ $WORKER_LOG.info('Quit received')
73
+
74
+ stop_worker
75
+
76
+ elsif @@want_to_exit_worker
77
+ $WORKER_LOG.info('Want to exit worker')
72
78
  stop_worker
73
79
  else
74
80
  @@count += 1
@@ -94,6 +100,10 @@ module ScbiMapreduce
94
100
  modified_data=process_object(obj.data)
95
101
  obj.data = modified_data
96
102
 
103
+ # if obj.job_identifier==3
104
+ # sleep 15
105
+ # end
106
+
97
107
  send_object(obj)
98
108
 
99
109
  rescue Exception => e
@@ -114,18 +124,30 @@ module ScbiMapreduce
114
124
  end
115
125
 
116
126
  def stop_worker
127
+ $WORKER_LOG.info "Closing connection with WORKER"
128
+ $WORKER_LOG.info("Worker processed #{@@count} chunks")
129
+
117
130
  close_connection
118
131
  EventMachine::stop_event_loop
119
132
  closing_worker
120
133
  end
134
+
135
+ def self.controlled_exit_worker
136
+ @@want_to_exit_worker=true
137
+ end
121
138
 
122
139
  def self.start_worker(worker_id,ip,port,log_file=nil)
123
140
  #puts "NEW WORKER - INIIIIIIIIIIIIIIIIIIIIT #{self}"
141
+
142
+
124
143
  ip = ip
125
144
  port = port
126
145
  @@count = -1
127
146
 
128
147
  @@worker_id=worker_id
148
+
149
+ # Signal.trap("INT") { puts "TRAP INT in worker #{@@worker_id}"; controlled_exit_worker; EM.stop}
150
+ # Signal.trap("TERM") { puts "TRAP TERM in worker #{@@worker_id}";controlled_exit_worker; EM.stop}
129
151
 
130
152
  if log_file.nil?
131
153
  log_file = 'logs/worker'+worker_id+'_'+`hostname`.chomp+'_log.txt'
@@ -7,7 +7,7 @@ $: << File.expand_path('scbi_mapreduce')
7
7
  # puts $:
8
8
 
9
9
  module ScbiMapreduce
10
- VERSION = '0.0.37'
10
+ VERSION = '0.0.38'
11
11
  end
12
12
 
13
13
  require 'scbi_mapreduce/manager'
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: scbi_mapreduce
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.37
5
+ version: 0.0.38
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-10-20 00:00:00 Z
13
+ date: 2012-04-13 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: eventmachine