scbi_mapreduce 0.0.37 → 0.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.0.38 2012-04-13
2
+
3
+ Automatic checkpointing improvements
4
+
1
5
  === 0.0.37 2011-10-20
2
6
 
3
7
  Memory management improvement
@@ -17,13 +17,12 @@ module ScbiMapreduce
17
17
 
18
18
  class Manager
19
19
 
20
- attr_accessor :checkpointing, :keep_order, :retry_failed_jobs, :exit_on_many_errors, :chunk_size
20
+ attr_accessor :checkpointing, :keep_order, :retry_stuck_jobs, :exit_on_many_errors, :chunk_size
21
21
 
22
22
  # initialize Manager
23
23
  def initialize(server_ip, port, workers, work_manager_class,custom_worker_file,log_file=nil, init_env_file=nil)
24
24
  @port=port
25
25
 
26
-
27
26
  if log_file.nil?
28
27
  log_file = File.join('logs','server_log.txt')
29
28
  end
@@ -55,8 +54,9 @@ module ScbiMapreduce
55
54
 
56
55
  @checkpointing=false
57
56
  @keep_order=false
58
- @retry_failed_jobs=false
59
-
57
+ @retry_stuck_jobs=false
58
+ @exit_on_many_errors=true
59
+
60
60
  @chunk_size=1
61
61
 
62
62
 
@@ -84,7 +84,6 @@ module ScbiMapreduce
84
84
 
85
85
  @worker_launcher = WorkerLauncher.new(@ip,port,ip_list,@workers,custom_worker_file,log_file,init_env_file)
86
86
 
87
-
88
87
  $SERVER_LOG.info("Local workers: #{@workers}")
89
88
  $SERVER_LOG.info("Remote workers: #{@worker_names}")
90
89
 
@@ -101,11 +100,15 @@ module ScbiMapreduce
101
100
  EM.error_handler{ |e|
102
101
  $SERVER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
103
102
  }
103
+
104
+ # $SERVER_LOG.info("Installing INT and TERM traps in #{@work_manager_class}")
105
+ # Signal.trap("INT") { puts "TRAP INT";@work_manager_class.controlled_exit; EM.stop}
106
+ # Signal.trap("TERM") { puts "TRAP TERM";@work_manager_class.controlled_exit; EM.stop}
104
107
 
105
108
  # start EM loop
106
109
  EventMachine::run {
107
110
 
108
- @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_failed_jobs,@exit_on_many_errors,@chunk_size)
111
+ @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_stuck_jobs,@exit_on_many_errors,@chunk_size)
109
112
 
110
113
  evm=EventMachine::start_server @ip, @port, @work_manager_class
111
114
  dir=Socket.unpack_sockaddr_in( EM.get_sockname( evm ))
@@ -14,27 +14,58 @@
14
14
 
15
15
  module ScbiMapreduce
16
16
 
17
-
18
- PENDING_TO_SAVE=100
19
17
 
18
+ PENDING_TO_SAVE=10
19
+ CHECKPOINT_FILE='scbi_mapreduce_checkpoint'
20
+ OLD_CHECKPOINT_FILE='old_scbi_mapreduce_checkpoint'
20
21
 
21
22
  class WorkManagerData
22
23
 
23
24
  @@job_id=1
25
+ @@longest_processing_time=0
24
26
 
25
27
  attr_reader :job_identifier
26
- attr_accessor :status, :data
28
+ attr_accessor :status, :data,:sent_time,:received_time
27
29
 
28
- def initialize(job)
30
+ def initialize(objs)
29
31
 
30
32
  @job_identifier=@@job_id
31
33
  @@job_id+=1
32
- @data=job
34
+ @data=objs
35
+
36
+ sent!
37
+ @received_time=0
38
+ @processing_time=nil
39
+ end
40
+
41
+ def received!(objs)
42
+ @data=objs
43
+ @received_time=Time.now
44
+ @processing_time=@received_time-@sent_time
45
+
46
+ # save longer processing time
47
+ @@longest_processing_time=[@@longest_processing_time,@processing_time].max
48
+
49
+ @status=:received
50
+ end
51
+
52
+ def sent!
33
53
  @status=:running
54
+ @sent_time=Time.now
55
+ end
56
+
57
+ def stuck?
58
+ (@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*2))
59
+ end
60
+
61
+ # return running or real processing time
62
+ def processing_time
63
+ return (@processing_time || (Time.now-@sent_time))
34
64
  end
35
65
 
36
66
  def inspect
37
- return "WorkManagerData: #{@job_identifier} => #{@status}"
67
+ time="; time: #{processing_time} seg"
68
+ return "WorkManagerData: #{@job_identifier} => #{@status} #{time}"
38
69
  end
39
70
 
40
71
  def self.job_id=(c)
@@ -53,7 +84,7 @@ module ScbiMapreduce
53
84
  class WorkManager < EventMachine::Connection
54
85
 
55
86
  include EM::P::ObjectProtocol
56
-
87
+
57
88
  def self.init_work_manager
58
89
 
59
90
  end
@@ -102,8 +133,9 @@ module ScbiMapreduce
102
133
 
103
134
  ############
104
135
 
105
- def self.init_work_manager_internals(checkpointing, keep_order, retry_failed_jobs,exit_on_many_errors,chunk_size)
136
+ def self.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs,exit_on_many_errors,chunk_size)
106
137
  @@count = 0
138
+ @@want_to_exit=false
107
139
  @@chunk_count = 0
108
140
  @@workers = 0
109
141
  @@max_workers = 0
@@ -113,13 +145,17 @@ module ScbiMapreduce
113
145
 
114
146
  @@checkpointing=checkpointing
115
147
  @@keep_order=keep_order
116
- @@retry_failed_jobs=retry_failed_jobs
148
+ @@retry_stuck_jobs=retry_stuck_jobs
117
149
  @@exit_on_many_errors=exit_on_many_errors
118
150
 
119
151
  # TODO - Implement a dynamic chunk_size
120
152
 
121
153
  @@chunk_size=chunk_size
122
154
  $SERVER_LOG.info "Processing in chunks of #{@@chunk_size} objects"
155
+ $SERVER_LOG.info "Checkpointing: #{@@checkpointing}"
156
+ $SERVER_LOG.info "Keeping output order: #{@@keep_order}"
157
+ $SERVER_LOG.info "Retrying stuck jobs: #{@@retry_stuck_jobs}"
158
+ $SERVER_LOG.info "Exiting on too many errors: #{@@exit_on_many_errors}"
123
159
 
124
160
  @@checkpoint=0
125
161
  if @@checkpointing
@@ -133,17 +169,28 @@ module ScbiMapreduce
133
169
  return @@checkpoint
134
170
  end
135
171
 
136
- def save_checkpoint
137
- checkpoint_file = File.open('scbi_mapreduce_checkpoint','w')
172
+ def remove_checkpoint
173
+ if File.exists?(CHECKPOINT_FILE)
174
+ checkpoint_file = FileUtils.mv(CHECKPOINT_FILE,OLD_CHECKPOINT_FILE)
175
+ end
176
+ end
177
+
138
178
 
179
+ def save_checkpoint
180
+ checkpoint_file = File.open(CHECKPOINT_FILE,'w')
181
+
139
182
  if !@@running_jobs.empty?
140
- checkpoint_file.puts @@running_jobs.first.job_identifier
183
+ checkpoint_value = @@running_jobs.first.job_identifier
141
184
  else
142
- checkpoint_file.puts WorkManagerData.job_id-1
185
+ checkpoint_value = WorkManagerData.job_id
143
186
  end
144
-
187
+
188
+ $SERVER_LOG.info "Saving checkpoint: #{checkpoint_value}"
189
+
190
+ checkpoint_file.puts checkpoint_value
191
+
145
192
  checkpoint_file.close
146
-
193
+
147
194
  save_user_checkpoint
148
195
 
149
196
  end
@@ -151,8 +198,8 @@ module ScbiMapreduce
151
198
  def self.get_checkpoint
152
199
  res = 0
153
200
  begin
154
- if File.exists?('scbi_mapreduce_checkpoint')
155
- res=File.read('scbi_mapreduce_checkpoint').chomp
201
+ if File.exists?(CHECKPOINT_FILE)
202
+ res=File.read(CHECKPOINT_FILE).chomp
156
203
  # puts "read checkpoint #{res}"
157
204
 
158
205
  res = res.to_i
@@ -176,42 +223,80 @@ module ScbiMapreduce
176
223
  send_object(obj)
177
224
  end
178
225
 
179
- # send next work to worker
180
- def send_next_work
226
+ def print_running_jobs
227
+ jobs=@@running_jobs.map{|j| j.inspect}.join("\n")
228
+ $SERVER_LOG.debug("Running Jobs:\n#{jobs}")
229
+ end
181
230
 
182
- objs=[]
231
+ def send_stuck_work
232
+ sent=false
183
233
 
184
- @@chunk_size.times do
185
- obj=next_work
186
- if obj.nil?
187
- break
188
- else
189
- # add to obj array
190
- objs << obj
191
- end
192
- end
234
+ if @@retry_stuck_jobs
235
+ # count stuck jobs and re-sent the first one
236
+ stuck_works=@@running_jobs.select{|job| job.stuck?}
193
237
 
238
+ if !stuck_works.empty?
239
+ jobs=stuck_works.map{|j| j.inspect}.join("\n")
240
+ $SERVER_LOG.info("Stuck Jobs:\n#{jobs}")
194
241
 
195
- if objs.count>0
196
- @@count += objs.count
197
- @@chunk_count += 1
242
+ # send_object
243
+ send_object(stuck_works.first)
244
+ stuck_works.first.sent!
245
+ $SERVER_LOG.info("Sending stuck work #{stuck_works.first.inspect}")
246
+ sent=true
247
+ end
248
+ end
198
249
 
199
- work_data=WorkManagerData.new(objs)
250
+ return sent
251
+ end
200
252
 
201
- send_object(work_data)
253
+ # send next work to worker
254
+ def send_next_work
202
255
 
203
- # to keep order or retry failed job, we need job status
204
- if @@keep_order || @@retry_failed_jobs
205
- @@running_jobs.push work_data
256
+ # if we need to exit, send quit to workers
257
+
258
+ if @@want_to_exit
259
+ send_object(:quit)
260
+
261
+ elsif !send_stuck_work
262
+
263
+ #send stuck work
264
+ objs=[]
265
+
266
+ # prepare new data
267
+ @@chunk_size.times do
268
+ obj=next_work
269
+ if obj.nil?
270
+ break
271
+ else
272
+ # add to obj array
273
+ objs << obj
274
+ end
206
275
  end
207
- else
208
276
 
209
- send_object(:quit)
210
- end
277
+ # if new was data collected, send it
278
+ if objs.count>0
279
+ @@count += objs.count
280
+ @@chunk_count += 1
211
281
 
282
+ work_data=WorkManagerData.new(objs)
283
+ send_object(work_data)
212
284
 
285
+ # to keep order or retry failed job, we need job status
286
+ if @@keep_order || @@retry_stuck_jobs
287
+ # do not remove data to be able to sent it again
288
+ # work_data.data=nil
289
+ @@running_jobs.push work_data
290
+ # print_running_jobs
291
+ end
292
+ else
293
+ # otherwise, send a quit value indicating no more data available
294
+ send_object(:quit)
295
+ end
296
+ end
213
297
  end
214
298
 
299
+ # loads a checkpoint
215
300
  def goto_checkpoint
216
301
  if @@checkpoint>0
217
302
  $SERVER_LOG.info "Skipping until checkpoint #{@@checkpoint}"
@@ -220,18 +305,13 @@ module ScbiMapreduce
220
305
 
221
306
  # do an automatic checkpoint restore
222
307
  if checkpoint==-1
223
- @@checkpoint.times do |i|
224
- # puts "Skipping #{i+1}"
225
-
308
+ (@@checkpoint - 1).times do |i|
309
+ $SERVER_LOG.info "Automatic trashing Chunk #{i+1}"
226
310
  # get next work
227
- trash_checkpointed_work
228
- # if obj
229
- # if obj.methods.include?(:count)
230
- # @@count += obj.count
231
- # else
232
- # @@count += 1
233
- # end
234
- # end
311
+ @@chunk_size.times do
312
+ obj=next_work
313
+ end
314
+ # trash_checkpointed_work
235
315
  end
236
316
 
237
317
  $SERVER_LOG.info "Automatic checkpoint finished"
@@ -240,8 +320,9 @@ module ScbiMapreduce
240
320
 
241
321
  #user has done the checkpoint restoration
242
322
  elsif checkpoint>0
243
-
323
+
244
324
  WorkManagerData.job_id=checkpoint
325
+
245
326
  elsif checkpoint==0
246
327
  $SERVER_LOG.info "Automatic checkpoint not done"
247
328
  end
@@ -271,6 +352,11 @@ module ScbiMapreduce
271
352
  send_initial_config
272
353
  send_next_work
273
354
  end
355
+
356
+ def self.controlled_exit
357
+ $SERVER_LOG.info("Controlled exit. Workers will be noticed in next round")
358
+ @@want_to_exit=true
359
+ end
274
360
 
275
361
 
276
362
  def receive_object(obj)
@@ -285,21 +371,23 @@ module ScbiMapreduce
285
371
 
286
372
  # if there are too many errors
287
373
  if (@@count>100) && (@@error_count >= @@count*0.8)
288
- @@exit = @@exit_on_many_errors
289
374
 
290
375
  # notice programmer
291
376
  res=too_many_errors_received
292
377
 
293
378
  # force exit if too_many_errors_received returns true
294
- if res==true
295
- @@exit=res
379
+ if @@exit_on_many_errors || res
380
+ $SERVER_LOG.error("Want to exit due to too many errors")
381
+ self.controlled_exit
296
382
  end
297
383
  end
298
384
 
299
385
  else
300
386
  # if not using checkpointing
301
387
 
302
- if @@checkpointing || @@keep_order || @@retry_failed_jobs
388
+
389
+ if @@checkpointing || @@keep_order || @@retry_stuck_jobs
390
+ # print_running_jobs
303
391
  checkpointable_job_received(obj)
304
392
  else
305
393
  work_received(obj.data)
@@ -314,58 +402,71 @@ module ScbiMapreduce
314
402
 
315
403
 
316
404
  def checkpointable_job_received(obj)
405
+
406
+ # find reveived object between sent jobs
317
407
  received_job=@@running_jobs.find{|o| o.job_identifier==obj.job_identifier}
318
408
 
319
- # save job
409
+ # save job if there is was a valid work previously sent
320
410
  if received_job
321
411
 
322
- # change job's status to received
323
- received_job.data=obj.data
324
- received_job.status=:received
412
+ # change this job's status to received
413
+ received_job.received!(obj.data)
414
+
415
+
325
416
 
326
- # if there are sufficient jobs, count pending ones
327
- if (@@running_jobs.count>=PENDING_TO_SAVE)
328
- # count received objects pending to be written
329
- pending=0
417
+ # # if there are sufficient jobs, count pending ones
418
+ # if (@@running_jobs.count>=PENDING_TO_SAVE)
330
419
 
420
+ # count received objects pending to be written, only until one that is still running is found
421
+ pending_to_save=0
422
+ @@running_jobs.each do |job|
423
+ if job.status==:received
424
+ pending_to_save += 1
425
+ else
426
+ break
427
+ end
428
+ end
429
+
430
+ # if there are a few pending to save works, or all remaining works are pending, then save
431
+ if (pending_to_save>=PENDING_TO_SAVE) || (pending_to_save==@@running_jobs.count)
432
+ # save pending jobs and write to disk
433
+ to_remove = 0
434
+
435
+ if @@checkpointing
436
+ remove_checkpoint
437
+ end
438
+
331
439
  @@running_jobs.each do |job|
332
440
  if job.status==:received
333
- pending += 1
441
+ # puts "Sent to save: #{job.inspect}"
442
+ work_received(job.data)
443
+ job.status=:saved
444
+ to_remove += 1
334
445
  else
335
446
  break
336
447
  end
337
448
  end
338
449
 
450
+ # if some objects were saved, remove them from the running_jobs
451
+ if to_remove > 0
452
+ to_remove.times do |i|
453
+ o=@@running_jobs.shift
339
454
 
340
- if (pending>PENDING_TO_SAVE) || (pending==@@running_jobs.count)
341
- # purge contiguos saved data
342
- to_remove = 0
343
-
344
- @@running_jobs.each_with_index do |job,i|
345
- if job.status==:received
346
- # puts "Sent to save: #{job.inspect}"
347
- work_received(job.data)
348
- job.status=:saved
349
- to_remove += 1
350
- else
351
- break
352
- end
455
+ # puts "Job removed #{o.inspect}"
456
+ o=nil
353
457
  end
354
458
 
355
- # if some objects were saved
356
- if to_remove > 0
357
- to_remove.times do |i|
358
- o=@@running_jobs.shift
359
- # puts "Job removed #{o.inspect}"
360
- o=nil
361
- end
459
+ # print_running_jobs
460
+
461
+ if @@checkpointing && !@@want_to_exit
362
462
 
363
463
  save_checkpoint
364
464
  end
365
465
  end
366
466
  end
467
+ # end
367
468
  else
368
- $SERVER_LOG.info "Job already processed #{obj.inspect}"
469
+ $SERVER_LOG.warn "Job already processed #{obj.inspect}"
369
470
  end
370
471
  end
371
472
 
@@ -385,26 +486,29 @@ module ScbiMapreduce
385
486
  # no more workers left, shutdown EM and stop server
386
487
  if @@workers == 0
387
488
  $SERVER_LOG.info "All workers finished"
388
- EM.stop
389
- $SERVER_LOG.info "Exiting server"
390
-
391
-
392
-
393
- self.class.end_work_manager
394
-
395
- @@total_seconds = Time.now-@@total_seconds
396
- $SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
397
- $SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
398
- $SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
399
-
400
- $SERVER_LOG.info "Number of errors: #{@@error_count}"
401
- $SERVER_LOG.info "Chunk size: #{@@chunk_size}"
402
- $SERVER_LOG.info "Total connected workers: #{@@max_workers}"
403
-
404
-
405
-
489
+ stop_work_manager
406
490
  end
407
491
  end
492
+
493
+ def stop_work_manager
494
+
495
+
496
+
497
+ EM.stop
498
+ $SERVER_LOG.info "Exiting server"
499
+
500
+ self.class.end_work_manager
501
+
502
+ @@total_seconds = Time.now-@@total_seconds
503
+ $SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
504
+ $SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
505
+ $SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
506
+
507
+ $SERVER_LOG.info "Number of errors: #{@@error_count}"
508
+ $SERVER_LOG.info "Chunk size: #{@@chunk_size}"
509
+ $SERVER_LOG.info "Total connected workers: #{@@max_workers}"
510
+
511
+ end
408
512
 
409
513
  end
410
514
  end
@@ -10,7 +10,7 @@ module ScbiMapreduce
10
10
  class Worker < EventMachine::Connection
11
11
  include EM::P::ObjectProtocol
12
12
 
13
-
13
+ @@want_to_exit_worker=false
14
14
 
15
15
  def receive_initial_config(obj)
16
16
 
@@ -40,7 +40,6 @@ module ScbiMapreduce
40
40
 
41
41
  def initialize(*args)
42
42
  super
43
-
44
43
  end
45
44
 
46
45
  def post_init
@@ -67,8 +66,15 @@ module ScbiMapreduce
67
66
  # At first iteration, start worker
68
67
  starting_worker
69
68
  else
70
-
71
- if obj == :quit
69
+ $WORKER_LOG.info("received:"+obj.to_s)
70
+
71
+ if (obj == :quit) || @@want_to_exit_worker
72
+ $WORKER_LOG.info('Quit received')
73
+
74
+ stop_worker
75
+
76
+ elsif @@want_to_exit_worker
77
+ $WORKER_LOG.info('Want to exit worker')
72
78
  stop_worker
73
79
  else
74
80
  @@count += 1
@@ -94,6 +100,10 @@ module ScbiMapreduce
94
100
  modified_data=process_object(obj.data)
95
101
  obj.data = modified_data
96
102
 
103
+ # if obj.job_identifier==3
104
+ # sleep 15
105
+ # end
106
+
97
107
  send_object(obj)
98
108
 
99
109
  rescue Exception => e
@@ -114,18 +124,30 @@ module ScbiMapreduce
114
124
  end
115
125
 
116
126
  def stop_worker
127
+ $WORKER_LOG.info "Closing connection with WORKER"
128
+ $WORKER_LOG.info("Worker processed #{@@count} chunks")
129
+
117
130
  close_connection
118
131
  EventMachine::stop_event_loop
119
132
  closing_worker
120
133
  end
134
+
135
+ def self.controlled_exit_worker
136
+ @@want_to_exit_worker=true
137
+ end
121
138
 
122
139
  def self.start_worker(worker_id,ip,port,log_file=nil)
123
140
  #puts "NEW WORKER - INIIIIIIIIIIIIIIIIIIIIT #{self}"
141
+
142
+
124
143
  ip = ip
125
144
  port = port
126
145
  @@count = -1
127
146
 
128
147
  @@worker_id=worker_id
148
+
149
+ # Signal.trap("INT") { puts "TRAP INT in worker #{@@worker_id}"; controlled_exit_worker; EM.stop}
150
+ # Signal.trap("TERM") { puts "TRAP TERM in worker #{@@worker_id}";controlled_exit_worker; EM.stop}
129
151
 
130
152
  if log_file.nil?
131
153
  log_file = 'logs/worker'+worker_id+'_'+`hostname`.chomp+'_log.txt'
@@ -7,7 +7,7 @@ $: << File.expand_path('scbi_mapreduce')
7
7
  # puts $:
8
8
 
9
9
  module ScbiMapreduce
10
- VERSION = '0.0.37'
10
+ VERSION = '0.0.38'
11
11
  end
12
12
 
13
13
  require 'scbi_mapreduce/manager'
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: scbi_mapreduce
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.37
5
+ version: 0.0.38
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-10-20 00:00:00 Z
13
+ date: 2012-04-13 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: eventmachine