scbi_mapreduce 0.0.38 → 0.0.40
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +8 -0
- data/README.rdoc +8 -2
- data/lib/scbi_mapreduce.rb +10 -1
- data/lib/scbi_mapreduce/manager.rb +18 -4
- data/lib/scbi_mapreduce/work_manager.rb +219 -24
- data/lib/scbi_mapreduce/worker.rb +22 -5
- data/lib/scbi_mapreduce/worker_launcher.rb +21 -8
- metadata +102 -53
data/.gemtest
ADDED
File without changes
|
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -238,22 +238,28 @@ Your worker file will be used to launch workers.
|
|
238
238
|
|
239
239
|
You can also set additional properties:
|
240
240
|
|
241
|
-
|
242
241
|
# if you want basic checkpointing. Some performance drop should be expected
|
243
242
|
# mgr.checkpointing=true
|
244
243
|
|
245
244
|
# if you want to keep the order of input data. Some performance drop should be expected
|
246
245
|
# mgr.keep_order=true
|
247
246
|
|
247
|
+
# Enable fault tolerance for stuck jobs. Those jobs that has been stuck will be sent again to another worker. Some performance drop should be expected
|
248
|
+
# mgr.retry_stuck_jobs=true
|
249
|
+
|
248
250
|
# you can set the size of packets of data sent to workers
|
249
251
|
mgr.chunk_size=100
|
250
252
|
|
251
253
|
|
252
|
-
And finally, start the server:
|
254
|
+
And finally, start the server, and write a file with specific statistics if desired:
|
253
255
|
|
254
256
|
# start processing
|
255
257
|
mgr.start_server
|
256
258
|
|
259
|
+
# save full stats and a custom value in json format to a file
|
260
|
+
mgr.stats[:my_stats]=11
|
261
|
+
|
262
|
+
mgr.save_stats
|
257
263
|
|
258
264
|
# this line is reached when all data has been processed
|
259
265
|
puts "Program finished"
|
data/lib/scbi_mapreduce.rb
CHANGED
@@ -7,9 +7,18 @@ $: << File.expand_path('scbi_mapreduce')
|
|
7
7
|
# puts $:
|
8
8
|
|
9
9
|
module ScbiMapreduce
|
10
|
-
VERSION = '0.0.
|
10
|
+
VERSION = '0.0.40'
|
11
|
+
|
12
|
+
|
11
13
|
end
|
12
14
|
|
15
|
+
class Time
|
16
|
+
def self.now_us
|
17
|
+
return (Time.now.to_f)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
|
13
22
|
require 'scbi_mapreduce/manager'
|
14
23
|
require 'scbi_mapreduce/worker_launcher'
|
15
24
|
require 'scbi_mapreduce/worker'
|
@@ -26,8 +26,12 @@ module ScbiMapreduce
|
|
26
26
|
if log_file.nil?
|
27
27
|
log_file = File.join('logs','server_log.txt')
|
28
28
|
end
|
29
|
-
|
30
|
-
|
29
|
+
|
30
|
+
if ((log_file!=STDOUT) && (!File.exists?(File.dirname(log_file))))
|
31
|
+
FileUtils.mkdir_p(File.dirname(log_file))
|
32
|
+
$SERVER_LOG.info("Creating logs folder")
|
33
|
+
end
|
34
|
+
|
31
35
|
$SERVER_LOG = Logger.new(log_file)
|
32
36
|
|
33
37
|
|
@@ -92,7 +96,6 @@ module ScbiMapreduce
|
|
92
96
|
|
93
97
|
end
|
94
98
|
|
95
|
-
|
96
99
|
# Start a EventMachine loop acting as a server for incoming workers connections
|
97
100
|
def start_server
|
98
101
|
|
@@ -128,6 +131,17 @@ module ScbiMapreduce
|
|
128
131
|
end
|
129
132
|
|
130
133
|
|
131
|
-
end
|
132
134
|
|
135
|
+
def stats
|
136
|
+
@work_manager_class.stats
|
137
|
+
end
|
138
|
+
|
139
|
+
def save_stats(stats=nil, filename='scbi_mapreduce_stats.json')
|
140
|
+
@work_manager_class.save_stats(stats,filename)
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
|
133
147
|
end
|
@@ -12,12 +12,14 @@
|
|
12
12
|
# TODO - Data preload (queue?) instead of under demand loading
|
13
13
|
# DONE - Add serializer with marshal + zlib deflate/inflate
|
14
14
|
|
15
|
-
|
15
|
+
require 'json'
|
16
16
|
|
17
|
+
module ScbiMapreduce
|
17
18
|
|
18
19
|
PENDING_TO_SAVE=10
|
19
20
|
CHECKPOINT_FILE='scbi_mapreduce_checkpoint'
|
20
21
|
OLD_CHECKPOINT_FILE='old_scbi_mapreduce_checkpoint'
|
22
|
+
PROCESSING_TIMEOUT_MULTIPLIER=10
|
21
23
|
|
22
24
|
class WorkManagerData
|
23
25
|
|
@@ -25,48 +27,101 @@ module ScbiMapreduce
|
|
25
27
|
@@longest_processing_time=0
|
26
28
|
|
27
29
|
attr_reader :job_identifier
|
28
|
-
attr_accessor :status, :data
|
30
|
+
attr_accessor :status, :data, :sent_time, :received_time, :working_time, :worker_start_time, :worker_end_time, :worker_identifier
|
29
31
|
|
30
32
|
def initialize(objs)
|
31
|
-
|
33
|
+
@worker_identifier=0
|
32
34
|
@job_identifier=@@job_id
|
33
35
|
@@job_id+=1
|
34
36
|
@data=objs
|
35
37
|
|
36
|
-
|
37
|
-
@
|
38
|
+
@received_time=nil
|
39
|
+
@sent_time=0
|
38
40
|
@processing_time=nil
|
41
|
+
|
42
|
+
@worker_start_time=0
|
43
|
+
@worker_end_time=0
|
44
|
+
@worker_time=0
|
45
|
+
|
46
|
+
sent!
|
39
47
|
end
|
40
48
|
|
49
|
+
def update_with_received!(job)
|
50
|
+
@received_time=job.received_time
|
51
|
+
@sent_time=job.sent_time
|
52
|
+
@worker_end_time=job.worker_end_time
|
53
|
+
@worker_start_time=job.worker_start_time
|
54
|
+
|
55
|
+
@processing_time=@received_time-@sent_time
|
56
|
+
@worker_time=@worker_end_time-@worker_start_time
|
57
|
+
|
58
|
+
# save longer processing time
|
59
|
+
@@longest_processing_time=[@@longest_processing_time,@processing_time].max
|
60
|
+
|
61
|
+
@data=job.data
|
62
|
+
|
63
|
+
# if job.worker_identifier==0
|
64
|
+
# puts print_worker_time
|
65
|
+
# end
|
66
|
+
|
67
|
+
@status=:received
|
68
|
+
|
69
|
+
end
|
41
70
|
def received!(objs)
|
42
|
-
|
43
|
-
@received_time=Time.
|
71
|
+
|
72
|
+
@received_time=Time.now_us
|
73
|
+
|
44
74
|
@processing_time=@received_time-@sent_time
|
75
|
+
@worker_time=@worker_end_time-@worker_start_time
|
45
76
|
|
46
77
|
# save longer processing time
|
47
78
|
@@longest_processing_time=[@@longest_processing_time,@processing_time].max
|
48
79
|
|
80
|
+
@data=objs
|
81
|
+
|
49
82
|
@status=:received
|
50
83
|
end
|
51
84
|
|
85
|
+
def end_worker_time!
|
86
|
+
@worker_end_time=Time.now_us
|
87
|
+
@worker_time= (@worker_end_time - @worker_start_time)
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
def start_worker_time!
|
92
|
+
@worker_start_time=Time.now_us
|
93
|
+
end
|
94
|
+
|
52
95
|
def sent!
|
53
96
|
@status=:running
|
54
|
-
@sent_time=Time.
|
97
|
+
@sent_time=Time.now_us
|
55
98
|
end
|
56
99
|
|
57
100
|
def stuck?
|
58
|
-
(@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*
|
101
|
+
(@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*PROCESSING_TIMEOUT_MULTIPLIER))
|
59
102
|
end
|
60
103
|
|
61
104
|
# return running or real processing time
|
62
105
|
def processing_time
|
63
|
-
return (@processing_time || (Time.
|
106
|
+
return (@processing_time || (Time.now_us-@sent_time))
|
64
107
|
end
|
108
|
+
|
109
|
+
def worker_time
|
110
|
+
return (@worker_time)
|
111
|
+
end
|
112
|
+
|
113
|
+
def transmission_time
|
114
|
+
return (processing_time - worker_time)
|
115
|
+
end
|
65
116
|
|
66
117
|
def inspect
|
67
|
-
time="; time: #{processing_time}
|
118
|
+
time="; time: #{processing_time} usecs"
|
68
119
|
return "WorkManagerData: #{@job_identifier} => #{@status} #{time}"
|
69
120
|
end
|
121
|
+
|
122
|
+
def print_worker_time
|
123
|
+
return "WorkManagerData Times: #{@worker_start_time} => #{@worker_end_time} #{worker_time}"
|
124
|
+
end
|
70
125
|
|
71
126
|
def self.job_id=(c)
|
72
127
|
# puts "Setting job_id to #{c}"
|
@@ -132,9 +187,28 @@ module ScbiMapreduce
|
|
132
187
|
end
|
133
188
|
|
134
189
|
############
|
190
|
+
def self.stats
|
191
|
+
@@stats
|
192
|
+
end
|
193
|
+
|
194
|
+
def self.save_stats(stats=nil, filename='scbi_mapreduce_stats.json')
|
195
|
+
f=File.open(filename,'w')
|
196
|
+
|
197
|
+
if stats.nil?
|
198
|
+
f.puts JSON::pretty_generate @@stats
|
199
|
+
else
|
200
|
+
f.puts JSON::pretty_generate stats
|
201
|
+
end
|
202
|
+
|
203
|
+
f.close
|
204
|
+
end
|
135
205
|
|
136
206
|
def self.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs,exit_on_many_errors,chunk_size)
|
207
|
+
@@stats={}
|
137
208
|
@@count = 0
|
209
|
+
@@retried_jobs=0
|
210
|
+
@@sent_chunks=0
|
211
|
+
@@received_objects=0
|
138
212
|
@@want_to_exit=false
|
139
213
|
@@chunk_count = 0
|
140
214
|
@@workers = 0
|
@@ -162,9 +236,51 @@ module ScbiMapreduce
|
|
162
236
|
@@checkpoint=self.get_checkpoint
|
163
237
|
$SERVER_LOG.info "Detected checkpoint at #{@@checkpoint}"
|
164
238
|
end
|
239
|
+
|
240
|
+
# for statistics:
|
241
|
+
@@total_seconds=0
|
242
|
+
@@total_manager_time=0
|
243
|
+
# mean_worker_time=0
|
244
|
+
@@each_worker_time={}
|
245
|
+
@@each_transmission_time={}
|
246
|
+
|
247
|
+
@@total_read_time=0
|
248
|
+
@@total_write_time=0
|
249
|
+
# mean_transmission_time=0
|
165
250
|
|
166
251
|
end
|
167
252
|
|
253
|
+
|
254
|
+
def mean_time(h)
|
255
|
+
r=0
|
256
|
+
i=0
|
257
|
+
|
258
|
+
h.each do |k,v|
|
259
|
+
r+=h[k]
|
260
|
+
i+=1
|
261
|
+
end
|
262
|
+
|
263
|
+
if r>0
|
264
|
+
r=r/i.to_f
|
265
|
+
end
|
266
|
+
|
267
|
+
return r
|
268
|
+
end
|
269
|
+
def each_worker_time(worker,time)
|
270
|
+
if @@each_worker_time[worker].nil? then
|
271
|
+
@@each_worker_time[worker]=0
|
272
|
+
end
|
273
|
+
@@each_worker_time[worker]+=time
|
274
|
+
end
|
275
|
+
|
276
|
+
def each_transmission_time(worker,time)
|
277
|
+
if @@each_transmission_time[worker].nil? then
|
278
|
+
@@each_transmission_time[worker]=0
|
279
|
+
end
|
280
|
+
@@each_transmission_time[worker]+=time
|
281
|
+
end
|
282
|
+
|
283
|
+
|
168
284
|
def self.checkpoint
|
169
285
|
return @@checkpoint
|
170
286
|
end
|
@@ -232,6 +348,8 @@ module ScbiMapreduce
|
|
232
348
|
sent=false
|
233
349
|
|
234
350
|
if @@retry_stuck_jobs
|
351
|
+
# $SERVER_LOG.debug("="*40)
|
352
|
+
# print_running_jobs
|
235
353
|
# count stuck jobs and re-sent the first one
|
236
354
|
stuck_works=@@running_jobs.select{|job| job.stuck?}
|
237
355
|
|
@@ -240,8 +358,10 @@ module ScbiMapreduce
|
|
240
358
|
$SERVER_LOG.info("Stuck Jobs:\n#{jobs}")
|
241
359
|
|
242
360
|
# send_object
|
243
|
-
send_object(stuck_works.first)
|
244
361
|
stuck_works.first.sent!
|
362
|
+
send_object(stuck_works.first)
|
363
|
+
@@sent_chunks+=1
|
364
|
+
@@retried_jobs+=1
|
245
365
|
$SERVER_LOG.info("Sending stuck work #{stuck_works.first.inspect}")
|
246
366
|
sent=true
|
247
367
|
end
|
@@ -262,7 +382,9 @@ module ScbiMapreduce
|
|
262
382
|
|
263
383
|
#send stuck work
|
264
384
|
objs=[]
|
265
|
-
|
385
|
+
|
386
|
+
t=Time.now_us
|
387
|
+
|
266
388
|
# prepare new data
|
267
389
|
@@chunk_size.times do
|
268
390
|
obj=next_work
|
@@ -273,7 +395,9 @@ module ScbiMapreduce
|
|
273
395
|
objs << obj
|
274
396
|
end
|
275
397
|
end
|
276
|
-
|
398
|
+
|
399
|
+
@@total_read_time+=(Time.now_us - t)
|
400
|
+
|
277
401
|
# if new was data collected, send it
|
278
402
|
if objs.count>0
|
279
403
|
@@count += objs.count
|
@@ -281,6 +405,7 @@ module ScbiMapreduce
|
|
281
405
|
|
282
406
|
work_data=WorkManagerData.new(objs)
|
283
407
|
send_object(work_data)
|
408
|
+
@@sent_chunks+=1
|
284
409
|
|
285
410
|
# to keep order or retry failed job, we need job status
|
286
411
|
if @@keep_order || @@retry_stuck_jobs
|
@@ -290,8 +415,15 @@ module ScbiMapreduce
|
|
290
415
|
# print_running_jobs
|
291
416
|
end
|
292
417
|
else
|
293
|
-
# otherwise,
|
294
|
-
|
418
|
+
# otherwise,
|
419
|
+
if @@running_jobs.count >0
|
420
|
+
$SERVER_LOG.info("Worker, go to sleep")
|
421
|
+
send_object(:sleep)
|
422
|
+
|
423
|
+
else
|
424
|
+
# send a quit value indicating no more data available
|
425
|
+
send_object(:quit)
|
426
|
+
end
|
295
427
|
end
|
296
428
|
end
|
297
429
|
end
|
@@ -339,7 +471,7 @@ module ScbiMapreduce
|
|
339
471
|
@@max_workers +=1
|
340
472
|
# when first worker is connected, do special config
|
341
473
|
if @@workers == 1
|
342
|
-
@@total_seconds = Time.
|
474
|
+
@@total_seconds = Time.now_us
|
343
475
|
$SERVER_LOG.info "First worker connected"
|
344
476
|
|
345
477
|
if @@checkpointing
|
@@ -382,16 +514,31 @@ module ScbiMapreduce
|
|
382
514
|
end
|
383
515
|
end
|
384
516
|
|
517
|
+
elsif obj == :waking_up
|
518
|
+
$SERVER_LOG.info("Worker woke up")
|
385
519
|
else
|
520
|
+
|
386
521
|
# if not using checkpointing
|
387
|
-
|
522
|
+
obj.received!(obj.data)
|
388
523
|
|
389
524
|
if @@checkpointing || @@keep_order || @@retry_stuck_jobs
|
390
525
|
# print_running_jobs
|
391
526
|
checkpointable_job_received(obj)
|
392
527
|
else
|
528
|
+
# change this job's status to received
|
529
|
+
|
530
|
+
t=Time.now_us
|
393
531
|
work_received(obj.data)
|
532
|
+
@@received_objects+=obj.data.count
|
533
|
+
@@total_write_time+=(Time.now_us - t)
|
394
534
|
end
|
535
|
+
|
536
|
+
# puts obj.worker_identifier,obj.worker_identifier.class
|
537
|
+
# if obj.worker_identifier==0 then
|
538
|
+
# end
|
539
|
+
|
540
|
+
each_worker_time(obj.worker_identifier, obj.worker_time)
|
541
|
+
each_transmission_time(obj.worker_identifier, obj.transmission_time)
|
395
542
|
end
|
396
543
|
|
397
544
|
# free mem
|
@@ -409,10 +556,8 @@ module ScbiMapreduce
|
|
409
556
|
# save job if there is was a valid work previously sent
|
410
557
|
if received_job
|
411
558
|
|
412
|
-
# change this job's status to received
|
413
|
-
received_job.
|
414
|
-
|
415
|
-
|
559
|
+
# change this job's status to received, already done in previous method
|
560
|
+
received_job.update_with_received!(obj)
|
416
561
|
|
417
562
|
# # if there are sufficient jobs, count pending ones
|
418
563
|
# if (@@running_jobs.count>=PENDING_TO_SAVE)
|
@@ -439,7 +584,11 @@ module ScbiMapreduce
|
|
439
584
|
@@running_jobs.each do |job|
|
440
585
|
if job.status==:received
|
441
586
|
# puts "Sent to save: #{job.inspect}"
|
587
|
+
t=Time.now_us
|
442
588
|
work_received(job.data)
|
589
|
+
@@received_objects+=job.data.count
|
590
|
+
@@total_write_time+=(Time.now_us - t)
|
591
|
+
|
443
592
|
job.status=:saved
|
444
593
|
to_remove += 1
|
445
594
|
else
|
@@ -490,6 +639,8 @@ module ScbiMapreduce
|
|
490
639
|
end
|
491
640
|
end
|
492
641
|
|
642
|
+
|
643
|
+
|
493
644
|
def stop_work_manager
|
494
645
|
|
495
646
|
|
@@ -499,12 +650,56 @@ module ScbiMapreduce
|
|
499
650
|
|
500
651
|
self.class.end_work_manager
|
501
652
|
|
502
|
-
@@total_seconds = Time.
|
653
|
+
@@total_seconds = (Time.now_us-@@total_seconds)
|
654
|
+
@@total_manager_time= @@total_manager_time
|
655
|
+
|
656
|
+
@@total_read_time=@@total_read_time
|
657
|
+
@@total_write_time=@@total_write_time
|
658
|
+
|
659
|
+
mean_worker_time=mean_time(@@each_worker_time)
|
660
|
+
mean_transmission_time=mean_time(@@each_transmission_time)
|
661
|
+
|
662
|
+
idle_time=(@@total_seconds - @@total_read_time -@@total_write_time - mean_transmission_time)
|
663
|
+
|
664
|
+
@@stats={}
|
665
|
+
@@stats[:total_objects]=@@count
|
666
|
+
@@stats[:total_seconds]=@@total_seconds
|
667
|
+
@@stats[:sent_chunks]=@@sent_chunks
|
668
|
+
@@stats[:received_objects]=@@received_objects
|
669
|
+
@@stats[:processing_rate]=(@@count/@@total_seconds.to_f)
|
670
|
+
@@stats[:total_read_time]=@@total_read_time
|
671
|
+
@@stats[:total_write_time]=@@total_write_time
|
672
|
+
@@stats[:mean_worker_time]=mean_worker_time
|
673
|
+
@@stats[:mean_transmission_time]=mean_transmission_time
|
674
|
+
@@stats[:total_manager_idle_time]=idle_time
|
675
|
+
|
676
|
+
@@stats[:error_count]=@@error_count
|
677
|
+
@@stats[:retried_jobs]=@@retried_jobs
|
678
|
+
@@stats[:chunk_size]=@@chunk_size
|
679
|
+
@@stats[:connected_workers]=@@max_workers
|
680
|
+
@@stats[:each_transmission_time]=@@each_transmission_time
|
681
|
+
@@stats[:each_worker_time]=@@each_worker_time
|
682
|
+
|
683
|
+
|
503
684
|
$SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
|
685
|
+
$SERVER_LOG.info "Total sent chunks: #{@@sent_chunks} objects"
|
686
|
+
|
687
|
+
$SERVER_LOG.info "Total sent objects: #{@@count} objects"
|
688
|
+
$SERVER_LOG.info "Total received objects: #{@@received_objects} objects"
|
689
|
+
|
504
690
|
$SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
|
505
691
|
$SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
|
506
|
-
|
692
|
+
|
693
|
+
$SERVER_LOG.info "Total read time #{@@total_read_time} seconds"
|
694
|
+
$SERVER_LOG.info "Total write time #{@@total_write_time} seconds"
|
695
|
+
# mean_worker_time=mean_worker_time/@@max_workers
|
696
|
+
$SERVER_LOG.info "Total worker time #{mean_worker_time} seconds"
|
697
|
+
$SERVER_LOG.info "Total transmission time #{mean_transmission_time} seconds"
|
698
|
+
$SERVER_LOG.info "Total manager_idle time #{idle_time} seconds"
|
699
|
+
# $SERVER_LOG.info "Total manager time #{@@total_read_time + @@total_write_time + mean_transmission_time} seconds"
|
700
|
+
|
507
701
|
$SERVER_LOG.info "Number of errors: #{@@error_count}"
|
702
|
+
$SERVER_LOG.info "Number of retried stuck jobs: #{@@retried_jobs}"
|
508
703
|
$SERVER_LOG.info "Chunk size: #{@@chunk_size}"
|
509
704
|
$SERVER_LOG.info "Total connected workers: #{@@max_workers}"
|
510
705
|
|
@@ -68,6 +68,7 @@ module ScbiMapreduce
|
|
68
68
|
else
|
69
69
|
$WORKER_LOG.info("received:"+obj.to_s)
|
70
70
|
|
71
|
+
|
71
72
|
if (obj == :quit) || @@want_to_exit_worker
|
72
73
|
$WORKER_LOG.info('Quit received')
|
73
74
|
|
@@ -76,8 +77,13 @@ module ScbiMapreduce
|
|
76
77
|
elsif @@want_to_exit_worker
|
77
78
|
$WORKER_LOG.info('Want to exit worker')
|
78
79
|
stop_worker
|
80
|
+
elsif (obj== :sleep)
|
81
|
+
$WORKER_LOG.info('Sleeping 10 secs')
|
82
|
+
sleep 10
|
83
|
+
send_object(:waking_up)
|
79
84
|
else
|
80
85
|
@@count += 1
|
86
|
+
obj.worker_identifier=@@worker_id.to_i
|
81
87
|
|
82
88
|
# OJO - HAY QUE PASAR EL MODIFIED OBJECT
|
83
89
|
# operation = proc {
|
@@ -96,10 +102,14 @@ module ScbiMapreduce
|
|
96
102
|
|
97
103
|
|
98
104
|
begin
|
99
|
-
|
105
|
+
|
106
|
+
obj.start_worker_time!
|
107
|
+
|
100
108
|
modified_data=process_object(obj.data)
|
101
109
|
obj.data = modified_data
|
102
|
-
|
110
|
+
|
111
|
+
obj.end_worker_time!
|
112
|
+
|
103
113
|
# if obj.job_identifier==3
|
104
114
|
# sleep 15
|
105
115
|
# end
|
@@ -160,20 +170,27 @@ module ScbiMapreduce
|
|
160
170
|
|
161
171
|
$LOG = $WORKER_LOG
|
162
172
|
|
163
|
-
total_seconds = Time.
|
173
|
+
total_seconds = Time.now_us
|
164
174
|
|
165
175
|
EM.error_handler{ |e|
|
166
176
|
$WORKER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
|
167
177
|
}
|
178
|
+
|
179
|
+
Signal.trap("CONT") do
|
180
|
+
$WORKER_LOG.info("SIGCONT: Worker: #{@@worker_id} with PID: #{Process.pid} sleeping 15 seconds before waking up")
|
181
|
+
puts "SIGCONT: Worker: #{@@worker_id} with PID: #{Process.pid} sleeping 15 seconds before waking up"
|
182
|
+
|
183
|
+
sleep 15
|
184
|
+
end
|
168
185
|
|
169
186
|
EventMachine::run {
|
170
187
|
|
171
188
|
EventMachine::connect ip, port, self
|
172
|
-
$WORKER_LOG.info "Worker connected to #{ip}:#{port}"
|
189
|
+
$WORKER_LOG.info "Worker: #{@@worker_id} with PID: #{Process.pid} connected to #{ip}:#{port}"
|
173
190
|
|
174
191
|
}
|
175
192
|
|
176
|
-
total_seconds = Time.
|
193
|
+
total_seconds = Time.now_us-total_seconds
|
177
194
|
$WORKER_LOG.info "Client #{@@worker_id} processed: #{@@count} objs"
|
178
195
|
$WORKER_LOG.info "Client #{@@worker_id} proc rate: #{@@count/total_seconds.to_f} objects/seg"
|
179
196
|
|
@@ -9,6 +9,8 @@ module ScbiMapreduce
|
|
9
9
|
class WorkerLauncher
|
10
10
|
|
11
11
|
attr_accessor :server_ip, :server_port
|
12
|
+
|
13
|
+
@@worker_id=0
|
12
14
|
|
13
15
|
def initialize(server_ip,server_port, server_ip_list,workers, worker_file, log_file=nil, init_env_file=nil)
|
14
16
|
@server_ip = server_ip
|
@@ -19,8 +21,9 @@ module ScbiMapreduce
|
|
19
21
|
@server_ip_list=server_ip_list
|
20
22
|
|
21
23
|
|
24
|
+
|
22
25
|
if log_file.nil?
|
23
|
-
log_file =
|
26
|
+
log_file = "logs/launcher_global_log.txt"
|
24
27
|
end
|
25
28
|
|
26
29
|
FileUtils.mkdir_p(File.dirname(log_file)) if ((log_file!=STDOUT) && (!File.exists?(File.dirname(log_file))))
|
@@ -44,10 +47,10 @@ module ScbiMapreduce
|
|
44
47
|
threads = []
|
45
48
|
@workers.times do |i|
|
46
49
|
pid=fork{
|
47
|
-
launch_worker(
|
50
|
+
launch_worker(@@worker_id,@server_ip,@server_port)
|
48
51
|
$LAUNCHER_LOG.info "Worker #{i} launched [#{@server_ip}:#{@server_port}]"
|
49
52
|
}
|
50
|
-
|
53
|
+
@@worker_id+=1
|
51
54
|
#threads.each { |aThread| aThread.join }
|
52
55
|
end
|
53
56
|
#Process.waitall
|
@@ -109,7 +112,14 @@ module ScbiMapreduce
|
|
109
112
|
|
110
113
|
def launch_external_workers(workers)
|
111
114
|
puts "Launching #{workers.count} external workers: #{workers}"
|
112
|
-
|
115
|
+
puts "INIT_ENV_FILE: #{@init_env_file}"
|
116
|
+
|
117
|
+
# This sleep is necessary to leave time to lustre fylesystems to sync the log folder between all nodes. If not, external workers will not be launched.
|
118
|
+
if !workers.empty?
|
119
|
+
puts "SLEEP 10 for logs folder sync in lustre fs"
|
120
|
+
sleep 10
|
121
|
+
end
|
122
|
+
|
113
123
|
init=''
|
114
124
|
if @init_env_file
|
115
125
|
init_path = File.expand_path(@init_env_file)
|
@@ -129,10 +139,13 @@ module ScbiMapreduce
|
|
129
139
|
cd = "cd #{init_dir}; "
|
130
140
|
end
|
131
141
|
|
132
|
-
|
142
|
+
|
133
143
|
|
134
144
|
workers.each do |machine|
|
135
|
-
|
145
|
+
|
146
|
+
log_file=File.join(init_dir,'logs',"launcher_#{@@worker_id}")
|
147
|
+
log_dir=File.join(init_dir,'logs')
|
148
|
+
|
136
149
|
# if server_ip is not in valid ips
|
137
150
|
if !@server_ip_list.include?(@server_ip)
|
138
151
|
# find matching ip between server and worker
|
@@ -146,7 +159,7 @@ module ScbiMapreduce
|
|
146
159
|
if !found_ip.nil?
|
147
160
|
# cmd = "ssh #{machine} \"#{init} #{cd} #{INTERPRETER} #{File.join(File.dirname(__FILE__),'main_worker.rb')} #{worker_id.to_s} #{@server_ip} #{@server_port} #{@worker_file}\""
|
148
161
|
# cmd = "ssh #{machine} \"nohup #{File.join(File.dirname(__FILE__),'launcher.sh')} #{worker_id.to_s} #{@server_ip} #{@server_port} #{@worker_file} #{init_dir} #{init_path} </dev/null >> #{log_file} 2>> #{log_file} & \""
|
149
|
-
cmd = "ssh #{machine} \"nohup
|
162
|
+
cmd = "ssh #{machine} \"nohup #{File.join(File.dirname(__FILE__),'launcher.sh')} #{@@worker_id.to_s} #{found_ip} #{@server_port} #{@worker_file} #{init_dir} #{init_path} </dev/null >> #{log_file} 2>> #{log_file} & \""
|
150
163
|
|
151
164
|
$LAUNCHER_LOG.info cmd
|
152
165
|
|
@@ -154,7 +167,7 @@ module ScbiMapreduce
|
|
154
167
|
exec(cmd)
|
155
168
|
}
|
156
169
|
|
157
|
-
worker_id+=1
|
170
|
+
@@worker_id+=1
|
158
171
|
else
|
159
172
|
$LAUNCHER_LOG.error("Couldn't find a matching ip between worker (#{machine_ip}) and server #{ip_list.to_json}")
|
160
173
|
end
|
metadata
CHANGED
@@ -1,65 +1,113 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: scbi_mapreduce
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.40
|
4
5
|
prerelease:
|
5
|
-
version: 0.0.38
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Dario Guerrero
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2013-09-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
16
15
|
name: eventmachine
|
17
|
-
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
19
17
|
none: false
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
23
21
|
version: 0.12.0
|
24
22
|
type: :runtime
|
25
|
-
version_requirements: *id001
|
26
|
-
- !ruby/object:Gem::Dependency
|
27
|
-
name: json
|
28
23
|
prerelease: false
|
29
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.12.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: json
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
30
33
|
none: false
|
31
|
-
requirements:
|
32
|
-
- -
|
33
|
-
- !ruby/object:Gem::Version
|
34
|
-
version:
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
35
38
|
type: :runtime
|
36
|
-
version_requirements: *id002
|
37
|
-
- !ruby/object:Gem::Dependency
|
38
|
-
name: hoe
|
39
39
|
prerelease: false
|
40
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rdoc
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
41
49
|
none: false
|
42
|
-
requirements:
|
43
|
-
- -
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version:
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '4.0'
|
46
54
|
type: :development
|
47
|
-
|
48
|
-
|
49
|
-
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: newgem
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.5.3
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.5.3
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: hoe
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '3.6'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '3.6'
|
94
|
+
description: scbi_mapreduce brings parallel and distributed computing capabilities
|
95
|
+
to your code, with a very easy to use framework that allows you to exploit your
|
96
|
+
clustered or cloud computational resources.
|
97
|
+
email:
|
50
98
|
- dariogf@gmail.com
|
51
|
-
executables:
|
99
|
+
executables:
|
52
100
|
- scbi_mapreduce
|
53
101
|
extensions: []
|
54
|
-
|
55
|
-
extra_rdoc_files:
|
102
|
+
extra_rdoc_files:
|
56
103
|
- History.txt
|
57
104
|
- Manifest.txt
|
58
105
|
- PostInstall.txt
|
106
|
+
- README.rdoc
|
59
107
|
- skeleton/simple/README.txt
|
60
108
|
- skeleton/remove_mids/README.txt
|
61
109
|
- skeleton/dummy_calcs/README.txt
|
62
|
-
files:
|
110
|
+
files:
|
63
111
|
- History.txt
|
64
112
|
- lib/scbi_mapreduce/error_handler.rb
|
65
113
|
- lib/scbi_mapreduce/main_worker.rb
|
@@ -106,34 +154,35 @@ files:
|
|
106
154
|
- skeleton/dummy_calcs/my_worker_manager.rb
|
107
155
|
- skeleton/dummy_calcs/README.txt
|
108
156
|
- skeleton/dummy_calcs/threads_implementation.rb
|
157
|
+
- .gemtest
|
109
158
|
homepage: http://www.scbi.uma.es/downloads
|
110
159
|
licenses: []
|
111
|
-
|
112
160
|
post_install_message: PostInstall.txt
|
113
|
-
rdoc_options:
|
161
|
+
rdoc_options:
|
114
162
|
- --main
|
115
163
|
- README.rdoc
|
116
|
-
require_paths:
|
164
|
+
require_paths:
|
117
165
|
- lib
|
118
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
166
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
167
|
none: false
|
120
|
-
requirements:
|
121
|
-
- -
|
122
|
-
- !ruby/object:Gem::Version
|
123
|
-
version:
|
124
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
168
|
+
requirements:
|
169
|
+
- - ! '>='
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: '0'
|
172
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
173
|
none: false
|
126
|
-
requirements:
|
127
|
-
- -
|
128
|
-
- !ruby/object:Gem::Version
|
129
|
-
version:
|
174
|
+
requirements:
|
175
|
+
- - ! '>='
|
176
|
+
- !ruby/object:Gem::Version
|
177
|
+
version: '0'
|
130
178
|
requirements: []
|
131
|
-
|
132
179
|
rubyforge_project: scbi_mapreduce
|
133
|
-
rubygems_version: 1.
|
180
|
+
rubygems_version: 1.8.24
|
134
181
|
signing_key:
|
135
182
|
specification_version: 3
|
136
|
-
summary: scbi_mapreduce brings parallel and distributed computing capabilities to
|
137
|
-
|
183
|
+
summary: scbi_mapreduce brings parallel and distributed computing capabilities to
|
184
|
+
your code, with a very easy to use framework that allows you to exploit your clustered
|
185
|
+
or cloud computational resources.
|
186
|
+
test_files:
|
138
187
|
- test/test_helper.rb
|
139
188
|
- test/test_scbi_drb.rb
|