scbi_mapreduce 0.0.38 → 0.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +8 -0
- data/README.rdoc +8 -2
- data/lib/scbi_mapreduce.rb +10 -1
- data/lib/scbi_mapreduce/manager.rb +18 -4
- data/lib/scbi_mapreduce/work_manager.rb +219 -24
- data/lib/scbi_mapreduce/worker.rb +22 -5
- data/lib/scbi_mapreduce/worker_launcher.rb +21 -8
- metadata +102 -53
data/.gemtest
ADDED
File without changes
|
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -238,22 +238,28 @@ Your worker file will be used to launch workers.
|
|
238
238
|
|
239
239
|
You can also set additional properties:
|
240
240
|
|
241
|
-
|
242
241
|
# if you want basic checkpointing. Some performance drop should be expected
|
243
242
|
# mgr.checkpointing=true
|
244
243
|
|
245
244
|
# if you want to keep the order of input data. Some performance drop should be expected
|
246
245
|
# mgr.keep_order=true
|
247
246
|
|
247
|
+
# Enable fault tolerance for stuck jobs. Those jobs that has been stuck will be sent again to another worker. Some performance drop should be expected
|
248
|
+
# mgr.retry_stuck_jobs=true
|
249
|
+
|
248
250
|
# you can set the size of packets of data sent to workers
|
249
251
|
mgr.chunk_size=100
|
250
252
|
|
251
253
|
|
252
|
-
And finally, start the server:
|
254
|
+
And finally, start the server, and write a file with specific statistics if desired:
|
253
255
|
|
254
256
|
# start processing
|
255
257
|
mgr.start_server
|
256
258
|
|
259
|
+
# save full stats and a custom value in json format to a file
|
260
|
+
mgr.stats[:my_stats]=11
|
261
|
+
|
262
|
+
mgr.save_stats
|
257
263
|
|
258
264
|
# this line is reached when all data has been processed
|
259
265
|
puts "Program finished"
|
data/lib/scbi_mapreduce.rb
CHANGED
@@ -7,9 +7,18 @@ $: << File.expand_path('scbi_mapreduce')
|
|
7
7
|
# puts $:
|
8
8
|
|
9
9
|
module ScbiMapreduce
|
10
|
-
VERSION = '0.0.
|
10
|
+
VERSION = '0.0.40'
|
11
|
+
|
12
|
+
|
11
13
|
end
|
12
14
|
|
15
|
+
class Time
|
16
|
+
def self.now_us
|
17
|
+
return (Time.now.to_f)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
|
13
22
|
require 'scbi_mapreduce/manager'
|
14
23
|
require 'scbi_mapreduce/worker_launcher'
|
15
24
|
require 'scbi_mapreduce/worker'
|
@@ -26,8 +26,12 @@ module ScbiMapreduce
|
|
26
26
|
if log_file.nil?
|
27
27
|
log_file = File.join('logs','server_log.txt')
|
28
28
|
end
|
29
|
-
|
30
|
-
|
29
|
+
|
30
|
+
if ((log_file!=STDOUT) && (!File.exists?(File.dirname(log_file))))
|
31
|
+
FileUtils.mkdir_p(File.dirname(log_file))
|
32
|
+
$SERVER_LOG.info("Creating logs folder")
|
33
|
+
end
|
34
|
+
|
31
35
|
$SERVER_LOG = Logger.new(log_file)
|
32
36
|
|
33
37
|
|
@@ -92,7 +96,6 @@ module ScbiMapreduce
|
|
92
96
|
|
93
97
|
end
|
94
98
|
|
95
|
-
|
96
99
|
# Start a EventMachine loop acting as a server for incoming workers connections
|
97
100
|
def start_server
|
98
101
|
|
@@ -128,6 +131,17 @@ module ScbiMapreduce
|
|
128
131
|
end
|
129
132
|
|
130
133
|
|
131
|
-
end
|
132
134
|
|
135
|
+
def stats
|
136
|
+
@work_manager_class.stats
|
137
|
+
end
|
138
|
+
|
139
|
+
def save_stats(stats=nil, filename='scbi_mapreduce_stats.json')
|
140
|
+
@work_manager_class.save_stats(stats,filename)
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
|
133
147
|
end
|
@@ -12,12 +12,14 @@
|
|
12
12
|
# TODO - Data preload (queue?) instead of under demand loading
|
13
13
|
# DONE - Add serializer with marshal + zlib deflate/inflate
|
14
14
|
|
15
|
-
|
15
|
+
require 'json'
|
16
16
|
|
17
|
+
module ScbiMapreduce
|
17
18
|
|
18
19
|
PENDING_TO_SAVE=10
|
19
20
|
CHECKPOINT_FILE='scbi_mapreduce_checkpoint'
|
20
21
|
OLD_CHECKPOINT_FILE='old_scbi_mapreduce_checkpoint'
|
22
|
+
PROCESSING_TIMEOUT_MULTIPLIER=10
|
21
23
|
|
22
24
|
class WorkManagerData
|
23
25
|
|
@@ -25,48 +27,101 @@ module ScbiMapreduce
|
|
25
27
|
@@longest_processing_time=0
|
26
28
|
|
27
29
|
attr_reader :job_identifier
|
28
|
-
attr_accessor :status, :data
|
30
|
+
attr_accessor :status, :data, :sent_time, :received_time, :working_time, :worker_start_time, :worker_end_time, :worker_identifier
|
29
31
|
|
30
32
|
def initialize(objs)
|
31
|
-
|
33
|
+
@worker_identifier=0
|
32
34
|
@job_identifier=@@job_id
|
33
35
|
@@job_id+=1
|
34
36
|
@data=objs
|
35
37
|
|
36
|
-
|
37
|
-
@
|
38
|
+
@received_time=nil
|
39
|
+
@sent_time=0
|
38
40
|
@processing_time=nil
|
41
|
+
|
42
|
+
@worker_start_time=0
|
43
|
+
@worker_end_time=0
|
44
|
+
@worker_time=0
|
45
|
+
|
46
|
+
sent!
|
39
47
|
end
|
40
48
|
|
49
|
+
def update_with_received!(job)
|
50
|
+
@received_time=job.received_time
|
51
|
+
@sent_time=job.sent_time
|
52
|
+
@worker_end_time=job.worker_end_time
|
53
|
+
@worker_start_time=job.worker_start_time
|
54
|
+
|
55
|
+
@processing_time=@received_time-@sent_time
|
56
|
+
@worker_time=@worker_end_time-@worker_start_time
|
57
|
+
|
58
|
+
# save longer processing time
|
59
|
+
@@longest_processing_time=[@@longest_processing_time,@processing_time].max
|
60
|
+
|
61
|
+
@data=job.data
|
62
|
+
|
63
|
+
# if job.worker_identifier==0
|
64
|
+
# puts print_worker_time
|
65
|
+
# end
|
66
|
+
|
67
|
+
@status=:received
|
68
|
+
|
69
|
+
end
|
41
70
|
def received!(objs)
|
42
|
-
|
43
|
-
@received_time=Time.
|
71
|
+
|
72
|
+
@received_time=Time.now_us
|
73
|
+
|
44
74
|
@processing_time=@received_time-@sent_time
|
75
|
+
@worker_time=@worker_end_time-@worker_start_time
|
45
76
|
|
46
77
|
# save longer processing time
|
47
78
|
@@longest_processing_time=[@@longest_processing_time,@processing_time].max
|
48
79
|
|
80
|
+
@data=objs
|
81
|
+
|
49
82
|
@status=:received
|
50
83
|
end
|
51
84
|
|
85
|
+
def end_worker_time!
|
86
|
+
@worker_end_time=Time.now_us
|
87
|
+
@worker_time= (@worker_end_time - @worker_start_time)
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
def start_worker_time!
|
92
|
+
@worker_start_time=Time.now_us
|
93
|
+
end
|
94
|
+
|
52
95
|
def sent!
|
53
96
|
@status=:running
|
54
|
-
@sent_time=Time.
|
97
|
+
@sent_time=Time.now_us
|
55
98
|
end
|
56
99
|
|
57
100
|
def stuck?
|
58
|
-
(@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*
|
101
|
+
(@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*PROCESSING_TIMEOUT_MULTIPLIER))
|
59
102
|
end
|
60
103
|
|
61
104
|
# return running or real processing time
|
62
105
|
def processing_time
|
63
|
-
return (@processing_time || (Time.
|
106
|
+
return (@processing_time || (Time.now_us-@sent_time))
|
64
107
|
end
|
108
|
+
|
109
|
+
def worker_time
|
110
|
+
return (@worker_time)
|
111
|
+
end
|
112
|
+
|
113
|
+
def transmission_time
|
114
|
+
return (processing_time - worker_time)
|
115
|
+
end
|
65
116
|
|
66
117
|
def inspect
|
67
|
-
time="; time: #{processing_time}
|
118
|
+
time="; time: #{processing_time} usecs"
|
68
119
|
return "WorkManagerData: #{@job_identifier} => #{@status} #{time}"
|
69
120
|
end
|
121
|
+
|
122
|
+
def print_worker_time
|
123
|
+
return "WorkManagerData Times: #{@worker_start_time} => #{@worker_end_time} #{worker_time}"
|
124
|
+
end
|
70
125
|
|
71
126
|
def self.job_id=(c)
|
72
127
|
# puts "Setting job_id to #{c}"
|
@@ -132,9 +187,28 @@ module ScbiMapreduce
|
|
132
187
|
end
|
133
188
|
|
134
189
|
############
|
190
|
+
def self.stats
|
191
|
+
@@stats
|
192
|
+
end
|
193
|
+
|
194
|
+
def self.save_stats(stats=nil, filename='scbi_mapreduce_stats.json')
|
195
|
+
f=File.open(filename,'w')
|
196
|
+
|
197
|
+
if stats.nil?
|
198
|
+
f.puts JSON::pretty_generate @@stats
|
199
|
+
else
|
200
|
+
f.puts JSON::pretty_generate stats
|
201
|
+
end
|
202
|
+
|
203
|
+
f.close
|
204
|
+
end
|
135
205
|
|
136
206
|
def self.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs,exit_on_many_errors,chunk_size)
|
207
|
+
@@stats={}
|
137
208
|
@@count = 0
|
209
|
+
@@retried_jobs=0
|
210
|
+
@@sent_chunks=0
|
211
|
+
@@received_objects=0
|
138
212
|
@@want_to_exit=false
|
139
213
|
@@chunk_count = 0
|
140
214
|
@@workers = 0
|
@@ -162,9 +236,51 @@ module ScbiMapreduce
|
|
162
236
|
@@checkpoint=self.get_checkpoint
|
163
237
|
$SERVER_LOG.info "Detected checkpoint at #{@@checkpoint}"
|
164
238
|
end
|
239
|
+
|
240
|
+
# for statistics:
|
241
|
+
@@total_seconds=0
|
242
|
+
@@total_manager_time=0
|
243
|
+
# mean_worker_time=0
|
244
|
+
@@each_worker_time={}
|
245
|
+
@@each_transmission_time={}
|
246
|
+
|
247
|
+
@@total_read_time=0
|
248
|
+
@@total_write_time=0
|
249
|
+
# mean_transmission_time=0
|
165
250
|
|
166
251
|
end
|
167
252
|
|
253
|
+
|
254
|
+
def mean_time(h)
|
255
|
+
r=0
|
256
|
+
i=0
|
257
|
+
|
258
|
+
h.each do |k,v|
|
259
|
+
r+=h[k]
|
260
|
+
i+=1
|
261
|
+
end
|
262
|
+
|
263
|
+
if r>0
|
264
|
+
r=r/i.to_f
|
265
|
+
end
|
266
|
+
|
267
|
+
return r
|
268
|
+
end
|
269
|
+
def each_worker_time(worker,time)
|
270
|
+
if @@each_worker_time[worker].nil? then
|
271
|
+
@@each_worker_time[worker]=0
|
272
|
+
end
|
273
|
+
@@each_worker_time[worker]+=time
|
274
|
+
end
|
275
|
+
|
276
|
+
def each_transmission_time(worker,time)
|
277
|
+
if @@each_transmission_time[worker].nil? then
|
278
|
+
@@each_transmission_time[worker]=0
|
279
|
+
end
|
280
|
+
@@each_transmission_time[worker]+=time
|
281
|
+
end
|
282
|
+
|
283
|
+
|
168
284
|
def self.checkpoint
|
169
285
|
return @@checkpoint
|
170
286
|
end
|
@@ -232,6 +348,8 @@ module ScbiMapreduce
|
|
232
348
|
sent=false
|
233
349
|
|
234
350
|
if @@retry_stuck_jobs
|
351
|
+
# $SERVER_LOG.debug("="*40)
|
352
|
+
# print_running_jobs
|
235
353
|
# count stuck jobs and re-sent the first one
|
236
354
|
stuck_works=@@running_jobs.select{|job| job.stuck?}
|
237
355
|
|
@@ -240,8 +358,10 @@ module ScbiMapreduce
|
|
240
358
|
$SERVER_LOG.info("Stuck Jobs:\n#{jobs}")
|
241
359
|
|
242
360
|
# send_object
|
243
|
-
send_object(stuck_works.first)
|
244
361
|
stuck_works.first.sent!
|
362
|
+
send_object(stuck_works.first)
|
363
|
+
@@sent_chunks+=1
|
364
|
+
@@retried_jobs+=1
|
245
365
|
$SERVER_LOG.info("Sending stuck work #{stuck_works.first.inspect}")
|
246
366
|
sent=true
|
247
367
|
end
|
@@ -262,7 +382,9 @@ module ScbiMapreduce
|
|
262
382
|
|
263
383
|
#send stuck work
|
264
384
|
objs=[]
|
265
|
-
|
385
|
+
|
386
|
+
t=Time.now_us
|
387
|
+
|
266
388
|
# prepare new data
|
267
389
|
@@chunk_size.times do
|
268
390
|
obj=next_work
|
@@ -273,7 +395,9 @@ module ScbiMapreduce
|
|
273
395
|
objs << obj
|
274
396
|
end
|
275
397
|
end
|
276
|
-
|
398
|
+
|
399
|
+
@@total_read_time+=(Time.now_us - t)
|
400
|
+
|
277
401
|
# if new was data collected, send it
|
278
402
|
if objs.count>0
|
279
403
|
@@count += objs.count
|
@@ -281,6 +405,7 @@ module ScbiMapreduce
|
|
281
405
|
|
282
406
|
work_data=WorkManagerData.new(objs)
|
283
407
|
send_object(work_data)
|
408
|
+
@@sent_chunks+=1
|
284
409
|
|
285
410
|
# to keep order or retry failed job, we need job status
|
286
411
|
if @@keep_order || @@retry_stuck_jobs
|
@@ -290,8 +415,15 @@ module ScbiMapreduce
|
|
290
415
|
# print_running_jobs
|
291
416
|
end
|
292
417
|
else
|
293
|
-
# otherwise,
|
294
|
-
|
418
|
+
# otherwise,
|
419
|
+
if @@running_jobs.count >0
|
420
|
+
$SERVER_LOG.info("Worker, go to sleep")
|
421
|
+
send_object(:sleep)
|
422
|
+
|
423
|
+
else
|
424
|
+
# send a quit value indicating no more data available
|
425
|
+
send_object(:quit)
|
426
|
+
end
|
295
427
|
end
|
296
428
|
end
|
297
429
|
end
|
@@ -339,7 +471,7 @@ module ScbiMapreduce
|
|
339
471
|
@@max_workers +=1
|
340
472
|
# when first worker is connected, do special config
|
341
473
|
if @@workers == 1
|
342
|
-
@@total_seconds = Time.
|
474
|
+
@@total_seconds = Time.now_us
|
343
475
|
$SERVER_LOG.info "First worker connected"
|
344
476
|
|
345
477
|
if @@checkpointing
|
@@ -382,16 +514,31 @@ module ScbiMapreduce
|
|
382
514
|
end
|
383
515
|
end
|
384
516
|
|
517
|
+
elsif obj == :waking_up
|
518
|
+
$SERVER_LOG.info("Worker woke up")
|
385
519
|
else
|
520
|
+
|
386
521
|
# if not using checkpointing
|
387
|
-
|
522
|
+
obj.received!(obj.data)
|
388
523
|
|
389
524
|
if @@checkpointing || @@keep_order || @@retry_stuck_jobs
|
390
525
|
# print_running_jobs
|
391
526
|
checkpointable_job_received(obj)
|
392
527
|
else
|
528
|
+
# change this job's status to received
|
529
|
+
|
530
|
+
t=Time.now_us
|
393
531
|
work_received(obj.data)
|
532
|
+
@@received_objects+=obj.data.count
|
533
|
+
@@total_write_time+=(Time.now_us - t)
|
394
534
|
end
|
535
|
+
|
536
|
+
# puts obj.worker_identifier,obj.worker_identifier.class
|
537
|
+
# if obj.worker_identifier==0 then
|
538
|
+
# end
|
539
|
+
|
540
|
+
each_worker_time(obj.worker_identifier, obj.worker_time)
|
541
|
+
each_transmission_time(obj.worker_identifier, obj.transmission_time)
|
395
542
|
end
|
396
543
|
|
397
544
|
# free mem
|
@@ -409,10 +556,8 @@ module ScbiMapreduce
|
|
409
556
|
# save job if there is was a valid work previously sent
|
410
557
|
if received_job
|
411
558
|
|
412
|
-
# change this job's status to received
|
413
|
-
received_job.
|
414
|
-
|
415
|
-
|
559
|
+
# change this job's status to received, already done in previous method
|
560
|
+
received_job.update_with_received!(obj)
|
416
561
|
|
417
562
|
# # if there are sufficient jobs, count pending ones
|
418
563
|
# if (@@running_jobs.count>=PENDING_TO_SAVE)
|
@@ -439,7 +584,11 @@ module ScbiMapreduce
|
|
439
584
|
@@running_jobs.each do |job|
|
440
585
|
if job.status==:received
|
441
586
|
# puts "Sent to save: #{job.inspect}"
|
587
|
+
t=Time.now_us
|
442
588
|
work_received(job.data)
|
589
|
+
@@received_objects+=job.data.count
|
590
|
+
@@total_write_time+=(Time.now_us - t)
|
591
|
+
|
443
592
|
job.status=:saved
|
444
593
|
to_remove += 1
|
445
594
|
else
|
@@ -490,6 +639,8 @@ module ScbiMapreduce
|
|
490
639
|
end
|
491
640
|
end
|
492
641
|
|
642
|
+
|
643
|
+
|
493
644
|
def stop_work_manager
|
494
645
|
|
495
646
|
|
@@ -499,12 +650,56 @@ module ScbiMapreduce
|
|
499
650
|
|
500
651
|
self.class.end_work_manager
|
501
652
|
|
502
|
-
@@total_seconds = Time.
|
653
|
+
@@total_seconds = (Time.now_us-@@total_seconds)
|
654
|
+
@@total_manager_time= @@total_manager_time
|
655
|
+
|
656
|
+
@@total_read_time=@@total_read_time
|
657
|
+
@@total_write_time=@@total_write_time
|
658
|
+
|
659
|
+
mean_worker_time=mean_time(@@each_worker_time)
|
660
|
+
mean_transmission_time=mean_time(@@each_transmission_time)
|
661
|
+
|
662
|
+
idle_time=(@@total_seconds - @@total_read_time -@@total_write_time - mean_transmission_time)
|
663
|
+
|
664
|
+
@@stats={}
|
665
|
+
@@stats[:total_objects]=@@count
|
666
|
+
@@stats[:total_seconds]=@@total_seconds
|
667
|
+
@@stats[:sent_chunks]=@@sent_chunks
|
668
|
+
@@stats[:received_objects]=@@received_objects
|
669
|
+
@@stats[:processing_rate]=(@@count/@@total_seconds.to_f)
|
670
|
+
@@stats[:total_read_time]=@@total_read_time
|
671
|
+
@@stats[:total_write_time]=@@total_write_time
|
672
|
+
@@stats[:mean_worker_time]=mean_worker_time
|
673
|
+
@@stats[:mean_transmission_time]=mean_transmission_time
|
674
|
+
@@stats[:total_manager_idle_time]=idle_time
|
675
|
+
|
676
|
+
@@stats[:error_count]=@@error_count
|
677
|
+
@@stats[:retried_jobs]=@@retried_jobs
|
678
|
+
@@stats[:chunk_size]=@@chunk_size
|
679
|
+
@@stats[:connected_workers]=@@max_workers
|
680
|
+
@@stats[:each_transmission_time]=@@each_transmission_time
|
681
|
+
@@stats[:each_worker_time]=@@each_worker_time
|
682
|
+
|
683
|
+
|
503
684
|
$SERVER_LOG.info "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
|
685
|
+
$SERVER_LOG.info "Total sent chunks: #{@@sent_chunks} objects"
|
686
|
+
|
687
|
+
$SERVER_LOG.info "Total sent objects: #{@@count} objects"
|
688
|
+
$SERVER_LOG.info "Total received objects: #{@@received_objects} objects"
|
689
|
+
|
504
690
|
$SERVER_LOG.info "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
|
505
691
|
$SERVER_LOG.info "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
|
506
|
-
|
692
|
+
|
693
|
+
$SERVER_LOG.info "Total read time #{@@total_read_time} seconds"
|
694
|
+
$SERVER_LOG.info "Total write time #{@@total_write_time} seconds"
|
695
|
+
# mean_worker_time=mean_worker_time/@@max_workers
|
696
|
+
$SERVER_LOG.info "Total worker time #{mean_worker_time} seconds"
|
697
|
+
$SERVER_LOG.info "Total transmission time #{mean_transmission_time} seconds"
|
698
|
+
$SERVER_LOG.info "Total manager_idle time #{idle_time} seconds"
|
699
|
+
# $SERVER_LOG.info "Total manager time #{@@total_read_time + @@total_write_time + mean_transmission_time} seconds"
|
700
|
+
|
507
701
|
$SERVER_LOG.info "Number of errors: #{@@error_count}"
|
702
|
+
$SERVER_LOG.info "Number of retried stuck jobs: #{@@retried_jobs}"
|
508
703
|
$SERVER_LOG.info "Chunk size: #{@@chunk_size}"
|
509
704
|
$SERVER_LOG.info "Total connected workers: #{@@max_workers}"
|
510
705
|
|
@@ -68,6 +68,7 @@ module ScbiMapreduce
|
|
68
68
|
else
|
69
69
|
$WORKER_LOG.info("received:"+obj.to_s)
|
70
70
|
|
71
|
+
|
71
72
|
if (obj == :quit) || @@want_to_exit_worker
|
72
73
|
$WORKER_LOG.info('Quit received')
|
73
74
|
|
@@ -76,8 +77,13 @@ module ScbiMapreduce
|
|
76
77
|
elsif @@want_to_exit_worker
|
77
78
|
$WORKER_LOG.info('Want to exit worker')
|
78
79
|
stop_worker
|
80
|
+
elsif (obj== :sleep)
|
81
|
+
$WORKER_LOG.info('Sleeping 10 secs')
|
82
|
+
sleep 10
|
83
|
+
send_object(:waking_up)
|
79
84
|
else
|
80
85
|
@@count += 1
|
86
|
+
obj.worker_identifier=@@worker_id.to_i
|
81
87
|
|
82
88
|
# OJO - HAY QUE PASAR EL MODIFIED OBJECT
|
83
89
|
# operation = proc {
|
@@ -96,10 +102,14 @@ module ScbiMapreduce
|
|
96
102
|
|
97
103
|
|
98
104
|
begin
|
99
|
-
|
105
|
+
|
106
|
+
obj.start_worker_time!
|
107
|
+
|
100
108
|
modified_data=process_object(obj.data)
|
101
109
|
obj.data = modified_data
|
102
|
-
|
110
|
+
|
111
|
+
obj.end_worker_time!
|
112
|
+
|
103
113
|
# if obj.job_identifier==3
|
104
114
|
# sleep 15
|
105
115
|
# end
|
@@ -160,20 +170,27 @@ module ScbiMapreduce
|
|
160
170
|
|
161
171
|
$LOG = $WORKER_LOG
|
162
172
|
|
163
|
-
total_seconds = Time.
|
173
|
+
total_seconds = Time.now_us
|
164
174
|
|
165
175
|
EM.error_handler{ |e|
|
166
176
|
$WORKER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
|
167
177
|
}
|
178
|
+
|
179
|
+
Signal.trap("CONT") do
|
180
|
+
$WORKER_LOG.info("SIGCONT: Worker: #{@@worker_id} with PID: #{Process.pid} sleeping 15 seconds before waking up")
|
181
|
+
puts "SIGCONT: Worker: #{@@worker_id} with PID: #{Process.pid} sleeping 15 seconds before waking up"
|
182
|
+
|
183
|
+
sleep 15
|
184
|
+
end
|
168
185
|
|
169
186
|
EventMachine::run {
|
170
187
|
|
171
188
|
EventMachine::connect ip, port, self
|
172
|
-
$WORKER_LOG.info "Worker connected to #{ip}:#{port}"
|
189
|
+
$WORKER_LOG.info "Worker: #{@@worker_id} with PID: #{Process.pid} connected to #{ip}:#{port}"
|
173
190
|
|
174
191
|
}
|
175
192
|
|
176
|
-
total_seconds = Time.
|
193
|
+
total_seconds = Time.now_us-total_seconds
|
177
194
|
$WORKER_LOG.info "Client #{@@worker_id} processed: #{@@count} objs"
|
178
195
|
$WORKER_LOG.info "Client #{@@worker_id} proc rate: #{@@count/total_seconds.to_f} objects/seg"
|
179
196
|
|
@@ -9,6 +9,8 @@ module ScbiMapreduce
|
|
9
9
|
class WorkerLauncher
|
10
10
|
|
11
11
|
attr_accessor :server_ip, :server_port
|
12
|
+
|
13
|
+
@@worker_id=0
|
12
14
|
|
13
15
|
def initialize(server_ip,server_port, server_ip_list,workers, worker_file, log_file=nil, init_env_file=nil)
|
14
16
|
@server_ip = server_ip
|
@@ -19,8 +21,9 @@ module ScbiMapreduce
|
|
19
21
|
@server_ip_list=server_ip_list
|
20
22
|
|
21
23
|
|
24
|
+
|
22
25
|
if log_file.nil?
|
23
|
-
log_file =
|
26
|
+
log_file = "logs/launcher_global_log.txt"
|
24
27
|
end
|
25
28
|
|
26
29
|
FileUtils.mkdir_p(File.dirname(log_file)) if ((log_file!=STDOUT) && (!File.exists?(File.dirname(log_file))))
|
@@ -44,10 +47,10 @@ module ScbiMapreduce
|
|
44
47
|
threads = []
|
45
48
|
@workers.times do |i|
|
46
49
|
pid=fork{
|
47
|
-
launch_worker(
|
50
|
+
launch_worker(@@worker_id,@server_ip,@server_port)
|
48
51
|
$LAUNCHER_LOG.info "Worker #{i} launched [#{@server_ip}:#{@server_port}]"
|
49
52
|
}
|
50
|
-
|
53
|
+
@@worker_id+=1
|
51
54
|
#threads.each { |aThread| aThread.join }
|
52
55
|
end
|
53
56
|
#Process.waitall
|
@@ -109,7 +112,14 @@ module ScbiMapreduce
|
|
109
112
|
|
110
113
|
def launch_external_workers(workers)
|
111
114
|
puts "Launching #{workers.count} external workers: #{workers}"
|
112
|
-
|
115
|
+
puts "INIT_ENV_FILE: #{@init_env_file}"
|
116
|
+
|
117
|
+
# This sleep is necessary to leave time to lustre fylesystems to sync the log folder between all nodes. If not, external workers will not be launched.
|
118
|
+
if !workers.empty?
|
119
|
+
puts "SLEEP 10 for logs folder sync in lustre fs"
|
120
|
+
sleep 10
|
121
|
+
end
|
122
|
+
|
113
123
|
init=''
|
114
124
|
if @init_env_file
|
115
125
|
init_path = File.expand_path(@init_env_file)
|
@@ -129,10 +139,13 @@ module ScbiMapreduce
|
|
129
139
|
cd = "cd #{init_dir}; "
|
130
140
|
end
|
131
141
|
|
132
|
-
|
142
|
+
|
133
143
|
|
134
144
|
workers.each do |machine|
|
135
|
-
|
145
|
+
|
146
|
+
log_file=File.join(init_dir,'logs',"launcher_#{@@worker_id}")
|
147
|
+
log_dir=File.join(init_dir,'logs')
|
148
|
+
|
136
149
|
# if server_ip is not in valid ips
|
137
150
|
if !@server_ip_list.include?(@server_ip)
|
138
151
|
# find matching ip between server and worker
|
@@ -146,7 +159,7 @@ module ScbiMapreduce
|
|
146
159
|
if !found_ip.nil?
|
147
160
|
# cmd = "ssh #{machine} \"#{init} #{cd} #{INTERPRETER} #{File.join(File.dirname(__FILE__),'main_worker.rb')} #{worker_id.to_s} #{@server_ip} #{@server_port} #{@worker_file}\""
|
148
161
|
# cmd = "ssh #{machine} \"nohup #{File.join(File.dirname(__FILE__),'launcher.sh')} #{worker_id.to_s} #{@server_ip} #{@server_port} #{@worker_file} #{init_dir} #{init_path} </dev/null >> #{log_file} 2>> #{log_file} & \""
|
149
|
-
cmd = "ssh #{machine} \"nohup
|
162
|
+
cmd = "ssh #{machine} \"nohup #{File.join(File.dirname(__FILE__),'launcher.sh')} #{@@worker_id.to_s} #{found_ip} #{@server_port} #{@worker_file} #{init_dir} #{init_path} </dev/null >> #{log_file} 2>> #{log_file} & \""
|
150
163
|
|
151
164
|
$LAUNCHER_LOG.info cmd
|
152
165
|
|
@@ -154,7 +167,7 @@ module ScbiMapreduce
|
|
154
167
|
exec(cmd)
|
155
168
|
}
|
156
169
|
|
157
|
-
worker_id+=1
|
170
|
+
@@worker_id+=1
|
158
171
|
else
|
159
172
|
$LAUNCHER_LOG.error("Couldn't find a matching ip between worker (#{machine_ip}) and server #{ip_list.to_json}")
|
160
173
|
end
|
metadata
CHANGED
@@ -1,65 +1,113 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: scbi_mapreduce
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.40
|
4
5
|
prerelease:
|
5
|
-
version: 0.0.38
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Dario Guerrero
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2013-09-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
16
15
|
name: eventmachine
|
17
|
-
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
19
17
|
none: false
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
23
21
|
version: 0.12.0
|
24
22
|
type: :runtime
|
25
|
-
version_requirements: *id001
|
26
|
-
- !ruby/object:Gem::Dependency
|
27
|
-
name: json
|
28
23
|
prerelease: false
|
29
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.12.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: json
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
30
33
|
none: false
|
31
|
-
requirements:
|
32
|
-
- -
|
33
|
-
- !ruby/object:Gem::Version
|
34
|
-
version:
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
35
38
|
type: :runtime
|
36
|
-
version_requirements: *id002
|
37
|
-
- !ruby/object:Gem::Dependency
|
38
|
-
name: hoe
|
39
39
|
prerelease: false
|
40
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rdoc
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
41
49
|
none: false
|
42
|
-
requirements:
|
43
|
-
- -
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version:
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '4.0'
|
46
54
|
type: :development
|
47
|
-
|
48
|
-
|
49
|
-
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: newgem
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.5.3
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.5.3
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: hoe
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '3.6'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '3.6'
|
94
|
+
description: scbi_mapreduce brings parallel and distributed computing capabilities
|
95
|
+
to your code, with a very easy to use framework that allows you to exploit your
|
96
|
+
clustered or cloud computational resources.
|
97
|
+
email:
|
50
98
|
- dariogf@gmail.com
|
51
|
-
executables:
|
99
|
+
executables:
|
52
100
|
- scbi_mapreduce
|
53
101
|
extensions: []
|
54
|
-
|
55
|
-
extra_rdoc_files:
|
102
|
+
extra_rdoc_files:
|
56
103
|
- History.txt
|
57
104
|
- Manifest.txt
|
58
105
|
- PostInstall.txt
|
106
|
+
- README.rdoc
|
59
107
|
- skeleton/simple/README.txt
|
60
108
|
- skeleton/remove_mids/README.txt
|
61
109
|
- skeleton/dummy_calcs/README.txt
|
62
|
-
files:
|
110
|
+
files:
|
63
111
|
- History.txt
|
64
112
|
- lib/scbi_mapreduce/error_handler.rb
|
65
113
|
- lib/scbi_mapreduce/main_worker.rb
|
@@ -106,34 +154,35 @@ files:
|
|
106
154
|
- skeleton/dummy_calcs/my_worker_manager.rb
|
107
155
|
- skeleton/dummy_calcs/README.txt
|
108
156
|
- skeleton/dummy_calcs/threads_implementation.rb
|
157
|
+
- .gemtest
|
109
158
|
homepage: http://www.scbi.uma.es/downloads
|
110
159
|
licenses: []
|
111
|
-
|
112
160
|
post_install_message: PostInstall.txt
|
113
|
-
rdoc_options:
|
161
|
+
rdoc_options:
|
114
162
|
- --main
|
115
163
|
- README.rdoc
|
116
|
-
require_paths:
|
164
|
+
require_paths:
|
117
165
|
- lib
|
118
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
166
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
167
|
none: false
|
120
|
-
requirements:
|
121
|
-
- -
|
122
|
-
- !ruby/object:Gem::Version
|
123
|
-
version:
|
124
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
168
|
+
requirements:
|
169
|
+
- - ! '>='
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: '0'
|
172
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
173
|
none: false
|
126
|
-
requirements:
|
127
|
-
- -
|
128
|
-
- !ruby/object:Gem::Version
|
129
|
-
version:
|
174
|
+
requirements:
|
175
|
+
- - ! '>='
|
176
|
+
- !ruby/object:Gem::Version
|
177
|
+
version: '0'
|
130
178
|
requirements: []
|
131
|
-
|
132
179
|
rubyforge_project: scbi_mapreduce
|
133
|
-
rubygems_version: 1.
|
180
|
+
rubygems_version: 1.8.24
|
134
181
|
signing_key:
|
135
182
|
specification_version: 3
|
136
|
-
summary: scbi_mapreduce brings parallel and distributed computing capabilities to
|
137
|
-
|
183
|
+
summary: scbi_mapreduce brings parallel and distributed computing capabilities to
|
184
|
+
your code, with a very easy to use framework that allows you to exploit your clustered
|
185
|
+
or cloud computational resources.
|
186
|
+
test_files:
|
138
187
|
- test/test_helper.rb
|
139
188
|
- test/test_scbi_drb.rb
|