scbi_mapreduce 0.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+
5
+ # modify include path
6
+ $: << File.join(File.dirname(__FILE__),'lib')
7
+
8
+ require 'calculations'
9
+ include Calculations
10
+
11
+ # process
12
+
13
+ times_to_calculate=1000
14
+
15
+ times_to_calculate.times do
16
+
17
+ do_dummy_calculations
18
+
19
+ end
20
+
21
+ puts "final"
22
+
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.join(File.dirname(__FILE__))
4
+
5
+ # load required libraries
6
+ require 'scbi_mapreduce'
7
+ require 'my_worker_manager'
8
+
9
+ # listen on all ips at port 50000
10
+ server_ip='10.243'
11
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
12
+
13
+ ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
14
+
15
+ if !ip
16
+ ip='0.0.0.0'
17
+ end
18
+
19
+ port = 0
20
+
21
+ # set number of workers. You can also provide an array with worker names.
22
+ # Those workers names can be read from a file produced by the existing
23
+ # queue system, if any.
24
+
25
+
26
+
27
+ workers = 4
28
+
29
+ # read optional workers parameter
30
+ input_workers = ARGV.shift
31
+ if !input_workers.nil?
32
+ # if it is a file
33
+ if File.exists?(input_workers)
34
+ # read workers into array
35
+ workers=File.read(input_workers).split("\n").map{|w| w.chomp}
36
+ else
37
+ # workers is a number
38
+ workers = input_workers.to_i
39
+ end
40
+ end
41
+
42
+ # we need the path to my_worker in order to launch it when necessary
43
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
44
+
45
+ # initialize the work manager. Here you can pass parameters like file names
46
+ MyWorkerManager.init_work_manager
47
+
48
+ # launch processor server
49
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
50
+
51
+ # you can set additional properties
52
+ # =================================
53
+
54
+ # if you want basic checkpointing. Some performance drop should be expected
55
+ # mgr.checkpointing=true
56
+
57
+ # if you want to keep the order of input data. Some performance drop should be expected
58
+ # mgr.keep_order=true
59
+
60
+ # you can set the size of packets of data sent to workers
61
+ mgr.chunk_size=1
62
+
63
+ # start processing
64
+ mgr.start_server
65
+
66
+ # this line is reached when all data has been processed
67
+ puts "Program finished"
@@ -0,0 +1,56 @@
1
+ $: << File.join(File.dirname(__FILE__),'lib')
2
+
3
+ require 'calculations'
4
+ include Calculations
5
+
6
+ # MyWorker defines the behaviour of workers.
7
+ # Here is where the real processing takes place
8
+ class MyWorker < ScbiMapreduce::Worker
9
+
10
+ # starting_worker method is called one time at initialization
11
+ # and allows you to initialize your variables
12
+ def starting_worker
13
+
14
+ # You can use worker logs at any time in this way:
15
+ # $WORKER_LOG.info "Starting a worker"
16
+
17
+ end
18
+
19
+
20
+ # receive_initial_config is called only once just after
21
+ # the first connection, when initial parameters are
22
+ # received from manager
23
+ def receive_initial_config(parameters)
24
+
25
+ # Reads the parameters
26
+
27
+ # You can use worker logs at any time in this way:
28
+ # $WORKER_LOG.info "Params received"
29
+
30
+ # save received parameters, if any
31
+ # @params = parameters
32
+ end
33
+
34
+
35
+ # process_object method is called for each received object.
36
+ # Be aware that objs is always an array, and you must iterate
37
+ # over it if you need to process it independently
38
+ #
39
+ # The value returned here will be received by the work_received
40
+ # method at your worker_manager subclass.
41
+ def process_object(objs)
42
+
43
+ # iterate over all objects received
44
+ # objs.each do |obj|
45
+ # convert to uppercase
46
+ do_dummy_calculations
47
+ # end
48
+
49
+ # return objs back to manager
50
+ return objs
51
+ end
52
+
53
+ def closing_worker
54
+
55
+ end
56
+ end
@@ -0,0 +1,52 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class MyWorkerManager < ScbiMapreduce::WorkManager
6
+
7
+ # init_work_manager is executed at the start, prior to any processing.
8
+ # You can use init_work_manager to initialize global variables, open files, etc...
9
+ # Note that an instance of MyWorkerManager will be created for each
10
+ # worker connection, and thus, all global variables here should be
11
+ # class variables (starting with @@)
12
+ def self.init_work_manager
13
+
14
+ # execute dummy_calc in workers @remaining_data times
15
+ @@remaining_data = 1000
16
+ end
17
+
18
+ # end_work_manager is executed at the end, when all the process is done.
19
+ # You can use it to close files opened in init_work_manager
20
+ def self.end_work_manager
21
+
22
+ end
23
+
24
+ # worker_initial_config is used to send initial parameters to workers.
25
+ # The method is executed once per each worker
26
+ def worker_initial_config
27
+
28
+ end
29
+
30
+ # next_work method is called every time a worker needs a new work
31
+ # Here you can read data from disk
32
+ # This method must return the work data or nil if no more data is available
33
+ def next_work
34
+ @@remaining_data -= 1
35
+
36
+ e = @@remaining_data
37
+
38
+ e = nil if @@remaining_data<=0
39
+
40
+ return e
41
+
42
+ end
43
+
44
+
45
+ # work_received is executed each time a worker has finished a job.
46
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
47
+ def work_received(results)
48
+
49
+ # write_data_to_disk(results)
50
+ end
51
+
52
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+
5
+ # modify include path
6
+ $: << File.join(File.dirname(__FILE__),'lib')
7
+
8
+ require 'thread_pool'
9
+ require 'calculations'
10
+ include Calculations
11
+
12
+ if ARGV.count!=1
13
+ puts "use: #{$0} threads"
14
+ exit
15
+ end
16
+
17
+ @pool=ThreadPool.new(ARGV[0].to_i)
18
+
19
+ # process
20
+
21
+ times_to_calculate=1000
22
+
23
+ times_to_calculate.times do
24
+
25
+ @pool.process {do_dummy_calculations}
26
+
27
+ end
28
+
29
+
30
+ puts "wait"
31
+ @pool.join
32
+ puts "final"
33
+
@@ -0,0 +1,30 @@
1
+ A simple scbi_mapreduce application demo
2
+ ========================================
3
+
4
+ This application is a basic sequence processing template. It processes all
5
+ sequences in fastq_file (a file in FastQ format) removing a MIDs from it. It
6
+ needs some external requisites to work:
7
+
8
+ EXTERNAL REQUISITES
9
+ ===================
10
+
11
+ * scbi_blast gem installed
12
+
13
+
14
+ At lib/db you can find a preformated MID database.
15
+
16
+ You can modify the files to perform more complicated processing.
17
+ There are other templates available, you can list them by issuing this command:
18
+
19
+ scbi_mapreduce
20
+
21
+ You can launch the application right now with the following command using 4 cpus/cores and chunks of 100 sequences at a time:
22
+
23
+ ruby main.rb fastq_file 4 100
24
+
25
+ A server and some workers will be launched, and all sequences in fastq_file will
26
+ be processed in blocks of 100 sequences.
27
+
28
+ A sequential lineal example is also provided, you can launch it by issuing:
29
+
30
+ ruby linear_implementation.rb fastq_file
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.dirname(__FILE__)
4
+
5
+ require "logger"
6
+
7
+ $: << '/Users/dariogf/progs/ruby/gems/scbi_mapreduce/lib'
8
+
9
+ require 'scbi_mapreduce'
10
+ require 'my_worker_manager'
11
+
12
+
13
+ $LOG = Logger.new(STDOUT)
14
+ $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
15
+
16
+ ip='10.247.255.5'
17
+ port = 50000
18
+ workers = 8
19
+
20
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
21
+
22
+ $LOG.info 'Starting server'
23
+
24
+
25
+ worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
26
+ worker_launcher.launch_workers_and_wait
27
+
28
+ # launch processor server
29
+ $LOG.info 'Closing workers'
@@ -0,0 +1,120 @@
1
+ >RL1
2
+ ACACGACGACT
3
+ >RL2
4
+ ACACGTAGTAT
5
+ >RL3
6
+ ACACTACTCGT
7
+ >RL4
8
+ ACGACACGTAT
9
+ >RL5
10
+ ACGAGTAGACT
11
+ >RL6
12
+ ACGCGTCTAGT
13
+ >RL7
14
+ ACGTACACACT
15
+ >RL8
16
+ ACGTACTGTGT
17
+ >RL9
18
+ ACGTAGATCGT
19
+ >RL10
20
+ ACTACGTCTCT
21
+ >RL11
22
+ ACTATACGAGT
23
+ >RL12
24
+ ACTCGCGTCGT
25
+ >MID1
26
+ ACGAGTGCGT
27
+ >MID2
28
+ ACGCTCGACA
29
+ >MID3
30
+ AGACGCACTC
31
+ >MID5
32
+ ATCAGACACG
33
+ >MID6
34
+ ATATCGCGAG
35
+ >MID7
36
+ CGTGTCTCTA
37
+ >MID8
38
+ CTCGCGTGTC
39
+ >MID10
40
+ TCTCTATGCG
41
+ >MID11
42
+ TGATACGTCT
43
+ >MID13
44
+ CATAGTAGTG
45
+ >MID14
46
+ CGAGAGATAC
47
+ >MID15
48
+ ATACGACGTA
49
+ >MID16
50
+ TCACGTACTA
51
+ >MID17
52
+ CGTCTAGTAC
53
+ >MID18
54
+ TCTACGTAGC
55
+ >MID19
56
+ TGTACTACTC
57
+ >MID20
58
+ ACGACTACAG
59
+ >MID21
60
+ CGTAGACTAG
61
+ >MID22
62
+ TACGAGTATG
63
+ >MID23
64
+ TACTCTCGTG
65
+ >MID24
66
+ TAGAGACGAG
67
+ >MID25
68
+ TCGTCGCTCG
69
+ >MID26
70
+ ACATACGCGT
71
+ >MID27
72
+ ACGCGAGTAT
73
+ >MID28
74
+ ACTACTATGT
75
+ >MID68
76
+ TCGCTGCGTA
77
+ >MID30
78
+ AGACTATACT
79
+ >MID31
80
+ AGCGTCGTCT
81
+ >MID32
82
+ AGTACGCTAT
83
+ >MID33
84
+ ATAGAGTACT
85
+ >MID34
86
+ CACGCTACGT
87
+ >MID35
88
+ CAGTAGACGT
89
+ >MID36
90
+ CGACGTGACT
91
+ >MID37
92
+ TACACACACT
93
+ >MID38
94
+ TACACGTGAT
95
+ >MID39
96
+ TACAGATCGT
97
+ >MID40
98
+ TACGCTGTCT
99
+ >MID69
100
+ TCTGACGTCA
101
+ >MID42
102
+ TCGATCACGT
103
+ >MID43
104
+ TCGCACTAGT
105
+ >MID44
106
+ TCTAGCGACT
107
+ >MID45
108
+ TCTATACTAT
109
+ >MID46
110
+ TGACGTATGT
111
+ >MID47
112
+ TGTGAGTAGT
113
+ >MID48
114
+ ACAGTATATA
115
+ >MID49
116
+ ACGCGATCGA
117
+ >MID50
118
+ ACTAGCAGTA
119
+ >MID67
120
+ TCGATAGTGA
@@ -0,0 +1,191 @@
1
+ require 'scbi_blast'
2
+ require 'global_match'
3
+ # require 'json'
4
+
5
+ # Module to find Mids in a set of sequences
6
+ module FindMids
7
+
8
+ # find mids using blast+ as an external tool
9
+ def find_mid_with_blast(seqs)
10
+ t=Time.now
11
+
12
+ # Create blast machine agains mid database
13
+ blast = BatchBlast.new("-db #{File.expand_path(File.join(File.dirname(__FILE__),'db/mids.fasta'))}",'blastn'," -task blastn-short -perc_identity 95 -max_target_seqs 4 ") #get mids
14
+
15
+ # build fastas to blast
16
+ fastas=[]
17
+
18
+ seqs.each do |name,fasta,qual,comments|
19
+ fastas.push ">"+name
20
+ fastas.push fasta
21
+ end
22
+
23
+ # execute blast
24
+ blast_table_results = blast.do_blast(fastas)
25
+
26
+ # puts blast_table_results.inspect
27
+
28
+ # Iterate over blast results and sequences
29
+ i=0
30
+ seqs.each do |name,fasta,qual,comments|
31
+ parse_seq(blast_table_results.querys[i],name,fasta,qual,comments)
32
+ i+=1
33
+ end
34
+
35
+ elapsed=Time.now-t
36
+
37
+ puts "T:#{elapsed}, rate#{elapsed/seqs.count}"
38
+
39
+ end
40
+
41
+
42
+ # parse blast results and sequences to remove found MIDS
43
+ def parse_seq(query,name,fasta,qual,comments)
44
+
45
+ # find_polys('TN',fasta)
46
+ # find_polys('AN',fasta)
47
+
48
+ query.hits.each do |found_mid|
49
+
50
+ if found_mid.align_len>1
51
+
52
+ # modify comments by appending removed mid
53
+ comments << found_mid.subject_id
54
+
55
+ # keep fasta from pos to end
56
+ fasta.slice!(0, found_mid.q_beg + found_mid.align_len)
57
+
58
+ # keep qual from pos to end
59
+ qual.slice!(0, found_mid.q_beg + found_mid.align_len)
60
+ break
61
+ end
62
+ end
63
+ end
64
+
65
+ def find_mid_without_blast(seqs)
66
+ # those are the mids found in database
67
+ t=Time.now
68
+
69
+ mids={}
70
+ mids['RL1']='ACACGACGACT'
71
+ mids['RL2']='ACACGTAGTAT'
72
+ mids['RL3']='ACACTACTCGT'
73
+ mids['RL4']='ACGACACGTAT'
74
+ mids['RL5']='ACGAGTAGACT'
75
+ mids['RL6']='ACGCGTCTAGT'
76
+ mids['RL7']='ACGTACACACT'
77
+ mids['RL8']='ACGTACTGTGT'
78
+ mids['RL9']='ACGTAGATCGT'
79
+ mids['RL10']='ACTACGTCTCT'
80
+ mids['RL11']='ACTATACGAGT'
81
+ mids['RL12']='ACTCGCGTCGT'
82
+ mids['MID1']='ACGAGTGCGT'
83
+ mids['MID2']='ACGCTCGACA'
84
+ mids['MID3']='AGACGCACTC'
85
+ mids['MID5']='ATCAGACACG'
86
+ mids['MID6']='ATATCGCGAG'
87
+ mids['MID7']='CGTGTCTCTA'
88
+ mids['MID8']='CTCGCGTGTC'
89
+ mids['MID10']='TCTCTATGCG'
90
+ mids['MID11']='TGATACGTCT'
91
+ mids['MID13']='CATAGTAGTG'
92
+ mids['MID14']='CGAGAGATAC'
93
+ mids['MID15']='ATACGACGTA'
94
+ mids['MID16']='TCACGTACTA'
95
+ mids['MID17']='CGTCTAGTAC'
96
+ mids['MID18']='TCTACGTAGC'
97
+ mids['MID19']='TGTACTACTC'
98
+ mids['MID20']='ACGACTACAG'
99
+ mids['MID21']='CGTAGACTAG'
100
+ mids['MID22']='TACGAGTATG'
101
+ mids['MID23']='TACTCTCGTG'
102
+ mids['MID24']='TAGAGACGAG'
103
+ mids['MID25']='TCGTCGCTCG'
104
+ mids['MID26']='ACATACGCGT'
105
+ mids['MID27']='ACGCGAGTAT'
106
+ mids['MID28']='ACTACTATGT'
107
+ mids['MID68']='TCGCTGCGTA'
108
+ mids['MID30']='AGACTATACT'
109
+ mids['MID31']='AGCGTCGTCT'
110
+ mids['MID32']='AGTACGCTAT'
111
+ mids['MID33']='ATAGAGTACT'
112
+ mids['MID34']='CACGCTACGT'
113
+ mids['MID35']='CAGTAGACGT'
114
+ mids['MID36']='CGACGTGACT'
115
+ mids['MID37']='TACACACACT'
116
+ mids['MID38']='TACACGTGAT'
117
+ mids['MID39']='TACAGATCGT'
118
+ mids['MID40']='TACGCTGTCT'
119
+ mids['MID69']='TCTGACGTCA'
120
+ mids['MID42']='TCGATCACGT'
121
+ mids['MID43']='TCGCACTAGT'
122
+ mids['MID44']='TCTAGCGACT'
123
+ mids['MID45']='TCTATACTAT'
124
+ mids['MID46']='TGACGTATGT'
125
+ mids['MID47']='TGTGAGTAGT'
126
+ mids['MID48']='ACAGTATATA'
127
+ mids['MID49']='ACGCGATCGA'
128
+ mids['MID50']='ACTAGCAGTA'
129
+ mids['MID67']='TCGATAGTGA'
130
+
131
+ # for each sequence
132
+ seqs.each do |name,fasta,qual,comment|
133
+
134
+ # find all mids
135
+ mids.each do |mid_name,mid|
136
+ # puts "."
137
+ # find a known MID position
138
+ found_mid=fasta[0..20].lcs(mid)
139
+ # puts "."
140
+ # puts pos.to_json
141
+ if found_mid.length>5
142
+
143
+ pos=fasta[0..20].index(found_mid)
144
+ # puts found_mid,pos
145
+ # keep fasta from pos to end
146
+ fasta.slice!(0,pos+found_mid.length)
147
+
148
+ # keep qual from pos to end
149
+ qual.slice!(0,pos+found_mid.length)
150
+
151
+ comment << "mid_name #{mid_name}\n"
152
+ # puts comment
153
+ break
154
+ end
155
+ end
156
+ end
157
+
158
+ elapsed=Time.now-t
159
+
160
+ puts "T:#{elapsed}, rate#{elapsed/seqs.count}"
161
+
162
+ end
163
+
164
+
165
+ def do_dummy_calculation
166
+ numer_of_calcs=250000
167
+
168
+ t=Time.now
169
+
170
+ x1=1
171
+ x2=1
172
+
173
+ # do a loop with calculations
174
+ numer_of_calcs.times do |i|
175
+ x=x1+x2
176
+
177
+ x1=x2
178
+ x2=x
179
+
180
+ # puts some info at regular intervals
181
+ if (i % 100000)==0
182
+ puts "Calculated #{i} by thread #{n}"
183
+ end
184
+ end
185
+ puts Time.now-t
186
+
187
+ end
188
+
189
+
190
+
191
+ end