scbi_mapreduce 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+
5
+ # modify include path
6
+ $: << File.join(File.dirname(__FILE__),'lib')
7
+
8
+ require 'calculations'
9
+ include Calculations
10
+
11
+ # process
12
+
13
+ times_to_calculate=1000
14
+
15
+ times_to_calculate.times do
16
+
17
+ do_dummy_calculations
18
+
19
+ end
20
+
21
+ puts "final"
22
+
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.join(File.dirname(__FILE__))
4
+
5
+ # load required libraries
6
+ require 'scbi_mapreduce'
7
+ require 'my_worker_manager'
8
+
9
+ # listen on all ips at port 50000
10
+ server_ip='10.243'
11
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
12
+
13
+ ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
14
+
15
+ if !ip
16
+ ip='0.0.0.0'
17
+ end
18
+
19
+ port = 0
20
+
21
+ # set number of workers. You can also provide an array with worker names.
22
+ # Those workers names can be read from a file produced by the existing
23
+ # queue system, if any.
24
+
25
+
26
+
27
+ workers = 4
28
+
29
+ # read optional workers parameter
30
+ input_workers = ARGV.shift
31
+ if !input_workers.nil?
32
+ # if it is a file
33
+ if File.exists?(input_workers)
34
+ # read workers into array
35
+ workers=File.read(input_workers).split("\n").map{|w| w.chomp}
36
+ else
37
+ # workers is a number
38
+ workers = input_workers.to_i
39
+ end
40
+ end
41
+
42
+ # we need the path to my_worker in order to launch it when necessary
43
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
44
+
45
+ # initialize the work manager. Here you can pass parameters like file names
46
+ MyWorkerManager.init_work_manager
47
+
48
+ # launch processor server
49
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
50
+
51
+ # you can set additional properties
52
+ # =================================
53
+
54
+ # if you want basic checkpointing. Some performance drop should be expected
55
+ # mgr.checkpointing=true
56
+
57
+ # if you want to keep the order of input data. Some performance drop should be expected
58
+ # mgr.keep_order=true
59
+
60
+ # you can set the size of packets of data sent to workers
61
+ mgr.chunk_size=1
62
+
63
+ # start processing
64
+ mgr.start_server
65
+
66
+ # this line is reached when all data has been processed
67
+ puts "Program finished"
@@ -0,0 +1,56 @@
1
+ $: << File.join(File.dirname(__FILE__),'lib')
2
+
3
+ require 'calculations'
4
+ include Calculations
5
+
6
+ # MyWorker defines the behaviour of workers.
7
+ # Here is where the real processing takes place
8
+ class MyWorker < ScbiMapreduce::Worker
9
+
10
+ # starting_worker method is called one time at initialization
11
+ # and allows you to initialize your variables
12
+ def starting_worker
13
+
14
+ # You can use worker logs at any time in this way:
15
+ # $WORKER_LOG.info "Starting a worker"
16
+
17
+ end
18
+
19
+
20
+ # receive_initial_config is called only once just after
21
+ # the first connection, when initial parameters are
22
+ # received from manager
23
+ def receive_initial_config(parameters)
24
+
25
+ # Reads the parameters
26
+
27
+ # You can use worker logs at any time in this way:
28
+ # $WORKER_LOG.info "Params received"
29
+
30
+ # save received parameters, if any
31
+ # @params = parameters
32
+ end
33
+
34
+
35
+ # process_object method is called for each received object.
36
+ # Be aware that objs is always an array, and you must iterate
37
+ # over it if you need to process it independently
38
+ #
39
+ # The value returned here will be received by the work_received
40
+ # method at your worker_manager subclass.
41
+ def process_object(objs)
42
+
43
+ # iterate over all objects received
44
+ # objs.each do |obj|
45
+ # convert to uppercase
46
+ do_dummy_calculations
47
+ # end
48
+
49
+ # return objs back to manager
50
+ return objs
51
+ end
52
+
53
+ def closing_worker
54
+
55
+ end
56
+ end
@@ -0,0 +1,52 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class MyWorkerManager < ScbiMapreduce::WorkManager
6
+
7
+ # init_work_manager is executed at the start, prior to any processing.
8
+ # You can use init_work_manager to initialize global variables, open files, etc...
9
+ # Note that an instance of MyWorkerManager will be created for each
10
+ # worker connection, and thus, all global variables here should be
11
+ # class variables (starting with @@)
12
+ def self.init_work_manager
13
+
14
+ # execute dummy_calc in workers @remaining_data times
15
+ @@remaining_data = 1000
16
+ end
17
+
18
+ # end_work_manager is executed at the end, when all the process is done.
19
+ # You can use it to close files opened in init_work_manager
20
+ def self.end_work_manager
21
+
22
+ end
23
+
24
+ # worker_initial_config is used to send initial parameters to workers.
25
+ # The method is executed once per each worker
26
+ def worker_initial_config
27
+
28
+ end
29
+
30
+ # next_work method is called every time a worker needs a new work
31
+ # Here you can read data from disk
32
+ # This method must return the work data or nil if no more data is available
33
+ def next_work
34
+ @@remaining_data -= 1
35
+
36
+ e = @@remaining_data
37
+
38
+ e = nil if @@remaining_data<=0
39
+
40
+ return e
41
+
42
+ end
43
+
44
+
45
+ # work_received is executed each time a worker has finished a job.
46
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
47
+ def work_received(results)
48
+
49
+ # write_data_to_disk(results)
50
+ end
51
+
52
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+
5
+ # modify include path
6
+ $: << File.join(File.dirname(__FILE__),'lib')
7
+
8
+ require 'thread_pool'
9
+ require 'calculations'
10
+ include Calculations
11
+
12
+ if ARGV.count!=1
13
+ puts "use: #{$0} threads"
14
+ exit
15
+ end
16
+
17
+ @pool=ThreadPool.new(ARGV[0].to_i)
18
+
19
+ # process
20
+
21
+ times_to_calculate=1000
22
+
23
+ times_to_calculate.times do
24
+
25
+ @pool.process {do_dummy_calculations}
26
+
27
+ end
28
+
29
+
30
+ puts "wait"
31
+ @pool.join
32
+ puts "final"
33
+
@@ -0,0 +1,30 @@
1
+ A simple scbi_mapreduce application demo
2
+ ========================================
3
+
4
+ This application is a basic sequence processing template. It processes all
5
+ sequences in fastq_file (a file in FastQ format) removing a MIDs from it. It
6
+ needs some external requisites to work:
7
+
8
+ EXTERNAL REQUISITES
9
+ ===================
10
+
11
+ * scbi_blast gem installed
12
+
13
+
14
+ At lib/db you can find a preformated MID database.
15
+
16
+ You can modify the files to perform more complicated processing.
17
+ There are other templates available, you can list them by issuing this command:
18
+
19
+ scbi_mapreduce
20
+
21
+ You can launch the application right now with the following command using 4 cpus/cores and chunks of 100 sequences at a time:
22
+
23
+ ruby main.rb fastq_file 4 100
24
+
25
+ A server and some workers will be launched, and all sequences in fastq_file will
26
+ be processed in blocks of 100 sequences.
27
+
28
+ A sequential lineal example is also provided, you can launch it by issuing:
29
+
30
+ ruby linear_implementation.rb fastq_file
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.dirname(__FILE__)
4
+
5
+ require "logger"
6
+
7
+ $: << '/Users/dariogf/progs/ruby/gems/scbi_mapreduce/lib'
8
+
9
+ require 'scbi_mapreduce'
10
+ require 'my_worker_manager'
11
+
12
+
13
+ $LOG = Logger.new(STDOUT)
14
+ $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
15
+
16
+ ip='10.247.255.5'
17
+ port = 50000
18
+ workers = 8
19
+
20
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
21
+
22
+ $LOG.info 'Starting server'
23
+
24
+
25
+ worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
26
+ worker_launcher.launch_workers_and_wait
27
+
28
+ # launch processor server
29
+ $LOG.info 'Closing workers'
@@ -0,0 +1,120 @@
1
+ >RL1
2
+ ACACGACGACT
3
+ >RL2
4
+ ACACGTAGTAT
5
+ >RL3
6
+ ACACTACTCGT
7
+ >RL4
8
+ ACGACACGTAT
9
+ >RL5
10
+ ACGAGTAGACT
11
+ >RL6
12
+ ACGCGTCTAGT
13
+ >RL7
14
+ ACGTACACACT
15
+ >RL8
16
+ ACGTACTGTGT
17
+ >RL9
18
+ ACGTAGATCGT
19
+ >RL10
20
+ ACTACGTCTCT
21
+ >RL11
22
+ ACTATACGAGT
23
+ >RL12
24
+ ACTCGCGTCGT
25
+ >MID1
26
+ ACGAGTGCGT
27
+ >MID2
28
+ ACGCTCGACA
29
+ >MID3
30
+ AGACGCACTC
31
+ >MID5
32
+ ATCAGACACG
33
+ >MID6
34
+ ATATCGCGAG
35
+ >MID7
36
+ CGTGTCTCTA
37
+ >MID8
38
+ CTCGCGTGTC
39
+ >MID10
40
+ TCTCTATGCG
41
+ >MID11
42
+ TGATACGTCT
43
+ >MID13
44
+ CATAGTAGTG
45
+ >MID14
46
+ CGAGAGATAC
47
+ >MID15
48
+ ATACGACGTA
49
+ >MID16
50
+ TCACGTACTA
51
+ >MID17
52
+ CGTCTAGTAC
53
+ >MID18
54
+ TCTACGTAGC
55
+ >MID19
56
+ TGTACTACTC
57
+ >MID20
58
+ ACGACTACAG
59
+ >MID21
60
+ CGTAGACTAG
61
+ >MID22
62
+ TACGAGTATG
63
+ >MID23
64
+ TACTCTCGTG
65
+ >MID24
66
+ TAGAGACGAG
67
+ >MID25
68
+ TCGTCGCTCG
69
+ >MID26
70
+ ACATACGCGT
71
+ >MID27
72
+ ACGCGAGTAT
73
+ >MID28
74
+ ACTACTATGT
75
+ >MID68
76
+ TCGCTGCGTA
77
+ >MID30
78
+ AGACTATACT
79
+ >MID31
80
+ AGCGTCGTCT
81
+ >MID32
82
+ AGTACGCTAT
83
+ >MID33
84
+ ATAGAGTACT
85
+ >MID34
86
+ CACGCTACGT
87
+ >MID35
88
+ CAGTAGACGT
89
+ >MID36
90
+ CGACGTGACT
91
+ >MID37
92
+ TACACACACT
93
+ >MID38
94
+ TACACGTGAT
95
+ >MID39
96
+ TACAGATCGT
97
+ >MID40
98
+ TACGCTGTCT
99
+ >MID69
100
+ TCTGACGTCA
101
+ >MID42
102
+ TCGATCACGT
103
+ >MID43
104
+ TCGCACTAGT
105
+ >MID44
106
+ TCTAGCGACT
107
+ >MID45
108
+ TCTATACTAT
109
+ >MID46
110
+ TGACGTATGT
111
+ >MID47
112
+ TGTGAGTAGT
113
+ >MID48
114
+ ACAGTATATA
115
+ >MID49
116
+ ACGCGATCGA
117
+ >MID50
118
+ ACTAGCAGTA
119
+ >MID67
120
+ TCGATAGTGA
@@ -0,0 +1,191 @@
1
+ require 'scbi_blast'
2
+ require 'global_match'
3
+ # require 'json'
4
+
5
+ # Module to find Mids in a set of sequences
6
+ module FindMids
7
+
8
+ # find mids using blast+ as an external tool
9
+ def find_mid_with_blast(seqs)
10
+ t=Time.now
11
+
12
+ # Create blast machine agains mid database
13
+ blast = BatchBlast.new("-db #{File.expand_path(File.join(File.dirname(__FILE__),'db/mids.fasta'))}",'blastn'," -task blastn-short -perc_identity 95 -max_target_seqs 4 ") #get mids
14
+
15
+ # build fastas to blast
16
+ fastas=[]
17
+
18
+ seqs.each do |name,fasta,qual,comments|
19
+ fastas.push ">"+name
20
+ fastas.push fasta
21
+ end
22
+
23
+ # execute blast
24
+ blast_table_results = blast.do_blast(fastas)
25
+
26
+ # puts blast_table_results.inspect
27
+
28
+ # Iterate over blast results and sequences
29
+ i=0
30
+ seqs.each do |name,fasta,qual,comments|
31
+ parse_seq(blast_table_results.querys[i],name,fasta,qual,comments)
32
+ i+=1
33
+ end
34
+
35
+ elapsed=Time.now-t
36
+
37
+ puts "T:#{elapsed}, rate#{elapsed/seqs.count}"
38
+
39
+ end
40
+
41
+
42
+ # parse blast results and sequences to remove found MIDS
43
+ def parse_seq(query,name,fasta,qual,comments)
44
+
45
+ # find_polys('TN',fasta)
46
+ # find_polys('AN',fasta)
47
+
48
+ query.hits.each do |found_mid|
49
+
50
+ if found_mid.align_len>1
51
+
52
+ # modify comments by appending removed mid
53
+ comments << found_mid.subject_id
54
+
55
+ # keep fasta from pos to end
56
+ fasta.slice!(0, found_mid.q_beg + found_mid.align_len)
57
+
58
+ # keep qual from pos to end
59
+ qual.slice!(0, found_mid.q_beg + found_mid.align_len)
60
+ break
61
+ end
62
+ end
63
+ end
64
+
65
+ def find_mid_without_blast(seqs)
66
+ # those are the mids found in database
67
+ t=Time.now
68
+
69
+ mids={}
70
+ mids['RL1']='ACACGACGACT'
71
+ mids['RL2']='ACACGTAGTAT'
72
+ mids['RL3']='ACACTACTCGT'
73
+ mids['RL4']='ACGACACGTAT'
74
+ mids['RL5']='ACGAGTAGACT'
75
+ mids['RL6']='ACGCGTCTAGT'
76
+ mids['RL7']='ACGTACACACT'
77
+ mids['RL8']='ACGTACTGTGT'
78
+ mids['RL9']='ACGTAGATCGT'
79
+ mids['RL10']='ACTACGTCTCT'
80
+ mids['RL11']='ACTATACGAGT'
81
+ mids['RL12']='ACTCGCGTCGT'
82
+ mids['MID1']='ACGAGTGCGT'
83
+ mids['MID2']='ACGCTCGACA'
84
+ mids['MID3']='AGACGCACTC'
85
+ mids['MID5']='ATCAGACACG'
86
+ mids['MID6']='ATATCGCGAG'
87
+ mids['MID7']='CGTGTCTCTA'
88
+ mids['MID8']='CTCGCGTGTC'
89
+ mids['MID10']='TCTCTATGCG'
90
+ mids['MID11']='TGATACGTCT'
91
+ mids['MID13']='CATAGTAGTG'
92
+ mids['MID14']='CGAGAGATAC'
93
+ mids['MID15']='ATACGACGTA'
94
+ mids['MID16']='TCACGTACTA'
95
+ mids['MID17']='CGTCTAGTAC'
96
+ mids['MID18']='TCTACGTAGC'
97
+ mids['MID19']='TGTACTACTC'
98
+ mids['MID20']='ACGACTACAG'
99
+ mids['MID21']='CGTAGACTAG'
100
+ mids['MID22']='TACGAGTATG'
101
+ mids['MID23']='TACTCTCGTG'
102
+ mids['MID24']='TAGAGACGAG'
103
+ mids['MID25']='TCGTCGCTCG'
104
+ mids['MID26']='ACATACGCGT'
105
+ mids['MID27']='ACGCGAGTAT'
106
+ mids['MID28']='ACTACTATGT'
107
+ mids['MID68']='TCGCTGCGTA'
108
+ mids['MID30']='AGACTATACT'
109
+ mids['MID31']='AGCGTCGTCT'
110
+ mids['MID32']='AGTACGCTAT'
111
+ mids['MID33']='ATAGAGTACT'
112
+ mids['MID34']='CACGCTACGT'
113
+ mids['MID35']='CAGTAGACGT'
114
+ mids['MID36']='CGACGTGACT'
115
+ mids['MID37']='TACACACACT'
116
+ mids['MID38']='TACACGTGAT'
117
+ mids['MID39']='TACAGATCGT'
118
+ mids['MID40']='TACGCTGTCT'
119
+ mids['MID69']='TCTGACGTCA'
120
+ mids['MID42']='TCGATCACGT'
121
+ mids['MID43']='TCGCACTAGT'
122
+ mids['MID44']='TCTAGCGACT'
123
+ mids['MID45']='TCTATACTAT'
124
+ mids['MID46']='TGACGTATGT'
125
+ mids['MID47']='TGTGAGTAGT'
126
+ mids['MID48']='ACAGTATATA'
127
+ mids['MID49']='ACGCGATCGA'
128
+ mids['MID50']='ACTAGCAGTA'
129
+ mids['MID67']='TCGATAGTGA'
130
+
131
+ # for each sequence
132
+ seqs.each do |name,fasta,qual,comment|
133
+
134
+ # find all mids
135
+ mids.each do |mid_name,mid|
136
+ # puts "."
137
+ # find a known MID position
138
+ found_mid=fasta[0..20].lcs(mid)
139
+ # puts "."
140
+ # puts pos.to_json
141
+ if found_mid.length>5
142
+
143
+ pos=fasta[0..20].index(found_mid)
144
+ # puts found_mid,pos
145
+ # keep fasta from pos to end
146
+ fasta.slice!(0,pos+found_mid.length)
147
+
148
+ # keep qual from pos to end
149
+ qual.slice!(0,pos+found_mid.length)
150
+
151
+ comment << "mid_name #{mid_name}\n"
152
+ # puts comment
153
+ break
154
+ end
155
+ end
156
+ end
157
+
158
+ elapsed=Time.now-t
159
+
160
+ puts "T:#{elapsed}, rate#{elapsed/seqs.count}"
161
+
162
+ end
163
+
164
+
165
+ def do_dummy_calculation
166
+ numer_of_calcs=250000
167
+
168
+ t=Time.now
169
+
170
+ x1=1
171
+ x2=1
172
+
173
+ # do a loop with calculations
174
+ numer_of_calcs.times do |i|
175
+ x=x1+x2
176
+
177
+ x1=x2
178
+ x2=x
179
+
180
+ # puts some info at regular intervals
181
+ if (i % 100000)==0
182
+ puts "Calculated #{i} by thread #{n}"
183
+ end
184
+ end
185
+ puts Time.now-t
186
+
187
+ end
188
+
189
+
190
+
191
+ end