scbi_mapreduce 0.0.40 → 0.0.45

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.txt +22 -0
  5. data/{README.rdoc → README.md} +0 -0
  6. data/Rakefile +8 -28
  7. data/lib/scbi_mapreduce.rb +2 -10
  8. data/lib/scbi_mapreduce/main_worker.rb +20 -6
  9. data/lib/scbi_mapreduce/manager.rb +4 -0
  10. data/lib/scbi_mapreduce/version.rb +3 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +22 -0
  12. data/lib/scbi_mapreduce/worker_launcher.rb +34 -0
  13. data/scbi_mapreduce.gemspec +27 -0
  14. data/skeleton/.DS_Store +0 -0
  15. data/skeleton/dummy_calcs/.DS_Store +0 -0
  16. data/skeleton/old/dummy_calculations/README.txt +25 -0
  17. data/skeleton/old/dummy_calculations/lib/calculations.rb +37 -0
  18. data/skeleton/old/dummy_calculations/lib/thread_pool.rb +107 -0
  19. data/skeleton/old/dummy_calculations/main.rb +59 -0
  20. data/skeleton/old/dummy_calculations/my_worker.rb +56 -0
  21. data/skeleton/old/dummy_calculations/my_worker_manager.rb +52 -0
  22. data/skeleton/old/dummy_calculations/threads_implementation.rb +29 -0
  23. data/skeleton/old/sequences_blast/README.txt +31 -0
  24. data/{test/drb_test/main.rb → skeleton/old/sequences_blast/launch_only_workers.rb} +6 -10
  25. data/skeleton/old/sequences_blast/lib/db/mids.fasta +64 -0
  26. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nhr +0 -0
  27. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nin +0 -0
  28. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nog +0 -0
  29. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nsd +48 -0
  30. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nsi +0 -0
  31. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nsq +0 -0
  32. data/skeleton/old/sequences_blast/lib/find_mids.rb +134 -0
  33. data/skeleton/old/sequences_blast/lib/thread_pool.rb +107 -0
  34. data/skeleton/old/sequences_blast/linear_implementation.rb +86 -0
  35. data/skeleton/old/sequences_blast/logs/worker0_osiris-2.local_log.txt +13 -0
  36. data/skeleton/old/sequences_blast/logs/worker1_osiris-2.local_log.txt +13 -0
  37. data/skeleton/old/sequences_blast/main.rb +63 -0
  38. data/skeleton/old/sequences_blast/my_worker.rb +58 -0
  39. data/skeleton/old/sequences_blast/my_worker_manager.rb +60 -0
  40. data/skeleton/old/sequences_blast/results.fastq +3996 -0
  41. data/skeleton/old/sequences_blast/test_threads.rb +32 -0
  42. data/skeleton/old/sequences_blast/threads_implementation.rb +108 -0
  43. data/skeleton/remove_mids/lib/db/mids.fasta.nhr +0 -0
  44. data/skeleton/remove_mids/lib/db/mids.fasta.nin +0 -0
  45. data/skeleton/remove_mids/lib/db/mids.fasta.nog +0 -0
  46. data/skeleton/remove_mids/lib/db/mids.fasta.nsd +120 -0
  47. data/skeleton/remove_mids/lib/db/mids.fasta.nsi +0 -0
  48. data/skeleton/remove_mids/lib/db/mids.fasta.nsq +0 -0
  49. data/{.gemtest → skeleton/remove_mids/results.fastq558292} +0 -0
  50. data/skeleton/remove_mids/results.fastq662870 +3996 -0
  51. data/skeleton/simple/launch_only_workers.rb +29 -0
  52. metadata +102 -110
  53. data/History.txt +0 -93
  54. data/Manifest.txt +0 -47
  55. data/PostInstall.txt +0 -7
  56. data/script/console +0 -10
  57. data/script/destroy +0 -14
  58. data/script/generate +0 -14
  59. data/test/drb_test/my_worker.rb +0 -36
  60. data/test/drb_test/my_worker_manager.rb +0 -41
  61. data/test/drb_test/scbi_drb_checkpoint +0 -1
  62. data/test/drb_test/scbi_mapreduce_checkpoint +0 -1
  63. data/test/test_helper.rb +0 -3
  64. data/test/test_scbi_drb.rb +0 -11
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.join(File.dirname(__FILE__))
4
+
5
+ # load required libraries
6
+ require 'scbi_mapreduce'
7
+ require 'my_worker_manager'
8
+
9
+ # listen on all ips at port 50000
10
+ ip='0.0.0.0'
11
+ port = 50000
12
+
13
+ # set number of workers. You can also provide an array with worker names.
14
+ # Those workers names can be read from a file produced by the existing
15
+ # queue system, if any.
16
+
17
+
18
+
19
+ workers = 4
20
+
21
+ # read optional workers parameter
22
+ input_workers = ARGV.shift
23
+ if !input_workers.nil?
24
+ # if it is a file
25
+ if File.exists?(input_workers)
26
+ # read workers into array
27
+ workers=File.read(input_workers).split("\n").map{|w| w.chomp}
28
+ else
29
+ # workers is a number
30
+ workers = input_workers.to_i
31
+ end
32
+ end
33
+
34
+ # we need the path to my_worker in order to launch it when necessary
35
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
36
+
37
+ # initialize the work manager. Here you can pass parameters like file names
38
+ MyWorkerManager.init_work_manager
39
+
40
+ # launch processor server
41
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
42
+
43
+ # you can set additional properties
44
+ # =================================
45
+
46
+ # if you want basic checkpointing. Some performance drop should be expected
47
+ # mgr.checkpointing=true
48
+
49
+ # if you want to keep the order of input data. Some performance drop should be expected
50
+ # mgr.keep_order=true
51
+
52
+ # you can set the size of packets of data sent to workers
53
+ mgr.chunk_size=1
54
+
55
+ # start processing
56
+ mgr.start_server
57
+
58
+ # this line is reached when all data has been processed
59
+ puts "Program finished"
@@ -0,0 +1,56 @@
1
+ $: << File.join(File.dirname(__FILE__),'lib')
2
+
3
+ require 'calculations'
4
+ include Calculations
5
+
6
+ # MyWorker defines the behaviour of workers.
7
+ # Here is where the real processing takes place
8
+ class MyWorker < ScbiMapreduce::Worker
9
+
10
+ # starting_worker method is called one time at initialization
11
+ # and allows you to initialize your variables
12
+ def starting_worker
13
+
14
+ # You can use worker logs at any time in this way:
15
+ # $WORKER_LOG.info "Starting a worker"
16
+
17
+ end
18
+
19
+
20
+ # receive_initial_config is called only once just after
21
+ # the first connection, when initial parameters are
22
+ # received from manager
23
+ def receive_initial_config(parameters)
24
+
25
+ # Reads the parameters
26
+
27
+ # You can use worker logs at any time in this way:
28
+ # $WORKER_LOG.info "Params received"
29
+
30
+ # save received parameters, if any
31
+ # @params = parameters
32
+ end
33
+
34
+
35
+ # process_object method is called for each received object.
36
+ # Be aware that objs is always an array, and you must iterate
37
+ # over it if you need to process it independently
38
+ #
39
+ # The value returned here will be received by the work_received
40
+ # method at your worker_manager subclass.
41
+ def process_object(objs)
42
+
43
+ # iterate over all objects received
44
+ # objs.each do |obj|
45
+ # convert to uppercase
46
+ do_dummy_calculations
47
+ # end
48
+
49
+ # return objs back to manager
50
+ return objs
51
+ end
52
+
53
+ def closing_worker
54
+
55
+ end
56
+ end
@@ -0,0 +1,52 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class MyWorkerManager < ScbiMapreduce::WorkManager
6
+
7
+ # init_work_manager is executed at the start, prior to any processing.
8
+ # You can use init_work_manager to initialize global variables, open files, etc...
9
+ # Note that an instance of MyWorkerManager will be created for each
10
+ # worker connection, and thus, all global variables here should be
11
+ # class variables (starting with @@)
12
+ def self.init_work_manager
13
+
14
+ # execute dummy_calc in workers @remaining_data times
15
+ @@remaining_data = 1000
16
+ end
17
+
18
+ # end_work_manager is executed at the end, when all the process is done.
19
+ # You can use it to close files opened in init_work_manager
20
+ def self.end_work_manager
21
+
22
+ end
23
+
24
+ # worker_initial_config is used to send initial parameters to workers.
25
+ # The method is executed once per each worker
26
+ def worker_initial_config
27
+
28
+ end
29
+
30
+ # next_work method is called every time a worker needs a new work
31
+ # Here you can read data from disk
32
+ # This method must return the work data or nil if no more data is available
33
+ def next_work
34
+ @@remaining_data -= 1
35
+
36
+ e = @@remaining_data
37
+
38
+ e = nil if @@remaining_data<=0
39
+
40
+ return e
41
+
42
+ end
43
+
44
+
45
+ # work_received is executed each time a worker has finished a job.
46
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
47
+ def work_received(results)
48
+
49
+ # write_data_to_disk(results)
50
+ end
51
+
52
+ end
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+
5
+ # modify include path
6
+ $: << File.join(File.dirname(__FILE__),'lib')
7
+
8
+ require 'thread_pool'
9
+ require 'calculations'
10
+ include Calculations
11
+
12
+
13
+ @pool=ThreadPool.new(4)
14
+
15
+ # process
16
+
17
+ times_to_calculate=1000
18
+
19
+ times_to_calculate.times do
20
+
21
+ @pool.process {do_dummy_calculations}
22
+
23
+ end
24
+
25
+
26
+ puts "wait"
27
+ @pool.join
28
+ puts "final"
29
+
@@ -0,0 +1,31 @@
1
+ A simple scbi_mapreduce application demo
2
+ ========================================
3
+
4
+ This application is a basic sequence processing template. It processes all
5
+ sequences in fastq_file (a file in FastQ format) removing a MIDs from it. It
6
+ needs some external requisites to work:
7
+
8
+ EXTERNAL REQUISITES
9
+ ===================
10
+
11
+ * Blast+ 2.2.24 or greater installed
12
+ * scbi_blast gem installed
13
+
14
+
15
+ At lib/db you can find a preformated MID database for blast+ (formatted with makeblastdb).
16
+
17
+ You can modify the files to perform more complicated processing.
18
+ There are other templates available, you can list them by issuing this command:
19
+
20
+ scbi_mapreduce
21
+
22
+ You can launch the application right now with the following command:
23
+
24
+ ruby main.rb fastq_file
25
+
26
+ A server and some workers will be launched, and all sequences in fastq_file will
27
+ be processed in blocks of 100 sequences.
28
+
29
+ A sequential example is also provided, you can launch it by issuing:
30
+
31
+ ruby linear_implementation.rb fastq_file
@@ -4,7 +4,7 @@ $: << File.dirname(__FILE__)
4
4
 
5
5
  require "logger"
6
6
 
7
- $: << File.expand_path('../../lib')
7
+ $: << '/Users/dariogf/progs/ruby/gems/scbi_mapreduce/lib'
8
8
 
9
9
  require 'scbi_mapreduce'
10
10
  require 'my_worker_manager'
@@ -13,21 +13,17 @@ require 'my_worker_manager'
13
13
  $LOG = Logger.new(STDOUT)
14
14
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
15
15
 
16
- ip='0.0.0.0'
16
+ ip='10.247.255.5'
17
17
  port = 50000
18
18
  workers = 8
19
19
 
20
- workers = File.expand_path('workers')
21
-
22
20
  custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
23
21
 
24
22
  $LOG.info 'Starting server'
25
23
 
26
- MyWorkerManager.init_work_manager
24
+
25
+ worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
26
+ worker_launcher.launch_workers_and_wait
27
27
 
28
28
  # launch processor server
29
- mgr = ScbiMapreduce::Manager.new(ip,port, workers, MyWorkerManager,custom_worker_file, STDOUT)
30
- # mgr.checkpointing=false
31
- # mgr.keep_order=true
32
- mgr.start_server
33
- $LOG.info 'Closing server'
29
+ $LOG.info 'Closing workers'
@@ -0,0 +1,64 @@
1
+ >RL1
2
+ ACACGACGACT
3
+ >RL2
4
+ ACACGTAGTAT
5
+ >RL3
6
+ ACACTACTCGT
7
+ >RL4
8
+ ACGACACGTAT
9
+ >RL5
10
+ ACGAGTAGACT
11
+ >RL6
12
+ ACGCGTCTAGT
13
+ >RL7
14
+ ACGTACACACT
15
+ >RL8
16
+ ACGTACTGTGT
17
+ >RL9
18
+ ACGTAGATCGT
19
+ >RL10
20
+ ACTACGTCTCT
21
+ >RL11
22
+ ACTATACGAGT
23
+ >RL12
24
+ ACTCGCGTCGT
25
+
26
+
27
+ >MID1
28
+ ACGAGTGCGT
29
+
30
+ >MID2
31
+ ACGCTCGACA
32
+
33
+ >MID3
34
+ AGACGCACTC
35
+
36
+ >MID4
37
+ AGCACTGTAG
38
+
39
+ >MID5
40
+ ATCAGACACG
41
+
42
+ >MID6
43
+ ATATCGCGAG
44
+
45
+ >MID7
46
+ CGTGTCTCTA
47
+
48
+ >MID8
49
+ CTCGCGTGTC
50
+
51
+ >MID9
52
+ TAGTATCAGC
53
+
54
+ >MID10
55
+ TCTCTATGCG
56
+
57
+ >MID11
58
+ TGATACGTCT
59
+
60
+ >MID12
61
+ TACTGAGCTA
62
+
63
+
64
+
@@ -0,0 +1,48 @@
1
+ lcl|mid112
2
+ lcl|mid1021
3
+ lcl|mid1122
4
+ lcl|mid1223
5
+ lcl|mid213
6
+ lcl|mid314
7
+ lcl|mid415
8
+ lcl|mid516
9
+ lcl|mid617
10
+ lcl|mid718
11
+ lcl|mid819
12
+ lcl|mid920
13
+ lcl|rl10
14
+ lcl|rl109
15
+ lcl|rl1110
16
+ lcl|rl1211
17
+ lcl|rl21
18
+ lcl|rl32
19
+ lcl|rl43
20
+ lcl|rl54
21
+ lcl|rl65
22
+ lcl|rl76
23
+ lcl|rl87
24
+ lcl|rl98
25
+ mid112
26
+ mid1021
27
+ mid1122
28
+ mid1223
29
+ mid213
30
+ mid314
31
+ mid415
32
+ mid516
33
+ mid617
34
+ mid718
35
+ mid819
36
+ mid920
37
+ rl10
38
+ rl109
39
+ rl1110
40
+ rl1211
41
+ rl21
42
+ rl32
43
+ rl43
44
+ rl54
45
+ rl65
46
+ rl76
47
+ rl87
48
+ rl98
@@ -0,0 +1,134 @@
1
+ require 'scbi_blast'
2
+ # require 'json'
3
+
4
+ # Module to find Mids in a set of sequences
5
+ module FindMids
6
+
7
+ # find mids using blast+ as an external tool
8
+ def find_mid_with_blast(seqs)
9
+ t=Time.now
10
+
11
+ # Create blast machine agains mid database
12
+ blast = BatchBlast.new("-db #{File.expand_path(File.join(File.dirname(__FILE__),'db/mids.fasta'))}",'blastn'," -task blastn-short -perc_identity 95 -max_target_seqs 4 ") #get mids
13
+
14
+ # build fastas to blast
15
+ fastas=[]
16
+
17
+ seqs.each do |name,fasta,qual,comments|
18
+ fastas.push ">"+name
19
+ fastas.push fasta
20
+ end
21
+
22
+ # execute blast
23
+ blast_table_results = blast.do_blast(fastas)
24
+
25
+ puts blast_table_results.inspect
26
+
27
+ # Iterate over blast results and sequences
28
+ i=0
29
+ seqs.each do |name,fasta,qual,comments|
30
+ parse_seq(blast_table_results.querys[i],name,fasta,qual,comments)
31
+ i+=1
32
+ end
33
+
34
+ puts Time.now-t
35
+
36
+ end
37
+
38
+
39
+ # parse blast results and sequences to remove found MIDS
40
+ def parse_seq(query,name,fasta,qual,comments)
41
+
42
+ query.hits.each do |found_mid|
43
+
44
+ if found_mid.align_len>7
45
+
46
+ # modify comments by appending removed mid
47
+ comments << found_mid.subject_id
48
+
49
+ # keep fasta from pos to end
50
+ fasta.slice!(0, found_mid.q_beg + found_mid.align_len)
51
+
52
+ # keep qual from pos to end
53
+ qual.slice!(0, found_mid.q_beg + found_mid.align_len)
54
+ break
55
+ end
56
+ end
57
+ end
58
+
59
+ def find_mid_without_blast(seqs)
60
+ # those are the mids found in database
61
+
62
+ mids={}
63
+ mids['RL1']='ACACGACGACT'
64
+ mids['RL2']='ACACGTAGTAT'
65
+ mids['RL3']='ACACTACTCGT'
66
+ mids['RL4']='ACGACACGTAT'
67
+ mids['RL5']='ACGAGTAGACT'
68
+ mids['RL6']='ACGCGTCTAGT'
69
+ mids['RL7']='ACGTACACACT'
70
+ mids['RL8']='ACGTACTGTGT'
71
+ mids['RL9']='ACGTAGATCGT'
72
+ mids['RL10']='ACTACGTCTCT'
73
+ mids['RL11']='ACTATACGAGT'
74
+ mids['RL12']='ACTCGCGTCGT'
75
+ mids['MID1']='ACGAGTGCGT'
76
+ mids['MID2']='ACGCTCGACA'
77
+ mids['MID3']='AGACGCACTC'
78
+ mids['MID4']='AGCACTGTAG'
79
+ mids['MID5']='ATCAGACACG'
80
+ mids['MID6']='ATATCGCGAG'
81
+ mids['MID7']='CGTGTCTCTA'
82
+ mids['MID8']='CTCGCGTGTC'
83
+ mids['MID9']='TAGTATCAGC'
84
+ mids['MID10']='TCTCTATGCG'
85
+ mids['MID11']='TGATACGTCT'
86
+ mids['MID12']='TACTGAGCTA'
87
+
88
+ mids.each do |mid_name,mid|
89
+
90
+ seqs.each do |name,fasta,qual,comment|
91
+
92
+ # find a known MID position
93
+ pos=fasta.upcase.index(mid)
94
+
95
+ if pos
96
+
97
+ # keep fasta from pos to end
98
+ fasta.slice!(0,pos+mid.length)
99
+
100
+ # keep qual from pos to end
101
+ qual.slice!(0,pos+mid.length)
102
+
103
+ end
104
+ end
105
+ end
106
+
107
+ end
108
+
109
+
110
+ def do_dummy_calculation
111
+ numer_of_calcs=250000
112
+
113
+ t=Time.now
114
+
115
+ x1=1
116
+ x2=1
117
+
118
+ # do a loop with calculations
119
+ numer_of_calcs.times do |i|
120
+ x=x1+x2
121
+
122
+ x1=x2
123
+ x2=x
124
+
125
+ # puts some info at regular intervals
126
+ if (i % 100000)==0
127
+ puts "Calculated #{i} by thread #{n}"
128
+ end
129
+ end
130
+ puts Time.now-t
131
+
132
+ end
133
+
134
+ end