scbi_mapreduce 0.0.40 → 0.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.txt +22 -0
  5. data/{README.rdoc → README.md} +0 -0
  6. data/Rakefile +8 -28
  7. data/lib/scbi_mapreduce.rb +2 -10
  8. data/lib/scbi_mapreduce/main_worker.rb +20 -6
  9. data/lib/scbi_mapreduce/manager.rb +4 -0
  10. data/lib/scbi_mapreduce/version.rb +3 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +22 -0
  12. data/lib/scbi_mapreduce/worker_launcher.rb +34 -0
  13. data/scbi_mapreduce.gemspec +27 -0
  14. data/skeleton/.DS_Store +0 -0
  15. data/skeleton/dummy_calcs/.DS_Store +0 -0
  16. data/skeleton/old/dummy_calculations/README.txt +25 -0
  17. data/skeleton/old/dummy_calculations/lib/calculations.rb +37 -0
  18. data/skeleton/old/dummy_calculations/lib/thread_pool.rb +107 -0
  19. data/skeleton/old/dummy_calculations/main.rb +59 -0
  20. data/skeleton/old/dummy_calculations/my_worker.rb +56 -0
  21. data/skeleton/old/dummy_calculations/my_worker_manager.rb +52 -0
  22. data/skeleton/old/dummy_calculations/threads_implementation.rb +29 -0
  23. data/skeleton/old/sequences_blast/README.txt +31 -0
  24. data/{test/drb_test/main.rb → skeleton/old/sequences_blast/launch_only_workers.rb} +6 -10
  25. data/skeleton/old/sequences_blast/lib/db/mids.fasta +64 -0
  26. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nhr +0 -0
  27. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nin +0 -0
  28. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nog +0 -0
  29. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nsd +48 -0
  30. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nsi +0 -0
  31. data/skeleton/old/sequences_blast/lib/db/mids.fasta.nsq +0 -0
  32. data/skeleton/old/sequences_blast/lib/find_mids.rb +134 -0
  33. data/skeleton/old/sequences_blast/lib/thread_pool.rb +107 -0
  34. data/skeleton/old/sequences_blast/linear_implementation.rb +86 -0
  35. data/skeleton/old/sequences_blast/logs/worker0_osiris-2.local_log.txt +13 -0
  36. data/skeleton/old/sequences_blast/logs/worker1_osiris-2.local_log.txt +13 -0
  37. data/skeleton/old/sequences_blast/main.rb +63 -0
  38. data/skeleton/old/sequences_blast/my_worker.rb +58 -0
  39. data/skeleton/old/sequences_blast/my_worker_manager.rb +60 -0
  40. data/skeleton/old/sequences_blast/results.fastq +3996 -0
  41. data/skeleton/old/sequences_blast/test_threads.rb +32 -0
  42. data/skeleton/old/sequences_blast/threads_implementation.rb +108 -0
  43. data/skeleton/remove_mids/lib/db/mids.fasta.nhr +0 -0
  44. data/skeleton/remove_mids/lib/db/mids.fasta.nin +0 -0
  45. data/skeleton/remove_mids/lib/db/mids.fasta.nog +0 -0
  46. data/skeleton/remove_mids/lib/db/mids.fasta.nsd +120 -0
  47. data/skeleton/remove_mids/lib/db/mids.fasta.nsi +0 -0
  48. data/skeleton/remove_mids/lib/db/mids.fasta.nsq +0 -0
  49. data/{.gemtest → skeleton/remove_mids/results.fastq558292} +0 -0
  50. data/skeleton/remove_mids/results.fastq662870 +3996 -0
  51. data/skeleton/simple/launch_only_workers.rb +29 -0
  52. metadata +102 -110
  53. data/History.txt +0 -93
  54. data/Manifest.txt +0 -47
  55. data/PostInstall.txt +0 -7
  56. data/script/console +0 -10
  57. data/script/destroy +0 -14
  58. data/script/generate +0 -14
  59. data/test/drb_test/my_worker.rb +0 -36
  60. data/test/drb_test/my_worker_manager.rb +0 -41
  61. data/test/drb_test/scbi_drb_checkpoint +0 -1
  62. data/test/drb_test/scbi_mapreduce_checkpoint +0 -1
  63. data/test/test_helper.rb +0 -3
  64. data/test/test_scbi_drb.rb +0 -11
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.join(File.dirname(__FILE__))
4
+
5
+ # load required libraries
6
+ require 'scbi_mapreduce'
7
+ require 'my_worker_manager'
8
+
9
+ # listen on all ips at port 50000
10
+ ip='0.0.0.0'
11
+ port = 50000
12
+
13
+ # set number of workers. You can also provide an array with worker names.
14
+ # Those workers names can be read from a file produced by the existing
15
+ # queue system, if any.
16
+
17
+
18
+
19
+ workers = 4
20
+
21
+ # read optional workers parameter
22
+ input_workers = ARGV.shift
23
+ if !input_workers.nil?
24
+ # if it is a file
25
+ if File.exists?(input_workers)
26
+ # read workers into array
27
+ workers=File.read(input_workers).split("\n").map{|w| w.chomp}
28
+ else
29
+ # workers is a number
30
+ workers = input_workers.to_i
31
+ end
32
+ end
33
+
34
+ # we need the path to my_worker in order to launch it when necessary
35
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
36
+
37
+ # initialize the work manager. Here you can pass parameters like file names
38
+ MyWorkerManager.init_work_manager
39
+
40
+ # launch processor server
41
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
42
+
43
+ # you can set additional properties
44
+ # =================================
45
+
46
+ # if you want basic checkpointing. Some performance drop should be expected
47
+ # mgr.checkpointing=true
48
+
49
+ # if you want to keep the order of input data. Some performance drop should be expected
50
+ # mgr.keep_order=true
51
+
52
+ # you can set the size of packets of data sent to workers
53
+ mgr.chunk_size=1
54
+
55
+ # start processing
56
+ mgr.start_server
57
+
58
+ # this line is reached when all data has been processed
59
+ puts "Program finished"
@@ -0,0 +1,56 @@
1
+ $: << File.join(File.dirname(__FILE__),'lib')
2
+
3
+ require 'calculations'
4
+ include Calculations
5
+
6
+ # MyWorker defines the behaviour of workers.
7
+ # Here is where the real processing takes place
8
+ class MyWorker < ScbiMapreduce::Worker
9
+
10
+ # starting_worker method is called one time at initialization
11
+ # and allows you to initialize your variables
12
+ def starting_worker
13
+
14
+ # You can use worker logs at any time in this way:
15
+ # $WORKER_LOG.info "Starting a worker"
16
+
17
+ end
18
+
19
+
20
+ # receive_initial_config is called only once just after
21
+ # the first connection, when initial parameters are
22
+ # received from manager
23
+ def receive_initial_config(parameters)
24
+
25
+ # Reads the parameters
26
+
27
+ # You can use worker logs at any time in this way:
28
+ # $WORKER_LOG.info "Params received"
29
+
30
+ # save received parameters, if any
31
+ # @params = parameters
32
+ end
33
+
34
+
35
+ # process_object method is called for each received object.
36
+ # Be aware that objs is always an array, and you must iterate
37
+ # over it if you need to process it independently
38
+ #
39
+ # The value returned here will be received by the work_received
40
+ # method at your worker_manager subclass.
41
+ def process_object(objs)
42
+
43
+ # iterate over all objects received
44
+ # objs.each do |obj|
45
+ # convert to uppercase
46
+ do_dummy_calculations
47
+ # end
48
+
49
+ # return objs back to manager
50
+ return objs
51
+ end
52
+
53
+ def closing_worker
54
+
55
+ end
56
+ end
@@ -0,0 +1,52 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class MyWorkerManager < ScbiMapreduce::WorkManager
6
+
7
+ # init_work_manager is executed at the start, prior to any processing.
8
+ # You can use init_work_manager to initialize global variables, open files, etc...
9
+ # Note that an instance of MyWorkerManager will be created for each
10
+ # worker connection, and thus, all global variables here should be
11
+ # class variables (starting with @@)
12
+ def self.init_work_manager
13
+
14
+ # execute dummy_calc in workers @remaining_data times
15
+ @@remaining_data = 1000
16
+ end
17
+
18
+ # end_work_manager is executed at the end, when all the process is done.
19
+ # You can use it to close files opened in init_work_manager
20
+ def self.end_work_manager
21
+
22
+ end
23
+
24
+ # worker_initial_config is used to send initial parameters to workers.
25
+ # The method is executed once per each worker
26
+ def worker_initial_config
27
+
28
+ end
29
+
30
+ # next_work method is called every time a worker needs a new work
31
+ # Here you can read data from disk
32
+ # This method must return the work data or nil if no more data is available
33
+ def next_work
34
+ @@remaining_data -= 1
35
+
36
+ e = @@remaining_data
37
+
38
+ e = nil if @@remaining_data<=0
39
+
40
+ return e
41
+
42
+ end
43
+
44
+
45
+ # work_received is executed each time a worker has finished a job.
46
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
47
+ def work_received(results)
48
+
49
+ # write_data_to_disk(results)
50
+ end
51
+
52
+ end
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+
5
+ # modify include path
6
+ $: << File.join(File.dirname(__FILE__),'lib')
7
+
8
+ require 'thread_pool'
9
+ require 'calculations'
10
+ include Calculations
11
+
12
+
13
+ @pool=ThreadPool.new(4)
14
+
15
+ # process
16
+
17
+ times_to_calculate=1000
18
+
19
+ times_to_calculate.times do
20
+
21
+ @pool.process {do_dummy_calculations}
22
+
23
+ end
24
+
25
+
26
+ puts "wait"
27
+ @pool.join
28
+ puts "final"
29
+
@@ -0,0 +1,31 @@
1
+ A simple scbi_mapreduce application demo
2
+ ========================================
3
+
4
+ This application is a basic sequence processing template. It processes all
5
+ sequences in fastq_file (a file in FastQ format) removing a MIDs from it. It
6
+ needs some external requisites to work:
7
+
8
+ EXTERNAL REQUISITES
9
+ ===================
10
+
11
+ * Blast+ 2.2.24 or greater installed
12
+ * scbi_blast gem installed
13
+
14
+
15
+ At lib/db you can find a preformated MID database for blast+ (formatted with makeblastdb).
16
+
17
+ You can modify the files to perform more complicated processing.
18
+ There are other templates available, you can list them by issuing this command:
19
+
20
+ scbi_mapreduce
21
+
22
+ You can launch the application right now with the following command:
23
+
24
+ ruby main.rb fastq_file
25
+
26
+ A server and some workers will be launched, and all sequences in fastq_file will
27
+ be processed in blocks of 100 sequences.
28
+
29
+ A sequential example is also provided, you can launch it by issuing:
30
+
31
+ ruby linear_implementation.rb fastq_file
@@ -4,7 +4,7 @@ $: << File.dirname(__FILE__)
4
4
 
5
5
  require "logger"
6
6
 
7
- $: << File.expand_path('../../lib')
7
+ $: << '/Users/dariogf/progs/ruby/gems/scbi_mapreduce/lib'
8
8
 
9
9
  require 'scbi_mapreduce'
10
10
  require 'my_worker_manager'
@@ -13,21 +13,17 @@ require 'my_worker_manager'
13
13
  $LOG = Logger.new(STDOUT)
14
14
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
15
15
 
16
- ip='0.0.0.0'
16
+ ip='10.247.255.5'
17
17
  port = 50000
18
18
  workers = 8
19
19
 
20
- workers = File.expand_path('workers')
21
-
22
20
  custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
23
21
 
24
22
  $LOG.info 'Starting server'
25
23
 
26
- MyWorkerManager.init_work_manager
24
+
25
+ worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
26
+ worker_launcher.launch_workers_and_wait
27
27
 
28
28
  # launch processor server
29
- mgr = ScbiMapreduce::Manager.new(ip,port, workers, MyWorkerManager,custom_worker_file, STDOUT)
30
- # mgr.checkpointing=false
31
- # mgr.keep_order=true
32
- mgr.start_server
33
- $LOG.info 'Closing server'
29
+ $LOG.info 'Closing workers'
@@ -0,0 +1,64 @@
1
+ >RL1
2
+ ACACGACGACT
3
+ >RL2
4
+ ACACGTAGTAT
5
+ >RL3
6
+ ACACTACTCGT
7
+ >RL4
8
+ ACGACACGTAT
9
+ >RL5
10
+ ACGAGTAGACT
11
+ >RL6
12
+ ACGCGTCTAGT
13
+ >RL7
14
+ ACGTACACACT
15
+ >RL8
16
+ ACGTACTGTGT
17
+ >RL9
18
+ ACGTAGATCGT
19
+ >RL10
20
+ ACTACGTCTCT
21
+ >RL11
22
+ ACTATACGAGT
23
+ >RL12
24
+ ACTCGCGTCGT
25
+
26
+
27
+ >MID1
28
+ ACGAGTGCGT
29
+
30
+ >MID2
31
+ ACGCTCGACA
32
+
33
+ >MID3
34
+ AGACGCACTC
35
+
36
+ >MID4
37
+ AGCACTGTAG
38
+
39
+ >MID5
40
+ ATCAGACACG
41
+
42
+ >MID6
43
+ ATATCGCGAG
44
+
45
+ >MID7
46
+ CGTGTCTCTA
47
+
48
+ >MID8
49
+ CTCGCGTGTC
50
+
51
+ >MID9
52
+ TAGTATCAGC
53
+
54
+ >MID10
55
+ TCTCTATGCG
56
+
57
+ >MID11
58
+ TGATACGTCT
59
+
60
+ >MID12
61
+ TACTGAGCTA
62
+
63
+
64
+
@@ -0,0 +1,48 @@
1
+ lcl|mid112
2
+ lcl|mid1021
3
+ lcl|mid1122
4
+ lcl|mid1223
5
+ lcl|mid213
6
+ lcl|mid314
7
+ lcl|mid415
8
+ lcl|mid516
9
+ lcl|mid617
10
+ lcl|mid718
11
+ lcl|mid819
12
+ lcl|mid920
13
+ lcl|rl10
14
+ lcl|rl109
15
+ lcl|rl1110
16
+ lcl|rl1211
17
+ lcl|rl21
18
+ lcl|rl32
19
+ lcl|rl43
20
+ lcl|rl54
21
+ lcl|rl65
22
+ lcl|rl76
23
+ lcl|rl87
24
+ lcl|rl98
25
+ mid112
26
+ mid1021
27
+ mid1122
28
+ mid1223
29
+ mid213
30
+ mid314
31
+ mid415
32
+ mid516
33
+ mid617
34
+ mid718
35
+ mid819
36
+ mid920
37
+ rl10
38
+ rl109
39
+ rl1110
40
+ rl1211
41
+ rl21
42
+ rl32
43
+ rl43
44
+ rl54
45
+ rl65
46
+ rl76
47
+ rl87
48
+ rl98
@@ -0,0 +1,134 @@
1
+ require 'scbi_blast'
2
+ # require 'json'
3
+
4
+ # Module to find Mids in a set of sequences
5
+ module FindMids
6
+
7
+ # find mids using blast+ as an external tool
8
+ def find_mid_with_blast(seqs)
9
+ t=Time.now
10
+
11
+ # Create blast machine agains mid database
12
+ blast = BatchBlast.new("-db #{File.expand_path(File.join(File.dirname(__FILE__),'db/mids.fasta'))}",'blastn'," -task blastn-short -perc_identity 95 -max_target_seqs 4 ") #get mids
13
+
14
+ # build fastas to blast
15
+ fastas=[]
16
+
17
+ seqs.each do |name,fasta,qual,comments|
18
+ fastas.push ">"+name
19
+ fastas.push fasta
20
+ end
21
+
22
+ # execute blast
23
+ blast_table_results = blast.do_blast(fastas)
24
+
25
+ puts blast_table_results.inspect
26
+
27
+ # Iterate over blast results and sequences
28
+ i=0
29
+ seqs.each do |name,fasta,qual,comments|
30
+ parse_seq(blast_table_results.querys[i],name,fasta,qual,comments)
31
+ i+=1
32
+ end
33
+
34
+ puts Time.now-t
35
+
36
+ end
37
+
38
+
39
+ # parse blast results and sequences to remove found MIDS
40
+ def parse_seq(query,name,fasta,qual,comments)
41
+
42
+ query.hits.each do |found_mid|
43
+
44
+ if found_mid.align_len>7
45
+
46
+ # modify comments by appending removed mid
47
+ comments << found_mid.subject_id
48
+
49
+ # keep fasta from pos to end
50
+ fasta.slice!(0, found_mid.q_beg + found_mid.align_len)
51
+
52
+ # keep qual from pos to end
53
+ qual.slice!(0, found_mid.q_beg + found_mid.align_len)
54
+ break
55
+ end
56
+ end
57
+ end
58
+
59
+ def find_mid_without_blast(seqs)
60
+ # those are the mids found in database
61
+
62
+ mids={}
63
+ mids['RL1']='ACACGACGACT'
64
+ mids['RL2']='ACACGTAGTAT'
65
+ mids['RL3']='ACACTACTCGT'
66
+ mids['RL4']='ACGACACGTAT'
67
+ mids['RL5']='ACGAGTAGACT'
68
+ mids['RL6']='ACGCGTCTAGT'
69
+ mids['RL7']='ACGTACACACT'
70
+ mids['RL8']='ACGTACTGTGT'
71
+ mids['RL9']='ACGTAGATCGT'
72
+ mids['RL10']='ACTACGTCTCT'
73
+ mids['RL11']='ACTATACGAGT'
74
+ mids['RL12']='ACTCGCGTCGT'
75
+ mids['MID1']='ACGAGTGCGT'
76
+ mids['MID2']='ACGCTCGACA'
77
+ mids['MID3']='AGACGCACTC'
78
+ mids['MID4']='AGCACTGTAG'
79
+ mids['MID5']='ATCAGACACG'
80
+ mids['MID6']='ATATCGCGAG'
81
+ mids['MID7']='CGTGTCTCTA'
82
+ mids['MID8']='CTCGCGTGTC'
83
+ mids['MID9']='TAGTATCAGC'
84
+ mids['MID10']='TCTCTATGCG'
85
+ mids['MID11']='TGATACGTCT'
86
+ mids['MID12']='TACTGAGCTA'
87
+
88
+ mids.each do |mid_name,mid|
89
+
90
+ seqs.each do |name,fasta,qual,comment|
91
+
92
+ # find a known MID position
93
+ pos=fasta.upcase.index(mid)
94
+
95
+ if pos
96
+
97
+ # keep fasta from pos to end
98
+ fasta.slice!(0,pos+mid.length)
99
+
100
+ # keep qual from pos to end
101
+ qual.slice!(0,pos+mid.length)
102
+
103
+ end
104
+ end
105
+ end
106
+
107
+ end
108
+
109
+
110
+ def do_dummy_calculation
111
+ numer_of_calcs=250000
112
+
113
+ t=Time.now
114
+
115
+ x1=1
116
+ x2=1
117
+
118
+ # do a loop with calculations
119
+ numer_of_calcs.times do |i|
120
+ x=x1+x2
121
+
122
+ x1=x2
123
+ x2=x
124
+
125
+ # puts some info at regular intervals
126
+ if (i % 100000)==0
127
+ puts "Calculated #{i} by thread #{n}"
128
+ end
129
+ end
130
+ puts Time.now-t
131
+
132
+ end
133
+
134
+ end