scbi_mapreduce 0.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
@@ -0,0 +1,97 @@
1
+ class GMatch
2
+
3
+ attr_accessor :offset
4
+ attr_accessor :match
5
+
6
+
7
+ end
8
+
9
+ class String
10
+ def lcs(s2)
11
+ s1=self
12
+ res=""
13
+ num=Array.new(s1.size){Array.new(s2.size)}
14
+ len,ans=0
15
+ lastsub=0
16
+ s1.scan(/./).each_with_index do |l1,i |
17
+ s2.scan(/./).each_with_index do |l2,j |
18
+ unless l1==l2
19
+ num[i][j]=0
20
+ else
21
+ (i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
22
+ if num[i][j] > len
23
+ len = ans = num[i][j]
24
+ thissub = i
25
+ thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
26
+ if lastsub==thissub
27
+ res+=s1[i,1]
28
+ else
29
+ lastsub=thissub
30
+ res=s1[lastsub, (i+1)-lastsub]
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ res
37
+ end
38
+ end
39
+
40
+
41
+ class Regexp
42
+ def global_match(input_str,overlap_group_no = 0)
43
+ res = []
44
+
45
+ str=input_str
46
+
47
+ last_end = 0
48
+
49
+ loop do
50
+ str = input_str.slice(last_end,input_str.length-last_end)
51
+ if str.nil? or str.empty?
52
+ break
53
+ end
54
+
55
+ m = self.match(str)
56
+ # puts "find in: #{str}"
57
+
58
+ if !m.nil?
59
+ # puts m.inspect
60
+
61
+
62
+ new_match=GMatch.new()
63
+ new_match.offset = last_end
64
+ new_match.match = m
65
+
66
+ res.push new_match
67
+
68
+ if overlap_group_no == 0
69
+ last_end += m.end(overlap_group_no)
70
+ else
71
+ last_end += m.begin(overlap_group_no)
72
+ end
73
+
74
+ else
75
+ break
76
+ end
77
+
78
+ end
79
+
80
+
81
+ return res
82
+ end
83
+
84
+
85
+ # def global_match(str, &proc)
86
+ # retval = nil
87
+ # loop do
88
+ # res = str.sub(self) do |m|
89
+ # proc.call($~) # pass MatchData obj
90
+ # ''
91
+ # end
92
+ # break retval if res == str
93
+ # str = res
94
+ # retval ||= true
95
+ # end
96
+ # end
97
+ end
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+ require 'scbi_mapreduce'
5
+
6
+ # in order to load fastq files
7
+ require 'scbi_fastq'
8
+
9
+ # modify include path
10
+ $: << File.join(File.dirname(__FILE__),'lib')
11
+
12
+ require 'find_mids'
13
+ include FindMids
14
+
15
+ # check arguments
16
+ if ARGV.count != 2
17
+
18
+ puts "Usage #{File.basename($0)} fastq_file chunk"
19
+ puts ""
20
+ puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a KNOWN_MID from it"
21
+ exit
22
+ end
23
+
24
+ fastq_file_path=ARGV[0]
25
+
26
+ if !File.exists?(fastq_file_path)
27
+ puts "Error, #{fastq_file_path} doesn't exists"
28
+ exit
29
+ end
30
+
31
+
32
+ # make processing
33
+
34
+ # open files
35
+ @@fastq_file=FastqFile.new(fastq_file_path)
36
+ @@results=FastqFile.new('./results2.fastq','w+')
37
+
38
+ # process
39
+ chunk_size=ARGV[1].to_i
40
+
41
+ # iterate over file
42
+ begin
43
+ seqs=[]
44
+
45
+ chunk_size.times do
46
+ # read data from file
47
+ name,fasta,qual,comments=@@fastq_file.next_seq
48
+
49
+ if name.nil?
50
+ break
51
+ end
52
+ seqs<<[name,fasta,qual,comments]
53
+
54
+ end
55
+
56
+ if !seqs.empty?
57
+
58
+
59
+ # process it
60
+ find_mid_without_blast(seqs)
61
+
62
+ # # find a known MID position
63
+ # pos=fasta.upcase.index(KNOWN_MID)
64
+ #
65
+ # if pos
66
+ #
67
+ # # keep fasta from pos to end
68
+ # fasta.slice!(0,pos + KNOWN_MID.length)
69
+ # # keep qual from pos to end
70
+ # qual.slice!(0,pos + KNOWN_MID.length)
71
+ #
72
+ # end
73
+ #
74
+
75
+ # write data to disk
76
+ seqs.each do |name,fasta,qual,comments|
77
+ @@results.write_seq(name,fasta,qual,comments)
78
+ end
79
+
80
+ end
81
+
82
+ end until seqs.empty?
83
+
84
+
85
+ # close files
86
+ @@fastq_file.close
87
+ @@results.close
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.dirname(__FILE__)
4
+
5
+ # load required libraries
6
+ require 'scbi_mapreduce'
7
+
8
+ # in order to load fastq files
9
+ require 'scbi_fastq'
10
+
11
+ require 'my_worker_manager.rb'
12
+
13
+ # check arguments
14
+ if ARGV.count != 3
15
+
16
+ puts "Usage #{File.basename($0)} fastq_file workers chunk_size"
17
+ puts ""
18
+ puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a MID (barcode) from it"
19
+ exit
20
+ end
21
+
22
+ fastq_file_path=ARGV.shift
23
+
24
+ if !File.exists?(fastq_file_path)
25
+ puts "Error, #{fastq_file_path} doesn't exists"
26
+ exit
27
+ end
28
+
29
+ # listen on all ips at port 50000
30
+ server_ip='10.243'
31
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
32
+
33
+ ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
34
+
35
+ if !ip
36
+ ip='0.0.0.0'
37
+ end
38
+
39
+ port = 0
40
+
41
+ # set number of workers. You can also provide an array with worker names.
42
+ # Those workers names can be read from a file produced by the existing
43
+ # queue system, if any.
44
+
45
+
46
+
47
+ workers = 4
48
+
49
+ # read optional workers parameter
50
+ input_workers = ARGV.shift
51
+ if !input_workers.nil?
52
+ # if it is a file
53
+ if File.exists?(input_workers)
54
+ # read workers into array
55
+ workers=File.read(input_workers).split("\n").map{|w| w.chomp}
56
+ else
57
+ # workers is a number
58
+ workers = input_workers.to_i
59
+ end
60
+ end
61
+
62
+ chunk_size = ARGV.shift.to_i
63
+
64
+ # we need the path to my_worker in order to launch it when necessary
65
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
66
+
67
+ # initialize the work manager. Here you can pass parameters like file names
68
+ MyWorkerManager.init_work_manager(fastq_file_path)
69
+
70
+ # launch processor server
71
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT,'~seqtrimnext/init_env')
72
+
73
+ # you can set additional properties
74
+ # =================================
75
+
76
+ # if you want basic checkpointing. Some performance drop should be expected
77
+ # mgr.checkpointing=true
78
+
79
+ # if you want to keep the order of input data. Some performance drop should be expected
80
+ # mgr.keep_order=true
81
+
82
+ # you can set the size of packets of data sent to workers
83
+ mgr.chunk_size=chunk_size
84
+
85
+ # start processing
86
+ mgr.start_server
87
+
88
+ # this line is reached when all data has been processed
89
+ puts "Program finished"
@@ -0,0 +1,59 @@
1
+
2
+ # adjust import paths
3
+ $: << File.join(File.dirname(__FILE__),'lib')
4
+
5
+ # load external module
6
+ require 'find_mids'
7
+ include FindMids
8
+
9
+
10
+ # MyWorker defines the behaviour of workers.
11
+ # Here is where the real processing takes place
12
+ class MyWorker < ScbiMapreduce::Worker
13
+
14
+ # starting_worker method is called one time at initialization
15
+ # and allows you to initialize your variables
16
+ def starting_worker
17
+
18
+ # You can use worker logs at any time in this way:
19
+ # $WORKER_LOG.info "Starting a worker"
20
+
21
+ end
22
+
23
+
24
+ # receive_initial_config is called only once just after
25
+ # the first connection, when initial parameters are
26
+ # received from manager
27
+ def receive_initial_config(parameters)
28
+
29
+ # Reads the parameters
30
+
31
+ # You can use worker logs at any time in this way:
32
+ # $WORKER_LOG.info "Params received"
33
+
34
+ # save received parameters, if any
35
+ # @params = parameters
36
+ end
37
+
38
+
39
+ # process_object method is called for each received object.
40
+ # Be aware that objs is always an array, and you must iterate
41
+ # over it if you need to process it independently
42
+ #
43
+ # The value returned here will be received by the work_received
44
+ # method at your worker_manager subclass.
45
+ def process_object(objs)
46
+
47
+ # find mid in sequences
48
+ # find_mid_with_blast(objs)
49
+ find_mid_without_blast(objs)
50
+
51
+ # return modified objs back to manager
52
+ return objs
53
+ end
54
+
55
+
56
+ def closing_worker
57
+
58
+ end
59
+ end
@@ -0,0 +1,68 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class MyWorkerManager < ScbiMapreduce::WorkManager
6
+
7
+ # init_work_manager is executed at the start, prior to any processing.
8
+ # You can use init_work_manager to initialize global variables, open files, etc...
9
+ # Note that an instance of MyWorkerManager will be created for each
10
+ # worker connection, and thus, all global variables here should be
11
+ # class variables (starting with @@)
12
+ def self.init_work_manager(fastq_file_path)
13
+
14
+ # puts "tiempo1",Time.now
15
+ # open file using scbi_fastq gem
16
+ @@fastq_file=FastqFile.new(fastq_file_path)
17
+ @@results=FastqFile.new('./results.fastq'+Time.now.usec.to_s,'w+')
18
+ @@cache = []
19
+
20
+ # @@fastq_file.each do |name,fasta,qual,comments|
21
+ # @@cache << [name,fasta,qual,comments]
22
+ # end
23
+ # puts "tiempo2",Time.now
24
+ end
25
+
26
+ # end_work_manager is executed at the end, when all the process is done.
27
+ # You can use it to close files opened in init_work_manager
28
+ def self.end_work_manager
29
+ @@fastq_file.close
30
+
31
+ @@results.close
32
+ end
33
+
34
+ # worker_initial_config is used to send initial parameters to workers.
35
+ # The method is executed once per each worker
36
+ def worker_initial_config
37
+
38
+ end
39
+
40
+ # next_work method is called every time a worker needs a new work
41
+ # Here you can read data from disk
42
+ # This method must return the work data or nil if no more data is available
43
+ def next_work
44
+ name,fasta,qual,comments=@@fastq_file.next_seq
45
+ # name,fasta,qual,comments=@@cache.shift
46
+
47
+ if !name.nil?
48
+ return name,fasta,qual,comments
49
+ else
50
+ return nil
51
+ end
52
+
53
+ end
54
+
55
+
56
+ # work_received is executed each time a worker has finished a job.
57
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
58
+ def work_received(results)
59
+
60
+ # write results to disk
61
+ results.each do |name,fasta,qual,comments|
62
+ # puts "comments: #{comments}\n"
63
+ @@results.write_seq(name,fasta,qual,comments)
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,16 @@
1
+ A simple scbi_mapreduce application demo
2
+ ========================================
3
+
4
+ This application is only a basic template. You should modify the files
5
+ to perform the desired tasks. There are other templates available, you
6
+ can list them by issuing this command:
7
+
8
+ scbi_mapreduce
9
+
10
+ You can launch the application right now with the following command:
11
+
12
+ ruby main.rb
13
+
14
+ A server and some workers will be launched, and 200.000 1Kb strings will be
15
+ converted from downcase to uppercase, in blocks of 100 strings.
16
+
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+ require 'scbi_mapreduce'
5
+ require './my_worker_manager.rb'
6
+
7
+ # listen on all ips at port 50000
8
+ ip='0.0.0.0'
9
+ port = 50000
10
+
11
+ # set number of workers. You can also provide an array with worker names.
12
+ # Those workers names can be read from a file produced by the existing
13
+ # queue system, if any.
14
+ workers = 8
15
+
16
+ # we need the path to my_worker in order to launch it when necessary
17
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
18
+
19
+ # initialize the work manager. Here you can pass parameters like file names
20
+ MyWorkerManager.init_work_manager
21
+
22
+ # launch processor server
23
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
24
+
25
+ # you can set additional properties
26
+ # =================================
27
+
28
+ # if you want basic checkpointing. Some performance drop should be expected
29
+ # mgr.checkpointing=true
30
+
31
+ # if you want to keep the order of input data. Some performance drop should be expected
32
+ # mgr.keep_order=true
33
+
34
+ # you can set the size of packets of data sent to workers
35
+ mgr.chunk_size=100
36
+
37
+ # start processing
38
+ mgr.start_server
39
+
40
+ # this line is reached when all data has been processed
41
+ puts "Program finished"