scbi_mapreduce 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
@@ -0,0 +1,97 @@
1
+ class GMatch
2
+
3
+ attr_accessor :offset
4
+ attr_accessor :match
5
+
6
+
7
+ end
8
+
9
+ class String
10
+ def lcs(s2)
11
+ s1=self
12
+ res=""
13
+ num=Array.new(s1.size){Array.new(s2.size)}
14
+ len,ans=0
15
+ lastsub=0
16
+ s1.scan(/./).each_with_index do |l1,i |
17
+ s2.scan(/./).each_with_index do |l2,j |
18
+ unless l1==l2
19
+ num[i][j]=0
20
+ else
21
+ (i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
22
+ if num[i][j] > len
23
+ len = ans = num[i][j]
24
+ thissub = i
25
+ thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
26
+ if lastsub==thissub
27
+ res+=s1[i,1]
28
+ else
29
+ lastsub=thissub
30
+ res=s1[lastsub, (i+1)-lastsub]
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ res
37
+ end
38
+ end
39
+
40
+
41
+ class Regexp
42
+ def global_match(input_str,overlap_group_no = 0)
43
+ res = []
44
+
45
+ str=input_str
46
+
47
+ last_end = 0
48
+
49
+ loop do
50
+ str = input_str.slice(last_end,input_str.length-last_end)
51
+ if str.nil? or str.empty?
52
+ break
53
+ end
54
+
55
+ m = self.match(str)
56
+ # puts "find in: #{str}"
57
+
58
+ if !m.nil?
59
+ # puts m.inspect
60
+
61
+
62
+ new_match=GMatch.new()
63
+ new_match.offset = last_end
64
+ new_match.match = m
65
+
66
+ res.push new_match
67
+
68
+ if overlap_group_no == 0
69
+ last_end += m.end(overlap_group_no)
70
+ else
71
+ last_end += m.begin(overlap_group_no)
72
+ end
73
+
74
+ else
75
+ break
76
+ end
77
+
78
+ end
79
+
80
+
81
+ return res
82
+ end
83
+
84
+
85
+ # def global_match(str, &proc)
86
+ # retval = nil
87
+ # loop do
88
+ # res = str.sub(self) do |m|
89
+ # proc.call($~) # pass MatchData obj
90
+ # ''
91
+ # end
92
+ # break retval if res == str
93
+ # str = res
94
+ # retval ||= true
95
+ # end
96
+ # end
97
+ end
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+ require 'scbi_mapreduce'
5
+
6
+ # in order to load fastq files
7
+ require 'scbi_fastq'
8
+
9
+ # modify include path
10
+ $: << File.join(File.dirname(__FILE__),'lib')
11
+
12
+ require 'find_mids'
13
+ include FindMids
14
+
15
+ # check arguments
16
+ if ARGV.count != 2
17
+
18
+ puts "Usage #{File.basename($0)} fastq_file chunk"
19
+ puts ""
20
+ puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a KNOWN_MID from it"
21
+ exit
22
+ end
23
+
24
+ fastq_file_path=ARGV[0]
25
+
26
+ if !File.exists?(fastq_file_path)
27
+ puts "Error, #{fastq_file_path} doesn't exists"
28
+ exit
29
+ end
30
+
31
+
32
+ # make processing
33
+
34
+ # open files
35
+ @@fastq_file=FastqFile.new(fastq_file_path)
36
+ @@results=FastqFile.new('./results2.fastq','w+')
37
+
38
+ # process
39
+ chunk_size=ARGV[1].to_i
40
+
41
+ # iterate over file
42
+ begin
43
+ seqs=[]
44
+
45
+ chunk_size.times do
46
+ # read data from file
47
+ name,fasta,qual,comments=@@fastq_file.next_seq
48
+
49
+ if name.nil?
50
+ break
51
+ end
52
+ seqs<<[name,fasta,qual,comments]
53
+
54
+ end
55
+
56
+ if !seqs.empty?
57
+
58
+
59
+ # process it
60
+ find_mid_without_blast(seqs)
61
+
62
+ # # find a known MID position
63
+ # pos=fasta.upcase.index(KNOWN_MID)
64
+ #
65
+ # if pos
66
+ #
67
+ # # keep fasta from pos to end
68
+ # fasta.slice!(0,pos + KNOWN_MID.length)
69
+ # # keep qual from pos to end
70
+ # qual.slice!(0,pos + KNOWN_MID.length)
71
+ #
72
+ # end
73
+ #
74
+
75
+ # write data to disk
76
+ seqs.each do |name,fasta,qual,comments|
77
+ @@results.write_seq(name,fasta,qual,comments)
78
+ end
79
+
80
+ end
81
+
82
+ end until seqs.empty?
83
+
84
+
85
+ # close files
86
+ @@fastq_file.close
87
+ @@results.close
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.dirname(__FILE__)
4
+
5
+ # load required libraries
6
+ require 'scbi_mapreduce'
7
+
8
+ # in order to load fastq files
9
+ require 'scbi_fastq'
10
+
11
+ require 'my_worker_manager.rb'
12
+
13
+ # check arguments
14
+ if ARGV.count != 3
15
+
16
+ puts "Usage #{File.basename($0)} fastq_file workers chunk_size"
17
+ puts ""
18
+ puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a MID (barcode) from it"
19
+ exit
20
+ end
21
+
22
+ fastq_file_path=ARGV.shift
23
+
24
+ if !File.exists?(fastq_file_path)
25
+ puts "Error, #{fastq_file_path} doesn't exists"
26
+ exit
27
+ end
28
+
29
+ # listen on all ips at port 50000
30
+ server_ip='10.243'
31
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
32
+
33
+ ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
34
+
35
+ if !ip
36
+ ip='0.0.0.0'
37
+ end
38
+
39
+ port = 0
40
+
41
+ # set number of workers. You can also provide an array with worker names.
42
+ # Those workers names can be read from a file produced by the existing
43
+ # queue system, if any.
44
+
45
+
46
+
47
+ workers = 4
48
+
49
+ # read optional workers parameter
50
+ input_workers = ARGV.shift
51
+ if !input_workers.nil?
52
+ # if it is a file
53
+ if File.exists?(input_workers)
54
+ # read workers into array
55
+ workers=File.read(input_workers).split("\n").map{|w| w.chomp}
56
+ else
57
+ # workers is a number
58
+ workers = input_workers.to_i
59
+ end
60
+ end
61
+
62
+ chunk_size = ARGV.shift.to_i
63
+
64
+ # we need the path to my_worker in order to launch it when necessary
65
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
66
+
67
+ # initialize the work manager. Here you can pass parameters like file names
68
+ MyWorkerManager.init_work_manager(fastq_file_path)
69
+
70
+ # launch processor server
71
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT,'~seqtrimnext/init_env')
72
+
73
+ # you can set additional properties
74
+ # =================================
75
+
76
+ # if you want basic checkpointing. Some performance drop should be expected
77
+ # mgr.checkpointing=true
78
+
79
+ # if you want to keep the order of input data. Some performance drop should be expected
80
+ # mgr.keep_order=true
81
+
82
+ # you can set the size of packets of data sent to workers
83
+ mgr.chunk_size=chunk_size
84
+
85
+ # start processing
86
+ mgr.start_server
87
+
88
+ # this line is reached when all data has been processed
89
+ puts "Program finished"
@@ -0,0 +1,59 @@
1
+
2
+ # adjust import paths
3
+ $: << File.join(File.dirname(__FILE__),'lib')
4
+
5
+ # load external module
6
+ require 'find_mids'
7
+ include FindMids
8
+
9
+
10
+ # MyWorker defines the behaviour of workers.
11
+ # Here is where the real processing takes place
12
+ class MyWorker < ScbiMapreduce::Worker
13
+
14
+ # starting_worker method is called one time at initialization
15
+ # and allows you to initialize your variables
16
+ def starting_worker
17
+
18
+ # You can use worker logs at any time in this way:
19
+ # $WORKER_LOG.info "Starting a worker"
20
+
21
+ end
22
+
23
+
24
+ # receive_initial_config is called only once just after
25
+ # the first connection, when initial parameters are
26
+ # received from manager
27
+ def receive_initial_config(parameters)
28
+
29
+ # Reads the parameters
30
+
31
+ # You can use worker logs at any time in this way:
32
+ # $WORKER_LOG.info "Params received"
33
+
34
+ # save received parameters, if any
35
+ # @params = parameters
36
+ end
37
+
38
+
39
+ # process_object method is called for each received object.
40
+ # Be aware that objs is always an array, and you must iterate
41
+ # over it if you need to process it independently
42
+ #
43
+ # The value returned here will be received by the work_received
44
+ # method at your worker_manager subclass.
45
+ def process_object(objs)
46
+
47
+ # find mid in sequences
48
+ # find_mid_with_blast(objs)
49
+ find_mid_without_blast(objs)
50
+
51
+ # return modified objs back to manager
52
+ return objs
53
+ end
54
+
55
+
56
+ def closing_worker
57
+
58
+ end
59
+ end
@@ -0,0 +1,68 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class MyWorkerManager < ScbiMapreduce::WorkManager
6
+
7
+ # init_work_manager is executed at the start, prior to any processing.
8
+ # You can use init_work_manager to initialize global variables, open files, etc...
9
+ # Note that an instance of MyWorkerManager will be created for each
10
+ # worker connection, and thus, all global variables here should be
11
+ # class variables (starting with @@)
12
+ def self.init_work_manager(fastq_file_path)
13
+
14
+ # puts "tiempo1",Time.now
15
+ # open file using scbi_fastq gem
16
+ @@fastq_file=FastqFile.new(fastq_file_path)
17
+ @@results=FastqFile.new('./results.fastq'+Time.now.usec.to_s,'w+')
18
+ @@cache = []
19
+
20
+ # @@fastq_file.each do |name,fasta,qual,comments|
21
+ # @@cache << [name,fasta,qual,comments]
22
+ # end
23
+ # puts "tiempo2",Time.now
24
+ end
25
+
26
+ # end_work_manager is executed at the end, when all the process is done.
27
+ # You can use it to close files opened in init_work_manager
28
+ def self.end_work_manager
29
+ @@fastq_file.close
30
+
31
+ @@results.close
32
+ end
33
+
34
+ # worker_initial_config is used to send initial parameters to workers.
35
+ # The method is executed once per each worker
36
+ def worker_initial_config
37
+
38
+ end
39
+
40
+ # next_work method is called every time a worker needs a new work
41
+ # Here you can read data from disk
42
+ # This method must return the work data or nil if no more data is available
43
+ def next_work
44
+ name,fasta,qual,comments=@@fastq_file.next_seq
45
+ # name,fasta,qual,comments=@@cache.shift
46
+
47
+ if !name.nil?
48
+ return name,fasta,qual,comments
49
+ else
50
+ return nil
51
+ end
52
+
53
+ end
54
+
55
+
56
+ # work_received is executed each time a worker has finished a job.
57
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
58
+ def work_received(results)
59
+
60
+ # write results to disk
61
+ results.each do |name,fasta,qual,comments|
62
+ # puts "comments: #{comments}\n"
63
+ @@results.write_seq(name,fasta,qual,comments)
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,16 @@
1
+ A simple scbi_mapreduce application demo
2
+ ========================================
3
+
4
+ This application is only a basic template. You should modify the files
5
+ to perform the desired tasks. There are other templates available, you
6
+ can list them by issuing this command:
7
+
8
+ scbi_mapreduce
9
+
10
+ You can launch the application right now with the following command:
11
+
12
+ ruby main.rb
13
+
14
+ A server and some workers will be launched, and 200.000 1Kb strings will be
15
+ converted from downcase to uppercase, in blocks of 100 strings.
16
+
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # load required libraries
4
+ require 'scbi_mapreduce'
5
+ require './my_worker_manager.rb'
6
+
7
+ # listen on all ips at port 50000
8
+ ip='0.0.0.0'
9
+ port = 50000
10
+
11
+ # set number of workers. You can also provide an array with worker names.
12
+ # Those workers names can be read from a file produced by the existing
13
+ # queue system, if any.
14
+ workers = 8
15
+
16
+ # we need the path to my_worker in order to launch it when necessary
17
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
18
+
19
+ # initialize the work manager. Here you can pass parameters like file names
20
+ MyWorkerManager.init_work_manager
21
+
22
+ # launch processor server
23
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
24
+
25
+ # you can set additional properties
26
+ # =================================
27
+
28
+ # if you want basic checkpointing. Some performance drop should be expected
29
+ # mgr.checkpointing=true
30
+
31
+ # if you want to keep the order of input data. Some performance drop should be expected
32
+ # mgr.keep_order=true
33
+
34
+ # you can set the size of packets of data sent to workers
35
+ mgr.chunk_size=100
36
+
37
+ # start processing
38
+ mgr.start_server
39
+
40
+ # this line is reached when all data has been processed
41
+ puts "Program finished"