scbi_mapreduce 0.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +49 -0
- data/Manifest.txt +46 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +295 -0
- data/Rakefile +28 -0
- data/bin/scbi_mapreduce +52 -0
- data/lib/scbi_mapreduce.rb +15 -0
- data/lib/scbi_mapreduce/error_handler.rb +15 -0
- data/lib/scbi_mapreduce/main_worker.rb +50 -0
- data/lib/scbi_mapreduce/manager.rb +110 -0
- data/lib/scbi_mapreduce/work_manager.rb +405 -0
- data/lib/scbi_mapreduce/worker.rb +163 -0
- data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
- data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/skeleton/dummy_calcs/README.txt +25 -0
- data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
- data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
- data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
- data/skeleton/dummy_calcs/main.rb +67 -0
- data/skeleton/dummy_calcs/my_worker.rb +56 -0
- data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
- data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
- data/skeleton/remove_mids/README.txt +30 -0
- data/skeleton/remove_mids/launch_only_workers.rb +29 -0
- data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
- data/skeleton/remove_mids/lib/find_mids.rb +191 -0
- data/skeleton/remove_mids/lib/global_match.rb +97 -0
- data/skeleton/remove_mids/linear_implementation.rb +87 -0
- data/skeleton/remove_mids/main.rb +89 -0
- data/skeleton/remove_mids/my_worker.rb +59 -0
- data/skeleton/remove_mids/my_worker_manager.rb +68 -0
- data/skeleton/simple/README.txt +16 -0
- data/skeleton/simple/main.rb +41 -0
- data/skeleton/simple/my_worker.rb +53 -0
- data/skeleton/simple/my_worker_manager.rb +55 -0
- data/test/drb_test/main.rb +31 -0
- data/test/drb_test/my_worker.rb +36 -0
- data/test/drb_test/my_worker_manager.rb +41 -0
- data/test/drb_test/scbi_drb_checkpoint +1 -0
- data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
- data/test/test_helper.rb +3 -0
- data/test/test_scbi_drb.rb +11 -0
- metadata +127 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
class GMatch
|
2
|
+
|
3
|
+
attr_accessor :offset
|
4
|
+
attr_accessor :match
|
5
|
+
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
class String
|
10
|
+
def lcs(s2)
|
11
|
+
s1=self
|
12
|
+
res=""
|
13
|
+
num=Array.new(s1.size){Array.new(s2.size)}
|
14
|
+
len,ans=0
|
15
|
+
lastsub=0
|
16
|
+
s1.scan(/./).each_with_index do |l1,i |
|
17
|
+
s2.scan(/./).each_with_index do |l2,j |
|
18
|
+
unless l1==l2
|
19
|
+
num[i][j]=0
|
20
|
+
else
|
21
|
+
(i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
|
22
|
+
if num[i][j] > len
|
23
|
+
len = ans = num[i][j]
|
24
|
+
thissub = i
|
25
|
+
thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
|
26
|
+
if lastsub==thissub
|
27
|
+
res+=s1[i,1]
|
28
|
+
else
|
29
|
+
lastsub=thissub
|
30
|
+
res=s1[lastsub, (i+1)-lastsub]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
res
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
class Regexp
|
42
|
+
def global_match(input_str,overlap_group_no = 0)
|
43
|
+
res = []
|
44
|
+
|
45
|
+
str=input_str
|
46
|
+
|
47
|
+
last_end = 0
|
48
|
+
|
49
|
+
loop do
|
50
|
+
str = input_str.slice(last_end,input_str.length-last_end)
|
51
|
+
if str.nil? or str.empty?
|
52
|
+
break
|
53
|
+
end
|
54
|
+
|
55
|
+
m = self.match(str)
|
56
|
+
# puts "find in: #{str}"
|
57
|
+
|
58
|
+
if !m.nil?
|
59
|
+
# puts m.inspect
|
60
|
+
|
61
|
+
|
62
|
+
new_match=GMatch.new()
|
63
|
+
new_match.offset = last_end
|
64
|
+
new_match.match = m
|
65
|
+
|
66
|
+
res.push new_match
|
67
|
+
|
68
|
+
if overlap_group_no == 0
|
69
|
+
last_end += m.end(overlap_group_no)
|
70
|
+
else
|
71
|
+
last_end += m.begin(overlap_group_no)
|
72
|
+
end
|
73
|
+
|
74
|
+
else
|
75
|
+
break
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
return res
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# def global_match(str, &proc)
|
86
|
+
# retval = nil
|
87
|
+
# loop do
|
88
|
+
# res = str.sub(self) do |m|
|
89
|
+
# proc.call($~) # pass MatchData obj
|
90
|
+
# ''
|
91
|
+
# end
|
92
|
+
# break retval if res == str
|
93
|
+
# str = res
|
94
|
+
# retval ||= true
|
95
|
+
# end
|
96
|
+
# end
|
97
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# load required libraries
|
4
|
+
require 'scbi_mapreduce'
|
5
|
+
|
6
|
+
# in order to load fastq files
|
7
|
+
require 'scbi_fastq'
|
8
|
+
|
9
|
+
# modify include path
|
10
|
+
$: << File.join(File.dirname(__FILE__),'lib')
|
11
|
+
|
12
|
+
require 'find_mids'
|
13
|
+
include FindMids
|
14
|
+
|
15
|
+
# check arguments
|
16
|
+
if ARGV.count != 2
|
17
|
+
|
18
|
+
puts "Usage #{File.basename($0)} fastq_file chunk"
|
19
|
+
puts ""
|
20
|
+
puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a KNOWN_MID from it"
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
fastq_file_path=ARGV[0]
|
25
|
+
|
26
|
+
if !File.exists?(fastq_file_path)
|
27
|
+
puts "Error, #{fastq_file_path} doesn't exists"
|
28
|
+
exit
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
# make processing
|
33
|
+
|
34
|
+
# open files
|
35
|
+
@@fastq_file=FastqFile.new(fastq_file_path)
|
36
|
+
@@results=FastqFile.new('./results2.fastq','w+')
|
37
|
+
|
38
|
+
# process
|
39
|
+
chunk_size=ARGV[1].to_i
|
40
|
+
|
41
|
+
# iterate over file
|
42
|
+
begin
|
43
|
+
seqs=[]
|
44
|
+
|
45
|
+
chunk_size.times do
|
46
|
+
# read data from file
|
47
|
+
name,fasta,qual,comments=@@fastq_file.next_seq
|
48
|
+
|
49
|
+
if name.nil?
|
50
|
+
break
|
51
|
+
end
|
52
|
+
seqs<<[name,fasta,qual,comments]
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
if !seqs.empty?
|
57
|
+
|
58
|
+
|
59
|
+
# process it
|
60
|
+
find_mid_without_blast(seqs)
|
61
|
+
|
62
|
+
# # find a known MID position
|
63
|
+
# pos=fasta.upcase.index(KNOWN_MID)
|
64
|
+
#
|
65
|
+
# if pos
|
66
|
+
#
|
67
|
+
# # keep fasta from pos to end
|
68
|
+
# fasta.slice!(0,pos + KNOWN_MID.length)
|
69
|
+
# # keep qual from pos to end
|
70
|
+
# qual.slice!(0,pos + KNOWN_MID.length)
|
71
|
+
#
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
|
75
|
+
# write data to disk
|
76
|
+
seqs.each do |name,fasta,qual,comments|
|
77
|
+
@@results.write_seq(name,fasta,qual,comments)
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end until seqs.empty?
|
83
|
+
|
84
|
+
|
85
|
+
# close files
|
86
|
+
@@fastq_file.close
|
87
|
+
@@results.close
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$: << File.dirname(__FILE__)
|
4
|
+
|
5
|
+
# load required libraries
|
6
|
+
require 'scbi_mapreduce'
|
7
|
+
|
8
|
+
# in order to load fastq files
|
9
|
+
require 'scbi_fastq'
|
10
|
+
|
11
|
+
require 'my_worker_manager.rb'
|
12
|
+
|
13
|
+
# check arguments
|
14
|
+
if ARGV.count != 3
|
15
|
+
|
16
|
+
puts "Usage #{File.basename($0)} fastq_file workers chunk_size"
|
17
|
+
puts ""
|
18
|
+
puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a MID (barcode) from it"
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
|
22
|
+
fastq_file_path=ARGV.shift
|
23
|
+
|
24
|
+
if !File.exists?(fastq_file_path)
|
25
|
+
puts "Error, #{fastq_file_path} doesn't exists"
|
26
|
+
exit
|
27
|
+
end
|
28
|
+
|
29
|
+
# listen on all ips at port 50000
|
30
|
+
server_ip='10.243'
|
31
|
+
ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
|
32
|
+
|
33
|
+
ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
|
34
|
+
|
35
|
+
if !ip
|
36
|
+
ip='0.0.0.0'
|
37
|
+
end
|
38
|
+
|
39
|
+
port = 0
|
40
|
+
|
41
|
+
# set number of workers. You can also provide an array with worker names.
|
42
|
+
# Those workers names can be read from a file produced by the existing
|
43
|
+
# queue system, if any.
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
workers = 4
|
48
|
+
|
49
|
+
# read optional workers parameter
|
50
|
+
input_workers = ARGV.shift
|
51
|
+
if !input_workers.nil?
|
52
|
+
# if it is a file
|
53
|
+
if File.exists?(input_workers)
|
54
|
+
# read workers into array
|
55
|
+
workers=File.read(input_workers).split("\n").map{|w| w.chomp}
|
56
|
+
else
|
57
|
+
# workers is a number
|
58
|
+
workers = input_workers.to_i
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
chunk_size = ARGV.shift.to_i
|
63
|
+
|
64
|
+
# we need the path to my_worker in order to launch it when necessary
|
65
|
+
custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
|
66
|
+
|
67
|
+
# initialize the work manager. Here you can pass parameters like file names
|
68
|
+
MyWorkerManager.init_work_manager(fastq_file_path)
|
69
|
+
|
70
|
+
# launch processor server
|
71
|
+
mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT,'~seqtrimnext/init_env')
|
72
|
+
|
73
|
+
# you can set additional properties
|
74
|
+
# =================================
|
75
|
+
|
76
|
+
# if you want basic checkpointing. Some performance drop should be expected
|
77
|
+
# mgr.checkpointing=true
|
78
|
+
|
79
|
+
# if you want to keep the order of input data. Some performance drop should be expected
|
80
|
+
# mgr.keep_order=true
|
81
|
+
|
82
|
+
# you can set the size of packets of data sent to workers
|
83
|
+
mgr.chunk_size=chunk_size
|
84
|
+
|
85
|
+
# start processing
|
86
|
+
mgr.start_server
|
87
|
+
|
88
|
+
# this line is reached when all data has been processed
|
89
|
+
puts "Program finished"
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
# adjust import paths
|
3
|
+
$: << File.join(File.dirname(__FILE__),'lib')
|
4
|
+
|
5
|
+
# load external module
|
6
|
+
require 'find_mids'
|
7
|
+
include FindMids
|
8
|
+
|
9
|
+
|
10
|
+
# MyWorker defines the behaviour of workers.
|
11
|
+
# Here is where the real processing takes place
|
12
|
+
class MyWorker < ScbiMapreduce::Worker
|
13
|
+
|
14
|
+
# starting_worker method is called one time at initialization
|
15
|
+
# and allows you to initialize your variables
|
16
|
+
def starting_worker
|
17
|
+
|
18
|
+
# You can use worker logs at any time in this way:
|
19
|
+
# $WORKER_LOG.info "Starting a worker"
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# receive_initial_config is called only once just after
|
25
|
+
# the first connection, when initial parameters are
|
26
|
+
# received from manager
|
27
|
+
def receive_initial_config(parameters)
|
28
|
+
|
29
|
+
# Reads the parameters
|
30
|
+
|
31
|
+
# You can use worker logs at any time in this way:
|
32
|
+
# $WORKER_LOG.info "Params received"
|
33
|
+
|
34
|
+
# save received parameters, if any
|
35
|
+
# @params = parameters
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# process_object method is called for each received object.
|
40
|
+
# Be aware that objs is always an array, and you must iterate
|
41
|
+
# over it if you need to process it independently
|
42
|
+
#
|
43
|
+
# The value returned here will be received by the work_received
|
44
|
+
# method at your worker_manager subclass.
|
45
|
+
def process_object(objs)
|
46
|
+
|
47
|
+
# find mid in sequences
|
48
|
+
# find_mid_with_blast(objs)
|
49
|
+
find_mid_without_blast(objs)
|
50
|
+
|
51
|
+
# return modified objs back to manager
|
52
|
+
return objs
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
def closing_worker
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
# MyWorkerManager class is used to implement the methods
|
4
|
+
# to send and receive the data to or from workers
|
5
|
+
class MyWorkerManager < ScbiMapreduce::WorkManager
|
6
|
+
|
7
|
+
# init_work_manager is executed at the start, prior to any processing.
|
8
|
+
# You can use init_work_manager to initialize global variables, open files, etc...
|
9
|
+
# Note that an instance of MyWorkerManager will be created for each
|
10
|
+
# worker connection, and thus, all global variables here should be
|
11
|
+
# class variables (starting with @@)
|
12
|
+
def self.init_work_manager(fastq_file_path)
|
13
|
+
|
14
|
+
# puts "tiempo1",Time.now
|
15
|
+
# open file using scbi_fastq gem
|
16
|
+
@@fastq_file=FastqFile.new(fastq_file_path)
|
17
|
+
@@results=FastqFile.new('./results.fastq'+Time.now.usec.to_s,'w+')
|
18
|
+
@@cache = []
|
19
|
+
|
20
|
+
# @@fastq_file.each do |name,fasta,qual,comments|
|
21
|
+
# @@cache << [name,fasta,qual,comments]
|
22
|
+
# end
|
23
|
+
# puts "tiempo2",Time.now
|
24
|
+
end
|
25
|
+
|
26
|
+
# end_work_manager is executed at the end, when all the process is done.
|
27
|
+
# You can use it to close files opened in init_work_manager
|
28
|
+
def self.end_work_manager
|
29
|
+
@@fastq_file.close
|
30
|
+
|
31
|
+
@@results.close
|
32
|
+
end
|
33
|
+
|
34
|
+
# worker_initial_config is used to send initial parameters to workers.
|
35
|
+
# The method is executed once per each worker
|
36
|
+
def worker_initial_config
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
# next_work method is called every time a worker needs a new work
|
41
|
+
# Here you can read data from disk
|
42
|
+
# This method must return the work data or nil if no more data is available
|
43
|
+
def next_work
|
44
|
+
name,fasta,qual,comments=@@fastq_file.next_seq
|
45
|
+
# name,fasta,qual,comments=@@cache.shift
|
46
|
+
|
47
|
+
if !name.nil?
|
48
|
+
return name,fasta,qual,comments
|
49
|
+
else
|
50
|
+
return nil
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
# work_received is executed each time a worker has finished a job.
|
57
|
+
# Here you can write results down to disk, perform some aggregated statistics, etc...
|
58
|
+
def work_received(results)
|
59
|
+
|
60
|
+
# write results to disk
|
61
|
+
results.each do |name,fasta,qual,comments|
|
62
|
+
# puts "comments: #{comments}\n"
|
63
|
+
@@results.write_seq(name,fasta,qual,comments)
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
A simple scbi_mapreduce application demo
|
2
|
+
========================================
|
3
|
+
|
4
|
+
This application is only a basic template. You should modify the files
|
5
|
+
to perform the desired tasks. There are other templates available, you
|
6
|
+
can list them by issuing this command:
|
7
|
+
|
8
|
+
scbi_mapreduce
|
9
|
+
|
10
|
+
You can launch the application right now with the following command:
|
11
|
+
|
12
|
+
ruby main.rb
|
13
|
+
|
14
|
+
A server and some workers will be launched, and 200.000 1Kb strings will be
|
15
|
+
converted from downcase to uppercase, in blocks of 100 strings.
|
16
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# load required libraries
|
4
|
+
require 'scbi_mapreduce'
|
5
|
+
require './my_worker_manager.rb'
|
6
|
+
|
7
|
+
# listen on all ips at port 50000
|
8
|
+
ip='0.0.0.0'
|
9
|
+
port = 50000
|
10
|
+
|
11
|
+
# set number of workers. You can also provide an array with worker names.
|
12
|
+
# Those workers names can be read from a file produced by the existing
|
13
|
+
# queue system, if any.
|
14
|
+
workers = 8
|
15
|
+
|
16
|
+
# we need the path to my_worker in order to launch it when necessary
|
17
|
+
custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
|
18
|
+
|
19
|
+
# initialize the work manager. Here you can pass parameters like file names
|
20
|
+
MyWorkerManager.init_work_manager
|
21
|
+
|
22
|
+
# launch processor server
|
23
|
+
mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
|
24
|
+
|
25
|
+
# you can set additional properties
|
26
|
+
# =================================
|
27
|
+
|
28
|
+
# if you want basic checkpointing. Some performance drop should be expected
|
29
|
+
# mgr.checkpointing=true
|
30
|
+
|
31
|
+
# if you want to keep the order of input data. Some performance drop should be expected
|
32
|
+
# mgr.keep_order=true
|
33
|
+
|
34
|
+
# you can set the size of packets of data sent to workers
|
35
|
+
mgr.chunk_size=100
|
36
|
+
|
37
|
+
# start processing
|
38
|
+
mgr.start_server
|
39
|
+
|
40
|
+
# this line is reached when all data has been processed
|
41
|
+
puts "Program finished"
|