scbi_mapreduce 0.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +49 -0
- data/Manifest.txt +46 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +295 -0
- data/Rakefile +28 -0
- data/bin/scbi_mapreduce +52 -0
- data/lib/scbi_mapreduce.rb +15 -0
- data/lib/scbi_mapreduce/error_handler.rb +15 -0
- data/lib/scbi_mapreduce/main_worker.rb +50 -0
- data/lib/scbi_mapreduce/manager.rb +110 -0
- data/lib/scbi_mapreduce/work_manager.rb +405 -0
- data/lib/scbi_mapreduce/worker.rb +163 -0
- data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
- data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/skeleton/dummy_calcs/README.txt +25 -0
- data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
- data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
- data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
- data/skeleton/dummy_calcs/main.rb +67 -0
- data/skeleton/dummy_calcs/my_worker.rb +56 -0
- data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
- data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
- data/skeleton/remove_mids/README.txt +30 -0
- data/skeleton/remove_mids/launch_only_workers.rb +29 -0
- data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
- data/skeleton/remove_mids/lib/find_mids.rb +191 -0
- data/skeleton/remove_mids/lib/global_match.rb +97 -0
- data/skeleton/remove_mids/linear_implementation.rb +87 -0
- data/skeleton/remove_mids/main.rb +89 -0
- data/skeleton/remove_mids/my_worker.rb +59 -0
- data/skeleton/remove_mids/my_worker_manager.rb +68 -0
- data/skeleton/simple/README.txt +16 -0
- data/skeleton/simple/main.rb +41 -0
- data/skeleton/simple/my_worker.rb +53 -0
- data/skeleton/simple/my_worker_manager.rb +55 -0
- data/test/drb_test/main.rb +31 -0
- data/test/drb_test/my_worker.rb +36 -0
- data/test/drb_test/my_worker_manager.rb +41 -0
- data/test/drb_test/scbi_drb_checkpoint +1 -0
- data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
- data/test/test_helper.rb +3 -0
- data/test/test_scbi_drb.rb +11 -0
- metadata +127 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
class GMatch
|
2
|
+
|
3
|
+
attr_accessor :offset
|
4
|
+
attr_accessor :match
|
5
|
+
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
class String
|
10
|
+
def lcs(s2)
|
11
|
+
s1=self
|
12
|
+
res=""
|
13
|
+
num=Array.new(s1.size){Array.new(s2.size)}
|
14
|
+
len,ans=0
|
15
|
+
lastsub=0
|
16
|
+
s1.scan(/./).each_with_index do |l1,i |
|
17
|
+
s2.scan(/./).each_with_index do |l2,j |
|
18
|
+
unless l1==l2
|
19
|
+
num[i][j]=0
|
20
|
+
else
|
21
|
+
(i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
|
22
|
+
if num[i][j] > len
|
23
|
+
len = ans = num[i][j]
|
24
|
+
thissub = i
|
25
|
+
thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
|
26
|
+
if lastsub==thissub
|
27
|
+
res+=s1[i,1]
|
28
|
+
else
|
29
|
+
lastsub=thissub
|
30
|
+
res=s1[lastsub, (i+1)-lastsub]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
res
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
class Regexp
|
42
|
+
def global_match(input_str,overlap_group_no = 0)
|
43
|
+
res = []
|
44
|
+
|
45
|
+
str=input_str
|
46
|
+
|
47
|
+
last_end = 0
|
48
|
+
|
49
|
+
loop do
|
50
|
+
str = input_str.slice(last_end,input_str.length-last_end)
|
51
|
+
if str.nil? or str.empty?
|
52
|
+
break
|
53
|
+
end
|
54
|
+
|
55
|
+
m = self.match(str)
|
56
|
+
# puts "find in: #{str}"
|
57
|
+
|
58
|
+
if !m.nil?
|
59
|
+
# puts m.inspect
|
60
|
+
|
61
|
+
|
62
|
+
new_match=GMatch.new()
|
63
|
+
new_match.offset = last_end
|
64
|
+
new_match.match = m
|
65
|
+
|
66
|
+
res.push new_match
|
67
|
+
|
68
|
+
if overlap_group_no == 0
|
69
|
+
last_end += m.end(overlap_group_no)
|
70
|
+
else
|
71
|
+
last_end += m.begin(overlap_group_no)
|
72
|
+
end
|
73
|
+
|
74
|
+
else
|
75
|
+
break
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
return res
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# def global_match(str, &proc)
|
86
|
+
# retval = nil
|
87
|
+
# loop do
|
88
|
+
# res = str.sub(self) do |m|
|
89
|
+
# proc.call($~) # pass MatchData obj
|
90
|
+
# ''
|
91
|
+
# end
|
92
|
+
# break retval if res == str
|
93
|
+
# str = res
|
94
|
+
# retval ||= true
|
95
|
+
# end
|
96
|
+
# end
|
97
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# load required libraries
|
4
|
+
require 'scbi_mapreduce'
|
5
|
+
|
6
|
+
# in order to load fastq files
|
7
|
+
require 'scbi_fastq'
|
8
|
+
|
9
|
+
# modify include path
|
10
|
+
$: << File.join(File.dirname(__FILE__),'lib')
|
11
|
+
|
12
|
+
require 'find_mids'
|
13
|
+
include FindMids
|
14
|
+
|
15
|
+
# check arguments
|
16
|
+
if ARGV.count != 2
|
17
|
+
|
18
|
+
puts "Usage #{File.basename($0)} fastq_file chunk"
|
19
|
+
puts ""
|
20
|
+
puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a KNOWN_MID from it"
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
fastq_file_path=ARGV[0]
|
25
|
+
|
26
|
+
if !File.exists?(fastq_file_path)
|
27
|
+
puts "Error, #{fastq_file_path} doesn't exists"
|
28
|
+
exit
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
# make processing
|
33
|
+
|
34
|
+
# open files
|
35
|
+
@@fastq_file=FastqFile.new(fastq_file_path)
|
36
|
+
@@results=FastqFile.new('./results2.fastq','w+')
|
37
|
+
|
38
|
+
# process
|
39
|
+
chunk_size=ARGV[1].to_i
|
40
|
+
|
41
|
+
# iterate over file
|
42
|
+
begin
|
43
|
+
seqs=[]
|
44
|
+
|
45
|
+
chunk_size.times do
|
46
|
+
# read data from file
|
47
|
+
name,fasta,qual,comments=@@fastq_file.next_seq
|
48
|
+
|
49
|
+
if name.nil?
|
50
|
+
break
|
51
|
+
end
|
52
|
+
seqs<<[name,fasta,qual,comments]
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
if !seqs.empty?
|
57
|
+
|
58
|
+
|
59
|
+
# process it
|
60
|
+
find_mid_without_blast(seqs)
|
61
|
+
|
62
|
+
# # find a known MID position
|
63
|
+
# pos=fasta.upcase.index(KNOWN_MID)
|
64
|
+
#
|
65
|
+
# if pos
|
66
|
+
#
|
67
|
+
# # keep fasta from pos to end
|
68
|
+
# fasta.slice!(0,pos + KNOWN_MID.length)
|
69
|
+
# # keep qual from pos to end
|
70
|
+
# qual.slice!(0,pos + KNOWN_MID.length)
|
71
|
+
#
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
|
75
|
+
# write data to disk
|
76
|
+
seqs.each do |name,fasta,qual,comments|
|
77
|
+
@@results.write_seq(name,fasta,qual,comments)
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end until seqs.empty?
|
83
|
+
|
84
|
+
|
85
|
+
# close files
|
86
|
+
@@fastq_file.close
|
87
|
+
@@results.close
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$: << File.dirname(__FILE__)
|
4
|
+
|
5
|
+
# load required libraries
|
6
|
+
require 'scbi_mapreduce'
|
7
|
+
|
8
|
+
# in order to load fastq files
|
9
|
+
require 'scbi_fastq'
|
10
|
+
|
11
|
+
require 'my_worker_manager.rb'
|
12
|
+
|
13
|
+
# check arguments
|
14
|
+
if ARGV.count != 3
|
15
|
+
|
16
|
+
puts "Usage #{File.basename($0)} fastq_file workers chunk_size"
|
17
|
+
puts ""
|
18
|
+
puts "#{File.basename($0)} iterates over all sequences in fastq_file (a file in FastQ format) and removes a MID (barcode) from it"
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
|
22
|
+
fastq_file_path=ARGV.shift
|
23
|
+
|
24
|
+
if !File.exists?(fastq_file_path)
|
25
|
+
puts "Error, #{fastq_file_path} doesn't exists"
|
26
|
+
exit
|
27
|
+
end
|
28
|
+
|
29
|
+
# listen on all ips at port 50000
|
30
|
+
server_ip='10.243'
|
31
|
+
ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
|
32
|
+
|
33
|
+
ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
|
34
|
+
|
35
|
+
if !ip
|
36
|
+
ip='0.0.0.0'
|
37
|
+
end
|
38
|
+
|
39
|
+
port = 0
|
40
|
+
|
41
|
+
# set number of workers. You can also provide an array with worker names.
|
42
|
+
# Those workers names can be read from a file produced by the existing
|
43
|
+
# queue system, if any.
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
workers = 4
|
48
|
+
|
49
|
+
# read optional workers parameter
|
50
|
+
input_workers = ARGV.shift
|
51
|
+
if !input_workers.nil?
|
52
|
+
# if it is a file
|
53
|
+
if File.exists?(input_workers)
|
54
|
+
# read workers into array
|
55
|
+
workers=File.read(input_workers).split("\n").map{|w| w.chomp}
|
56
|
+
else
|
57
|
+
# workers is a number
|
58
|
+
workers = input_workers.to_i
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
chunk_size = ARGV.shift.to_i
|
63
|
+
|
64
|
+
# we need the path to my_worker in order to launch it when necessary
|
65
|
+
custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
|
66
|
+
|
67
|
+
# initialize the work manager. Here you can pass parameters like file names
|
68
|
+
MyWorkerManager.init_work_manager(fastq_file_path)
|
69
|
+
|
70
|
+
# launch processor server
|
71
|
+
mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT,'~seqtrimnext/init_env')
|
72
|
+
|
73
|
+
# you can set additional properties
|
74
|
+
# =================================
|
75
|
+
|
76
|
+
# if you want basic checkpointing. Some performance drop should be expected
|
77
|
+
# mgr.checkpointing=true
|
78
|
+
|
79
|
+
# if you want to keep the order of input data. Some performance drop should be expected
|
80
|
+
# mgr.keep_order=true
|
81
|
+
|
82
|
+
# you can set the size of packets of data sent to workers
|
83
|
+
mgr.chunk_size=chunk_size
|
84
|
+
|
85
|
+
# start processing
|
86
|
+
mgr.start_server
|
87
|
+
|
88
|
+
# this line is reached when all data has been processed
|
89
|
+
puts "Program finished"
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
# adjust import paths
|
3
|
+
$: << File.join(File.dirname(__FILE__),'lib')
|
4
|
+
|
5
|
+
# load external module
|
6
|
+
require 'find_mids'
|
7
|
+
include FindMids
|
8
|
+
|
9
|
+
|
10
|
+
# MyWorker defines the behaviour of workers.
|
11
|
+
# Here is where the real processing takes place
|
12
|
+
class MyWorker < ScbiMapreduce::Worker
|
13
|
+
|
14
|
+
# starting_worker method is called one time at initialization
|
15
|
+
# and allows you to initialize your variables
|
16
|
+
def starting_worker
|
17
|
+
|
18
|
+
# You can use worker logs at any time in this way:
|
19
|
+
# $WORKER_LOG.info "Starting a worker"
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# receive_initial_config is called only once just after
|
25
|
+
# the first connection, when initial parameters are
|
26
|
+
# received from manager
|
27
|
+
def receive_initial_config(parameters)
|
28
|
+
|
29
|
+
# Reads the parameters
|
30
|
+
|
31
|
+
# You can use worker logs at any time in this way:
|
32
|
+
# $WORKER_LOG.info "Params received"
|
33
|
+
|
34
|
+
# save received parameters, if any
|
35
|
+
# @params = parameters
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# process_object method is called for each received object.
|
40
|
+
# Be aware that objs is always an array, and you must iterate
|
41
|
+
# over it if you need to process it independently
|
42
|
+
#
|
43
|
+
# The value returned here will be received by the work_received
|
44
|
+
# method at your worker_manager subclass.
|
45
|
+
def process_object(objs)
|
46
|
+
|
47
|
+
# find mid in sequences
|
48
|
+
# find_mid_with_blast(objs)
|
49
|
+
find_mid_without_blast(objs)
|
50
|
+
|
51
|
+
# return modified objs back to manager
|
52
|
+
return objs
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
def closing_worker
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
# MyWorkerManager class is used to implement the methods
|
4
|
+
# to send and receive the data to or from workers
|
5
|
+
class MyWorkerManager < ScbiMapreduce::WorkManager
|
6
|
+
|
7
|
+
# init_work_manager is executed at the start, prior to any processing.
|
8
|
+
# You can use init_work_manager to initialize global variables, open files, etc...
|
9
|
+
# Note that an instance of MyWorkerManager will be created for each
|
10
|
+
# worker connection, and thus, all global variables here should be
|
11
|
+
# class variables (starting with @@)
|
12
|
+
def self.init_work_manager(fastq_file_path)
|
13
|
+
|
14
|
+
# puts "tiempo1",Time.now
|
15
|
+
# open file using scbi_fastq gem
|
16
|
+
@@fastq_file=FastqFile.new(fastq_file_path)
|
17
|
+
@@results=FastqFile.new('./results.fastq'+Time.now.usec.to_s,'w+')
|
18
|
+
@@cache = []
|
19
|
+
|
20
|
+
# @@fastq_file.each do |name,fasta,qual,comments|
|
21
|
+
# @@cache << [name,fasta,qual,comments]
|
22
|
+
# end
|
23
|
+
# puts "tiempo2",Time.now
|
24
|
+
end
|
25
|
+
|
26
|
+
# end_work_manager is executed at the end, when all the process is done.
|
27
|
+
# You can use it to close files opened in init_work_manager
|
28
|
+
def self.end_work_manager
|
29
|
+
@@fastq_file.close
|
30
|
+
|
31
|
+
@@results.close
|
32
|
+
end
|
33
|
+
|
34
|
+
# worker_initial_config is used to send initial parameters to workers.
|
35
|
+
# The method is executed once per each worker
|
36
|
+
def worker_initial_config
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
# next_work method is called every time a worker needs a new work
|
41
|
+
# Here you can read data from disk
|
42
|
+
# This method must return the work data or nil if no more data is available
|
43
|
+
def next_work
|
44
|
+
name,fasta,qual,comments=@@fastq_file.next_seq
|
45
|
+
# name,fasta,qual,comments=@@cache.shift
|
46
|
+
|
47
|
+
if !name.nil?
|
48
|
+
return name,fasta,qual,comments
|
49
|
+
else
|
50
|
+
return nil
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
# work_received is executed each time a worker has finished a job.
|
57
|
+
# Here you can write results down to disk, perform some aggregated statistics, etc...
|
58
|
+
def work_received(results)
|
59
|
+
|
60
|
+
# write results to disk
|
61
|
+
results.each do |name,fasta,qual,comments|
|
62
|
+
# puts "comments: #{comments}\n"
|
63
|
+
@@results.write_seq(name,fasta,qual,comments)
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
A simple scbi_mapreduce application demo
|
2
|
+
========================================
|
3
|
+
|
4
|
+
This application is only a basic template. You should modify the files
|
5
|
+
to perform the desired tasks. There are other templates available, you
|
6
|
+
can list them by issuing this command:
|
7
|
+
|
8
|
+
scbi_mapreduce
|
9
|
+
|
10
|
+
You can launch the application right now with the following command:
|
11
|
+
|
12
|
+
ruby main.rb
|
13
|
+
|
14
|
+
A server and some workers will be launched, and 200.000 1Kb strings will be
|
15
|
+
converted from downcase to uppercase, in blocks of 100 strings.
|
16
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# load required libraries
|
4
|
+
require 'scbi_mapreduce'
|
5
|
+
require './my_worker_manager.rb'
|
6
|
+
|
7
|
+
# listen on all ips at port 50000
|
8
|
+
ip='0.0.0.0'
|
9
|
+
port = 50000
|
10
|
+
|
11
|
+
# set number of workers. You can also provide an array with worker names.
|
12
|
+
# Those workers names can be read from a file produced by the existing
|
13
|
+
# queue system, if any.
|
14
|
+
workers = 8
|
15
|
+
|
16
|
+
# we need the path to my_worker in order to launch it when necessary
|
17
|
+
custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
|
18
|
+
|
19
|
+
# initialize the work manager. Here you can pass parameters like file names
|
20
|
+
MyWorkerManager.init_work_manager
|
21
|
+
|
22
|
+
# launch processor server
|
23
|
+
mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
|
24
|
+
|
25
|
+
# you can set additional properties
|
26
|
+
# =================================
|
27
|
+
|
28
|
+
# if you want basic checkpointing. Some performance drop should be expected
|
29
|
+
# mgr.checkpointing=true
|
30
|
+
|
31
|
+
# if you want to keep the order of input data. Some performance drop should be expected
|
32
|
+
# mgr.keep_order=true
|
33
|
+
|
34
|
+
# you can set the size of packets of data sent to workers
|
35
|
+
mgr.chunk_size=100
|
36
|
+
|
37
|
+
# start processing
|
38
|
+
mgr.start_server
|
39
|
+
|
40
|
+
# this line is reached when all data has been processed
|
41
|
+
puts "Program finished"
|