bio-rocker 1.0.0 → 1.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ROCker +276 -96
- data/lib/rocker.rb +25 -14
- data/lib/rocker/blasthit.rb +8 -5
- data/lib/rocker/genome-set.rb +70 -0
- data/lib/rocker/protein-set.rb +90 -0
- data/lib/rocker/rocdata.rb +26 -9
- data/lib/rocker/rocwindow.rb +20 -18
- data/lib/rocker/step/build.rb +233 -200
- data/lib/rocker/step/compile.rb +11 -6
- data/lib/rocker/step/filter.rb +11 -7
- data/lib/rocker/step/plot.rb +80 -26
- data/lib/rocker/step/search.rb +27 -4
- metadata +16 -14
data/lib/rocker.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-07-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'rocker/blasthit'
|
@@ -10,20 +10,30 @@ require 'rocker/rocdata'
|
|
10
10
|
|
11
11
|
class ROCker
|
12
12
|
#================================[ Class ]
|
13
|
+
@@VERSION = "1.1.9"
|
14
|
+
@@CITATION = "Orellana, Rodriguez-R, & Konstantinidis. Under review. " +
|
15
|
+
"Detecting and quantifying functional genes in short-read metagenomic " +
|
16
|
+
"datasets: method development and application to the nitrogen cycle " +
|
17
|
+
"genes."
|
13
18
|
@@DEFAULTS = {
|
14
19
|
# General
|
15
|
-
:
|
20
|
+
q: false, r: "R", nucl: false, debug: false, thr: 2, search: :blast,
|
16
21
|
# External software
|
17
|
-
:
|
18
|
-
:
|
19
|
-
:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
searchbins: "",
|
23
|
+
searchcmd: {
|
24
|
+
blast: '%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" ' +
|
25
|
+
'-num_threads %6$d -outfmt 6 -max_target_seqs 1',
|
26
|
+
diamond: '%1$sdiamond %2$s -q "%3$s" -d "%4$s" -a "%5$s.daa" -p %6$d' +
|
27
|
+
' -k 1 --min-score 20 --sensitive && %1$sdiamond view -a "%5$s"' +
|
28
|
+
' -o "%5$s"'},
|
29
|
+
makedbcmd: {
|
30
|
+
blast: '%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
|
31
|
+
diamond: '%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
|
24
32
|
}
|
25
33
|
def self.defaults() @@DEFAULTS ; end
|
26
34
|
def self.default(k) @@DEFAULTS[k] ; end
|
35
|
+
def self.VERSION; @@VERSION ; end
|
36
|
+
def self.CITATION; @@CITATION ; end
|
27
37
|
|
28
38
|
#================================[ Instance ]
|
29
39
|
attr_reader :o
|
@@ -46,7 +56,8 @@ class ROCker
|
|
46
56
|
end
|
47
57
|
def bash(cmd, err_msg=nil)
|
48
58
|
o = `#{cmd} 2>&1 && echo '{'`
|
49
|
-
raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
|
59
|
+
raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
|
60
|
+
o[-2]=="{"
|
50
61
|
true
|
51
62
|
end
|
52
63
|
end
|
@@ -63,10 +74,10 @@ require 'rocker/step/plot'
|
|
63
74
|
class Numeric
|
64
75
|
def ordinalize
|
65
76
|
n= self.to_s
|
66
|
-
s= n[-2]=='1' ?
|
67
|
-
n[-1]=='1' ?
|
68
|
-
n[-1]=='2' ?
|
69
|
-
n[-1]=='3' ?
|
77
|
+
s= n[-2]=='1' ? "th" :
|
78
|
+
n[-1]=='1' ? "st" :
|
79
|
+
n[-1]=='2' ? "nd" :
|
80
|
+
n[-1]=='3' ? "rd" : "th"
|
70
81
|
n + s
|
71
82
|
end
|
72
83
|
end
|
data/lib/rocker/blasthit.rb
CHANGED
@@ -2,12 +2,13 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-06-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
class BlastHit
|
9
|
-
attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :midpoint
|
10
|
-
# Initialize from BLAST using new(ln,aln),
|
9
|
+
attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :isfalse, :midpoint
|
10
|
+
# Initialize from BLAST using new(ln,aln),
|
11
|
+
# initialize from TABLE using new(ln)
|
11
12
|
def initialize(ln, aln=nil)
|
12
13
|
l = ln.chomp.split(/\t/)
|
13
14
|
if aln.nil?
|
@@ -16,6 +17,7 @@ class BlastHit
|
|
16
17
|
@sto = l[2].to_i
|
17
18
|
@bits = l[3].to_f
|
18
19
|
@istrue = l[4]=='1'
|
20
|
+
@istrue = l[4]=='-1'
|
19
21
|
@midpoint = l[5].to_i
|
20
22
|
else
|
21
23
|
s = aln.seq(l[1])
|
@@ -27,13 +29,14 @@ class BlastHit
|
|
27
29
|
@sto = [a,b].max
|
28
30
|
@bits = l[11].to_f
|
29
31
|
@istrue = ! /@%/.match(l[0]).nil?
|
32
|
+
@isfalse = ! /@\$/.match(l[0]).nil?
|
30
33
|
@midpoint = s.pos2col(((l[8].to_f+l[9].to_f)/2).ceil)
|
31
34
|
end
|
32
35
|
end
|
33
36
|
def to_s
|
34
37
|
self.sbj.nil? ? "" :
|
35
|
-
[
|
36
|
-
|
38
|
+
[sbj, sfrom.to_s, sto.to_s, bits.to_s,
|
39
|
+
istrue ? "1" : (isfalse ? "-1" : "0"), midpoint].join("\t") + "\n"
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-23-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class GenomeSet
|
9
|
+
attr_reader :rocker, :ids, :taxa
|
10
|
+
def initialize(rocker, ids)
|
11
|
+
@rocker = rocker
|
12
|
+
@ids = ids
|
13
|
+
@ids = [] if ids.nil?
|
14
|
+
@taxa = {}
|
15
|
+
@all_taxa = {}
|
16
|
+
end
|
17
|
+
def download(file)
|
18
|
+
tmp_ids = Array.new(self.ids)
|
19
|
+
ofh = File.open(file, "w")
|
20
|
+
while tmp_ids.size>0
|
21
|
+
ofh.print rocker.ebiFetch(:embl, tmp_ids.shift(200), :fasta)
|
22
|
+
end
|
23
|
+
ofh.close
|
24
|
+
end
|
25
|
+
def link_taxon(id, taxon)
|
26
|
+
@all_taxa[ taxon.to_sym ] ||= []
|
27
|
+
@all_taxa[ taxon.to_sym ] << id
|
28
|
+
end
|
29
|
+
def choose_genomes!(rank)
|
30
|
+
@taxa = {}
|
31
|
+
self.get_taxonomy! rank
|
32
|
+
@all_taxa.each_pair{ |taxon,ids| @taxa[taxon] = ids.sample }
|
33
|
+
@ids = @taxa.values
|
34
|
+
end
|
35
|
+
def get_taxonomy!(rank)
|
36
|
+
@all_taxa = {}
|
37
|
+
ids.each do |id|
|
38
|
+
self.link_taxon(id, genome2taxon(id, rank))
|
39
|
+
end
|
40
|
+
end
|
41
|
+
def taxa=(hash)
|
42
|
+
@taxa = {}
|
43
|
+
hash.each_pair{ |taxon, id| @taxa[taxon] = id if self.ids.include? id }
|
44
|
+
end
|
45
|
+
def size() self.ids.size end
|
46
|
+
def empty?() self.ids.empty? end
|
47
|
+
|
48
|
+
#================================[ Utilities ]
|
49
|
+
def genome2taxon(genome_id, rank='species')
|
50
|
+
v = genome2taxid(genome_id)
|
51
|
+
unless v.nil?
|
52
|
+
xml = rocker.ebiFetch('taxonomy', [v], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
53
|
+
v = xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first
|
54
|
+
v.sub!(/.* taxId="(\d+)".*/,"\\1") unless v.nil?
|
55
|
+
end
|
56
|
+
return "no-taxon-#{(0...12).map { (65 + rand(26)).chr }.join}" if v.nil? or v !~ /^\d+$/
|
57
|
+
v
|
58
|
+
end
|
59
|
+
def genome2taxid(genome_id)
|
60
|
+
doc = rocker.ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/)
|
61
|
+
ln = doc.grep(/^FT\s+\/db_xref="taxon:/).first
|
62
|
+
ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
|
63
|
+
return nil if ln.nil?
|
64
|
+
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, "\\1")
|
65
|
+
return nil unless ln =~ /^\d+$/
|
66
|
+
ln
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jul-20-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'rocker/alignment'
|
9
|
+
|
10
|
+
class ProteinSet
|
11
|
+
attr_reader :rocker, :ids, :aln
|
12
|
+
def initialize(rocker, ids=nil, file=nil, aln_file=nil)
|
13
|
+
@genomes = {}
|
14
|
+
@tranids = {}
|
15
|
+
@aln = nil
|
16
|
+
@rocker = rocker
|
17
|
+
@ids = []
|
18
|
+
@ids += ids unless ids.nil?
|
19
|
+
@ids += File.readlines(file).map{ |l| l.chomp } unless file.nil?
|
20
|
+
unless aln_file.nil?
|
21
|
+
aln = Alignment.new
|
22
|
+
aln.read_fasta aln_file
|
23
|
+
aln_ids = aln.get_ids
|
24
|
+
@aln = aln if (@ids - aln_ids).empty?
|
25
|
+
@ids += aln_ids
|
26
|
+
end
|
27
|
+
@ids.uniq!
|
28
|
+
end
|
29
|
+
def download(file)
|
30
|
+
tmp_ids = Array.new(self.ids)
|
31
|
+
f = File.open(file, "w")
|
32
|
+
while tmp_ids.size>0
|
33
|
+
f.print rocker.ebiFetch(:uniprotkb, tmp_ids.shift(200), :fasta)
|
34
|
+
end
|
35
|
+
f.close
|
36
|
+
end
|
37
|
+
def get_from_aln(file, aln)
|
38
|
+
f = File.open(file, "w")
|
39
|
+
f.print aln.to_seq_s
|
40
|
+
f.close
|
41
|
+
end
|
42
|
+
def get_genomes!
|
43
|
+
self.ids.each do |id|
|
44
|
+
doc = self.rocker.ebiFetch(:uniprotkb, [id], :annot).split("\n")
|
45
|
+
doc.grep( /^DR\s+EMBL;/ ).map do |ln|
|
46
|
+
r=ln.split('; ')
|
47
|
+
self.link_genome(id, r[1])
|
48
|
+
self.link_tranid(id, r[2])
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
def link_genome(prot_id, genome_id)
|
53
|
+
@genomes[prot_id] ||= []
|
54
|
+
@genomes[prot_id] << genome_id
|
55
|
+
@genomes[prot_id].uniq!
|
56
|
+
end
|
57
|
+
def link_tranid(prot_id, transl_id)
|
58
|
+
@tranids[prot_id] ||= []
|
59
|
+
@tranids[prot_id] << transl_id
|
60
|
+
@tranids[prot_id].uniq!
|
61
|
+
end
|
62
|
+
def genomes
|
63
|
+
return [] if @genomes.empty?
|
64
|
+
@genomes.values.reduce(:+).uniq
|
65
|
+
end
|
66
|
+
def tranids
|
67
|
+
return [] if @tranids.empty?
|
68
|
+
@tranids.values.reduce(:+).uniq
|
69
|
+
end
|
70
|
+
def in_coords(coords)
|
71
|
+
coords.keys.map do |genome|
|
72
|
+
locations = coords[ genome ]
|
73
|
+
locations.map do |loc|
|
74
|
+
if not loc[:prot_id].nil?
|
75
|
+
loc[:prot_id] if self.include? loc[:prot_id]
|
76
|
+
elsif not loc[:tran_id].nil? and not @tranids.rassoc(loc[:tran_id]).nil?
|
77
|
+
@tranids.rassoc(loc[:tran_id]).first
|
78
|
+
else
|
79
|
+
warn "Warning: Impossible to resolve protein located in '#{genome}' at: #{loc}."
|
80
|
+
nil
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end.reduce([], :+).compact.uniq
|
84
|
+
end
|
85
|
+
def size() self.ids.size end
|
86
|
+
def empty?() self.ids.empty? end
|
87
|
+
def include?(id) self.ids.include?(id) end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
data/lib/rocker/rocdata.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-07-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'rocker/rinterface'
|
@@ -11,11 +11,13 @@ require 'rocker/alignment'
|
|
11
11
|
require 'tmpdir'
|
12
12
|
|
13
13
|
class ROCData
|
14
|
-
attr_reader :aln, :windows, :r
|
15
|
-
# Use ROCData.new(table,aln,window) to re-compute from table, use
|
14
|
+
attr_reader :aln, :windows, :r, :refined
|
15
|
+
# Use ROCData.new(table,aln,window) to re-compute from table, use
|
16
|
+
# ROCData.new(data) to load
|
16
17
|
def initialize(val, aln=nil, window=nil)
|
17
18
|
@r = RInterface.new
|
18
19
|
@nucl = false
|
20
|
+
@refined = false
|
19
21
|
if not aln.nil?
|
20
22
|
@aln = aln
|
21
23
|
self.rrun "library('pROC');"
|
@@ -33,7 +35,9 @@ class ROCData
|
|
33
35
|
@aln.read_rocker(val)
|
34
36
|
end
|
35
37
|
end
|
36
|
-
def win_at_col(col)
|
38
|
+
def win_at_col(col)
|
39
|
+
self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first
|
40
|
+
end
|
37
41
|
def in_nucl?() @nucl end
|
38
42
|
def nucl=(nucl) @nucl=nucl end
|
39
43
|
def refine! table
|
@@ -41,14 +45,17 @@ class ROCData
|
|
41
45
|
return false unless self.load_table! table
|
42
46
|
break if self._refine_iter(table)==0
|
43
47
|
end
|
48
|
+
@refined = true
|
44
49
|
return true
|
45
50
|
end
|
51
|
+
def is_refined? ; @refined ; end
|
46
52
|
def _refine_iter table
|
47
53
|
to_refine = []
|
48
54
|
self.windows.each do |w|
|
49
55
|
next if w.almost_empty or w.length <= 5
|
50
56
|
self.rrun "acc <- w$accuracy[w$V1==#{w.from}];"
|
51
|
-
to_refine << w if
|
57
|
+
to_refine << w if
|
58
|
+
self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
|
52
59
|
end
|
53
60
|
n = to_refine.size
|
54
61
|
return 0 unless n > 0
|
@@ -86,9 +93,17 @@ class ROCData
|
|
86
93
|
win <- which( (m>=w$V1) & (m<=w$V2))[1];
|
87
94
|
if(!is.na(win)){
|
88
95
|
if(x$V4[i] >= w$V5[win]){
|
89
|
-
if(x$V5[i]==1){
|
96
|
+
if(x$V5[i]==1){
|
97
|
+
w$tp[win] <- w$tp[win]+1
|
98
|
+
} else {
|
99
|
+
w$fp[win] <- w$fp[win]+1
|
100
|
+
}
|
90
101
|
}else{
|
91
|
-
if(x$V5[i]==1){
|
102
|
+
if(x$V5[i]==1){
|
103
|
+
w$fn[win] <- w$fn[win]+1
|
104
|
+
} else {
|
105
|
+
w$tn[win] <- w$tn[win]+1
|
106
|
+
};
|
92
107
|
}
|
93
108
|
}
|
94
109
|
}
|
@@ -106,7 +121,9 @@ class ROCData
|
|
106
121
|
end
|
107
122
|
def init_windows!(size)
|
108
123
|
@windows = []
|
109
|
-
1.step(self.aln.cols,size).each
|
124
|
+
1.step(self.aln.cols,size).each do |a|
|
125
|
+
@windows << ROCWindow.new(self, a, a+size-1)
|
126
|
+
end
|
110
127
|
end
|
111
128
|
def rrun(cmd, type=nil) self.r.run cmd, type end
|
112
129
|
def save(file)
|
@@ -115,7 +132,7 @@ class ROCData
|
|
115
132
|
f.close
|
116
133
|
end
|
117
134
|
def to_s
|
118
|
-
o =
|
135
|
+
o = "#v ROCker " + ROCker.VERSION + "\n"
|
119
136
|
self.windows.each{|w| o += w.to_s}
|
120
137
|
o += self.aln.to_s
|
121
138
|
return o
|
data/lib/rocker/rocwindow.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-07-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
class ROCWindow
|
@@ -22,16 +22,18 @@ class ROCWindow
|
|
22
22
|
@from = [a,b].min
|
23
23
|
@to = [a,b].max
|
24
24
|
@thr = nil
|
25
|
-
|
25
|
+
compute!
|
26
26
|
end
|
27
27
|
end
|
28
28
|
def compute!
|
29
|
-
|
30
|
-
@hits =
|
31
|
-
@tps =
|
32
|
-
unless
|
33
|
-
|
34
|
-
thr =
|
29
|
+
load_hits
|
30
|
+
@hits = rrun("nrow(y);", :int)
|
31
|
+
@tps = rrun("sum(y$V5==1);", :int)
|
32
|
+
unless almost_empty
|
33
|
+
rrun "rocobj <- roc(as.numeric(y$V5==1), y$V4);"
|
34
|
+
thr = rrun("coords(rocobj, 'best', ret='threshold', " +
|
35
|
+
"best.method='youden', " +
|
36
|
+
"best.weights=c(0.5, sum(y$V5==1)/nrow(y)))[1];", :float)
|
35
37
|
@thr = thr.to_f
|
36
38
|
@thr = nil if @thr==0.0 or @thr.infinite?
|
37
39
|
end
|
@@ -48,16 +50,16 @@ class ROCWindow
|
|
48
50
|
return nil if a.nil? and b.nil?
|
49
51
|
return a.thr if b.nil?
|
50
52
|
return b.thr if a.nil?
|
51
|
-
return (b.thr*(
|
53
|
+
return (b.thr*(from-a.from) - a.thr*(from-b.from))/(b.from-a.from)
|
52
54
|
end
|
53
|
-
def load_hits() self.rrun "y <- x[x$V6>=#{
|
54
|
-
def previous() (
|
55
|
-
def next() (
|
56
|
-
def thr_notnil() (@thr.nil? or @thr.infinite?) ?
|
57
|
-
def fps()
|
58
|
-
def almost_empty()
|
59
|
-
def length()
|
60
|
-
def rrun(cmd, type=nil)
|
61
|
-
def to_s() [
|
55
|
+
def load_hits() self.rrun "y <- x[x$V6>=#{from} & x$V6<=#{to},];" end
|
56
|
+
def previous() (from == 1) ? nil : data.win_at_col(from - 1) end
|
57
|
+
def next() (to == data.aln.cols) ? nil : data.win_at_col(to + 1) end
|
58
|
+
def thr_notnil() (@thr.nil? or @thr.infinite?) ? around_thr : @thr end
|
59
|
+
def fps() hits - tps end
|
60
|
+
def almost_empty() fps < 3 or tps < 3 end
|
61
|
+
def length() to - from + 1 end
|
62
|
+
def rrun(cmd, type=nil) data.rrun(cmd, type) end
|
63
|
+
def to_s() [from, to, hits, tps, thr_notnil].join("\t") + "\n" end
|
62
64
|
end
|
63
65
|
|
data/lib/rocker/step/build.rb
CHANGED
@@ -2,21 +2,27 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-11-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'json'
|
9
|
+
require 'rocker/protein-set'
|
10
|
+
require 'rocker/genome-set'
|
9
11
|
|
10
12
|
class ROCker
|
11
13
|
#================================[ Class ]
|
12
|
-
@@EBIREST =
|
13
|
-
@@DEFAULTS.merge!({:
|
14
|
+
@@EBIREST = "http://www.ebi.ac.uk/Tools"
|
15
|
+
@@DEFAULTS.merge!({positive:[], negative:[], seqdepth:0.03, readlen:100,
|
16
|
+
minovl:50,
|
14
17
|
# Ext. Software
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:
|
18
|
-
|
19
|
-
|
18
|
+
aligner: :clustalo, simulator: :grinder,
|
19
|
+
simulatorbin:{grinder:"grinder"},
|
20
|
+
simulatorcmd:{grinder:"%1$s -reference_file \"%2$s\" -cf \"%3$f\" " +
|
21
|
+
"-dc '-~*NnKkMmRrYySsWwBbVvHhDdXx' -md uniform 0.1 -mr 95 5 " +
|
22
|
+
"-rd %4$d uniform 5 -base_name \"%5$s\""},
|
23
|
+
alignerbin:{muscle:"muscle", clustalo:"clustalo"},
|
24
|
+
alignercmd:{muscle:"%1$s -in \"%2$s\" -out \"%3$s\" -quiet",
|
25
|
+
clustalo:"%1$s -i \"%2$s\" -o \"%3$s\" --threads=%4$d --force"}
|
20
26
|
})
|
21
27
|
@@HAS_BUILD_GEMS = nil
|
22
28
|
def self.ebirest() @@EBIREST ; end
|
@@ -33,90 +39,65 @@ class ROCker
|
|
33
39
|
end
|
34
40
|
|
35
41
|
#================================[ Utilities ]
|
36
|
-
def genes2genomes(gene_ids)
|
37
|
-
genomes = []
|
38
|
-
ids = Array.new(gene_ids)
|
39
|
-
while ids.size>0
|
40
|
-
doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
|
41
|
-
genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
|
42
|
-
r=ln.split('; ')
|
43
|
-
{:genome_id=>r[1], :transl_id=>r[2]}
|
44
|
-
end
|
45
|
-
end
|
46
|
-
genomes.uniq
|
47
|
-
end
|
48
|
-
def genome2taxid(genome_id)
|
49
|
-
ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
|
50
|
-
return ln if ln.nil?
|
51
|
-
ln.sub(/.*"taxon:(\d+)".*/, "\\1")
|
52
|
-
end
|
53
|
-
def genome2taxon(genome_id, rank='species')
|
54
|
-
xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
55
|
-
xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
|
56
|
-
end
|
57
42
|
def restcall(url, outfile=nil)
|
58
43
|
$stderr.puts " # Calling: #{url}" if @o[:debug]
|
59
|
-
response = RestClient::Request.execute(:method=>:get,
|
60
|
-
|
44
|
+
response = RestClient::Request.execute(:method=>:get, :url=>url,
|
45
|
+
:timeout=>600)
|
46
|
+
raise "Unable to reach EBI REST client, error code " +
|
47
|
+
response.code.to_s + "." unless response.code == 200
|
61
48
|
unless outfile.nil?
|
62
|
-
ohf = File.open(outfile,
|
49
|
+
ohf = File.open(outfile, "w")
|
63
50
|
ohf.print response.to_s
|
64
51
|
ohf.close
|
65
52
|
end
|
66
53
|
response.to_s
|
67
54
|
end
|
68
55
|
def ebiFetch(db, ids, format, outfile=nil)
|
69
|
-
url = "#{ROCker.ebirest}/dbfetch/dbfetch
|
70
|
-
|
71
|
-
|
72
|
-
ohf = File.open(outfile, 'w')
|
73
|
-
ohf.print res
|
74
|
-
ohf.close
|
75
|
-
end
|
76
|
-
res
|
56
|
+
url = "#{ROCker.ebirest}/dbfetch/dbfetch/" +
|
57
|
+
"#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
|
58
|
+
self.restcall url, outfile
|
77
59
|
end
|
78
|
-
def get_coords_from_gff3(genome_ids,
|
79
|
-
|
80
|
-
genomes_org = {}
|
60
|
+
def get_coords_from_gff3(genome_ids, pset, thread_id, json_file)
|
61
|
+
coords = {}
|
81
62
|
i = 0
|
82
63
|
genome_ids.each do |genome_id|
|
83
|
-
print " * scanning #{(i+=1).ordinalize} genome out of
|
84
|
-
|
85
|
-
|
86
|
-
genomes_org[ genome_taxon.to_sym ] ||= []
|
87
|
-
genomes_org[ genome_taxon.to_sym ] << genome_id
|
88
|
-
end
|
64
|
+
print " * scanning #{(i+=1).ordinalize} genome out of " +
|
65
|
+
"#{genome_ids.size} in first thread. \r" if
|
66
|
+
thread_id==0 and not @o[:q]
|
89
67
|
genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
|
90
68
|
if @o[:reuse] and File.size? genome_file
|
91
|
-
ifh = File.open(genome_file,
|
69
|
+
ifh = File.open(genome_file, "r")
|
92
70
|
doc = ifh.readlines.grep(/^[^#]/)
|
93
71
|
ifh.close
|
94
72
|
else
|
95
73
|
genome_file=nil unless @o[:noclean]
|
96
|
-
doc = ebiFetch(:embl, [genome_id], :gff3,
|
74
|
+
doc = ebiFetch(:embl, [genome_id], :gff3,
|
75
|
+
genome_file).split("\n").grep(/^[^#]/)
|
97
76
|
end
|
98
77
|
doc.each do |ln|
|
99
78
|
next if ln =~ /^#/
|
100
79
|
r = ln.chomp.split /\t/
|
101
80
|
next if r.size < 9
|
102
|
-
prots = r[8].split(/;/).grep(
|
103
|
-
|
104
|
-
|
105
|
-
|
81
|
+
prots = r[8].split(/;/).grep(
|
82
|
+
/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
|
83
|
+
p = prots.select{ |id| pset.ids.include? id }.first
|
84
|
+
trans = r[8].split(/;/).grep(
|
85
|
+
/^protein_id=/){ |pid| pid.split(/=/)[1] }
|
86
|
+
t = trans.select{ |id| pset.tranids.include? id }.first
|
106
87
|
next if p.nil? and t.nil?
|
107
|
-
|
108
|
-
|
109
|
-
:
|
110
|
-
:
|
111
|
-
:
|
112
|
-
:
|
113
|
-
:
|
88
|
+
coords[ r[0].to_sym ] ||= []
|
89
|
+
coords[ r[0].to_sym ] << {
|
90
|
+
prot_id: p,
|
91
|
+
tran_id: t,
|
92
|
+
from: r[3].to_i,
|
93
|
+
to: r[4].to_i,
|
94
|
+
strand: r[6]
|
114
95
|
}
|
115
96
|
end
|
116
97
|
end
|
117
98
|
print "\n" if thread_id==0 and not @o[:q]
|
118
|
-
ofh = File.open
|
119
|
-
ofh.print({:
|
99
|
+
ofh = File.open(json_file, "w")
|
100
|
+
ofh.print({coords:coords}.to_json)
|
120
101
|
ofh.close
|
121
102
|
end
|
122
103
|
|
@@ -124,211 +105,251 @@ class ROCker
|
|
124
105
|
def build!
|
125
106
|
# Check requirements
|
126
107
|
puts "Testing environment." unless @o[:q]
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
@o[
|
131
|
-
@o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
|
132
|
-
@o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
|
108
|
+
{ searchcmd: :search, makedbcmd: :search,
|
109
|
+
alignercmd: :aligner, alignerbin: :aligner,
|
110
|
+
simulatorcmd: :simulator, simulatorbin: :simulator
|
111
|
+
}.each_pair { |k,v| @o[k] = @o[k][@o[v]] if @o[k].is_a? Hash }
|
133
112
|
@o[:nosearch]=true if @o[:nosimulate]
|
134
|
-
raise "Unsatisfied requirements, please see the help message (-h)." unless
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
aln.read_fasta @o[:aln]
|
141
|
-
@o[:positive] += aln.get_ids
|
142
|
-
end
|
143
|
-
raise "-p or -P are mandatory." if @o[:positive].size==0
|
113
|
+
raise "Unsatisfied requirements, please see the help message (-h)." unless
|
114
|
+
ROCker.has_build_gems?
|
115
|
+
protein_set = {}
|
116
|
+
protein_set[:+] = ProteinSet.new(self,@o[:positive],@o[:posfile],@o[:aln])
|
117
|
+
protein_set[:-] = ProteinSet.new(self,@o[:negative],@o[:negfile])
|
118
|
+
raise "-p, -P, or -a are mandatory." if protein_set[:+].empty?
|
144
119
|
raise "-o/--baseout is mandatory." if @o[:baseout].nil?
|
145
|
-
if
|
146
|
-
warn "\nWARNING: Positive set contains only one sequence, turning
|
120
|
+
if protein_set[:+].size==1 and not @o[:noaln]
|
121
|
+
warn "\nWARNING: Positive set contains only one sequence, turning " +
|
122
|
+
"off alignment.\n\n"
|
147
123
|
@o[:noaln] = true
|
148
124
|
end
|
149
125
|
unless @o[:nosimulate]
|
150
|
-
self.bash
|
126
|
+
self.bash("#{@o[:simulatorbin]} --version",
|
127
|
+
"--simulator-bin must be executable. Is Grinder installed?") if
|
128
|
+
@o[:simulator]==:grinder
|
151
129
|
end
|
152
130
|
unless @o[:noaln]
|
153
|
-
self.bash
|
154
|
-
|
131
|
+
self.bash("#{@o[:alignerbin]} -version",
|
132
|
+
"--aligner-bin must be executable. Is Muscle installed?") if
|
133
|
+
@o[:aligner]==:muscle
|
134
|
+
self.bash("#{@o[:alignerbin]} --version",
|
135
|
+
"--aligner-bin must be executable. Is ClustalOmega installed?") if
|
136
|
+
@o[:aligner]==:clustalo
|
155
137
|
end
|
156
138
|
unless @o[:nosearch]
|
157
|
-
self.bash
|
158
|
-
|
139
|
+
self.bash("#{@o[:searchbins]}makeblastdb -version",
|
140
|
+
"--search-bins must contain executables. Is BLAST+ installed?") if
|
141
|
+
@o[:search]==:blast
|
142
|
+
self.bash("#{@o[:searchbins]}diamond --help",
|
143
|
+
"--search-bins must contain executables. Is DIAMOND installed?") if
|
144
|
+
@o[:search]==:diamond
|
159
145
|
end
|
160
146
|
|
161
147
|
# Download genes
|
162
148
|
puts "Downloading gene data." unless @o[:q]
|
163
149
|
ref_file = @o[:baseout] + ".ref.fasta"
|
164
|
-
if
|
150
|
+
if not protein_set[:+].aln.nil?
|
165
151
|
puts " * reusing aligned sequences as positive set." unless @o[:q]
|
166
|
-
|
167
|
-
f.print aln.to_seq_s
|
168
|
-
f.close
|
152
|
+
protein_set[:+].get_from_aln(ref_file, aln)
|
169
153
|
@o[:noaln] = true
|
170
154
|
elsif @o[:reuse] and File.size? ref_file
|
171
155
|
puts " * reusing positive set: #{ref_file}." unless @o[:q]
|
172
156
|
else
|
173
|
-
puts " * downloading #{
|
174
|
-
|
175
|
-
ids
|
176
|
-
|
177
|
-
while ids.size>0
|
178
|
-
f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
|
179
|
-
end
|
180
|
-
f.close
|
157
|
+
puts " * downloading #{protein_set[:+].size} sequence(s) in " +
|
158
|
+
"positive set." unless @o[:q]
|
159
|
+
$stderr.puts " # #{protein_set[:+].ids}" if @o[:debug]
|
160
|
+
protein_set[:+].download(ref_file)
|
181
161
|
end
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
puts "
|
187
|
-
|
188
|
-
r = genes2genomes(@o[set])
|
189
|
-
genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
|
190
|
-
transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
|
162
|
+
[:+, :-].each do |set|
|
163
|
+
unless protein_set[set].empty?
|
164
|
+
puts " * linking genomes from #{protein_set[set].size} " +
|
165
|
+
"[#{set.to_s}] sequence(s)." unless @o[:q]
|
166
|
+
$stderr.puts " # #{protein_set[set].ids}" if @o[:debug]
|
167
|
+
protein_set[set].get_genomes!
|
191
168
|
end
|
192
169
|
end
|
193
|
-
raise "No genomes associated with the positive set." if
|
194
|
-
|
195
|
-
|
196
|
-
|
170
|
+
raise "No genomes associated with the positive set." if
|
171
|
+
protein_set[:+].genomes.empty?
|
172
|
+
genome_set = {:+ => GenomeSet.new(self, protein_set[:+].genomes),
|
173
|
+
:- => GenomeSet.new(self, protein_set[:-].genomes)}
|
197
174
|
|
198
175
|
# Locate genes
|
199
176
|
puts "Analyzing genome data." unless @o[:q]
|
200
177
|
coords_file = @o[:baseout] + ".src.coords"
|
201
178
|
if @o[:reuse] and File.size? coords_file
|
202
179
|
puts " * reusing coordinates: #{coords_file}." unless @o[:q]
|
203
|
-
c = JSON.parse File.read(coords_file), {:
|
180
|
+
c = JSON.parse File.read(coords_file), {symbolize_names:true}
|
204
181
|
positive_coords = c[:positive_coords]
|
205
|
-
|
182
|
+
negative_coords = c[:negative_coords]
|
183
|
+
genome_set[:+].taxa = c[:taxa_pos]
|
184
|
+
genome_set[:-].taxa = c[:taxa_neg]
|
206
185
|
else
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
thr_obj
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
o[:positive_coords].each_pair do |k,v|
|
233
|
-
positive_coords[ k ] ||= []
|
234
|
-
positive_coords[ k ] += v
|
186
|
+
all_coords = {}
|
187
|
+
[:+, :-].each do |set_type|
|
188
|
+
all_coords[set_type] = {}
|
189
|
+
next if genome_set[set_type].empty?
|
190
|
+
thrs = [@o[:thr], genome_set[set_type].size].min
|
191
|
+
puts " * downloading and parsing #{genome_set[set_type].size} " +
|
192
|
+
"GFF3 document(s) in #{thrs} threads." unless @o[:q]
|
193
|
+
$stderr.puts " # Looking for translations: " +
|
194
|
+
"#{protein_set[set_type].tranids}" if @o[:debug]
|
195
|
+
$stderr.puts " # Looking into: #{genome_set[set_type].ids}" if
|
196
|
+
@o[:debug]
|
197
|
+
# Launch threads
|
198
|
+
thr_obj = []
|
199
|
+
(0 .. (thrs-1)).each do |thr_i|
|
200
|
+
ids_to_parse = []
|
201
|
+
(0 .. (genome_set[set_type].size-1)).each do |i|
|
202
|
+
ids_to_parse << protein_set[set_type].genomes[i] if
|
203
|
+
(i % thrs) == thr_i
|
204
|
+
end
|
205
|
+
json_file = @o[:baseout] + ".src.coords." + thr_i.to_s + ".tmp"
|
206
|
+
thr_obj << json_file
|
207
|
+
fork do
|
208
|
+
get_coords_from_gff3(ids_to_parse, protein_set[set_type],
|
209
|
+
thr_i, json_file)
|
210
|
+
end
|
235
211
|
end
|
236
|
-
|
237
|
-
|
238
|
-
|
212
|
+
# Combine results
|
213
|
+
Process.waitall
|
214
|
+
thr_obj.each do |t|
|
215
|
+
raise "Thread failed without error trace: #{t}" unless
|
216
|
+
File.exist? t
|
217
|
+
o = JSON.parse(File.read(t), {symbolize_names:true})
|
218
|
+
o[:coords].each_pair do |k,v|
|
219
|
+
all_coords[set_type][ k ] ||= []
|
220
|
+
all_coords[set_type][ k ] += v
|
221
|
+
end
|
222
|
+
File.unlink t
|
239
223
|
end
|
240
|
-
|
241
|
-
|
224
|
+
end # [:+, :-].each
|
225
|
+
positive_coords = all_coords[:+]
|
226
|
+
negative_coords = all_coords[:-]
|
242
227
|
# Select one genome per taxon
|
243
228
|
unless @o[:pertaxon].nil?
|
244
|
-
|
229
|
+
puts " Selecting genomes by #{@o[:pertaxon]}." unless @o[:q]
|
230
|
+
[:+,:-].each{ |set| genome_set[set].choose_genomes! @o[:pertaxon] }
|
245
231
|
end
|
246
|
-
# Save coordinates
|
232
|
+
# Save coordinates and taxa
|
247
233
|
ofh = File.open(coords_file, "w")
|
248
|
-
ofh.print JSON.pretty_generate({
|
234
|
+
ofh.print JSON.pretty_generate({
|
235
|
+
positive_coords:positive_coords,
|
236
|
+
negative_coords:negative_coords,
|
237
|
+
taxa_pos:genome_set[:+].taxa,
|
238
|
+
taxa_neg:genome_set[:-].taxa})
|
249
239
|
ofh.close
|
250
|
-
end
|
240
|
+
end # if @o[:reuse] and File.size? coords_file ... else
|
251
241
|
unless @o[:pertaxon].nil?
|
252
|
-
|
253
|
-
|
242
|
+
puts " Using " +
|
243
|
+
[:+,:-].map{ |set| genome_set[set].size }.reduce(:+).to_s +
|
244
|
+
" genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
|
254
245
|
end
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
246
|
+
found = protein_set[:+].in_coords(positive_coords)
|
247
|
+
raise "Cannot find the genomic location of any provided sequence." if
|
248
|
+
found.nil?
|
249
|
+
missing = protein_set[:+].ids - found
|
250
|
+
warn "\nWARNING: Cannot find genomic location of #{missing.size} " +
|
251
|
+
"sequence(s) #{missing.join(",")}.\n\n" unless missing.empty?
|
261
252
|
|
262
253
|
# Download genomes
|
263
|
-
|
254
|
+
genome_set[:all] = GenomeSet.new(self,
|
255
|
+
genome_set[ :+ ].ids + genome_set[ :- ].ids)
|
256
|
+
genomes_file = @o[:baseout] + ".src.fasta"
|
264
257
|
if @o[:reuse] and File.size? genomes_file
|
265
258
|
puts " * reusing existing file: #{genomes_file}." unless @o[:q]
|
266
259
|
else
|
267
|
-
puts " * downloading
|
268
|
-
|
269
|
-
ids
|
270
|
-
|
271
|
-
while ids.size>0
|
272
|
-
ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
|
273
|
-
end
|
274
|
-
ofh.close
|
260
|
+
puts " * downloading " + genome_set[:all].size.to_s +
|
261
|
+
" genome(s) in FastA." unless @o[:q]
|
262
|
+
$stderr.puts " # #{genome_set[:all].ids}" if @o[:debug]
|
263
|
+
genome_set[:all].download genomes_file
|
275
264
|
end
|
276
265
|
|
277
266
|
# Generate metagenome
|
278
267
|
unless @o[:nosimulate]
|
279
268
|
puts "Generating in silico metagenome" unless @o[:q]
|
280
269
|
if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
|
281
|
-
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
|
270
|
+
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
|
271
|
+
@o[:q]
|
282
272
|
else
|
283
|
-
all_src = File.readlines("#{@o[:baseout]}.src.fasta"
|
273
|
+
all_src = File.readlines("#{@o[:baseout]}.src.fasta"
|
274
|
+
).select{ |l| l =~ /^>/ }.size
|
284
275
|
thrs = [@o[:thr], all_src].min
|
285
|
-
puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
|
286
|
-
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
287
276
|
thr_obj = []
|
288
|
-
seqs_per_thr = (all_src/thrs).ceil
|
277
|
+
seqs_per_thr = (all_src.to_f/thrs).ceil
|
278
|
+
thrs = (all_src.to_f/seqs_per_thr).ceil
|
279
|
+
puts " * simulating metagenomes and tagging positive reads in " +
|
280
|
+
thrs.to_s + " threads." unless @o[:q]
|
281
|
+
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
289
282
|
(0 .. (thrs-1)).each do |thr_i|
|
290
283
|
output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
|
291
284
|
thr_obj << output
|
292
285
|
fork do
|
293
286
|
seqs_a = thr_i*seqs_per_thr + 1
|
294
|
-
seqs_b = [seqs_a + seqs_per_thr, all_src].min
|
287
|
+
seqs_b = [seqs_a + seqs_per_thr - 1, all_src].min
|
295
288
|
# Create sub-fasta
|
296
|
-
ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
|
297
|
-
ifh = File.open("#{@o[:baseout]}.src.fasta",
|
289
|
+
ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}","w")
|
290
|
+
ifh = File.open("#{@o[:baseout]}.src.fasta","r")
|
298
291
|
seq_i = 0
|
299
292
|
while l = ifh.gets
|
300
293
|
seq_i+=1 if l =~ /^>/
|
301
|
-
|
294
|
+
break if seq_i > seqs_b
|
302
295
|
ofh.print l if seq_i >= seqs_a
|
303
296
|
end
|
304
297
|
ifh.close
|
305
298
|
ofh.close
|
306
299
|
|
307
|
-
# Run simulator (except if the temporal file is already
|
308
|
-
|
309
|
-
|
310
|
-
|
300
|
+
# Run simulator (except if the temporal file is already
|
301
|
+
# there and can be reused)
|
302
|
+
bash sprintf(@o[:simulatorcmd], @o[:simulatorbin],
|
303
|
+
"#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
|
304
|
+
@o[:seqdepth]*@o[:readlen].to_f, @o[:readlen],
|
305
|
+
"#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}") unless
|
306
|
+
@o[:reuse] and
|
307
|
+
File.size? @o[:baseout] +
|
308
|
+
".mg.tmp.#{thr_i.to_s}-reads.fa"
|
311
309
|
|
312
|
-
# Tag
|
313
|
-
puts " * tagging
|
314
|
-
|
315
|
-
|
310
|
+
# Tag positive and negative reads
|
311
|
+
puts " * tagging reads [thread #{thr_i}]." unless
|
312
|
+
@o[:q]
|
313
|
+
ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i}-reads.fa",
|
314
|
+
"r")
|
315
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i}", "w")
|
316
316
|
while l = ifh.gets
|
317
317
|
if l =~ /^>/
|
318
|
-
rd =
|
319
|
-
|
318
|
+
rd = %r{
|
319
|
+
^>(?<id>\d+)\s
|
320
|
+
reference=[A-Za-z]+\|
|
321
|
+
(?<genome_id>[A-Za-z0-9_]+)\|.*\s
|
322
|
+
position=(?<comp>complement\()?(?<from>\d+)\.\.
|
323
|
+
(?<to>\d+)\)?\s
|
324
|
+
}x.match(l)
|
325
|
+
raise "Cannot parse simulated read's defline, are " +
|
326
|
+
"you using Grinder?: #{l}" if rd.nil?
|
320
327
|
positive = false
|
321
328
|
positive_coords[rd[:genome_id].to_sym] ||= []
|
322
329
|
positive_coords[rd[:genome_id].to_sym].each do |gn|
|
323
330
|
left = rd[:to].to_i - gn[:from]
|
324
331
|
right = gn[:to] - rd[:from].to_i
|
325
|
-
if (left*right >= 0) and
|
332
|
+
if (left*right >= 0) and
|
333
|
+
([left, right].min >= @o[:minovl])
|
326
334
|
positive = true
|
327
335
|
break
|
328
336
|
end
|
329
337
|
end
|
330
|
-
|
331
|
-
|
338
|
+
negative = false
|
339
|
+
negative_coords[rd[:genome_id].to_sym] ||= []
|
340
|
+
negative_coords[rd[:genome_id].to_sym].each do |gn|
|
341
|
+
left = rd[:to].to_i - gn[:from]
|
342
|
+
right = gn[:to] - rd[:from].to_i
|
343
|
+
if (left*right >= 0) and
|
344
|
+
([left, right].min >= @o[:minovl])
|
345
|
+
negative = true
|
346
|
+
break
|
347
|
+
end
|
348
|
+
end
|
349
|
+
l = ">#{thr_i.to_s}_#{rd[:id]}" +
|
350
|
+
"#{positive ? "@%" : (negative ? "@$" : "")} " +
|
351
|
+
"ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}" +
|
352
|
+
"#{(rd[:comp]=="complement(") ? "-" : "+"}\n"
|
332
353
|
end
|
333
354
|
ofh.print l
|
334
355
|
end
|
@@ -338,9 +359,10 @@ class ROCker
|
|
338
359
|
end # (1 .. thrs).each
|
339
360
|
Process.waitall
|
340
361
|
# Concatenate results
|
341
|
-
ofh = File.open(@o[:baseout] + ".mg.fasta",
|
362
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta", "w")
|
342
363
|
thr_obj.each do |t|
|
343
|
-
raise "Thread failed without error trace: #{t}" unless
|
364
|
+
raise "Thread failed without error trace: #{t}" unless
|
365
|
+
File.exist? t
|
344
366
|
ifh = File.open(t, "r")
|
345
367
|
while l = ifh.gets
|
346
368
|
ofh.print l
|
@@ -356,23 +378,33 @@ class ROCker
|
|
356
378
|
unless @o[:noaln]
|
357
379
|
puts "Aligning reference set." unless @o[:q]
|
358
380
|
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
|
359
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless
|
381
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless
|
382
|
+
@o[:q]
|
360
383
|
else
|
361
|
-
bash
|
362
|
-
|
384
|
+
bash(sprintf(@o[:alignercmd],
|
385
|
+
@o[:alignerbin], "#{@o[:baseout]}.ref.fasta",
|
386
|
+
"#{@o[:baseout]}.ref.aln", @o[:thr]))
|
387
|
+
puts " +--\n | IMPORTANT NOTE: Manually checking the alignment " +
|
388
|
+
"before\n | the 'compile' step is *strongly* encouraged.\n " +
|
389
|
+
"+--\n" unless @o[:q]
|
363
390
|
end
|
364
391
|
end
|
365
392
|
|
366
393
|
# Run similarity search
|
367
394
|
unless @o[:nosearch]
|
368
|
-
puts "Running
|
395
|
+
puts "Running similarity search." unless @o[:q]
|
369
396
|
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
|
370
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless
|
397
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless
|
398
|
+
@o[:q]
|
371
399
|
else
|
372
400
|
puts " * preparing database." unless @o[:q]
|
373
|
-
bash
|
401
|
+
bash(sprintf(@o[:makedbcmd],
|
402
|
+
@o[:searchbins], "prot", "#{@o[:baseout]}.ref.fasta",
|
403
|
+
"#{@o[:baseout]}.ref"))
|
374
404
|
puts " * running similarity search." unless @o[:q]
|
375
|
-
bash
|
405
|
+
bash(sprintf(@o[:searchcmd],
|
406
|
+
@o[:searchbins], "blastx", "#{@o[:baseout]}.mg.fasta",
|
407
|
+
"#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr]))
|
376
408
|
end
|
377
409
|
end
|
378
410
|
|
@@ -382,7 +414,8 @@ class ROCker
|
|
382
414
|
sff = %w{.src.xml .src.fasta}
|
383
415
|
sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
|
384
416
|
sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
|
385
|
-
sff.each { |sf| File.unlink @o[:baseout] + sf if
|
417
|
+
sff.each { |sf| File.unlink @o[:baseout] + sf if
|
418
|
+
File.exist? @o[:baseout] + sf }
|
386
419
|
end
|
387
420
|
end # build!
|
388
421
|
end # ROCker
|