bio-rocker 1.0.0 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ROCker +276 -96
- data/lib/rocker.rb +25 -14
- data/lib/rocker/blasthit.rb +8 -5
- data/lib/rocker/genome-set.rb +70 -0
- data/lib/rocker/protein-set.rb +90 -0
- data/lib/rocker/rocdata.rb +26 -9
- data/lib/rocker/rocwindow.rb +20 -18
- data/lib/rocker/step/build.rb +233 -200
- data/lib/rocker/step/compile.rb +11 -6
- data/lib/rocker/step/filter.rb +11 -7
- data/lib/rocker/step/plot.rb +80 -26
- data/lib/rocker/step/search.rb +27 -4
- metadata +16 -14
data/lib/rocker.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-07-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'rocker/blasthit'
|
@@ -10,20 +10,30 @@ require 'rocker/rocdata'
|
|
10
10
|
|
11
11
|
class ROCker
|
12
12
|
#================================[ Class ]
|
13
|
+
@@VERSION = "1.1.9"
|
14
|
+
@@CITATION = "Orellana, Rodriguez-R, & Konstantinidis. Under review. " +
|
15
|
+
"Detecting and quantifying functional genes in short-read metagenomic " +
|
16
|
+
"datasets: method development and application to the nitrogen cycle " +
|
17
|
+
"genes."
|
13
18
|
@@DEFAULTS = {
|
14
19
|
# General
|
15
|
-
:
|
20
|
+
q: false, r: "R", nucl: false, debug: false, thr: 2, search: :blast,
|
16
21
|
# External software
|
17
|
-
:
|
18
|
-
:
|
19
|
-
:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
searchbins: "",
|
23
|
+
searchcmd: {
|
24
|
+
blast: '%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" ' +
|
25
|
+
'-num_threads %6$d -outfmt 6 -max_target_seqs 1',
|
26
|
+
diamond: '%1$sdiamond %2$s -q "%3$s" -d "%4$s" -a "%5$s.daa" -p %6$d' +
|
27
|
+
' -k 1 --min-score 20 --sensitive && %1$sdiamond view -a "%5$s"' +
|
28
|
+
' -o "%5$s"'},
|
29
|
+
makedbcmd: {
|
30
|
+
blast: '%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
|
31
|
+
diamond: '%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
|
24
32
|
}
|
25
33
|
def self.defaults() @@DEFAULTS ; end
|
26
34
|
def self.default(k) @@DEFAULTS[k] ; end
|
35
|
+
def self.VERSION; @@VERSION ; end
|
36
|
+
def self.CITATION; @@CITATION ; end
|
27
37
|
|
28
38
|
#================================[ Instance ]
|
29
39
|
attr_reader :o
|
@@ -46,7 +56,8 @@ class ROCker
|
|
46
56
|
end
|
47
57
|
def bash(cmd, err_msg=nil)
|
48
58
|
o = `#{cmd} 2>&1 && echo '{'`
|
49
|
-
raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
|
59
|
+
raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
|
60
|
+
o[-2]=="{"
|
50
61
|
true
|
51
62
|
end
|
52
63
|
end
|
@@ -63,10 +74,10 @@ require 'rocker/step/plot'
|
|
63
74
|
class Numeric
|
64
75
|
def ordinalize
|
65
76
|
n= self.to_s
|
66
|
-
s= n[-2]=='1' ?
|
67
|
-
n[-1]=='1' ?
|
68
|
-
n[-1]=='2' ?
|
69
|
-
n[-1]=='3' ?
|
77
|
+
s= n[-2]=='1' ? "th" :
|
78
|
+
n[-1]=='1' ? "st" :
|
79
|
+
n[-1]=='2' ? "nd" :
|
80
|
+
n[-1]=='3' ? "rd" : "th"
|
70
81
|
n + s
|
71
82
|
end
|
72
83
|
end
|
data/lib/rocker/blasthit.rb
CHANGED
@@ -2,12 +2,13 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-06-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
class BlastHit
|
9
|
-
attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :midpoint
|
10
|
-
# Initialize from BLAST using new(ln,aln),
|
9
|
+
attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :isfalse, :midpoint
|
10
|
+
# Initialize from BLAST using new(ln,aln),
|
11
|
+
# initialize from TABLE using new(ln)
|
11
12
|
def initialize(ln, aln=nil)
|
12
13
|
l = ln.chomp.split(/\t/)
|
13
14
|
if aln.nil?
|
@@ -16,6 +17,7 @@ class BlastHit
|
|
16
17
|
@sto = l[2].to_i
|
17
18
|
@bits = l[3].to_f
|
18
19
|
@istrue = l[4]=='1'
|
20
|
+
@istrue = l[4]=='-1'
|
19
21
|
@midpoint = l[5].to_i
|
20
22
|
else
|
21
23
|
s = aln.seq(l[1])
|
@@ -27,13 +29,14 @@ class BlastHit
|
|
27
29
|
@sto = [a,b].max
|
28
30
|
@bits = l[11].to_f
|
29
31
|
@istrue = ! /@%/.match(l[0]).nil?
|
32
|
+
@isfalse = ! /@\$/.match(l[0]).nil?
|
30
33
|
@midpoint = s.pos2col(((l[8].to_f+l[9].to_f)/2).ceil)
|
31
34
|
end
|
32
35
|
end
|
33
36
|
def to_s
|
34
37
|
self.sbj.nil? ? "" :
|
35
|
-
[
|
36
|
-
|
38
|
+
[sbj, sfrom.to_s, sto.to_s, bits.to_s,
|
39
|
+
istrue ? "1" : (isfalse ? "-1" : "0"), midpoint].join("\t") + "\n"
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-23-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class GenomeSet
|
9
|
+
attr_reader :rocker, :ids, :taxa
|
10
|
+
def initialize(rocker, ids)
|
11
|
+
@rocker = rocker
|
12
|
+
@ids = ids
|
13
|
+
@ids = [] if ids.nil?
|
14
|
+
@taxa = {}
|
15
|
+
@all_taxa = {}
|
16
|
+
end
|
17
|
+
def download(file)
|
18
|
+
tmp_ids = Array.new(self.ids)
|
19
|
+
ofh = File.open(file, "w")
|
20
|
+
while tmp_ids.size>0
|
21
|
+
ofh.print rocker.ebiFetch(:embl, tmp_ids.shift(200), :fasta)
|
22
|
+
end
|
23
|
+
ofh.close
|
24
|
+
end
|
25
|
+
def link_taxon(id, taxon)
|
26
|
+
@all_taxa[ taxon.to_sym ] ||= []
|
27
|
+
@all_taxa[ taxon.to_sym ] << id
|
28
|
+
end
|
29
|
+
def choose_genomes!(rank)
|
30
|
+
@taxa = {}
|
31
|
+
self.get_taxonomy! rank
|
32
|
+
@all_taxa.each_pair{ |taxon,ids| @taxa[taxon] = ids.sample }
|
33
|
+
@ids = @taxa.values
|
34
|
+
end
|
35
|
+
def get_taxonomy!(rank)
|
36
|
+
@all_taxa = {}
|
37
|
+
ids.each do |id|
|
38
|
+
self.link_taxon(id, genome2taxon(id, rank))
|
39
|
+
end
|
40
|
+
end
|
41
|
+
def taxa=(hash)
|
42
|
+
@taxa = {}
|
43
|
+
hash.each_pair{ |taxon, id| @taxa[taxon] = id if self.ids.include? id }
|
44
|
+
end
|
45
|
+
def size() self.ids.size end
|
46
|
+
def empty?() self.ids.empty? end
|
47
|
+
|
48
|
+
#================================[ Utilities ]
|
49
|
+
def genome2taxon(genome_id, rank='species')
|
50
|
+
v = genome2taxid(genome_id)
|
51
|
+
unless v.nil?
|
52
|
+
xml = rocker.ebiFetch('taxonomy', [v], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
53
|
+
v = xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first
|
54
|
+
v.sub!(/.* taxId="(\d+)".*/,"\\1") unless v.nil?
|
55
|
+
end
|
56
|
+
return "no-taxon-#{(0...12).map { (65 + rand(26)).chr }.join}" if v.nil? or v !~ /^\d+$/
|
57
|
+
v
|
58
|
+
end
|
59
|
+
def genome2taxid(genome_id)
|
60
|
+
doc = rocker.ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/)
|
61
|
+
ln = doc.grep(/^FT\s+\/db_xref="taxon:/).first
|
62
|
+
ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
|
63
|
+
return nil if ln.nil?
|
64
|
+
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, "\\1")
|
65
|
+
return nil unless ln =~ /^\d+$/
|
66
|
+
ln
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jul-20-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'rocker/alignment'
|
9
|
+
|
10
|
+
class ProteinSet
|
11
|
+
attr_reader :rocker, :ids, :aln
|
12
|
+
def initialize(rocker, ids=nil, file=nil, aln_file=nil)
|
13
|
+
@genomes = {}
|
14
|
+
@tranids = {}
|
15
|
+
@aln = nil
|
16
|
+
@rocker = rocker
|
17
|
+
@ids = []
|
18
|
+
@ids += ids unless ids.nil?
|
19
|
+
@ids += File.readlines(file).map{ |l| l.chomp } unless file.nil?
|
20
|
+
unless aln_file.nil?
|
21
|
+
aln = Alignment.new
|
22
|
+
aln.read_fasta aln_file
|
23
|
+
aln_ids = aln.get_ids
|
24
|
+
@aln = aln if (@ids - aln_ids).empty?
|
25
|
+
@ids += aln_ids
|
26
|
+
end
|
27
|
+
@ids.uniq!
|
28
|
+
end
|
29
|
+
def download(file)
|
30
|
+
tmp_ids = Array.new(self.ids)
|
31
|
+
f = File.open(file, "w")
|
32
|
+
while tmp_ids.size>0
|
33
|
+
f.print rocker.ebiFetch(:uniprotkb, tmp_ids.shift(200), :fasta)
|
34
|
+
end
|
35
|
+
f.close
|
36
|
+
end
|
37
|
+
def get_from_aln(file, aln)
|
38
|
+
f = File.open(file, "w")
|
39
|
+
f.print aln.to_seq_s
|
40
|
+
f.close
|
41
|
+
end
|
42
|
+
def get_genomes!
|
43
|
+
self.ids.each do |id|
|
44
|
+
doc = self.rocker.ebiFetch(:uniprotkb, [id], :annot).split("\n")
|
45
|
+
doc.grep( /^DR\s+EMBL;/ ).map do |ln|
|
46
|
+
r=ln.split('; ')
|
47
|
+
self.link_genome(id, r[1])
|
48
|
+
self.link_tranid(id, r[2])
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
def link_genome(prot_id, genome_id)
|
53
|
+
@genomes[prot_id] ||= []
|
54
|
+
@genomes[prot_id] << genome_id
|
55
|
+
@genomes[prot_id].uniq!
|
56
|
+
end
|
57
|
+
def link_tranid(prot_id, transl_id)
|
58
|
+
@tranids[prot_id] ||= []
|
59
|
+
@tranids[prot_id] << transl_id
|
60
|
+
@tranids[prot_id].uniq!
|
61
|
+
end
|
62
|
+
def genomes
|
63
|
+
return [] if @genomes.empty?
|
64
|
+
@genomes.values.reduce(:+).uniq
|
65
|
+
end
|
66
|
+
def tranids
|
67
|
+
return [] if @tranids.empty?
|
68
|
+
@tranids.values.reduce(:+).uniq
|
69
|
+
end
|
70
|
+
def in_coords(coords)
|
71
|
+
coords.keys.map do |genome|
|
72
|
+
locations = coords[ genome ]
|
73
|
+
locations.map do |loc|
|
74
|
+
if not loc[:prot_id].nil?
|
75
|
+
loc[:prot_id] if self.include? loc[:prot_id]
|
76
|
+
elsif not loc[:tran_id].nil? and not @tranids.rassoc(loc[:tran_id]).nil?
|
77
|
+
@tranids.rassoc(loc[:tran_id]).first
|
78
|
+
else
|
79
|
+
warn "Warning: Impossible to resolve protein located in '#{genome}' at: #{loc}."
|
80
|
+
nil
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end.reduce([], :+).compact.uniq
|
84
|
+
end
|
85
|
+
def size() self.ids.size end
|
86
|
+
def empty?() self.ids.empty? end
|
87
|
+
def include?(id) self.ids.include?(id) end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
data/lib/rocker/rocdata.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-07-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'rocker/rinterface'
|
@@ -11,11 +11,13 @@ require 'rocker/alignment'
|
|
11
11
|
require 'tmpdir'
|
12
12
|
|
13
13
|
class ROCData
|
14
|
-
attr_reader :aln, :windows, :r
|
15
|
-
# Use ROCData.new(table,aln,window) to re-compute from table, use
|
14
|
+
attr_reader :aln, :windows, :r, :refined
|
15
|
+
# Use ROCData.new(table,aln,window) to re-compute from table, use
|
16
|
+
# ROCData.new(data) to load
|
16
17
|
def initialize(val, aln=nil, window=nil)
|
17
18
|
@r = RInterface.new
|
18
19
|
@nucl = false
|
20
|
+
@refined = false
|
19
21
|
if not aln.nil?
|
20
22
|
@aln = aln
|
21
23
|
self.rrun "library('pROC');"
|
@@ -33,7 +35,9 @@ class ROCData
|
|
33
35
|
@aln.read_rocker(val)
|
34
36
|
end
|
35
37
|
end
|
36
|
-
def win_at_col(col)
|
38
|
+
def win_at_col(col)
|
39
|
+
self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first
|
40
|
+
end
|
37
41
|
def in_nucl?() @nucl end
|
38
42
|
def nucl=(nucl) @nucl=nucl end
|
39
43
|
def refine! table
|
@@ -41,14 +45,17 @@ class ROCData
|
|
41
45
|
return false unless self.load_table! table
|
42
46
|
break if self._refine_iter(table)==0
|
43
47
|
end
|
48
|
+
@refined = true
|
44
49
|
return true
|
45
50
|
end
|
51
|
+
def is_refined? ; @refined ; end
|
46
52
|
def _refine_iter table
|
47
53
|
to_refine = []
|
48
54
|
self.windows.each do |w|
|
49
55
|
next if w.almost_empty or w.length <= 5
|
50
56
|
self.rrun "acc <- w$accuracy[w$V1==#{w.from}];"
|
51
|
-
to_refine << w if
|
57
|
+
to_refine << w if
|
58
|
+
self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
|
52
59
|
end
|
53
60
|
n = to_refine.size
|
54
61
|
return 0 unless n > 0
|
@@ -86,9 +93,17 @@ class ROCData
|
|
86
93
|
win <- which( (m>=w$V1) & (m<=w$V2))[1];
|
87
94
|
if(!is.na(win)){
|
88
95
|
if(x$V4[i] >= w$V5[win]){
|
89
|
-
if(x$V5[i]==1){
|
96
|
+
if(x$V5[i]==1){
|
97
|
+
w$tp[win] <- w$tp[win]+1
|
98
|
+
} else {
|
99
|
+
w$fp[win] <- w$fp[win]+1
|
100
|
+
}
|
90
101
|
}else{
|
91
|
-
if(x$V5[i]==1){
|
102
|
+
if(x$V5[i]==1){
|
103
|
+
w$fn[win] <- w$fn[win]+1
|
104
|
+
} else {
|
105
|
+
w$tn[win] <- w$tn[win]+1
|
106
|
+
};
|
92
107
|
}
|
93
108
|
}
|
94
109
|
}
|
@@ -106,7 +121,9 @@ class ROCData
|
|
106
121
|
end
|
107
122
|
def init_windows!(size)
|
108
123
|
@windows = []
|
109
|
-
1.step(self.aln.cols,size).each
|
124
|
+
1.step(self.aln.cols,size).each do |a|
|
125
|
+
@windows << ROCWindow.new(self, a, a+size-1)
|
126
|
+
end
|
110
127
|
end
|
111
128
|
def rrun(cmd, type=nil) self.r.run cmd, type end
|
112
129
|
def save(file)
|
@@ -115,7 +132,7 @@ class ROCData
|
|
115
132
|
f.close
|
116
133
|
end
|
117
134
|
def to_s
|
118
|
-
o =
|
135
|
+
o = "#v ROCker " + ROCker.VERSION + "\n"
|
119
136
|
self.windows.each{|w| o += w.to_s}
|
120
137
|
o += self.aln.to_s
|
121
138
|
return o
|
data/lib/rocker/rocwindow.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-07-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
class ROCWindow
|
@@ -22,16 +22,18 @@ class ROCWindow
|
|
22
22
|
@from = [a,b].min
|
23
23
|
@to = [a,b].max
|
24
24
|
@thr = nil
|
25
|
-
|
25
|
+
compute!
|
26
26
|
end
|
27
27
|
end
|
28
28
|
def compute!
|
29
|
-
|
30
|
-
@hits =
|
31
|
-
@tps =
|
32
|
-
unless
|
33
|
-
|
34
|
-
thr =
|
29
|
+
load_hits
|
30
|
+
@hits = rrun("nrow(y);", :int)
|
31
|
+
@tps = rrun("sum(y$V5==1);", :int)
|
32
|
+
unless almost_empty
|
33
|
+
rrun "rocobj <- roc(as.numeric(y$V5==1), y$V4);"
|
34
|
+
thr = rrun("coords(rocobj, 'best', ret='threshold', " +
|
35
|
+
"best.method='youden', " +
|
36
|
+
"best.weights=c(0.5, sum(y$V5==1)/nrow(y)))[1];", :float)
|
35
37
|
@thr = thr.to_f
|
36
38
|
@thr = nil if @thr==0.0 or @thr.infinite?
|
37
39
|
end
|
@@ -48,16 +50,16 @@ class ROCWindow
|
|
48
50
|
return nil if a.nil? and b.nil?
|
49
51
|
return a.thr if b.nil?
|
50
52
|
return b.thr if a.nil?
|
51
|
-
return (b.thr*(
|
53
|
+
return (b.thr*(from-a.from) - a.thr*(from-b.from))/(b.from-a.from)
|
52
54
|
end
|
53
|
-
def load_hits() self.rrun "y <- x[x$V6>=#{
|
54
|
-
def previous() (
|
55
|
-
def next() (
|
56
|
-
def thr_notnil() (@thr.nil? or @thr.infinite?) ?
|
57
|
-
def fps()
|
58
|
-
def almost_empty()
|
59
|
-
def length()
|
60
|
-
def rrun(cmd, type=nil)
|
61
|
-
def to_s() [
|
55
|
+
def load_hits() self.rrun "y <- x[x$V6>=#{from} & x$V6<=#{to},];" end
|
56
|
+
def previous() (from == 1) ? nil : data.win_at_col(from - 1) end
|
57
|
+
def next() (to == data.aln.cols) ? nil : data.win_at_col(to + 1) end
|
58
|
+
def thr_notnil() (@thr.nil? or @thr.infinite?) ? around_thr : @thr end
|
59
|
+
def fps() hits - tps end
|
60
|
+
def almost_empty() fps < 3 or tps < 3 end
|
61
|
+
def length() to - from + 1 end
|
62
|
+
def rrun(cmd, type=nil) data.rrun(cmd, type) end
|
63
|
+
def to_s() [from, to, hits, tps, thr_notnil].join("\t") + "\n" end
|
62
64
|
end
|
63
65
|
|
data/lib/rocker/step/build.rb
CHANGED
@@ -2,21 +2,27 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Sep-11-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'json'
|
9
|
+
require 'rocker/protein-set'
|
10
|
+
require 'rocker/genome-set'
|
9
11
|
|
10
12
|
class ROCker
|
11
13
|
#================================[ Class ]
|
12
|
-
@@EBIREST =
|
13
|
-
@@DEFAULTS.merge!({:
|
14
|
+
@@EBIREST = "http://www.ebi.ac.uk/Tools"
|
15
|
+
@@DEFAULTS.merge!({positive:[], negative:[], seqdepth:0.03, readlen:100,
|
16
|
+
minovl:50,
|
14
17
|
# Ext. Software
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:
|
18
|
-
|
19
|
-
|
18
|
+
aligner: :clustalo, simulator: :grinder,
|
19
|
+
simulatorbin:{grinder:"grinder"},
|
20
|
+
simulatorcmd:{grinder:"%1$s -reference_file \"%2$s\" -cf \"%3$f\" " +
|
21
|
+
"-dc '-~*NnKkMmRrYySsWwBbVvHhDdXx' -md uniform 0.1 -mr 95 5 " +
|
22
|
+
"-rd %4$d uniform 5 -base_name \"%5$s\""},
|
23
|
+
alignerbin:{muscle:"muscle", clustalo:"clustalo"},
|
24
|
+
alignercmd:{muscle:"%1$s -in \"%2$s\" -out \"%3$s\" -quiet",
|
25
|
+
clustalo:"%1$s -i \"%2$s\" -o \"%3$s\" --threads=%4$d --force"}
|
20
26
|
})
|
21
27
|
@@HAS_BUILD_GEMS = nil
|
22
28
|
def self.ebirest() @@EBIREST ; end
|
@@ -33,90 +39,65 @@ class ROCker
|
|
33
39
|
end
|
34
40
|
|
35
41
|
#================================[ Utilities ]
|
36
|
-
def genes2genomes(gene_ids)
|
37
|
-
genomes = []
|
38
|
-
ids = Array.new(gene_ids)
|
39
|
-
while ids.size>0
|
40
|
-
doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
|
41
|
-
genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
|
42
|
-
r=ln.split('; ')
|
43
|
-
{:genome_id=>r[1], :transl_id=>r[2]}
|
44
|
-
end
|
45
|
-
end
|
46
|
-
genomes.uniq
|
47
|
-
end
|
48
|
-
def genome2taxid(genome_id)
|
49
|
-
ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
|
50
|
-
return ln if ln.nil?
|
51
|
-
ln.sub(/.*"taxon:(\d+)".*/, "\\1")
|
52
|
-
end
|
53
|
-
def genome2taxon(genome_id, rank='species')
|
54
|
-
xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
55
|
-
xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
|
56
|
-
end
|
57
42
|
def restcall(url, outfile=nil)
|
58
43
|
$stderr.puts " # Calling: #{url}" if @o[:debug]
|
59
|
-
response = RestClient::Request.execute(:method=>:get,
|
60
|
-
|
44
|
+
response = RestClient::Request.execute(:method=>:get, :url=>url,
|
45
|
+
:timeout=>600)
|
46
|
+
raise "Unable to reach EBI REST client, error code " +
|
47
|
+
response.code.to_s + "." unless response.code == 200
|
61
48
|
unless outfile.nil?
|
62
|
-
ohf = File.open(outfile,
|
49
|
+
ohf = File.open(outfile, "w")
|
63
50
|
ohf.print response.to_s
|
64
51
|
ohf.close
|
65
52
|
end
|
66
53
|
response.to_s
|
67
54
|
end
|
68
55
|
def ebiFetch(db, ids, format, outfile=nil)
|
69
|
-
url = "#{ROCker.ebirest}/dbfetch/dbfetch
|
70
|
-
|
71
|
-
|
72
|
-
ohf = File.open(outfile, 'w')
|
73
|
-
ohf.print res
|
74
|
-
ohf.close
|
75
|
-
end
|
76
|
-
res
|
56
|
+
url = "#{ROCker.ebirest}/dbfetch/dbfetch/" +
|
57
|
+
"#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
|
58
|
+
self.restcall url, outfile
|
77
59
|
end
|
78
|
-
def get_coords_from_gff3(genome_ids,
|
79
|
-
|
80
|
-
genomes_org = {}
|
60
|
+
def get_coords_from_gff3(genome_ids, pset, thread_id, json_file)
|
61
|
+
coords = {}
|
81
62
|
i = 0
|
82
63
|
genome_ids.each do |genome_id|
|
83
|
-
print " * scanning #{(i+=1).ordinalize} genome out of
|
84
|
-
|
85
|
-
|
86
|
-
genomes_org[ genome_taxon.to_sym ] ||= []
|
87
|
-
genomes_org[ genome_taxon.to_sym ] << genome_id
|
88
|
-
end
|
64
|
+
print " * scanning #{(i+=1).ordinalize} genome out of " +
|
65
|
+
"#{genome_ids.size} in first thread. \r" if
|
66
|
+
thread_id==0 and not @o[:q]
|
89
67
|
genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
|
90
68
|
if @o[:reuse] and File.size? genome_file
|
91
|
-
ifh = File.open(genome_file,
|
69
|
+
ifh = File.open(genome_file, "r")
|
92
70
|
doc = ifh.readlines.grep(/^[^#]/)
|
93
71
|
ifh.close
|
94
72
|
else
|
95
73
|
genome_file=nil unless @o[:noclean]
|
96
|
-
doc = ebiFetch(:embl, [genome_id], :gff3,
|
74
|
+
doc = ebiFetch(:embl, [genome_id], :gff3,
|
75
|
+
genome_file).split("\n").grep(/^[^#]/)
|
97
76
|
end
|
98
77
|
doc.each do |ln|
|
99
78
|
next if ln =~ /^#/
|
100
79
|
r = ln.chomp.split /\t/
|
101
80
|
next if r.size < 9
|
102
|
-
prots = r[8].split(/;/).grep(
|
103
|
-
|
104
|
-
|
105
|
-
|
81
|
+
prots = r[8].split(/;/).grep(
|
82
|
+
/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
|
83
|
+
p = prots.select{ |id| pset.ids.include? id }.first
|
84
|
+
trans = r[8].split(/;/).grep(
|
85
|
+
/^protein_id=/){ |pid| pid.split(/=/)[1] }
|
86
|
+
t = trans.select{ |id| pset.tranids.include? id }.first
|
106
87
|
next if p.nil? and t.nil?
|
107
|
-
|
108
|
-
|
109
|
-
:
|
110
|
-
:
|
111
|
-
:
|
112
|
-
:
|
113
|
-
:
|
88
|
+
coords[ r[0].to_sym ] ||= []
|
89
|
+
coords[ r[0].to_sym ] << {
|
90
|
+
prot_id: p,
|
91
|
+
tran_id: t,
|
92
|
+
from: r[3].to_i,
|
93
|
+
to: r[4].to_i,
|
94
|
+
strand: r[6]
|
114
95
|
}
|
115
96
|
end
|
116
97
|
end
|
117
98
|
print "\n" if thread_id==0 and not @o[:q]
|
118
|
-
ofh = File.open
|
119
|
-
ofh.print({:
|
99
|
+
ofh = File.open(json_file, "w")
|
100
|
+
ofh.print({coords:coords}.to_json)
|
120
101
|
ofh.close
|
121
102
|
end
|
122
103
|
|
@@ -124,211 +105,251 @@ class ROCker
|
|
124
105
|
def build!
|
125
106
|
# Check requirements
|
126
107
|
puts "Testing environment." unless @o[:q]
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
@o[
|
131
|
-
@o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
|
132
|
-
@o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
|
108
|
+
{ searchcmd: :search, makedbcmd: :search,
|
109
|
+
alignercmd: :aligner, alignerbin: :aligner,
|
110
|
+
simulatorcmd: :simulator, simulatorbin: :simulator
|
111
|
+
}.each_pair { |k,v| @o[k] = @o[k][@o[v]] if @o[k].is_a? Hash }
|
133
112
|
@o[:nosearch]=true if @o[:nosimulate]
|
134
|
-
raise "Unsatisfied requirements, please see the help message (-h)." unless
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
aln.read_fasta @o[:aln]
|
141
|
-
@o[:positive] += aln.get_ids
|
142
|
-
end
|
143
|
-
raise "-p or -P are mandatory." if @o[:positive].size==0
|
113
|
+
raise "Unsatisfied requirements, please see the help message (-h)." unless
|
114
|
+
ROCker.has_build_gems?
|
115
|
+
protein_set = {}
|
116
|
+
protein_set[:+] = ProteinSet.new(self,@o[:positive],@o[:posfile],@o[:aln])
|
117
|
+
protein_set[:-] = ProteinSet.new(self,@o[:negative],@o[:negfile])
|
118
|
+
raise "-p, -P, or -a are mandatory." if protein_set[:+].empty?
|
144
119
|
raise "-o/--baseout is mandatory." if @o[:baseout].nil?
|
145
|
-
if
|
146
|
-
warn "\nWARNING: Positive set contains only one sequence, turning
|
120
|
+
if protein_set[:+].size==1 and not @o[:noaln]
|
121
|
+
warn "\nWARNING: Positive set contains only one sequence, turning " +
|
122
|
+
"off alignment.\n\n"
|
147
123
|
@o[:noaln] = true
|
148
124
|
end
|
149
125
|
unless @o[:nosimulate]
|
150
|
-
self.bash
|
126
|
+
self.bash("#{@o[:simulatorbin]} --version",
|
127
|
+
"--simulator-bin must be executable. Is Grinder installed?") if
|
128
|
+
@o[:simulator]==:grinder
|
151
129
|
end
|
152
130
|
unless @o[:noaln]
|
153
|
-
self.bash
|
154
|
-
|
131
|
+
self.bash("#{@o[:alignerbin]} -version",
|
132
|
+
"--aligner-bin must be executable. Is Muscle installed?") if
|
133
|
+
@o[:aligner]==:muscle
|
134
|
+
self.bash("#{@o[:alignerbin]} --version",
|
135
|
+
"--aligner-bin must be executable. Is ClustalOmega installed?") if
|
136
|
+
@o[:aligner]==:clustalo
|
155
137
|
end
|
156
138
|
unless @o[:nosearch]
|
157
|
-
self.bash
|
158
|
-
|
139
|
+
self.bash("#{@o[:searchbins]}makeblastdb -version",
|
140
|
+
"--search-bins must contain executables. Is BLAST+ installed?") if
|
141
|
+
@o[:search]==:blast
|
142
|
+
self.bash("#{@o[:searchbins]}diamond --help",
|
143
|
+
"--search-bins must contain executables. Is DIAMOND installed?") if
|
144
|
+
@o[:search]==:diamond
|
159
145
|
end
|
160
146
|
|
161
147
|
# Download genes
|
162
148
|
puts "Downloading gene data." unless @o[:q]
|
163
149
|
ref_file = @o[:baseout] + ".ref.fasta"
|
164
|
-
if
|
150
|
+
if not protein_set[:+].aln.nil?
|
165
151
|
puts " * reusing aligned sequences as positive set." unless @o[:q]
|
166
|
-
|
167
|
-
f.print aln.to_seq_s
|
168
|
-
f.close
|
152
|
+
protein_set[:+].get_from_aln(ref_file, aln)
|
169
153
|
@o[:noaln] = true
|
170
154
|
elsif @o[:reuse] and File.size? ref_file
|
171
155
|
puts " * reusing positive set: #{ref_file}." unless @o[:q]
|
172
156
|
else
|
173
|
-
puts " * downloading #{
|
174
|
-
|
175
|
-
ids
|
176
|
-
|
177
|
-
while ids.size>0
|
178
|
-
f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
|
179
|
-
end
|
180
|
-
f.close
|
157
|
+
puts " * downloading #{protein_set[:+].size} sequence(s) in " +
|
158
|
+
"positive set." unless @o[:q]
|
159
|
+
$stderr.puts " # #{protein_set[:+].ids}" if @o[:debug]
|
160
|
+
protein_set[:+].download(ref_file)
|
181
161
|
end
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
puts "
|
187
|
-
|
188
|
-
r = genes2genomes(@o[set])
|
189
|
-
genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
|
190
|
-
transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
|
162
|
+
[:+, :-].each do |set|
|
163
|
+
unless protein_set[set].empty?
|
164
|
+
puts " * linking genomes from #{protein_set[set].size} " +
|
165
|
+
"[#{set.to_s}] sequence(s)." unless @o[:q]
|
166
|
+
$stderr.puts " # #{protein_set[set].ids}" if @o[:debug]
|
167
|
+
protein_set[set].get_genomes!
|
191
168
|
end
|
192
169
|
end
|
193
|
-
raise "No genomes associated with the positive set." if
|
194
|
-
|
195
|
-
|
196
|
-
|
170
|
+
raise "No genomes associated with the positive set." if
|
171
|
+
protein_set[:+].genomes.empty?
|
172
|
+
genome_set = {:+ => GenomeSet.new(self, protein_set[:+].genomes),
|
173
|
+
:- => GenomeSet.new(self, protein_set[:-].genomes)}
|
197
174
|
|
198
175
|
# Locate genes
|
199
176
|
puts "Analyzing genome data." unless @o[:q]
|
200
177
|
coords_file = @o[:baseout] + ".src.coords"
|
201
178
|
if @o[:reuse] and File.size? coords_file
|
202
179
|
puts " * reusing coordinates: #{coords_file}." unless @o[:q]
|
203
|
-
c = JSON.parse File.read(coords_file), {:
|
180
|
+
c = JSON.parse File.read(coords_file), {symbolize_names:true}
|
204
181
|
positive_coords = c[:positive_coords]
|
205
|
-
|
182
|
+
negative_coords = c[:negative_coords]
|
183
|
+
genome_set[:+].taxa = c[:taxa_pos]
|
184
|
+
genome_set[:-].taxa = c[:taxa_neg]
|
206
185
|
else
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
thr_obj
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
o[:positive_coords].each_pair do |k,v|
|
233
|
-
positive_coords[ k ] ||= []
|
234
|
-
positive_coords[ k ] += v
|
186
|
+
all_coords = {}
|
187
|
+
[:+, :-].each do |set_type|
|
188
|
+
all_coords[set_type] = {}
|
189
|
+
next if genome_set[set_type].empty?
|
190
|
+
thrs = [@o[:thr], genome_set[set_type].size].min
|
191
|
+
puts " * downloading and parsing #{genome_set[set_type].size} " +
|
192
|
+
"GFF3 document(s) in #{thrs} threads." unless @o[:q]
|
193
|
+
$stderr.puts " # Looking for translations: " +
|
194
|
+
"#{protein_set[set_type].tranids}" if @o[:debug]
|
195
|
+
$stderr.puts " # Looking into: #{genome_set[set_type].ids}" if
|
196
|
+
@o[:debug]
|
197
|
+
# Launch threads
|
198
|
+
thr_obj = []
|
199
|
+
(0 .. (thrs-1)).each do |thr_i|
|
200
|
+
ids_to_parse = []
|
201
|
+
(0 .. (genome_set[set_type].size-1)).each do |i|
|
202
|
+
ids_to_parse << protein_set[set_type].genomes[i] if
|
203
|
+
(i % thrs) == thr_i
|
204
|
+
end
|
205
|
+
json_file = @o[:baseout] + ".src.coords." + thr_i.to_s + ".tmp"
|
206
|
+
thr_obj << json_file
|
207
|
+
fork do
|
208
|
+
get_coords_from_gff3(ids_to_parse, protein_set[set_type],
|
209
|
+
thr_i, json_file)
|
210
|
+
end
|
235
211
|
end
|
236
|
-
|
237
|
-
|
238
|
-
|
212
|
+
# Combine results
|
213
|
+
Process.waitall
|
214
|
+
thr_obj.each do |t|
|
215
|
+
raise "Thread failed without error trace: #{t}" unless
|
216
|
+
File.exist? t
|
217
|
+
o = JSON.parse(File.read(t), {symbolize_names:true})
|
218
|
+
o[:coords].each_pair do |k,v|
|
219
|
+
all_coords[set_type][ k ] ||= []
|
220
|
+
all_coords[set_type][ k ] += v
|
221
|
+
end
|
222
|
+
File.unlink t
|
239
223
|
end
|
240
|
-
|
241
|
-
|
224
|
+
end # [:+, :-].each
|
225
|
+
positive_coords = all_coords[:+]
|
226
|
+
negative_coords = all_coords[:-]
|
242
227
|
# Select one genome per taxon
|
243
228
|
unless @o[:pertaxon].nil?
|
244
|
-
|
229
|
+
puts " Selecting genomes by #{@o[:pertaxon]}." unless @o[:q]
|
230
|
+
[:+,:-].each{ |set| genome_set[set].choose_genomes! @o[:pertaxon] }
|
245
231
|
end
|
246
|
-
# Save coordinates
|
232
|
+
# Save coordinates and taxa
|
247
233
|
ofh = File.open(coords_file, "w")
|
248
|
-
ofh.print JSON.pretty_generate({
|
234
|
+
ofh.print JSON.pretty_generate({
|
235
|
+
positive_coords:positive_coords,
|
236
|
+
negative_coords:negative_coords,
|
237
|
+
taxa_pos:genome_set[:+].taxa,
|
238
|
+
taxa_neg:genome_set[:-].taxa})
|
249
239
|
ofh.close
|
250
|
-
end
|
240
|
+
end # if @o[:reuse] and File.size? coords_file ... else
|
251
241
|
unless @o[:pertaxon].nil?
|
252
|
-
|
253
|
-
|
242
|
+
puts " Using " +
|
243
|
+
[:+,:-].map{ |set| genome_set[set].size }.reduce(:+).to_s +
|
244
|
+
" genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
|
254
245
|
end
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
246
|
+
found = protein_set[:+].in_coords(positive_coords)
|
247
|
+
raise "Cannot find the genomic location of any provided sequence." if
|
248
|
+
found.nil?
|
249
|
+
missing = protein_set[:+].ids - found
|
250
|
+
warn "\nWARNING: Cannot find genomic location of #{missing.size} " +
|
251
|
+
"sequence(s) #{missing.join(",")}.\n\n" unless missing.empty?
|
261
252
|
|
262
253
|
# Download genomes
|
263
|
-
|
254
|
+
genome_set[:all] = GenomeSet.new(self,
|
255
|
+
genome_set[ :+ ].ids + genome_set[ :- ].ids)
|
256
|
+
genomes_file = @o[:baseout] + ".src.fasta"
|
264
257
|
if @o[:reuse] and File.size? genomes_file
|
265
258
|
puts " * reusing existing file: #{genomes_file}." unless @o[:q]
|
266
259
|
else
|
267
|
-
puts " * downloading
|
268
|
-
|
269
|
-
ids
|
270
|
-
|
271
|
-
while ids.size>0
|
272
|
-
ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
|
273
|
-
end
|
274
|
-
ofh.close
|
260
|
+
puts " * downloading " + genome_set[:all].size.to_s +
|
261
|
+
" genome(s) in FastA." unless @o[:q]
|
262
|
+
$stderr.puts " # #{genome_set[:all].ids}" if @o[:debug]
|
263
|
+
genome_set[:all].download genomes_file
|
275
264
|
end
|
276
265
|
|
277
266
|
# Generate metagenome
|
278
267
|
unless @o[:nosimulate]
|
279
268
|
puts "Generating in silico metagenome" unless @o[:q]
|
280
269
|
if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
|
281
|
-
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
|
270
|
+
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
|
271
|
+
@o[:q]
|
282
272
|
else
|
283
|
-
all_src = File.readlines("#{@o[:baseout]}.src.fasta"
|
273
|
+
all_src = File.readlines("#{@o[:baseout]}.src.fasta"
|
274
|
+
).select{ |l| l =~ /^>/ }.size
|
284
275
|
thrs = [@o[:thr], all_src].min
|
285
|
-
puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
|
286
|
-
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
287
276
|
thr_obj = []
|
288
|
-
seqs_per_thr = (all_src/thrs).ceil
|
277
|
+
seqs_per_thr = (all_src.to_f/thrs).ceil
|
278
|
+
thrs = (all_src.to_f/seqs_per_thr).ceil
|
279
|
+
puts " * simulating metagenomes and tagging positive reads in " +
|
280
|
+
thrs.to_s + " threads." unless @o[:q]
|
281
|
+
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
289
282
|
(0 .. (thrs-1)).each do |thr_i|
|
290
283
|
output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
|
291
284
|
thr_obj << output
|
292
285
|
fork do
|
293
286
|
seqs_a = thr_i*seqs_per_thr + 1
|
294
|
-
seqs_b = [seqs_a + seqs_per_thr, all_src].min
|
287
|
+
seqs_b = [seqs_a + seqs_per_thr - 1, all_src].min
|
295
288
|
# Create sub-fasta
|
296
|
-
ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
|
297
|
-
ifh = File.open("#{@o[:baseout]}.src.fasta",
|
289
|
+
ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}","w")
|
290
|
+
ifh = File.open("#{@o[:baseout]}.src.fasta","r")
|
298
291
|
seq_i = 0
|
299
292
|
while l = ifh.gets
|
300
293
|
seq_i+=1 if l =~ /^>/
|
301
|
-
|
294
|
+
break if seq_i > seqs_b
|
302
295
|
ofh.print l if seq_i >= seqs_a
|
303
296
|
end
|
304
297
|
ifh.close
|
305
298
|
ofh.close
|
306
299
|
|
307
|
-
# Run simulator (except if the temporal file is already
|
308
|
-
|
309
|
-
|
310
|
-
|
300
|
+
# Run simulator (except if the temporal file is already
|
301
|
+
# there and can be reused)
|
302
|
+
bash sprintf(@o[:simulatorcmd], @o[:simulatorbin],
|
303
|
+
"#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
|
304
|
+
@o[:seqdepth]*@o[:readlen].to_f, @o[:readlen],
|
305
|
+
"#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}") unless
|
306
|
+
@o[:reuse] and
|
307
|
+
File.size? @o[:baseout] +
|
308
|
+
".mg.tmp.#{thr_i.to_s}-reads.fa"
|
311
309
|
|
312
|
-
# Tag
|
313
|
-
puts " * tagging
|
314
|
-
|
315
|
-
|
310
|
+
# Tag positive and negative reads
|
311
|
+
puts " * tagging reads [thread #{thr_i}]." unless
|
312
|
+
@o[:q]
|
313
|
+
ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i}-reads.fa",
|
314
|
+
"r")
|
315
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i}", "w")
|
316
316
|
while l = ifh.gets
|
317
317
|
if l =~ /^>/
|
318
|
-
rd =
|
319
|
-
|
318
|
+
rd = %r{
|
319
|
+
^>(?<id>\d+)\s
|
320
|
+
reference=[A-Za-z]+\|
|
321
|
+
(?<genome_id>[A-Za-z0-9_]+)\|.*\s
|
322
|
+
position=(?<comp>complement\()?(?<from>\d+)\.\.
|
323
|
+
(?<to>\d+)\)?\s
|
324
|
+
}x.match(l)
|
325
|
+
raise "Cannot parse simulated read's defline, are " +
|
326
|
+
"you using Grinder?: #{l}" if rd.nil?
|
320
327
|
positive = false
|
321
328
|
positive_coords[rd[:genome_id].to_sym] ||= []
|
322
329
|
positive_coords[rd[:genome_id].to_sym].each do |gn|
|
323
330
|
left = rd[:to].to_i - gn[:from]
|
324
331
|
right = gn[:to] - rd[:from].to_i
|
325
|
-
if (left*right >= 0) and
|
332
|
+
if (left*right >= 0) and
|
333
|
+
([left, right].min >= @o[:minovl])
|
326
334
|
positive = true
|
327
335
|
break
|
328
336
|
end
|
329
337
|
end
|
330
|
-
|
331
|
-
|
338
|
+
negative = false
|
339
|
+
negative_coords[rd[:genome_id].to_sym] ||= []
|
340
|
+
negative_coords[rd[:genome_id].to_sym].each do |gn|
|
341
|
+
left = rd[:to].to_i - gn[:from]
|
342
|
+
right = gn[:to] - rd[:from].to_i
|
343
|
+
if (left*right >= 0) and
|
344
|
+
([left, right].min >= @o[:minovl])
|
345
|
+
negative = true
|
346
|
+
break
|
347
|
+
end
|
348
|
+
end
|
349
|
+
l = ">#{thr_i.to_s}_#{rd[:id]}" +
|
350
|
+
"#{positive ? "@%" : (negative ? "@$" : "")} " +
|
351
|
+
"ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}" +
|
352
|
+
"#{(rd[:comp]=="complement(") ? "-" : "+"}\n"
|
332
353
|
end
|
333
354
|
ofh.print l
|
334
355
|
end
|
@@ -338,9 +359,10 @@ class ROCker
|
|
338
359
|
end # (1 .. thrs).each
|
339
360
|
Process.waitall
|
340
361
|
# Concatenate results
|
341
|
-
ofh = File.open(@o[:baseout] + ".mg.fasta",
|
362
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta", "w")
|
342
363
|
thr_obj.each do |t|
|
343
|
-
raise "Thread failed without error trace: #{t}" unless
|
364
|
+
raise "Thread failed without error trace: #{t}" unless
|
365
|
+
File.exist? t
|
344
366
|
ifh = File.open(t, "r")
|
345
367
|
while l = ifh.gets
|
346
368
|
ofh.print l
|
@@ -356,23 +378,33 @@ class ROCker
|
|
356
378
|
unless @o[:noaln]
|
357
379
|
puts "Aligning reference set." unless @o[:q]
|
358
380
|
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
|
359
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless
|
381
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless
|
382
|
+
@o[:q]
|
360
383
|
else
|
361
|
-
bash
|
362
|
-
|
384
|
+
bash(sprintf(@o[:alignercmd],
|
385
|
+
@o[:alignerbin], "#{@o[:baseout]}.ref.fasta",
|
386
|
+
"#{@o[:baseout]}.ref.aln", @o[:thr]))
|
387
|
+
puts " +--\n | IMPORTANT NOTE: Manually checking the alignment " +
|
388
|
+
"before\n | the 'compile' step is *strongly* encouraged.\n " +
|
389
|
+
"+--\n" unless @o[:q]
|
363
390
|
end
|
364
391
|
end
|
365
392
|
|
366
393
|
# Run similarity search
|
367
394
|
unless @o[:nosearch]
|
368
|
-
puts "Running
|
395
|
+
puts "Running similarity search." unless @o[:q]
|
369
396
|
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
|
370
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless
|
397
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless
|
398
|
+
@o[:q]
|
371
399
|
else
|
372
400
|
puts " * preparing database." unless @o[:q]
|
373
|
-
bash
|
401
|
+
bash(sprintf(@o[:makedbcmd],
|
402
|
+
@o[:searchbins], "prot", "#{@o[:baseout]}.ref.fasta",
|
403
|
+
"#{@o[:baseout]}.ref"))
|
374
404
|
puts " * running similarity search." unless @o[:q]
|
375
|
-
bash
|
405
|
+
bash(sprintf(@o[:searchcmd],
|
406
|
+
@o[:searchbins], "blastx", "#{@o[:baseout]}.mg.fasta",
|
407
|
+
"#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr]))
|
376
408
|
end
|
377
409
|
end
|
378
410
|
|
@@ -382,7 +414,8 @@ class ROCker
|
|
382
414
|
sff = %w{.src.xml .src.fasta}
|
383
415
|
sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
|
384
416
|
sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
|
385
|
-
sff.each { |sf| File.unlink @o[:baseout] + sf if
|
417
|
+
sff.each { |sf| File.unlink @o[:baseout] + sf if
|
418
|
+
File.exist? @o[:baseout] + sf }
|
386
419
|
end
|
387
420
|
end # build!
|
388
421
|
end # ROCker
|