bio-rocker 1.0.0 → 1.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jun-05-2015
5
+ # @update Sep-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/blasthit'
@@ -10,20 +10,30 @@ require 'rocker/rocdata'
10
10
 
11
11
  class ROCker
12
12
  #================================[ Class ]
13
+ @@VERSION = "1.1.9"
14
+ @@CITATION = "Orellana, Rodriguez-R, & Konstantinidis. Under review. " +
15
+ "Detecting and quantifying functional genes in short-read metagenomic " +
16
+ "datasets: method development and application to the nitrogen cycle " +
17
+ "genes."
13
18
  @@DEFAULTS = {
14
19
  # General
15
- :q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
20
+ q: false, r: "R", nucl: false, debug: false, thr: 2, search: :blast,
16
21
  # External software
17
- :searchbins=>'',
18
- :searchcmd=>{
19
- :blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
20
- :diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
21
- :makedbcmd=>{
22
- :blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
23
- :diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
22
+ searchbins: "",
23
+ searchcmd: {
24
+ blast: '%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" ' +
25
+ '-num_threads %6$d -outfmt 6 -max_target_seqs 1',
26
+ diamond: '%1$sdiamond %2$s -q "%3$s" -d "%4$s" -a "%5$s.daa" -p %6$d' +
27
+ ' -k 1 --min-score 20 --sensitive && %1$sdiamond view -a "%5$s"' +
28
+ ' -o "%5$s"'},
29
+ makedbcmd: {
30
+ blast: '%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
31
+ diamond: '%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
24
32
  }
25
33
  def self.defaults() @@DEFAULTS ; end
26
34
  def self.default(k) @@DEFAULTS[k] ; end
35
+ def self.VERSION; @@VERSION ; end
36
+ def self.CITATION; @@CITATION ; end
27
37
 
28
38
  #================================[ Instance ]
29
39
  attr_reader :o
@@ -46,7 +56,8 @@ class ROCker
46
56
  end
47
57
  def bash(cmd, err_msg=nil)
48
58
  o = `#{cmd} 2>&1 && echo '{'`
49
- raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
59
+ raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
60
+ o[-2]=="{"
50
61
  true
51
62
  end
52
63
  end
@@ -63,10 +74,10 @@ require 'rocker/step/plot'
63
74
  class Numeric
64
75
  def ordinalize
65
76
  n= self.to_s
66
- s= n[-2]=='1' ? 'th' :
67
- n[-1]=='1' ? 'st' :
68
- n[-1]=='2' ? 'nd' :
69
- n[-1]=='3' ? 'rd' : 'th'
77
+ s= n[-2]=='1' ? "th" :
78
+ n[-1]=='1' ? "st" :
79
+ n[-1]=='2' ? "nd" :
80
+ n[-1]=='3' ? "rd" : "th"
70
81
  n + s
71
82
  end
72
83
  end
@@ -2,12 +2,13 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update Sep-06-2015
6
6
  #
7
7
 
8
8
  class BlastHit
9
- attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :midpoint
10
- # Initialize from BLAST using new(ln,aln), initialize from TABLE using new(ln)
9
+ attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :isfalse, :midpoint
10
+ # Initialize from BLAST using new(ln,aln),
11
+ # initialize from TABLE using new(ln)
11
12
  def initialize(ln, aln=nil)
12
13
  l = ln.chomp.split(/\t/)
13
14
  if aln.nil?
@@ -16,6 +17,7 @@ class BlastHit
16
17
  @sto = l[2].to_i
17
18
  @bits = l[3].to_f
18
19
  @istrue = l[4]=='1'
20
+ @istrue = l[4]=='-1'
19
21
  @midpoint = l[5].to_i
20
22
  else
21
23
  s = aln.seq(l[1])
@@ -27,13 +29,14 @@ class BlastHit
27
29
  @sto = [a,b].max
28
30
  @bits = l[11].to_f
29
31
  @istrue = ! /@%/.match(l[0]).nil?
32
+ @isfalse = ! /@\$/.match(l[0]).nil?
30
33
  @midpoint = s.pos2col(((l[8].to_f+l[9].to_f)/2).ceil)
31
34
  end
32
35
  end
33
36
  def to_s
34
37
  self.sbj.nil? ? "" :
35
- [self.sbj, self.sfrom.to_s, self.sto.to_s, self.bits.to_s,
36
- self.istrue ? '1' : '0', self.midpoint].join("\t") + "\n"
38
+ [sbj, sfrom.to_s, sto.to_s, bits.to_s,
39
+ istrue ? "1" : (isfalse ? "-1" : "0"), midpoint].join("\t") + "\n"
37
40
  end
38
41
  end
39
42
 
@@ -0,0 +1,70 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-23-2015
6
+ #
7
+
8
+ class GenomeSet
9
+ attr_reader :rocker, :ids, :taxa
10
+ def initialize(rocker, ids)
11
+ @rocker = rocker
12
+ @ids = ids
13
+ @ids = [] if ids.nil?
14
+ @taxa = {}
15
+ @all_taxa = {}
16
+ end
17
+ def download(file)
18
+ tmp_ids = Array.new(self.ids)
19
+ ofh = File.open(file, "w")
20
+ while tmp_ids.size>0
21
+ ofh.print rocker.ebiFetch(:embl, tmp_ids.shift(200), :fasta)
22
+ end
23
+ ofh.close
24
+ end
25
+ def link_taxon(id, taxon)
26
+ @all_taxa[ taxon.to_sym ] ||= []
27
+ @all_taxa[ taxon.to_sym ] << id
28
+ end
29
+ def choose_genomes!(rank)
30
+ @taxa = {}
31
+ self.get_taxonomy! rank
32
+ @all_taxa.each_pair{ |taxon,ids| @taxa[taxon] = ids.sample }
33
+ @ids = @taxa.values
34
+ end
35
+ def get_taxonomy!(rank)
36
+ @all_taxa = {}
37
+ ids.each do |id|
38
+ self.link_taxon(id, genome2taxon(id, rank))
39
+ end
40
+ end
41
+ def taxa=(hash)
42
+ @taxa = {}
43
+ hash.each_pair{ |taxon, id| @taxa[taxon] = id if self.ids.include? id }
44
+ end
45
+ def size() self.ids.size end
46
+ def empty?() self.ids.empty? end
47
+
48
+ #================================[ Utilities ]
49
+ def genome2taxon(genome_id, rank='species')
50
+ v = genome2taxid(genome_id)
51
+ unless v.nil?
52
+ xml = rocker.ebiFetch('taxonomy', [v], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
53
+ v = xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first
54
+ v.sub!(/.* taxId="(\d+)".*/,"\\1") unless v.nil?
55
+ end
56
+ return "no-taxon-#{(0...12).map { (65 + rand(26)).chr }.join}" if v.nil? or v !~ /^\d+$/
57
+ v
58
+ end
59
+ def genome2taxid(genome_id)
60
+ doc = rocker.ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/)
61
+ ln = doc.grep(/^FT\s+\/db_xref="taxon:/).first
62
+ ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
63
+ return nil if ln.nil?
64
+ ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, "\\1")
65
+ return nil unless ln =~ /^\d+$/
66
+ ln
67
+ end
68
+ end
69
+
70
+
@@ -0,0 +1,90 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jul-20-2015
6
+ #
7
+
8
+ require 'rocker/alignment'
9
+
10
+ class ProteinSet
11
+ attr_reader :rocker, :ids, :aln
12
+ def initialize(rocker, ids=nil, file=nil, aln_file=nil)
13
+ @genomes = {}
14
+ @tranids = {}
15
+ @aln = nil
16
+ @rocker = rocker
17
+ @ids = []
18
+ @ids += ids unless ids.nil?
19
+ @ids += File.readlines(file).map{ |l| l.chomp } unless file.nil?
20
+ unless aln_file.nil?
21
+ aln = Alignment.new
22
+ aln.read_fasta aln_file
23
+ aln_ids = aln.get_ids
24
+ @aln = aln if (@ids - aln_ids).empty?
25
+ @ids += aln_ids
26
+ end
27
+ @ids.uniq!
28
+ end
29
+ def download(file)
30
+ tmp_ids = Array.new(self.ids)
31
+ f = File.open(file, "w")
32
+ while tmp_ids.size>0
33
+ f.print rocker.ebiFetch(:uniprotkb, tmp_ids.shift(200), :fasta)
34
+ end
35
+ f.close
36
+ end
37
+ def get_from_aln(file, aln)
38
+ f = File.open(file, "w")
39
+ f.print aln.to_seq_s
40
+ f.close
41
+ end
42
+ def get_genomes!
43
+ self.ids.each do |id|
44
+ doc = self.rocker.ebiFetch(:uniprotkb, [id], :annot).split("\n")
45
+ doc.grep( /^DR\s+EMBL;/ ).map do |ln|
46
+ r=ln.split('; ')
47
+ self.link_genome(id, r[1])
48
+ self.link_tranid(id, r[2])
49
+ end
50
+ end
51
+ end
52
+ def link_genome(prot_id, genome_id)
53
+ @genomes[prot_id] ||= []
54
+ @genomes[prot_id] << genome_id
55
+ @genomes[prot_id].uniq!
56
+ end
57
+ def link_tranid(prot_id, transl_id)
58
+ @tranids[prot_id] ||= []
59
+ @tranids[prot_id] << transl_id
60
+ @tranids[prot_id].uniq!
61
+ end
62
+ def genomes
63
+ return [] if @genomes.empty?
64
+ @genomes.values.reduce(:+).uniq
65
+ end
66
+ def tranids
67
+ return [] if @tranids.empty?
68
+ @tranids.values.reduce(:+).uniq
69
+ end
70
+ def in_coords(coords)
71
+ coords.keys.map do |genome|
72
+ locations = coords[ genome ]
73
+ locations.map do |loc|
74
+ if not loc[:prot_id].nil?
75
+ loc[:prot_id] if self.include? loc[:prot_id]
76
+ elsif not loc[:tran_id].nil? and not @tranids.rassoc(loc[:tran_id]).nil?
77
+ @tranids.rassoc(loc[:tran_id]).first
78
+ else
79
+ warn "Warning: Impossible to resolve protein located in '#{genome}' at: #{loc}."
80
+ nil
81
+ end
82
+ end
83
+ end.reduce([], :+).compact.uniq
84
+ end
85
+ def size() self.ids.size end
86
+ def empty?() self.ids.empty? end
87
+ def include?(id) self.ids.include?(id) end
88
+ end
89
+
90
+
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update Sep-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/rinterface'
@@ -11,11 +11,13 @@ require 'rocker/alignment'
11
11
  require 'tmpdir'
12
12
 
13
13
  class ROCData
14
- attr_reader :aln, :windows, :r
15
- # Use ROCData.new(table,aln,window) to re-compute from table, use ROCData.new(data) to load
14
+ attr_reader :aln, :windows, :r, :refined
15
+ # Use ROCData.new(table,aln,window) to re-compute from table, use
16
+ # ROCData.new(data) to load
16
17
  def initialize(val, aln=nil, window=nil)
17
18
  @r = RInterface.new
18
19
  @nucl = false
20
+ @refined = false
19
21
  if not aln.nil?
20
22
  @aln = aln
21
23
  self.rrun "library('pROC');"
@@ -33,7 +35,9 @@ class ROCData
33
35
  @aln.read_rocker(val)
34
36
  end
35
37
  end
36
- def win_at_col(col) self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first end
38
+ def win_at_col(col)
39
+ self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first
40
+ end
37
41
  def in_nucl?() @nucl end
38
42
  def nucl=(nucl) @nucl=nucl end
39
43
  def refine! table
@@ -41,14 +45,17 @@ class ROCData
41
45
  return false unless self.load_table! table
42
46
  break if self._refine_iter(table)==0
43
47
  end
48
+ @refined = true
44
49
  return true
45
50
  end
51
+ def is_refined? ; @refined ; end
46
52
  def _refine_iter table
47
53
  to_refine = []
48
54
  self.windows.each do |w|
49
55
  next if w.almost_empty or w.length <= 5
50
56
  self.rrun "acc <- w$accuracy[w$V1==#{w.from}];"
51
- to_refine << w if self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
57
+ to_refine << w if
58
+ self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
52
59
  end
53
60
  n = to_refine.size
54
61
  return 0 unless n > 0
@@ -86,9 +93,17 @@ class ROCData
86
93
  win <- which( (m>=w$V1) & (m<=w$V2))[1];
87
94
  if(!is.na(win)){
88
95
  if(x$V4[i] >= w$V5[win]){
89
- if(x$V5[i]==1){ w$tp[win] <- w$tp[win]+1 }else{ w$fp[win] <- w$fp[win]+1 };
96
+ if(x$V5[i]==1){
97
+ w$tp[win] <- w$tp[win]+1
98
+ } else {
99
+ w$fp[win] <- w$fp[win]+1
100
+ }
90
101
  }else{
91
- if(x$V5[i]==1){ w$fn[win] <- w$fn[win]+1 }else{ w$tn[win] <- w$tn[win]+1 };
102
+ if(x$V5[i]==1){
103
+ w$fn[win] <- w$fn[win]+1
104
+ } else {
105
+ w$tn[win] <- w$tn[win]+1
106
+ };
92
107
  }
93
108
  }
94
109
  }
@@ -106,7 +121,9 @@ class ROCData
106
121
  end
107
122
  def init_windows!(size)
108
123
  @windows = []
109
- 1.step(self.aln.cols,size).each { |a| @windows << ROCWindow.new(self, a, a+size-1) }
124
+ 1.step(self.aln.cols,size).each do |a|
125
+ @windows << ROCWindow.new(self, a, a+size-1)
126
+ end
110
127
  end
111
128
  def rrun(cmd, type=nil) self.r.run cmd, type end
112
129
  def save(file)
@@ -115,7 +132,7 @@ class ROCData
115
132
  f.close
116
133
  end
117
134
  def to_s
118
- o = ''
135
+ o = "#v ROCker " + ROCker.VERSION + "\n"
119
136
  self.windows.each{|w| o += w.to_s}
120
137
  o += self.aln.to_s
121
138
  return o
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update Sep-07-2015
6
6
  #
7
7
 
8
8
  class ROCWindow
@@ -22,16 +22,18 @@ class ROCWindow
22
22
  @from = [a,b].min
23
23
  @to = [a,b].max
24
24
  @thr = nil
25
- self.compute!
25
+ compute!
26
26
  end
27
27
  end
28
28
  def compute!
29
- self.load_hits
30
- @hits = self.rrun "nrow(y);", :int
31
- @tps = self.rrun "sum(y$V5);", :int
32
- unless self.almost_empty
33
- self.rrun "rocobj <- roc(y$V5, y$V4);"
34
- thr = self.rrun 'coords(rocobj, "best", ret="threshold", best.method="youden", best.weights=c(0.5, sum(y$V5)/nrow(y)))[1];', :float
29
+ load_hits
30
+ @hits = rrun("nrow(y);", :int)
31
+ @tps = rrun("sum(y$V5==1);", :int)
32
+ unless almost_empty
33
+ rrun "rocobj <- roc(as.numeric(y$V5==1), y$V4);"
34
+ thr = rrun("coords(rocobj, 'best', ret='threshold', " +
35
+ "best.method='youden', " +
36
+ "best.weights=c(0.5, sum(y$V5==1)/nrow(y)))[1];", :float)
35
37
  @thr = thr.to_f
36
38
  @thr = nil if @thr==0.0 or @thr.infinite?
37
39
  end
@@ -48,16 +50,16 @@ class ROCWindow
48
50
  return nil if a.nil? and b.nil?
49
51
  return a.thr if b.nil?
50
52
  return b.thr if a.nil?
51
- return (b.thr*(self.from-a.from) - a.thr*(self.from-b.from))/(b.from-a.from)
53
+ return (b.thr*(from-a.from) - a.thr*(from-b.from))/(b.from-a.from)
52
54
  end
53
- def load_hits() self.rrun "y <- x[x$V6>=#{self.from} & x$V6<=#{self.to},];" end
54
- def previous() (self.from == 1) ? nil : self.data.win_at_col(self.from - 1) end
55
- def next() (self.to == self.data.aln.cols) ? nil : self.data.win_at_col(self.to + 1) end
56
- def thr_notnil() (@thr.nil? or @thr.infinite?) ? self.around_thr : @thr end
57
- def fps() self.hits - self.tps end
58
- def almost_empty() self.fps < 3 or self.tps < 3 end
59
- def length() self.to - self.from + 1 end
60
- def rrun(cmd, type=nil) self.data.rrun cmd, type end
61
- def to_s() [self.from, self.to, self.hits, self.tps, self.thr_notnil].join("\t") + "\n" end
55
+ def load_hits() self.rrun "y <- x[x$V6>=#{from} & x$V6<=#{to},];" end
56
+ def previous() (from == 1) ? nil : data.win_at_col(from - 1) end
57
+ def next() (to == data.aln.cols) ? nil : data.win_at_col(to + 1) end
58
+ def thr_notnil() (@thr.nil? or @thr.infinite?) ? around_thr : @thr end
59
+ def fps() hits - tps end
60
+ def almost_empty() fps < 3 or tps < 3 end
61
+ def length() to - from + 1 end
62
+ def rrun(cmd, type=nil) data.rrun(cmd, type) end
63
+ def to_s() [from, to, hits, tps, thr_notnil].join("\t") + "\n" end
62
64
  end
63
65
 
@@ -2,21 +2,27 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jun-05-2015
5
+ # @update Sep-11-2015
6
6
  #
7
7
 
8
8
  require 'json'
9
+ require 'rocker/protein-set'
10
+ require 'rocker/genome-set'
9
11
 
10
12
  class ROCker
11
13
  #================================[ Class ]
12
- @@EBIREST = 'http://www.ebi.ac.uk/Tools'
13
- @@DEFAULTS.merge!({:positive=>[], :negative=>[], :genomefrx=>1.0, :seqdepth=>0.03, :readlen=>100, :minovl=>50,
14
+ @@EBIREST = "http://www.ebi.ac.uk/Tools"
15
+ @@DEFAULTS.merge!({positive:[], negative:[], seqdepth:0.03, readlen:100,
16
+ minovl:50,
14
17
  # Ext. Software
15
- :aligner=>:clustalo, :simulator=>:grinder,
16
- :simulatorbin=>{:grinder=>'grinder'},
17
- :simulatorcmd=>{:grinder=>'%1$s -reference_file "%2$s" -cf "%3$f" -dc \'-~*NnKkMmRrYySsWwBbVvHhDdXx\' -md uniform 0.1 -mr 95 5 -rd %4$d uniform 5 -base_name "%5$s"'},
18
- :alignerbin=>{:muscle=>'muscle', :clustalo=>'clustalo'},
19
- :alignercmd=>{:muscle=>'%1$s -in "%2$s" -out "%3$s" -quiet', :clustalo=>'%1$s -i "%2$s" -o "%3$s" --threads=%4$d --force'}
18
+ aligner: :clustalo, simulator: :grinder,
19
+ simulatorbin:{grinder:"grinder"},
20
+ simulatorcmd:{grinder:"%1$s -reference_file \"%2$s\" -cf \"%3$f\" " +
21
+ "-dc '-~*NnKkMmRrYySsWwBbVvHhDdXx' -md uniform 0.1 -mr 95 5 " +
22
+ "-rd %4$d uniform 5 -base_name \"%5$s\""},
23
+ alignerbin:{muscle:"muscle", clustalo:"clustalo"},
24
+ alignercmd:{muscle:"%1$s -in \"%2$s\" -out \"%3$s\" -quiet",
25
+ clustalo:"%1$s -i \"%2$s\" -o \"%3$s\" --threads=%4$d --force"}
20
26
  })
21
27
  @@HAS_BUILD_GEMS = nil
22
28
  def self.ebirest() @@EBIREST ; end
@@ -33,90 +39,65 @@ class ROCker
33
39
  end
34
40
 
35
41
  #================================[ Utilities ]
36
- def genes2genomes(gene_ids)
37
- genomes = []
38
- ids = Array.new(gene_ids)
39
- while ids.size>0
40
- doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
41
- genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
42
- r=ln.split('; ')
43
- {:genome_id=>r[1], :transl_id=>r[2]}
44
- end
45
- end
46
- genomes.uniq
47
- end
48
- def genome2taxid(genome_id)
49
- ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
50
- return ln if ln.nil?
51
- ln.sub(/.*"taxon:(\d+)".*/, "\\1")
52
- end
53
- def genome2taxon(genome_id, rank='species')
54
- xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
55
- xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
56
- end
57
42
  def restcall(url, outfile=nil)
58
43
  $stderr.puts " # Calling: #{url}" if @o[:debug]
59
- response = RestClient::Request.execute(:method=>:get, :url=>url, :timeout=>600)
60
- raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
44
+ response = RestClient::Request.execute(:method=>:get, :url=>url,
45
+ :timeout=>600)
46
+ raise "Unable to reach EBI REST client, error code " +
47
+ response.code.to_s + "." unless response.code == 200
61
48
  unless outfile.nil?
62
- ohf = File.open(outfile, 'w')
49
+ ohf = File.open(outfile, "w")
63
50
  ohf.print response.to_s
64
51
  ohf.close
65
52
  end
66
53
  response.to_s
67
54
  end
68
55
  def ebiFetch(db, ids, format, outfile=nil)
69
- url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
70
- res = self.restcall url
71
- unless outfile.nil?
72
- ohf = File.open(outfile, 'w')
73
- ohf.print res
74
- ohf.close
75
- end
76
- res
56
+ url = "#{ROCker.ebirest}/dbfetch/dbfetch/" +
57
+ "#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
58
+ self.restcall url, outfile
77
59
  end
78
- def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
79
- positive_coords = {}
80
- genomes_org = {}
60
+ def get_coords_from_gff3(genome_ids, pset, thread_id, json_file)
61
+ coords = {}
81
62
  i = 0
82
63
  genome_ids.each do |genome_id|
83
- print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread. \r" if thread_id==0 and not @o[:q]
84
- unless @o[:pertaxon].nil?
85
- genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
86
- genomes_org[ genome_taxon.to_sym ] ||= []
87
- genomes_org[ genome_taxon.to_sym ] << genome_id
88
- end
64
+ print " * scanning #{(i+=1).ordinalize} genome out of " +
65
+ "#{genome_ids.size} in first thread. \r" if
66
+ thread_id==0 and not @o[:q]
89
67
  genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
90
68
  if @o[:reuse] and File.size? genome_file
91
- ifh = File.open(genome_file, 'r')
69
+ ifh = File.open(genome_file, "r")
92
70
  doc = ifh.readlines.grep(/^[^#]/)
93
71
  ifh.close
94
72
  else
95
73
  genome_file=nil unless @o[:noclean]
96
- doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
74
+ doc = ebiFetch(:embl, [genome_id], :gff3,
75
+ genome_file).split("\n").grep(/^[^#]/)
97
76
  end
98
77
  doc.each do |ln|
99
78
  next if ln =~ /^#/
100
79
  r = ln.chomp.split /\t/
101
80
  next if r.size < 9
102
- prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
103
- p = prots.select{ |id| protein_ids.include? id }.first
104
- trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
105
- t = trans.select{ |id| transl_ids.include? id }.first
81
+ prots = r[8].split(/;/).grep(
82
+ /^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
83
+ p = prots.select{ |id| pset.ids.include? id }.first
84
+ trans = r[8].split(/;/).grep(
85
+ /^protein_id=/){ |pid| pid.split(/=/)[1] }
86
+ t = trans.select{ |id| pset.tranids.include? id }.first
106
87
  next if p.nil? and t.nil?
107
- positive_coords[ r[0].to_sym ] ||= []
108
- positive_coords[ r[0].to_sym ] << {
109
- :prot_id => p,
110
- :tran_id => t,
111
- :from => r[3].to_i,
112
- :to => r[4].to_i,
113
- :strand => r[6]
88
+ coords[ r[0].to_sym ] ||= []
89
+ coords[ r[0].to_sym ] << {
90
+ prot_id: p,
91
+ tran_id: t,
92
+ from: r[3].to_i,
93
+ to: r[4].to_i,
94
+ strand: r[6]
114
95
  }
115
96
  end
116
97
  end
117
98
  print "\n" if thread_id==0 and not @o[:q]
118
- ofh = File.open json_file, "w"
119
- ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
99
+ ofh = File.open(json_file, "w")
100
+ ofh.print({coords:coords}.to_json)
120
101
  ofh.close
121
102
  end
122
103
 
@@ -124,211 +105,251 @@ class ROCker
124
105
  def build!
125
106
  # Check requirements
126
107
  puts "Testing environment." unless @o[:q]
127
- @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
128
- @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
129
- @o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
130
- @o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
131
- @o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
132
- @o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
108
+ { searchcmd: :search, makedbcmd: :search,
109
+ alignercmd: :aligner, alignerbin: :aligner,
110
+ simulatorcmd: :simulator, simulatorbin: :simulator
111
+ }.each_pair { |k,v| @o[k] = @o[k][@o[v]] if @o[k].is_a? Hash }
133
112
  @o[:nosearch]=true if @o[:nosimulate]
134
- raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
135
- @o[:positive] += @o[:posori] unless @o[:posori].nil?
136
- @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
137
- @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
138
- unless @o[:aln].nil?
139
- aln = Alignment.new
140
- aln.read_fasta @o[:aln]
141
- @o[:positive] += aln.get_ids
142
- end
143
- raise "-p or -P are mandatory." if @o[:positive].size==0
113
+ raise "Unsatisfied requirements, please see the help message (-h)." unless
114
+ ROCker.has_build_gems?
115
+ protein_set = {}
116
+ protein_set[:+] = ProteinSet.new(self,@o[:positive],@o[:posfile],@o[:aln])
117
+ protein_set[:-] = ProteinSet.new(self,@o[:negative],@o[:negfile])
118
+ raise "-p, -P, or -a are mandatory." if protein_set[:+].empty?
144
119
  raise "-o/--baseout is mandatory." if @o[:baseout].nil?
145
- if @o[:positive].size == 1 and not @o[:noaln]
146
- warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
120
+ if protein_set[:+].size==1 and not @o[:noaln]
121
+ warn "\nWARNING: Positive set contains only one sequence, turning " +
122
+ "off alignment.\n\n"
147
123
  @o[:noaln] = true
148
124
  end
149
125
  unless @o[:nosimulate]
150
- self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
126
+ self.bash("#{@o[:simulatorbin]} --version",
127
+ "--simulator-bin must be executable. Is Grinder installed?") if
128
+ @o[:simulator]==:grinder
151
129
  end
152
130
  unless @o[:noaln]
153
- self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
154
- self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
131
+ self.bash("#{@o[:alignerbin]} -version",
132
+ "--aligner-bin must be executable. Is Muscle installed?") if
133
+ @o[:aligner]==:muscle
134
+ self.bash("#{@o[:alignerbin]} --version",
135
+ "--aligner-bin must be executable. Is ClustalOmega installed?") if
136
+ @o[:aligner]==:clustalo
155
137
  end
156
138
  unless @o[:nosearch]
157
- self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
158
- self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
139
+ self.bash("#{@o[:searchbins]}makeblastdb -version",
140
+ "--search-bins must contain executables. Is BLAST+ installed?") if
141
+ @o[:search]==:blast
142
+ self.bash("#{@o[:searchbins]}diamond --help",
143
+ "--search-bins must contain executables. Is DIAMOND installed?") if
144
+ @o[:search]==:diamond
159
145
  end
160
146
 
161
147
  # Download genes
162
148
  puts "Downloading gene data." unless @o[:q]
163
149
  ref_file = @o[:baseout] + ".ref.fasta"
164
- if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
150
+ if not protein_set[:+].aln.nil?
165
151
  puts " * reusing aligned sequences as positive set." unless @o[:q]
166
- f = File.open(ref_file, "w")
167
- f.print aln.to_seq_s
168
- f.close
152
+ protein_set[:+].get_from_aln(ref_file, aln)
169
153
  @o[:noaln] = true
170
154
  elsif @o[:reuse] and File.size? ref_file
171
155
  puts " * reusing positive set: #{ref_file}." unless @o[:q]
172
156
  else
173
- puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
174
- $stderr.puts " # #{@o[:positive]}" if @o[:debug]
175
- ids = Array.new(@o[:positive])
176
- f = File.open(ref_file, "w")
177
- while ids.size>0
178
- f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
179
- end
180
- f.close
157
+ puts " * downloading #{protein_set[:+].size} sequence(s) in " +
158
+ "positive set." unless @o[:q]
159
+ $stderr.puts " # #{protein_set[:+].ids}" if @o[:debug]
160
+ protein_set[:+].download(ref_file)
181
161
  end
182
- genome_ids = {:positive=>[], :negative=>[]}
183
- transl_ids = {:positive=>[], :negative=>[]}
184
- [:positive, :negative].each do |set|
185
- unless @o[set].size==0
186
- puts " * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
187
- $stderr.puts " # #{@o[set]}" if @o[:debug]
188
- r = genes2genomes(@o[set])
189
- genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
190
- transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
162
+ [:+, :-].each do |set|
163
+ unless protein_set[set].empty?
164
+ puts " * linking genomes from #{protein_set[set].size} " +
165
+ "[#{set.to_s}] sequence(s)." unless @o[:q]
166
+ $stderr.puts " # #{protein_set[set].ids}" if @o[:debug]
167
+ protein_set[set].get_genomes!
191
168
  end
192
169
  end
193
- raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
194
- genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
195
- raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
196
- all_genome_ids = genome_ids.values.reduce(:+).uniq
170
+ raise "No genomes associated with the positive set." if
171
+ protein_set[:+].genomes.empty?
172
+ genome_set = {:+ => GenomeSet.new(self, protein_set[:+].genomes),
173
+ :- => GenomeSet.new(self, protein_set[:-].genomes)}
197
174
 
198
175
  # Locate genes
199
176
  puts "Analyzing genome data." unless @o[:q]
200
177
  coords_file = @o[:baseout] + ".src.coords"
201
178
  if @o[:reuse] and File.size? coords_file
202
179
  puts " * reusing coordinates: #{coords_file}." unless @o[:q]
203
- c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
180
+ c = JSON.parse File.read(coords_file), {symbolize_names:true}
204
181
  positive_coords = c[:positive_coords]
205
- genome_org = c[:genome_org]
182
+ negative_coords = c[:negative_coords]
183
+ genome_set[:+].taxa = c[:taxa_pos]
184
+ genome_set[:-].taxa = c[:taxa_neg]
206
185
  else
207
- thrs = [@o[:thr], genome_ids[:positive].size].min
208
- puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
209
- $stderr.puts " # Looking for proteins: #{@o[:positive]}" if @o[:debug]
210
- $stderr.puts " # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
211
- $stderr.puts " # Looking into: #{genome_ids[:positive]}" if @o[:debug]
212
- thr_obj = []
213
- (0 .. (thrs-1)).each do |thr_i|
214
- ids_to_parse = []
215
- (0 .. (genome_ids[:positive].size-1)).each do |i|
216
- ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
217
- end
218
- json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
219
- thr_obj << json_file
220
- fork do
221
- get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
222
- end
223
- end
224
- Process.waitall
225
- # Combine results
226
- positive_coords = {}
227
- genomes_org = {}
228
- genome_org = {}
229
- thr_obj.each do |t|
230
- raise "Thread failed without error trace: #{t}" unless File.exist? t
231
- o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
232
- o[:positive_coords].each_pair do |k,v|
233
- positive_coords[ k ] ||= []
234
- positive_coords[ k ] += v
186
+ all_coords = {}
187
+ [:+, :-].each do |set_type|
188
+ all_coords[set_type] = {}
189
+ next if genome_set[set_type].empty?
190
+ thrs = [@o[:thr], genome_set[set_type].size].min
191
+ puts " * downloading and parsing #{genome_set[set_type].size} " +
192
+ "GFF3 document(s) in #{thrs} threads." unless @o[:q]
193
+ $stderr.puts " # Looking for translations: " +
194
+ "#{protein_set[set_type].tranids}" if @o[:debug]
195
+ $stderr.puts " # Looking into: #{genome_set[set_type].ids}" if
196
+ @o[:debug]
197
+ # Launch threads
198
+ thr_obj = []
199
+ (0 .. (thrs-1)).each do |thr_i|
200
+ ids_to_parse = []
201
+ (0 .. (genome_set[set_type].size-1)).each do |i|
202
+ ids_to_parse << protein_set[set_type].genomes[i] if
203
+ (i % thrs) == thr_i
204
+ end
205
+ json_file = @o[:baseout] + ".src.coords." + thr_i.to_s + ".tmp"
206
+ thr_obj << json_file
207
+ fork do
208
+ get_coords_from_gff3(ids_to_parse, protein_set[set_type],
209
+ thr_i, json_file)
210
+ end
235
211
  end
236
- o[:genomes_org].each_pair do |k,v|
237
- genomes_org[ k ] ||= []
238
- genomes_org[ k ] << v
212
+ # Combine results
213
+ Process.waitall
214
+ thr_obj.each do |t|
215
+ raise "Thread failed without error trace: #{t}" unless
216
+ File.exist? t
217
+ o = JSON.parse(File.read(t), {symbolize_names:true})
218
+ o[:coords].each_pair do |k,v|
219
+ all_coords[set_type][ k ] ||= []
220
+ all_coords[set_type][ k ] += v
221
+ end
222
+ File.unlink t
239
223
  end
240
- File.unlink t
241
- end
224
+ end # [:+, :-].each
225
+ positive_coords = all_coords[:+]
226
+ negative_coords = all_coords[:-]
242
227
  # Select one genome per taxon
243
228
  unless @o[:pertaxon].nil?
244
- genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
229
+ puts " Selecting genomes by #{@o[:pertaxon]}." unless @o[:q]
230
+ [:+,:-].each{ |set| genome_set[set].choose_genomes! @o[:pertaxon] }
245
231
  end
246
- # Save coordinates
232
+ # Save coordinates and taxa
247
233
  ofh = File.open(coords_file, "w")
248
- ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
234
+ ofh.print JSON.pretty_generate({
235
+ positive_coords:positive_coords,
236
+ negative_coords:negative_coords,
237
+ taxa_pos:genome_set[:+].taxa,
238
+ taxa_neg:genome_set[:-].taxa})
249
239
  ofh.close
250
- end
240
+ end # if @o[:reuse] and File.size? coords_file ... else
251
241
  unless @o[:pertaxon].nil?
252
- genome_ids[:positive] = genome_org.values
253
- puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
242
+ puts " Using " +
243
+ [:+,:-].map{ |set| genome_set[set].size }.reduce(:+).to_s +
244
+ " genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
254
245
  end
255
- all_genome_ids = genome_ids.values.reduce(:+).uniq
256
- found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
257
- unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
258
- raise "Cannot find the genomic location of any provided sequence." if found.nil?
259
- missing = @o[:positive] - found
260
- warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
246
+ found = protein_set[:+].in_coords(positive_coords)
247
+ raise "Cannot find the genomic location of any provided sequence." if
248
+ found.nil?
249
+ missing = protein_set[:+].ids - found
250
+ warn "\nWARNING: Cannot find genomic location of #{missing.size} " +
251
+ "sequence(s) #{missing.join(",")}.\n\n" unless missing.empty?
261
252
 
262
253
  # Download genomes
263
- genomes_file = @o[:baseout] + '.src.fasta'
254
+ genome_set[:all] = GenomeSet.new(self,
255
+ genome_set[ :+ ].ids + genome_set[ :- ].ids)
256
+ genomes_file = @o[:baseout] + ".src.fasta"
264
257
  if @o[:reuse] and File.size? genomes_file
265
258
  puts " * reusing existing file: #{genomes_file}." unless @o[:q]
266
259
  else
267
- puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
268
- $stderr.puts " # #{all_genome_ids}" if @o[:debug]
269
- ids = Array.new(all_genome_ids)
270
- ofh = File.open(genomes_file, 'w')
271
- while ids.size>0
272
- ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
273
- end
274
- ofh.close
260
+ puts " * downloading " + genome_set[:all].size.to_s +
261
+ " genome(s) in FastA." unless @o[:q]
262
+ $stderr.puts " # #{genome_set[:all].ids}" if @o[:debug]
263
+ genome_set[:all].download genomes_file
275
264
  end
276
265
 
277
266
  # Generate metagenome
278
267
  unless @o[:nosimulate]
279
268
  puts "Generating in silico metagenome" unless @o[:q]
280
269
  if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
281
- puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
270
+ puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
271
+ @o[:q]
282
272
  else
283
- all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
273
+ all_src = File.readlines("#{@o[:baseout]}.src.fasta"
274
+ ).select{ |l| l =~ /^>/ }.size
284
275
  thrs = [@o[:thr], all_src].min
285
- puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
286
- $stderr.puts " # #{positive_coords}" if @o[:debug]
287
276
  thr_obj = []
288
- seqs_per_thr = (all_src/thrs).ceil
277
+ seqs_per_thr = (all_src.to_f/thrs).ceil
278
+ thrs = (all_src.to_f/seqs_per_thr).ceil
279
+ puts " * simulating metagenomes and tagging positive reads in " +
280
+ thrs.to_s + " threads." unless @o[:q]
281
+ $stderr.puts " # #{positive_coords}" if @o[:debug]
289
282
  (0 .. (thrs-1)).each do |thr_i|
290
283
  output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
291
284
  thr_obj << output
292
285
  fork do
293
286
  seqs_a = thr_i*seqs_per_thr + 1
294
- seqs_b = [seqs_a + seqs_per_thr, all_src].min
287
+ seqs_b = [seqs_a + seqs_per_thr - 1, all_src].min
295
288
  # Create sub-fasta
296
- ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
297
- ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
289
+ ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}","w")
290
+ ifh = File.open("#{@o[:baseout]}.src.fasta","r")
298
291
  seq_i = 0
299
292
  while l = ifh.gets
300
293
  seq_i+=1 if l =~ /^>/
301
- break if seq_i > seqs_b
294
+ break if seq_i > seqs_b
302
295
  ofh.print l if seq_i >= seqs_a
303
296
  end
304
297
  ifh.close
305
298
  ofh.close
306
299
 
307
- # Run simulator (except if the temporal file is already there and can be reused)
308
- unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
309
- bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
310
- end
300
+ # Run simulator (except if the temporal file is already
301
+ # there and can be reused)
302
+ bash sprintf(@o[:simulatorcmd], @o[:simulatorbin],
303
+ "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
304
+ @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen],
305
+ "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}") unless
306
+ @o[:reuse] and
307
+ File.size? @o[:baseout] +
308
+ ".mg.tmp.#{thr_i.to_s}-reads.fa"
311
309
 
312
- # Tag positives
313
- puts " * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
314
- ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
315
- ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
310
+ # Tag positive and negative reads
311
+ puts " * tagging reads [thread #{thr_i}]." unless
312
+ @o[:q]
313
+ ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i}-reads.fa",
314
+ "r")
315
+ ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i}", "w")
316
316
  while l = ifh.gets
317
317
  if l =~ /^>/
318
- rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
319
- raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
318
+ rd = %r{
319
+ ^>(?<id>\d+)\s
320
+ reference=[A-Za-z]+\|
321
+ (?<genome_id>[A-Za-z0-9_]+)\|.*\s
322
+ position=(?<comp>complement\()?(?<from>\d+)\.\.
323
+ (?<to>\d+)\)?\s
324
+ }x.match(l)
325
+ raise "Cannot parse simulated read's defline, are " +
326
+ "you using Grinder?: #{l}" if rd.nil?
320
327
  positive = false
321
328
  positive_coords[rd[:genome_id].to_sym] ||= []
322
329
  positive_coords[rd[:genome_id].to_sym].each do |gn|
323
330
  left = rd[:to].to_i - gn[:from]
324
331
  right = gn[:to] - rd[:from].to_i
325
- if (left*right >= 0) and ([left, right].min >= @o[:minovl])
332
+ if (left*right >= 0) and
333
+ ([left, right].min >= @o[:minovl])
326
334
  positive = true
327
335
  break
328
336
  end
329
337
  end
330
- l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
331
- "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
338
+ negative = false
339
+ negative_coords[rd[:genome_id].to_sym] ||= []
340
+ negative_coords[rd[:genome_id].to_sym].each do |gn|
341
+ left = rd[:to].to_i - gn[:from]
342
+ right = gn[:to] - rd[:from].to_i
343
+ if (left*right >= 0) and
344
+ ([left, right].min >= @o[:minovl])
345
+ negative = true
346
+ break
347
+ end
348
+ end
349
+ l = ">#{thr_i.to_s}_#{rd[:id]}" +
350
+ "#{positive ? "@%" : (negative ? "@$" : "")} " +
351
+ "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}" +
352
+ "#{(rd[:comp]=="complement(") ? "-" : "+"}\n"
332
353
  end
333
354
  ofh.print l
334
355
  end
@@ -338,9 +359,10 @@ class ROCker
338
359
  end # (1 .. thrs).each
339
360
  Process.waitall
340
361
  # Concatenate results
341
- ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
362
+ ofh = File.open(@o[:baseout] + ".mg.fasta", "w")
342
363
  thr_obj.each do |t|
343
- raise "Thread failed without error trace: #{t}" unless File.exist? t
364
+ raise "Thread failed without error trace: #{t}" unless
365
+ File.exist? t
344
366
  ifh = File.open(t, "r")
345
367
  while l = ifh.gets
346
368
  ofh.print l
@@ -356,23 +378,33 @@ class ROCker
356
378
  unless @o[:noaln]
357
379
  puts "Aligning reference set." unless @o[:q]
358
380
  if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
359
- puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
381
+ puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless
382
+ @o[:q]
360
383
  else
361
- bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
362
- puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
384
+ bash(sprintf(@o[:alignercmd],
385
+ @o[:alignerbin], "#{@o[:baseout]}.ref.fasta",
386
+ "#{@o[:baseout]}.ref.aln", @o[:thr]))
387
+ puts " +--\n | IMPORTANT NOTE: Manually checking the alignment " +
388
+ "before\n | the 'compile' step is *strongly* encouraged.\n " +
389
+ "+--\n" unless @o[:q]
363
390
  end
364
391
  end
365
392
 
366
393
  # Run similarity search
367
394
  unless @o[:nosearch]
368
- puts "Running homology search." unless @o[:q]
395
+ puts "Running similarity search." unless @o[:q]
369
396
  if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
370
- puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
397
+ puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless
398
+ @o[:q]
371
399
  else
372
400
  puts " * preparing database." unless @o[:q]
373
- bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
401
+ bash(sprintf(@o[:makedbcmd],
402
+ @o[:searchbins], "prot", "#{@o[:baseout]}.ref.fasta",
403
+ "#{@o[:baseout]}.ref"))
374
404
  puts " * running similarity search." unless @o[:q]
375
- bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
405
+ bash(sprintf(@o[:searchcmd],
406
+ @o[:searchbins], "blastx", "#{@o[:baseout]}.mg.fasta",
407
+ "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr]))
376
408
  end
377
409
  end
378
410
 
@@ -382,7 +414,8 @@ class ROCker
382
414
  sff = %w{.src.xml .src.fasta}
383
415
  sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
384
416
  sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
385
- sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
417
+ sff.each { |sf| File.unlink @o[:baseout] + sf if
418
+ File.exist? @o[:baseout] + sf }
386
419
  end
387
420
  end # build!
388
421
  end # ROCker