bio-rocker 1.0.0 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jun-05-2015
5
+ # @update Sep-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/blasthit'
@@ -10,20 +10,30 @@ require 'rocker/rocdata'
10
10
 
11
11
  class ROCker
12
12
  #================================[ Class ]
13
+ @@VERSION = "1.1.9"
14
+ @@CITATION = "Orellana, Rodriguez-R, & Konstantinidis. Under review. " +
15
+ "Detecting and quantifying functional genes in short-read metagenomic " +
16
+ "datasets: method development and application to the nitrogen cycle " +
17
+ "genes."
13
18
  @@DEFAULTS = {
14
19
  # General
15
- :q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
20
+ q: false, r: "R", nucl: false, debug: false, thr: 2, search: :blast,
16
21
  # External software
17
- :searchbins=>'',
18
- :searchcmd=>{
19
- :blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
20
- :diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
21
- :makedbcmd=>{
22
- :blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
23
- :diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
22
+ searchbins: "",
23
+ searchcmd: {
24
+ blast: '%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" ' +
25
+ '-num_threads %6$d -outfmt 6 -max_target_seqs 1',
26
+ diamond: '%1$sdiamond %2$s -q "%3$s" -d "%4$s" -a "%5$s.daa" -p %6$d' +
27
+ ' -k 1 --min-score 20 --sensitive && %1$sdiamond view -a "%5$s"' +
28
+ ' -o "%5$s"'},
29
+ makedbcmd: {
30
+ blast: '%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
31
+ diamond: '%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
24
32
  }
25
33
  def self.defaults() @@DEFAULTS ; end
26
34
  def self.default(k) @@DEFAULTS[k] ; end
35
+ def self.VERSION; @@VERSION ; end
36
+ def self.CITATION; @@CITATION ; end
27
37
 
28
38
  #================================[ Instance ]
29
39
  attr_reader :o
@@ -46,7 +56,8 @@ class ROCker
46
56
  end
47
57
  def bash(cmd, err_msg=nil)
48
58
  o = `#{cmd} 2>&1 && echo '{'`
49
- raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
59
+ raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
60
+ o[-2]=="{"
50
61
  true
51
62
  end
52
63
  end
@@ -63,10 +74,10 @@ require 'rocker/step/plot'
63
74
  class Numeric
64
75
  def ordinalize
65
76
  n= self.to_s
66
- s= n[-2]=='1' ? 'th' :
67
- n[-1]=='1' ? 'st' :
68
- n[-1]=='2' ? 'nd' :
69
- n[-1]=='3' ? 'rd' : 'th'
77
+ s= n[-2]=='1' ? "th" :
78
+ n[-1]=='1' ? "st" :
79
+ n[-1]=='2' ? "nd" :
80
+ n[-1]=='3' ? "rd" : "th"
70
81
  n + s
71
82
  end
72
83
  end
@@ -2,12 +2,13 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update Sep-06-2015
6
6
  #
7
7
 
8
8
  class BlastHit
9
- attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :midpoint
10
- # Initialize from BLAST using new(ln,aln), initialize from TABLE using new(ln)
9
+ attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :isfalse, :midpoint
10
+ # Initialize from BLAST using new(ln,aln),
11
+ # initialize from TABLE using new(ln)
11
12
  def initialize(ln, aln=nil)
12
13
  l = ln.chomp.split(/\t/)
13
14
  if aln.nil?
@@ -16,6 +17,7 @@ class BlastHit
16
17
  @sto = l[2].to_i
17
18
  @bits = l[3].to_f
18
19
  @istrue = l[4]=='1'
20
+ @istrue = l[4]=='-1'
19
21
  @midpoint = l[5].to_i
20
22
  else
21
23
  s = aln.seq(l[1])
@@ -27,13 +29,14 @@ class BlastHit
27
29
  @sto = [a,b].max
28
30
  @bits = l[11].to_f
29
31
  @istrue = ! /@%/.match(l[0]).nil?
32
+ @isfalse = ! /@\$/.match(l[0]).nil?
30
33
  @midpoint = s.pos2col(((l[8].to_f+l[9].to_f)/2).ceil)
31
34
  end
32
35
  end
33
36
  def to_s
34
37
  self.sbj.nil? ? "" :
35
- [self.sbj, self.sfrom.to_s, self.sto.to_s, self.bits.to_s,
36
- self.istrue ? '1' : '0', self.midpoint].join("\t") + "\n"
38
+ [sbj, sfrom.to_s, sto.to_s, bits.to_s,
39
+ istrue ? "1" : (isfalse ? "-1" : "0"), midpoint].join("\t") + "\n"
37
40
  end
38
41
  end
39
42
 
@@ -0,0 +1,70 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-23-2015
6
+ #
7
+
8
+ class GenomeSet
9
+ attr_reader :rocker, :ids, :taxa
10
+ def initialize(rocker, ids)
11
+ @rocker = rocker
12
+ @ids = ids
13
+ @ids = [] if ids.nil?
14
+ @taxa = {}
15
+ @all_taxa = {}
16
+ end
17
+ def download(file)
18
+ tmp_ids = Array.new(self.ids)
19
+ ofh = File.open(file, "w")
20
+ while tmp_ids.size>0
21
+ ofh.print rocker.ebiFetch(:embl, tmp_ids.shift(200), :fasta)
22
+ end
23
+ ofh.close
24
+ end
25
+ def link_taxon(id, taxon)
26
+ @all_taxa[ taxon.to_sym ] ||= []
27
+ @all_taxa[ taxon.to_sym ] << id
28
+ end
29
+ def choose_genomes!(rank)
30
+ @taxa = {}
31
+ self.get_taxonomy! rank
32
+ @all_taxa.each_pair{ |taxon,ids| @taxa[taxon] = ids.sample }
33
+ @ids = @taxa.values
34
+ end
35
+ def get_taxonomy!(rank)
36
+ @all_taxa = {}
37
+ ids.each do |id|
38
+ self.link_taxon(id, genome2taxon(id, rank))
39
+ end
40
+ end
41
+ def taxa=(hash)
42
+ @taxa = {}
43
+ hash.each_pair{ |taxon, id| @taxa[taxon] = id if self.ids.include? id }
44
+ end
45
+ def size() self.ids.size end
46
+ def empty?() self.ids.empty? end
47
+
48
+ #================================[ Utilities ]
49
+ def genome2taxon(genome_id, rank='species')
50
+ v = genome2taxid(genome_id)
51
+ unless v.nil?
52
+ xml = rocker.ebiFetch('taxonomy', [v], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
53
+ v = xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first
54
+ v.sub!(/.* taxId="(\d+)".*/,"\\1") unless v.nil?
55
+ end
56
+ return "no-taxon-#{(0...12).map { (65 + rand(26)).chr }.join}" if v.nil? or v !~ /^\d+$/
57
+ v
58
+ end
59
+ def genome2taxid(genome_id)
60
+ doc = rocker.ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/)
61
+ ln = doc.grep(/^FT\s+\/db_xref="taxon:/).first
62
+ ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
63
+ return nil if ln.nil?
64
+ ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, "\\1")
65
+ return nil unless ln =~ /^\d+$/
66
+ ln
67
+ end
68
+ end
69
+
70
+
@@ -0,0 +1,90 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jul-20-2015
6
+ #
7
+
8
+ require 'rocker/alignment'
9
+
10
+ class ProteinSet
11
+ attr_reader :rocker, :ids, :aln
12
+ def initialize(rocker, ids=nil, file=nil, aln_file=nil)
13
+ @genomes = {}
14
+ @tranids = {}
15
+ @aln = nil
16
+ @rocker = rocker
17
+ @ids = []
18
+ @ids += ids unless ids.nil?
19
+ @ids += File.readlines(file).map{ |l| l.chomp } unless file.nil?
20
+ unless aln_file.nil?
21
+ aln = Alignment.new
22
+ aln.read_fasta aln_file
23
+ aln_ids = aln.get_ids
24
+ @aln = aln if (@ids - aln_ids).empty?
25
+ @ids += aln_ids
26
+ end
27
+ @ids.uniq!
28
+ end
29
+ def download(file)
30
+ tmp_ids = Array.new(self.ids)
31
+ f = File.open(file, "w")
32
+ while tmp_ids.size>0
33
+ f.print rocker.ebiFetch(:uniprotkb, tmp_ids.shift(200), :fasta)
34
+ end
35
+ f.close
36
+ end
37
+ def get_from_aln(file, aln)
38
+ f = File.open(file, "w")
39
+ f.print aln.to_seq_s
40
+ f.close
41
+ end
42
+ def get_genomes!
43
+ self.ids.each do |id|
44
+ doc = self.rocker.ebiFetch(:uniprotkb, [id], :annot).split("\n")
45
+ doc.grep( /^DR\s+EMBL;/ ).map do |ln|
46
+ r=ln.split('; ')
47
+ self.link_genome(id, r[1])
48
+ self.link_tranid(id, r[2])
49
+ end
50
+ end
51
+ end
52
+ def link_genome(prot_id, genome_id)
53
+ @genomes[prot_id] ||= []
54
+ @genomes[prot_id] << genome_id
55
+ @genomes[prot_id].uniq!
56
+ end
57
+ def link_tranid(prot_id, transl_id)
58
+ @tranids[prot_id] ||= []
59
+ @tranids[prot_id] << transl_id
60
+ @tranids[prot_id].uniq!
61
+ end
62
+ def genomes
63
+ return [] if @genomes.empty?
64
+ @genomes.values.reduce(:+).uniq
65
+ end
66
+ def tranids
67
+ return [] if @tranids.empty?
68
+ @tranids.values.reduce(:+).uniq
69
+ end
70
+ def in_coords(coords)
71
+ coords.keys.map do |genome|
72
+ locations = coords[ genome ]
73
+ locations.map do |loc|
74
+ if not loc[:prot_id].nil?
75
+ loc[:prot_id] if self.include? loc[:prot_id]
76
+ elsif not loc[:tran_id].nil? and not @tranids.rassoc(loc[:tran_id]).nil?
77
+ @tranids.rassoc(loc[:tran_id]).first
78
+ else
79
+ warn "Warning: Impossible to resolve protein located in '#{genome}' at: #{loc}."
80
+ nil
81
+ end
82
+ end
83
+ end.reduce([], :+).compact.uniq
84
+ end
85
+ def size() self.ids.size end
86
+ def empty?() self.ids.empty? end
87
+ def include?(id) self.ids.include?(id) end
88
+ end
89
+
90
+
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update Sep-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/rinterface'
@@ -11,11 +11,13 @@ require 'rocker/alignment'
11
11
  require 'tmpdir'
12
12
 
13
13
  class ROCData
14
- attr_reader :aln, :windows, :r
15
- # Use ROCData.new(table,aln,window) to re-compute from table, use ROCData.new(data) to load
14
+ attr_reader :aln, :windows, :r, :refined
15
+ # Use ROCData.new(table,aln,window) to re-compute from table, use
16
+ # ROCData.new(data) to load
16
17
  def initialize(val, aln=nil, window=nil)
17
18
  @r = RInterface.new
18
19
  @nucl = false
20
+ @refined = false
19
21
  if not aln.nil?
20
22
  @aln = aln
21
23
  self.rrun "library('pROC');"
@@ -33,7 +35,9 @@ class ROCData
33
35
  @aln.read_rocker(val)
34
36
  end
35
37
  end
36
- def win_at_col(col) self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first end
38
+ def win_at_col(col)
39
+ self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first
40
+ end
37
41
  def in_nucl?() @nucl end
38
42
  def nucl=(nucl) @nucl=nucl end
39
43
  def refine! table
@@ -41,14 +45,17 @@ class ROCData
41
45
  return false unless self.load_table! table
42
46
  break if self._refine_iter(table)==0
43
47
  end
48
+ @refined = true
44
49
  return true
45
50
  end
51
+ def is_refined? ; @refined ; end
46
52
  def _refine_iter table
47
53
  to_refine = []
48
54
  self.windows.each do |w|
49
55
  next if w.almost_empty or w.length <= 5
50
56
  self.rrun "acc <- w$accuracy[w$V1==#{w.from}];"
51
- to_refine << w if self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
57
+ to_refine << w if
58
+ self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
52
59
  end
53
60
  n = to_refine.size
54
61
  return 0 unless n > 0
@@ -86,9 +93,17 @@ class ROCData
86
93
  win <- which( (m>=w$V1) & (m<=w$V2))[1];
87
94
  if(!is.na(win)){
88
95
  if(x$V4[i] >= w$V5[win]){
89
- if(x$V5[i]==1){ w$tp[win] <- w$tp[win]+1 }else{ w$fp[win] <- w$fp[win]+1 };
96
+ if(x$V5[i]==1){
97
+ w$tp[win] <- w$tp[win]+1
98
+ } else {
99
+ w$fp[win] <- w$fp[win]+1
100
+ }
90
101
  }else{
91
- if(x$V5[i]==1){ w$fn[win] <- w$fn[win]+1 }else{ w$tn[win] <- w$tn[win]+1 };
102
+ if(x$V5[i]==1){
103
+ w$fn[win] <- w$fn[win]+1
104
+ } else {
105
+ w$tn[win] <- w$tn[win]+1
106
+ };
92
107
  }
93
108
  }
94
109
  }
@@ -106,7 +121,9 @@ class ROCData
106
121
  end
107
122
  def init_windows!(size)
108
123
  @windows = []
109
- 1.step(self.aln.cols,size).each { |a| @windows << ROCWindow.new(self, a, a+size-1) }
124
+ 1.step(self.aln.cols,size).each do |a|
125
+ @windows << ROCWindow.new(self, a, a+size-1)
126
+ end
110
127
  end
111
128
  def rrun(cmd, type=nil) self.r.run cmd, type end
112
129
  def save(file)
@@ -115,7 +132,7 @@ class ROCData
115
132
  f.close
116
133
  end
117
134
  def to_s
118
- o = ''
135
+ o = "#v ROCker " + ROCker.VERSION + "\n"
119
136
  self.windows.each{|w| o += w.to_s}
120
137
  o += self.aln.to_s
121
138
  return o
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update Sep-07-2015
6
6
  #
7
7
 
8
8
  class ROCWindow
@@ -22,16 +22,18 @@ class ROCWindow
22
22
  @from = [a,b].min
23
23
  @to = [a,b].max
24
24
  @thr = nil
25
- self.compute!
25
+ compute!
26
26
  end
27
27
  end
28
28
  def compute!
29
- self.load_hits
30
- @hits = self.rrun "nrow(y);", :int
31
- @tps = self.rrun "sum(y$V5);", :int
32
- unless self.almost_empty
33
- self.rrun "rocobj <- roc(y$V5, y$V4);"
34
- thr = self.rrun 'coords(rocobj, "best", ret="threshold", best.method="youden", best.weights=c(0.5, sum(y$V5)/nrow(y)))[1];', :float
29
+ load_hits
30
+ @hits = rrun("nrow(y);", :int)
31
+ @tps = rrun("sum(y$V5==1);", :int)
32
+ unless almost_empty
33
+ rrun "rocobj <- roc(as.numeric(y$V5==1), y$V4);"
34
+ thr = rrun("coords(rocobj, 'best', ret='threshold', " +
35
+ "best.method='youden', " +
36
+ "best.weights=c(0.5, sum(y$V5==1)/nrow(y)))[1];", :float)
35
37
  @thr = thr.to_f
36
38
  @thr = nil if @thr==0.0 or @thr.infinite?
37
39
  end
@@ -48,16 +50,16 @@ class ROCWindow
48
50
  return nil if a.nil? and b.nil?
49
51
  return a.thr if b.nil?
50
52
  return b.thr if a.nil?
51
- return (b.thr*(self.from-a.from) - a.thr*(self.from-b.from))/(b.from-a.from)
53
+ return (b.thr*(from-a.from) - a.thr*(from-b.from))/(b.from-a.from)
52
54
  end
53
- def load_hits() self.rrun "y <- x[x$V6>=#{self.from} & x$V6<=#{self.to},];" end
54
- def previous() (self.from == 1) ? nil : self.data.win_at_col(self.from - 1) end
55
- def next() (self.to == self.data.aln.cols) ? nil : self.data.win_at_col(self.to + 1) end
56
- def thr_notnil() (@thr.nil? or @thr.infinite?) ? self.around_thr : @thr end
57
- def fps() self.hits - self.tps end
58
- def almost_empty() self.fps < 3 or self.tps < 3 end
59
- def length() self.to - self.from + 1 end
60
- def rrun(cmd, type=nil) self.data.rrun cmd, type end
61
- def to_s() [self.from, self.to, self.hits, self.tps, self.thr_notnil].join("\t") + "\n" end
55
+ def load_hits() self.rrun "y <- x[x$V6>=#{from} & x$V6<=#{to},];" end
56
+ def previous() (from == 1) ? nil : data.win_at_col(from - 1) end
57
+ def next() (to == data.aln.cols) ? nil : data.win_at_col(to + 1) end
58
+ def thr_notnil() (@thr.nil? or @thr.infinite?) ? around_thr : @thr end
59
+ def fps() hits - tps end
60
+ def almost_empty() fps < 3 or tps < 3 end
61
+ def length() to - from + 1 end
62
+ def rrun(cmd, type=nil) data.rrun(cmd, type) end
63
+ def to_s() [from, to, hits, tps, thr_notnil].join("\t") + "\n" end
62
64
  end
63
65
 
@@ -2,21 +2,27 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jun-05-2015
5
+ # @update Sep-11-2015
6
6
  #
7
7
 
8
8
  require 'json'
9
+ require 'rocker/protein-set'
10
+ require 'rocker/genome-set'
9
11
 
10
12
  class ROCker
11
13
  #================================[ Class ]
12
- @@EBIREST = 'http://www.ebi.ac.uk/Tools'
13
- @@DEFAULTS.merge!({:positive=>[], :negative=>[], :genomefrx=>1.0, :seqdepth=>0.03, :readlen=>100, :minovl=>50,
14
+ @@EBIREST = "http://www.ebi.ac.uk/Tools"
15
+ @@DEFAULTS.merge!({positive:[], negative:[], seqdepth:0.03, readlen:100,
16
+ minovl:50,
14
17
  # Ext. Software
15
- :aligner=>:clustalo, :simulator=>:grinder,
16
- :simulatorbin=>{:grinder=>'grinder'},
17
- :simulatorcmd=>{:grinder=>'%1$s -reference_file "%2$s" -cf "%3$f" -dc \'-~*NnKkMmRrYySsWwBbVvHhDdXx\' -md uniform 0.1 -mr 95 5 -rd %4$d uniform 5 -base_name "%5$s"'},
18
- :alignerbin=>{:muscle=>'muscle', :clustalo=>'clustalo'},
19
- :alignercmd=>{:muscle=>'%1$s -in "%2$s" -out "%3$s" -quiet', :clustalo=>'%1$s -i "%2$s" -o "%3$s" --threads=%4$d --force'}
18
+ aligner: :clustalo, simulator: :grinder,
19
+ simulatorbin:{grinder:"grinder"},
20
+ simulatorcmd:{grinder:"%1$s -reference_file \"%2$s\" -cf \"%3$f\" " +
21
+ "-dc '-~*NnKkMmRrYySsWwBbVvHhDdXx' -md uniform 0.1 -mr 95 5 " +
22
+ "-rd %4$d uniform 5 -base_name \"%5$s\""},
23
+ alignerbin:{muscle:"muscle", clustalo:"clustalo"},
24
+ alignercmd:{muscle:"%1$s -in \"%2$s\" -out \"%3$s\" -quiet",
25
+ clustalo:"%1$s -i \"%2$s\" -o \"%3$s\" --threads=%4$d --force"}
20
26
  })
21
27
  @@HAS_BUILD_GEMS = nil
22
28
  def self.ebirest() @@EBIREST ; end
@@ -33,90 +39,65 @@ class ROCker
33
39
  end
34
40
 
35
41
  #================================[ Utilities ]
36
- def genes2genomes(gene_ids)
37
- genomes = []
38
- ids = Array.new(gene_ids)
39
- while ids.size>0
40
- doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
41
- genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
42
- r=ln.split('; ')
43
- {:genome_id=>r[1], :transl_id=>r[2]}
44
- end
45
- end
46
- genomes.uniq
47
- end
48
- def genome2taxid(genome_id)
49
- ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
50
- return ln if ln.nil?
51
- ln.sub(/.*"taxon:(\d+)".*/, "\\1")
52
- end
53
- def genome2taxon(genome_id, rank='species')
54
- xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
55
- xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
56
- end
57
42
  def restcall(url, outfile=nil)
58
43
  $stderr.puts " # Calling: #{url}" if @o[:debug]
59
- response = RestClient::Request.execute(:method=>:get, :url=>url, :timeout=>600)
60
- raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
44
+ response = RestClient::Request.execute(:method=>:get, :url=>url,
45
+ :timeout=>600)
46
+ raise "Unable to reach EBI REST client, error code " +
47
+ response.code.to_s + "." unless response.code == 200
61
48
  unless outfile.nil?
62
- ohf = File.open(outfile, 'w')
49
+ ohf = File.open(outfile, "w")
63
50
  ohf.print response.to_s
64
51
  ohf.close
65
52
  end
66
53
  response.to_s
67
54
  end
68
55
  def ebiFetch(db, ids, format, outfile=nil)
69
- url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
70
- res = self.restcall url
71
- unless outfile.nil?
72
- ohf = File.open(outfile, 'w')
73
- ohf.print res
74
- ohf.close
75
- end
76
- res
56
+ url = "#{ROCker.ebirest}/dbfetch/dbfetch/" +
57
+ "#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
58
+ self.restcall url, outfile
77
59
  end
78
- def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
79
- positive_coords = {}
80
- genomes_org = {}
60
+ def get_coords_from_gff3(genome_ids, pset, thread_id, json_file)
61
+ coords = {}
81
62
  i = 0
82
63
  genome_ids.each do |genome_id|
83
- print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread. \r" if thread_id==0 and not @o[:q]
84
- unless @o[:pertaxon].nil?
85
- genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
86
- genomes_org[ genome_taxon.to_sym ] ||= []
87
- genomes_org[ genome_taxon.to_sym ] << genome_id
88
- end
64
+ print " * scanning #{(i+=1).ordinalize} genome out of " +
65
+ "#{genome_ids.size} in first thread. \r" if
66
+ thread_id==0 and not @o[:q]
89
67
  genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
90
68
  if @o[:reuse] and File.size? genome_file
91
- ifh = File.open(genome_file, 'r')
69
+ ifh = File.open(genome_file, "r")
92
70
  doc = ifh.readlines.grep(/^[^#]/)
93
71
  ifh.close
94
72
  else
95
73
  genome_file=nil unless @o[:noclean]
96
- doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
74
+ doc = ebiFetch(:embl, [genome_id], :gff3,
75
+ genome_file).split("\n").grep(/^[^#]/)
97
76
  end
98
77
  doc.each do |ln|
99
78
  next if ln =~ /^#/
100
79
  r = ln.chomp.split /\t/
101
80
  next if r.size < 9
102
- prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
103
- p = prots.select{ |id| protein_ids.include? id }.first
104
- trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
105
- t = trans.select{ |id| transl_ids.include? id }.first
81
+ prots = r[8].split(/;/).grep(
82
+ /^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
83
+ p = prots.select{ |id| pset.ids.include? id }.first
84
+ trans = r[8].split(/;/).grep(
85
+ /^protein_id=/){ |pid| pid.split(/=/)[1] }
86
+ t = trans.select{ |id| pset.tranids.include? id }.first
106
87
  next if p.nil? and t.nil?
107
- positive_coords[ r[0].to_sym ] ||= []
108
- positive_coords[ r[0].to_sym ] << {
109
- :prot_id => p,
110
- :tran_id => t,
111
- :from => r[3].to_i,
112
- :to => r[4].to_i,
113
- :strand => r[6]
88
+ coords[ r[0].to_sym ] ||= []
89
+ coords[ r[0].to_sym ] << {
90
+ prot_id: p,
91
+ tran_id: t,
92
+ from: r[3].to_i,
93
+ to: r[4].to_i,
94
+ strand: r[6]
114
95
  }
115
96
  end
116
97
  end
117
98
  print "\n" if thread_id==0 and not @o[:q]
118
- ofh = File.open json_file, "w"
119
- ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
99
+ ofh = File.open(json_file, "w")
100
+ ofh.print({coords:coords}.to_json)
120
101
  ofh.close
121
102
  end
122
103
 
@@ -124,211 +105,251 @@ class ROCker
124
105
  def build!
125
106
  # Check requirements
126
107
  puts "Testing environment." unless @o[:q]
127
- @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
128
- @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
129
- @o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
130
- @o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
131
- @o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
132
- @o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
108
+ { searchcmd: :search, makedbcmd: :search,
109
+ alignercmd: :aligner, alignerbin: :aligner,
110
+ simulatorcmd: :simulator, simulatorbin: :simulator
111
+ }.each_pair { |k,v| @o[k] = @o[k][@o[v]] if @o[k].is_a? Hash }
133
112
  @o[:nosearch]=true if @o[:nosimulate]
134
- raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
135
- @o[:positive] += @o[:posori] unless @o[:posori].nil?
136
- @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
137
- @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
138
- unless @o[:aln].nil?
139
- aln = Alignment.new
140
- aln.read_fasta @o[:aln]
141
- @o[:positive] += aln.get_ids
142
- end
143
- raise "-p or -P are mandatory." if @o[:positive].size==0
113
+ raise "Unsatisfied requirements, please see the help message (-h)." unless
114
+ ROCker.has_build_gems?
115
+ protein_set = {}
116
+ protein_set[:+] = ProteinSet.new(self,@o[:positive],@o[:posfile],@o[:aln])
117
+ protein_set[:-] = ProteinSet.new(self,@o[:negative],@o[:negfile])
118
+ raise "-p, -P, or -a are mandatory." if protein_set[:+].empty?
144
119
  raise "-o/--baseout is mandatory." if @o[:baseout].nil?
145
- if @o[:positive].size == 1 and not @o[:noaln]
146
- warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
120
+ if protein_set[:+].size==1 and not @o[:noaln]
121
+ warn "\nWARNING: Positive set contains only one sequence, turning " +
122
+ "off alignment.\n\n"
147
123
  @o[:noaln] = true
148
124
  end
149
125
  unless @o[:nosimulate]
150
- self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
126
+ self.bash("#{@o[:simulatorbin]} --version",
127
+ "--simulator-bin must be executable. Is Grinder installed?") if
128
+ @o[:simulator]==:grinder
151
129
  end
152
130
  unless @o[:noaln]
153
- self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
154
- self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
131
+ self.bash("#{@o[:alignerbin]} -version",
132
+ "--aligner-bin must be executable. Is Muscle installed?") if
133
+ @o[:aligner]==:muscle
134
+ self.bash("#{@o[:alignerbin]} --version",
135
+ "--aligner-bin must be executable. Is ClustalOmega installed?") if
136
+ @o[:aligner]==:clustalo
155
137
  end
156
138
  unless @o[:nosearch]
157
- self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
158
- self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
139
+ self.bash("#{@o[:searchbins]}makeblastdb -version",
140
+ "--search-bins must contain executables. Is BLAST+ installed?") if
141
+ @o[:search]==:blast
142
+ self.bash("#{@o[:searchbins]}diamond --help",
143
+ "--search-bins must contain executables. Is DIAMOND installed?") if
144
+ @o[:search]==:diamond
159
145
  end
160
146
 
161
147
  # Download genes
162
148
  puts "Downloading gene data." unless @o[:q]
163
149
  ref_file = @o[:baseout] + ".ref.fasta"
164
- if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
150
+ if not protein_set[:+].aln.nil?
165
151
  puts " * reusing aligned sequences as positive set." unless @o[:q]
166
- f = File.open(ref_file, "w")
167
- f.print aln.to_seq_s
168
- f.close
152
+ protein_set[:+].get_from_aln(ref_file, aln)
169
153
  @o[:noaln] = true
170
154
  elsif @o[:reuse] and File.size? ref_file
171
155
  puts " * reusing positive set: #{ref_file}." unless @o[:q]
172
156
  else
173
- puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
174
- $stderr.puts " # #{@o[:positive]}" if @o[:debug]
175
- ids = Array.new(@o[:positive])
176
- f = File.open(ref_file, "w")
177
- while ids.size>0
178
- f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
179
- end
180
- f.close
157
+ puts " * downloading #{protein_set[:+].size} sequence(s) in " +
158
+ "positive set." unless @o[:q]
159
+ $stderr.puts " # #{protein_set[:+].ids}" if @o[:debug]
160
+ protein_set[:+].download(ref_file)
181
161
  end
182
- genome_ids = {:positive=>[], :negative=>[]}
183
- transl_ids = {:positive=>[], :negative=>[]}
184
- [:positive, :negative].each do |set|
185
- unless @o[set].size==0
186
- puts " * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
187
- $stderr.puts " # #{@o[set]}" if @o[:debug]
188
- r = genes2genomes(@o[set])
189
- genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
190
- transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
162
+ [:+, :-].each do |set|
163
+ unless protein_set[set].empty?
164
+ puts " * linking genomes from #{protein_set[set].size} " +
165
+ "[#{set.to_s}] sequence(s)." unless @o[:q]
166
+ $stderr.puts " # #{protein_set[set].ids}" if @o[:debug]
167
+ protein_set[set].get_genomes!
191
168
  end
192
169
  end
193
- raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
194
- genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
195
- raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
196
- all_genome_ids = genome_ids.values.reduce(:+).uniq
170
+ raise "No genomes associated with the positive set." if
171
+ protein_set[:+].genomes.empty?
172
+ genome_set = {:+ => GenomeSet.new(self, protein_set[:+].genomes),
173
+ :- => GenomeSet.new(self, protein_set[:-].genomes)}
197
174
 
198
175
  # Locate genes
199
176
  puts "Analyzing genome data." unless @o[:q]
200
177
  coords_file = @o[:baseout] + ".src.coords"
201
178
  if @o[:reuse] and File.size? coords_file
202
179
  puts " * reusing coordinates: #{coords_file}." unless @o[:q]
203
- c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
180
+ c = JSON.parse File.read(coords_file), {symbolize_names:true}
204
181
  positive_coords = c[:positive_coords]
205
- genome_org = c[:genome_org]
182
+ negative_coords = c[:negative_coords]
183
+ genome_set[:+].taxa = c[:taxa_pos]
184
+ genome_set[:-].taxa = c[:taxa_neg]
206
185
  else
207
- thrs = [@o[:thr], genome_ids[:positive].size].min
208
- puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
209
- $stderr.puts " # Looking for proteins: #{@o[:positive]}" if @o[:debug]
210
- $stderr.puts " # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
211
- $stderr.puts " # Looking into: #{genome_ids[:positive]}" if @o[:debug]
212
- thr_obj = []
213
- (0 .. (thrs-1)).each do |thr_i|
214
- ids_to_parse = []
215
- (0 .. (genome_ids[:positive].size-1)).each do |i|
216
- ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
217
- end
218
- json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
219
- thr_obj << json_file
220
- fork do
221
- get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
222
- end
223
- end
224
- Process.waitall
225
- # Combine results
226
- positive_coords = {}
227
- genomes_org = {}
228
- genome_org = {}
229
- thr_obj.each do |t|
230
- raise "Thread failed without error trace: #{t}" unless File.exist? t
231
- o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
232
- o[:positive_coords].each_pair do |k,v|
233
- positive_coords[ k ] ||= []
234
- positive_coords[ k ] += v
186
+ all_coords = {}
187
+ [:+, :-].each do |set_type|
188
+ all_coords[set_type] = {}
189
+ next if genome_set[set_type].empty?
190
+ thrs = [@o[:thr], genome_set[set_type].size].min
191
+ puts " * downloading and parsing #{genome_set[set_type].size} " +
192
+ "GFF3 document(s) in #{thrs} threads." unless @o[:q]
193
+ $stderr.puts " # Looking for translations: " +
194
+ "#{protein_set[set_type].tranids}" if @o[:debug]
195
+ $stderr.puts " # Looking into: #{genome_set[set_type].ids}" if
196
+ @o[:debug]
197
+ # Launch threads
198
+ thr_obj = []
199
+ (0 .. (thrs-1)).each do |thr_i|
200
+ ids_to_parse = []
201
+ (0 .. (genome_set[set_type].size-1)).each do |i|
202
+ ids_to_parse << protein_set[set_type].genomes[i] if
203
+ (i % thrs) == thr_i
204
+ end
205
+ json_file = @o[:baseout] + ".src.coords." + thr_i.to_s + ".tmp"
206
+ thr_obj << json_file
207
+ fork do
208
+ get_coords_from_gff3(ids_to_parse, protein_set[set_type],
209
+ thr_i, json_file)
210
+ end
235
211
  end
236
- o[:genomes_org].each_pair do |k,v|
237
- genomes_org[ k ] ||= []
238
- genomes_org[ k ] << v
212
+ # Combine results
213
+ Process.waitall
214
+ thr_obj.each do |t|
215
+ raise "Thread failed without error trace: #{t}" unless
216
+ File.exist? t
217
+ o = JSON.parse(File.read(t), {symbolize_names:true})
218
+ o[:coords].each_pair do |k,v|
219
+ all_coords[set_type][ k ] ||= []
220
+ all_coords[set_type][ k ] += v
221
+ end
222
+ File.unlink t
239
223
  end
240
- File.unlink t
241
- end
224
+ end # [:+, :-].each
225
+ positive_coords = all_coords[:+]
226
+ negative_coords = all_coords[:-]
242
227
  # Select one genome per taxon
243
228
  unless @o[:pertaxon].nil?
244
- genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
229
+ puts " Selecting genomes by #{@o[:pertaxon]}." unless @o[:q]
230
+ [:+,:-].each{ |set| genome_set[set].choose_genomes! @o[:pertaxon] }
245
231
  end
246
- # Save coordinates
232
+ # Save coordinates and taxa
247
233
  ofh = File.open(coords_file, "w")
248
- ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
234
+ ofh.print JSON.pretty_generate({
235
+ positive_coords:positive_coords,
236
+ negative_coords:negative_coords,
237
+ taxa_pos:genome_set[:+].taxa,
238
+ taxa_neg:genome_set[:-].taxa})
249
239
  ofh.close
250
- end
240
+ end # if @o[:reuse] and File.size? coords_file ... else
251
241
  unless @o[:pertaxon].nil?
252
- genome_ids[:positive] = genome_org.values
253
- puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
242
+ puts " Using " +
243
+ [:+,:-].map{ |set| genome_set[set].size }.reduce(:+).to_s +
244
+ " genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
254
245
  end
255
- all_genome_ids = genome_ids.values.reduce(:+).uniq
256
- found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
257
- unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
258
- raise "Cannot find the genomic location of any provided sequence." if found.nil?
259
- missing = @o[:positive] - found
260
- warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
246
+ found = protein_set[:+].in_coords(positive_coords)
247
+ raise "Cannot find the genomic location of any provided sequence." if
248
+ found.nil?
249
+ missing = protein_set[:+].ids - found
250
+ warn "\nWARNING: Cannot find genomic location of #{missing.size} " +
251
+ "sequence(s) #{missing.join(",")}.\n\n" unless missing.empty?
261
252
 
262
253
  # Download genomes
263
- genomes_file = @o[:baseout] + '.src.fasta'
254
+ genome_set[:all] = GenomeSet.new(self,
255
+ genome_set[ :+ ].ids + genome_set[ :- ].ids)
256
+ genomes_file = @o[:baseout] + ".src.fasta"
264
257
  if @o[:reuse] and File.size? genomes_file
265
258
  puts " * reusing existing file: #{genomes_file}." unless @o[:q]
266
259
  else
267
- puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
268
- $stderr.puts " # #{all_genome_ids}" if @o[:debug]
269
- ids = Array.new(all_genome_ids)
270
- ofh = File.open(genomes_file, 'w')
271
- while ids.size>0
272
- ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
273
- end
274
- ofh.close
260
+ puts " * downloading " + genome_set[:all].size.to_s +
261
+ " genome(s) in FastA." unless @o[:q]
262
+ $stderr.puts " # #{genome_set[:all].ids}" if @o[:debug]
263
+ genome_set[:all].download genomes_file
275
264
  end
276
265
 
277
266
  # Generate metagenome
278
267
  unless @o[:nosimulate]
279
268
  puts "Generating in silico metagenome" unless @o[:q]
280
269
  if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
281
- puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
270
+ puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
271
+ @o[:q]
282
272
  else
283
- all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
273
+ all_src = File.readlines("#{@o[:baseout]}.src.fasta"
274
+ ).select{ |l| l =~ /^>/ }.size
284
275
  thrs = [@o[:thr], all_src].min
285
- puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
286
- $stderr.puts " # #{positive_coords}" if @o[:debug]
287
276
  thr_obj = []
288
- seqs_per_thr = (all_src/thrs).ceil
277
+ seqs_per_thr = (all_src.to_f/thrs).ceil
278
+ thrs = (all_src.to_f/seqs_per_thr).ceil
279
+ puts " * simulating metagenomes and tagging positive reads in " +
280
+ thrs.to_s + " threads." unless @o[:q]
281
+ $stderr.puts " # #{positive_coords}" if @o[:debug]
289
282
  (0 .. (thrs-1)).each do |thr_i|
290
283
  output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
291
284
  thr_obj << output
292
285
  fork do
293
286
  seqs_a = thr_i*seqs_per_thr + 1
294
- seqs_b = [seqs_a + seqs_per_thr, all_src].min
287
+ seqs_b = [seqs_a + seqs_per_thr - 1, all_src].min
295
288
  # Create sub-fasta
296
- ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
297
- ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
289
+ ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}","w")
290
+ ifh = File.open("#{@o[:baseout]}.src.fasta","r")
298
291
  seq_i = 0
299
292
  while l = ifh.gets
300
293
  seq_i+=1 if l =~ /^>/
301
- break if seq_i > seqs_b
294
+ break if seq_i > seqs_b
302
295
  ofh.print l if seq_i >= seqs_a
303
296
  end
304
297
  ifh.close
305
298
  ofh.close
306
299
 
307
- # Run simulator (except if the temporal file is already there and can be reused)
308
- unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
309
- bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
310
- end
300
+ # Run simulator (except if the temporal file is already
301
+ # there and can be reused)
302
+ bash sprintf(@o[:simulatorcmd], @o[:simulatorbin],
303
+ "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
304
+ @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen],
305
+ "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}") unless
306
+ @o[:reuse] and
307
+ File.size? @o[:baseout] +
308
+ ".mg.tmp.#{thr_i.to_s}-reads.fa"
311
309
 
312
- # Tag positives
313
- puts " * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
314
- ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
315
- ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
310
+ # Tag positive and negative reads
311
+ puts " * tagging reads [thread #{thr_i}]." unless
312
+ @o[:q]
313
+ ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i}-reads.fa",
314
+ "r")
315
+ ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i}", "w")
316
316
  while l = ifh.gets
317
317
  if l =~ /^>/
318
- rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
319
- raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
318
+ rd = %r{
319
+ ^>(?<id>\d+)\s
320
+ reference=[A-Za-z]+\|
321
+ (?<genome_id>[A-Za-z0-9_]+)\|.*\s
322
+ position=(?<comp>complement\()?(?<from>\d+)\.\.
323
+ (?<to>\d+)\)?\s
324
+ }x.match(l)
325
+ raise "Cannot parse simulated read's defline, are " +
326
+ "you using Grinder?: #{l}" if rd.nil?
320
327
  positive = false
321
328
  positive_coords[rd[:genome_id].to_sym] ||= []
322
329
  positive_coords[rd[:genome_id].to_sym].each do |gn|
323
330
  left = rd[:to].to_i - gn[:from]
324
331
  right = gn[:to] - rd[:from].to_i
325
- if (left*right >= 0) and ([left, right].min >= @o[:minovl])
332
+ if (left*right >= 0) and
333
+ ([left, right].min >= @o[:minovl])
326
334
  positive = true
327
335
  break
328
336
  end
329
337
  end
330
- l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
331
- "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
338
+ negative = false
339
+ negative_coords[rd[:genome_id].to_sym] ||= []
340
+ negative_coords[rd[:genome_id].to_sym].each do |gn|
341
+ left = rd[:to].to_i - gn[:from]
342
+ right = gn[:to] - rd[:from].to_i
343
+ if (left*right >= 0) and
344
+ ([left, right].min >= @o[:minovl])
345
+ negative = true
346
+ break
347
+ end
348
+ end
349
+ l = ">#{thr_i.to_s}_#{rd[:id]}" +
350
+ "#{positive ? "@%" : (negative ? "@$" : "")} " +
351
+ "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}" +
352
+ "#{(rd[:comp]=="complement(") ? "-" : "+"}\n"
332
353
  end
333
354
  ofh.print l
334
355
  end
@@ -338,9 +359,10 @@ class ROCker
338
359
  end # (1 .. thrs).each
339
360
  Process.waitall
340
361
  # Concatenate results
341
- ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
362
+ ofh = File.open(@o[:baseout] + ".mg.fasta", "w")
342
363
  thr_obj.each do |t|
343
- raise "Thread failed without error trace: #{t}" unless File.exist? t
364
+ raise "Thread failed without error trace: #{t}" unless
365
+ File.exist? t
344
366
  ifh = File.open(t, "r")
345
367
  while l = ifh.gets
346
368
  ofh.print l
@@ -356,23 +378,33 @@ class ROCker
356
378
  unless @o[:noaln]
357
379
  puts "Aligning reference set." unless @o[:q]
358
380
  if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
359
- puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
381
+ puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless
382
+ @o[:q]
360
383
  else
361
- bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
362
- puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
384
+ bash(sprintf(@o[:alignercmd],
385
+ @o[:alignerbin], "#{@o[:baseout]}.ref.fasta",
386
+ "#{@o[:baseout]}.ref.aln", @o[:thr]))
387
+ puts " +--\n | IMPORTANT NOTE: Manually checking the alignment " +
388
+ "before\n | the 'compile' step is *strongly* encouraged.\n " +
389
+ "+--\n" unless @o[:q]
363
390
  end
364
391
  end
365
392
 
366
393
  # Run similarity search
367
394
  unless @o[:nosearch]
368
- puts "Running homology search." unless @o[:q]
395
+ puts "Running similarity search." unless @o[:q]
369
396
  if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
370
- puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
397
+ puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless
398
+ @o[:q]
371
399
  else
372
400
  puts " * preparing database." unless @o[:q]
373
- bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
401
+ bash(sprintf(@o[:makedbcmd],
402
+ @o[:searchbins], "prot", "#{@o[:baseout]}.ref.fasta",
403
+ "#{@o[:baseout]}.ref"))
374
404
  puts " * running similarity search." unless @o[:q]
375
- bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
405
+ bash(sprintf(@o[:searchcmd],
406
+ @o[:searchbins], "blastx", "#{@o[:baseout]}.mg.fasta",
407
+ "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr]))
376
408
  end
377
409
  end
378
410
 
@@ -382,7 +414,8 @@ class ROCker
382
414
  sff = %w{.src.xml .src.fasta}
383
415
  sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
384
416
  sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
385
- sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
417
+ sff.each { |sf| File.unlink @o[:baseout] + sf if
418
+ File.exist? @o[:baseout] + sf }
386
419
  end
387
420
  end # build!
388
421
  end # ROCker