bio-rocker 0.2.5 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ROCker +36 -26
- data/lib/rocker.rb +18 -431
- data/lib/rocker/step/build.rb +389 -0
- data/lib/rocker/step/compile.rb +53 -0
- data/lib/rocker/step/filter.rb +32 -0
- data/lib/rocker/step/plot.rb +93 -0
- data/lib/rocker/step/search.rb +20 -0
- metadata +23 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8a10cdc85d8b7b54c21d26f12b90c0b3dff4f82
|
4
|
+
data.tar.gz: c837b3c6687f6705dbfc7c959824dd530e7ee932
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 869cdadfed2dad125fc11c03133e2f56df53074a54b0f35d8ea5c6674029e7069332e4c35c486b1f39a417aeff01932a7eee30da44e15de036ce1a2d878d15d4
|
7
|
+
data.tar.gz: 823b30e7923c243f8dc8bb122f50426898e2cdda634516cc53e0325b150946b0699e0a9257a55c06ed8868a0f843f0274ed1d23bdf8ef2de9629fafb66f33552
|
data/bin/ROCker
CHANGED
@@ -16,8 +16,8 @@ require 'optparse'
|
|
16
16
|
$t = {
|
17
17
|
'build' => 'Creates in silico metagenomes and training sets from reference genomes.',
|
18
18
|
'compile' => 'Identifies the most discriminant bit-score per alignment position in a set of sequence.',
|
19
|
-
'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
|
20
19
|
'search' => 'Uses a ROCker compilation to identify reads putatively derived from a set of sequences.',
|
20
|
+
'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
|
21
21
|
'plot' => 'Generates a graphical representation of the alignment, the thresholds, and the hits.',
|
22
22
|
}
|
23
23
|
task = (ARGV.size > 0 ? ARGV.shift : '').downcase
|
@@ -43,49 +43,51 @@ opts = OptionParser.new do |opt|
|
|
43
43
|
opt.on("-p", "--positive ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
|
44
44
|
opt.on("-n", "--negative ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
|
45
45
|
opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
|
46
|
-
#opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides (currently not implemented)."){ raise "--nucleotides: This option is currently not implemented." }
|
47
46
|
opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
|
48
47
|
opt.separator ""
|
49
48
|
opt.separator "+ ADVANCED BUILDING ARGUMENTS"
|
50
49
|
opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one UniProtKB ID per line. If used, -p is not required."){ |v| o[:posfile]=v }
|
51
50
|
opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one UniProtKB ID per line."){ |v| o[:negfile]=v }
|
52
51
|
opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain UniProtKB ID. If used, -p is not required."){ |v| o[:aln]=v }
|
53
|
-
opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
|
54
|
-
opt.on("-
|
55
|
-
opt.on(
|
56
|
-
opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum."
|
57
|
-
|
58
|
-
opt.on( "--
|
59
|
-
opt.on( "--
|
52
|
+
opt.on("-s", "--seqdepth NUMBER", "Sequencing depth (reads/bp) to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
|
53
|
+
opt.on("-l", "--readlen INTEGER", "Average read length of in silico metagenome (in bp). By default: '#{ROCker.default :readlen}'."){ |v| o[:readlen]=v.to_i }
|
54
|
+
opt.on("-v", "--overlap INTEGER", "Minimum overlap (in bp) with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_i }
|
55
|
+
opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum."){ |v| o[:pertaxon]=v.downcase }
|
56
|
+
opt.on( "--genome-frx NUMBER", "Fraction to subsample genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}."){ |v| o[:genomefrx]=v.to_f }
|
57
|
+
opt.on( "--nosimulate", "Do not simulate metagenome. Implies --nosearch. By default, metagenome is simulated."){ |v| o[:nosimulate]=v }
|
58
|
+
opt.on( "--nosearch", "Do not execute similarity search. By default, it is executed."){ |v| o[:nosearch]=v }
|
60
59
|
opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
|
61
60
|
opt.on( "--nocleanup", "Keep all intermediate files. By default, intermediate files are removed."){ |v| o[:noclean]=v }
|
62
61
|
opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
|
63
62
|
opt.separator ""
|
64
63
|
opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
|
65
|
-
opt.on("
|
66
|
-
opt.on("
|
67
|
-
opt.on("
|
68
|
-
opt.on( "--
|
69
|
-
|
70
|
-
opt.on("--
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
64
|
+
opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
|
65
|
+
opt.on( "--simulator STR", "In silico metagenome simulator to use. Supported: 'grinder'. By default: '#{ROCker.default :simulator}'.") { |v| o[:ssimulator]=v.to_sym }
|
66
|
+
opt.on( "--aligner STR", "Multiple alignment algorithm to use. Supported: 'clustalo' and 'muscle'. By default: '#{ROCker.default :aligner}'.") { |v| o[:aligner]=v.to_sym }
|
67
|
+
opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
|
68
|
+
opt.on( "--simulator-bin PATH", "Path to the simulator executable. By default in the $PATH: '#{ROCker.default(:simulatorbin).values.join("' or '")}'.") { |v| o[:simulatorbin]=v }
|
69
|
+
opt.on( "--aligner-bin PATH", "Path to the aligner executable. By default in the $PATH: '#{ROCker.default(:alignerbin).values.join("' or '")}'.") { |v| o[:alignerbin]=v }
|
70
|
+
opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
|
71
|
+
*ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
|
72
|
+
opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
|
73
|
+
*ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
|
74
|
+
opt.on( "--simulator-cmd STR", "Command calling simulator, where %1$s: binary, %2$s: input, %3$s: seq. depth (X), %4$d: read len., %5$s: output.",
|
75
|
+
*ROCker.default(:simulatorcmd).keys.map{|k| "By default if --simulator #{k}: '#{ROCker.default(:simulatorcmd)[k]}'."}){ |v| o[:simulatorcmd]=v }
|
76
|
+
opt.on("--aligner-cmd STR", "Command calling aligner, where %1$s: binary, %2$s: input, %3$s: output, %4$d: threads.",
|
77
|
+
*ROCker.default(:alignercmd).keys.map{|k| "By default if --aligner #{k}: '#{ROCker.default(:alignercmd)[k]}'."}){ |v| o[:alignercmd]=v }
|
76
78
|
when 'compile'
|
77
79
|
opt.separator "+ COMPILATION ARGUMENTS"
|
78
80
|
opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. Required."){ |v| o[:aln]=v }
|
79
81
|
opt.on("-b", "--ref-blast PATH",
|
80
82
|
"Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
|
81
83
|
opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
|
82
|
-
opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ raise "--nucleotides: This option is currently not implemented." }
|
83
84
|
opt.separator ""
|
84
85
|
opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
|
85
86
|
opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
|
86
87
|
opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
|
87
88
|
opt.on( "--norefine", "Do not refine windows."){ o[:refine]=false }
|
88
89
|
opt.on("-w", "--window INT", "Initial size of alignment windows (in number of AA columns). By default: #{ROCker.default :win}."){ |v| o[:win]=v.to_i }
|
90
|
+
opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
|
89
91
|
opt.separator ""
|
90
92
|
opt.separator "+ INPUT/OUTPUT"
|
91
93
|
opt.separator " o The input alignment (-a) MUST be in FastA format, and the IDs must"
|
@@ -107,15 +109,23 @@ opts = OptionParser.new do |opt|
|
|
107
109
|
opt.separator " 5. Bit score threshold set for the window."
|
108
110
|
opt.separator " The file also contains the alignment (commented with #:)."
|
109
111
|
opt.separator ""
|
112
|
+
when 'search'
|
113
|
+
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
114
|
+
opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
|
115
|
+
opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
|
116
|
+
opt.separator ""
|
117
|
+
opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
|
118
|
+
opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
|
119
|
+
opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
|
120
|
+
opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
|
121
|
+
*ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
|
122
|
+
opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
|
123
|
+
*ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
|
110
124
|
when 'filter'
|
111
125
|
opt.separator "+ FILTERING ARGUMENTS"
|
112
126
|
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
113
127
|
opt.on("-x", "--query-blast PATH", "Tabular BLAST (blastx) of the query reads vs. the reference dataset. Required."){ |v| o[:qblast]=v }
|
114
128
|
opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
|
115
|
-
when 'search'
|
116
|
-
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
117
|
-
opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
|
118
|
-
opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
|
119
129
|
when 'plot'
|
120
130
|
opt.separator "+ PLOTTING ARGUMENTS"
|
121
131
|
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
@@ -127,7 +137,7 @@ opts = OptionParser.new do |opt|
|
|
127
137
|
opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
|
128
138
|
opt.on( "--color", "Color alignment by amino acid."){ o[:color]=true }
|
129
139
|
opt.on( "--no-transparency", "Do not use (semi-)transparencies."){ |v| o[:transparency] = v }
|
130
|
-
opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
|
140
|
+
opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}."){ |v| o[:minscore]=v.to_f }
|
131
141
|
opt.on( "--stats-impact", "Plot impact on statistics, instead of absolute values per window."){ o[:impact]=true }
|
132
142
|
opt.on( "--stats-ylim STRING", "Limits of the Y-axis in the bottom panel. By default: '-2,.1' if --stats-impact is set, '50,100' otherwise."){ |v| o[:ylim]=v }
|
133
143
|
opt.on("-s", "--subject SBJ1,SBJ2,...", Array,
|
data/lib/rocker.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Jun-05-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'rocker/blasthit'
|
@@ -10,40 +10,20 @@ require 'rocker/rocdata'
|
|
10
10
|
|
11
11
|
class ROCker
|
12
12
|
#================================[ Class ]
|
13
|
-
@@EBIREST = 'http://www.ebi.ac.uk/Tools'
|
14
13
|
@@DEFAULTS = {
|
15
14
|
# General
|
16
|
-
:q=>false, :r=>'R', :nucl=>false, :debug=>false,
|
17
|
-
#
|
18
|
-
:
|
19
|
-
|
20
|
-
:
|
21
|
-
:
|
22
|
-
|
23
|
-
:
|
24
|
-
:
|
25
|
-
# Compile
|
26
|
-
:refine=>true, :win=>20, :minscore=>0,
|
27
|
-
# Filter
|
28
|
-
:sbj=>[],
|
29
|
-
# Plot
|
30
|
-
:color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true,
|
15
|
+
:q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
|
16
|
+
# External software
|
17
|
+
:searchbins=>'',
|
18
|
+
:searchcmd=>{
|
19
|
+
:blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
|
20
|
+
:diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
|
21
|
+
:makedbcmd=>{
|
22
|
+
:blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
|
23
|
+
:diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
|
31
24
|
}
|
32
|
-
@@HAS_BUILD_GEMS = nil
|
33
|
-
def self.ebirest() @@EBIREST ; end
|
34
25
|
def self.defaults() @@DEFAULTS ; end
|
35
26
|
def self.default(k) @@DEFAULTS[k] ; end
|
36
|
-
def self.has_build_gems?
|
37
|
-
return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
|
38
|
-
@@HAS_BUILD_GEMS = TRUE
|
39
|
-
begin
|
40
|
-
require 'rubygems'
|
41
|
-
require 'restclient'
|
42
|
-
rescue LoadError
|
43
|
-
@@HAS_BUILD_GEMS = FALSE
|
44
|
-
end
|
45
|
-
@@HAS_BUILD_GEMS
|
46
|
-
end
|
47
27
|
|
48
28
|
#================================[ Instance ]
|
49
29
|
attr_reader :o
|
@@ -53,374 +33,6 @@ class ROCker
|
|
53
33
|
RInterface.R_BIN = opts[:r] unless opts[:r].nil?
|
54
34
|
end
|
55
35
|
|
56
|
-
#================================[ Build ]
|
57
|
-
def build!
|
58
|
-
# Check requirements
|
59
|
-
puts "Testing environment." unless @o[:q]
|
60
|
-
@o[:noblast]=true if @o[:nomg]
|
61
|
-
raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
|
62
|
-
@o[:positive] += @o[:posori] unless @o[:posori].nil?
|
63
|
-
@o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
|
64
|
-
@o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
|
65
|
-
unless @o[:aln].nil?
|
66
|
-
aln = Alignment.new
|
67
|
-
aln.read_fasta @o[:aln]
|
68
|
-
@o[:positive] += aln.get_ids
|
69
|
-
end
|
70
|
-
raise "-p or -P are mandatory." if @o[:positive].size==0
|
71
|
-
raise "-o/--baseout is mandatory." if @o[:baseout].nil?
|
72
|
-
if @o[:positive].size == 1 and not @o[:noaln]
|
73
|
-
warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
|
74
|
-
@o[:noaln] = true
|
75
|
-
end
|
76
|
-
self.bash "#{@o[:grinder]} --version", "-G/--grinder must be executable. Is Grinder installed?" unless @o[:nomg]
|
77
|
-
self.bash "#{@o[:muscle]} -version", "-M/--muscle must be executable. Is Muscle installed?" unless @o[:noaln]
|
78
|
-
self.bash "#{@o[:blastbins]}makeblastdb -version", "-B/--blastbins must contain executables. Is BLAST+ installed?" unless @o[:noblast]
|
79
|
-
# Download genes
|
80
|
-
puts "Downloading gene data." unless @o[:q]
|
81
|
-
f = File.open(@o[:baseout] + '.ref.fasta', 'w')
|
82
|
-
if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
|
83
|
-
puts " * re-using aligned sequences as positive set." unless @o[:q]
|
84
|
-
f.print aln.to_seq_s
|
85
|
-
@o[:noaln] = true
|
86
|
-
else
|
87
|
-
puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
|
88
|
-
$stderr.puts " # #{@o[:positive]}" if @o[:debug]
|
89
|
-
ids = Array.new(@o[:positive])
|
90
|
-
while ids.size>0
|
91
|
-
f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
|
92
|
-
end
|
93
|
-
end
|
94
|
-
f.close
|
95
|
-
genome_ids = {:positive=>[], :negative=>[]}
|
96
|
-
[:positive, :negative].each do |set|
|
97
|
-
unless @o[set].size==0
|
98
|
-
puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
|
99
|
-
$stderr.puts " # #{@o[set]}" if @o[:debug]
|
100
|
-
genome_ids[set] = genes2genomes(@o[set])
|
101
|
-
end
|
102
|
-
end
|
103
|
-
raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
|
104
|
-
genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
|
105
|
-
raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
|
106
|
-
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
107
|
-
|
108
|
-
# Locate genes
|
109
|
-
puts "Analyzing genome data." unless @o[:q]
|
110
|
-
puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s)." unless @o[:q]
|
111
|
-
$stderr.puts " # #{genome_ids[:positive]}" if @o[:debug]
|
112
|
-
positive_coords = {}
|
113
|
-
genome_org = {}
|
114
|
-
i = 0
|
115
|
-
genome_ids[:positive].each do |genome_id|
|
116
|
-
print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids[:positive].size}. \r" unless @o[:q]
|
117
|
-
unless @o[:pertaxon].nil?
|
118
|
-
genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
|
119
|
-
next unless genome_org[ genome_taxon ].nil?
|
120
|
-
genome_org[ genome_taxon ] = genome_id
|
121
|
-
end
|
122
|
-
$stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
|
123
|
-
genome_file = @o[:baseout] + '.src.' + i.to_s + '.gff3'
|
124
|
-
if @o[:reuse] and File.exist? genome_file
|
125
|
-
puts " * reusing existing file: #{genome_file}." unless @o[:q]
|
126
|
-
ifh = File.open(genome_file, 'r')
|
127
|
-
doc = ifh.readlines.grep(/^[^#]/)
|
128
|
-
ifh.close
|
129
|
-
else
|
130
|
-
genome_file=nil unless @o[:noclean]
|
131
|
-
res = ebiFetch(:embl, [genome_id], :gff3, genome_file)
|
132
|
-
doc = res.split("\n").grep(/^[^#]/)
|
133
|
-
end
|
134
|
-
doc.each do |ln|
|
135
|
-
next if ln =~ /^#/
|
136
|
-
r = ln.chomp.split /\t/
|
137
|
-
next if r.size < 9
|
138
|
-
prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
|
139
|
-
p = prots.select{ |p| @o[:positive].include? p }.first
|
140
|
-
next if p.nil?
|
141
|
-
positive_coords[ r[0] ] ||= []
|
142
|
-
positive_coords[ r[0] ] << {
|
143
|
-
#:strand => r[6],
|
144
|
-
:prot_id => p,
|
145
|
-
:from => r[3].to_i,
|
146
|
-
:to => r[4].to_i
|
147
|
-
}
|
148
|
-
end
|
149
|
-
end
|
150
|
-
print "\n" unless @o[:q]
|
151
|
-
unless @o[:pertaxon].nil?
|
152
|
-
genome_ids[:positive] = genome_org.values
|
153
|
-
puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
|
154
|
-
end
|
155
|
-
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
156
|
-
found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+)
|
157
|
-
raise "Cannot find the genomic location of any provided sequence." if found.nil?
|
158
|
-
missing = @o[:positive] - found
|
159
|
-
warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or not @o[:pertaxon].nil?
|
160
|
-
|
161
|
-
# Download genomes
|
162
|
-
genomes_file = @o[:baseout] + '.src.fasta'
|
163
|
-
if @o[:reuse] and File.exist? genomes_file
|
164
|
-
puts " * reusing existing file: #{genomes_file}." unless @o[:q]
|
165
|
-
else
|
166
|
-
puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
|
167
|
-
$stderr.puts " # #{all_genome_ids}" if @o[:debug]
|
168
|
-
ids = Array.new(all_genome_ids)
|
169
|
-
ofh = File.open(genomes_file, 'w')
|
170
|
-
while ids.size>0
|
171
|
-
ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
|
172
|
-
end
|
173
|
-
ofh.close
|
174
|
-
end
|
175
|
-
|
176
|
-
# Generate metagenome
|
177
|
-
unless @o[:nomg]
|
178
|
-
puts "Generating in silico metagenome" unless @o[:q]
|
179
|
-
if @o[:reuse] and File.exist? @o[:baseout] + ".mg.fasta"
|
180
|
-
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
|
181
|
-
else
|
182
|
-
all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
|
183
|
-
thrs = [@o[:thr], all_src].min
|
184
|
-
puts " * running grinder and tagging positive reads (#{thrs} threads)." unless @o[:q]
|
185
|
-
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
186
|
-
thr_obj = []
|
187
|
-
seqs_per_thr = (all_src/thrs).ceil
|
188
|
-
(0 .. (thrs-1)).each do |thr_i|
|
189
|
-
thr_obj << Thread.new do
|
190
|
-
Thread.current[:seqs_a] = thr_i*seqs_per_thr + 1
|
191
|
-
Thread.current[:seqs_b] = [Thread.current[:seqs_a] + seqs_per_thr, all_src].min
|
192
|
-
# Create sub-fasta
|
193
|
-
Thread.current[:ofh] = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", 'w')
|
194
|
-
Thread.current[:ifh] = File.open("#{@o[:baseout]}.src.fasta", 'r')
|
195
|
-
Thread.current[:seq_i] = 0
|
196
|
-
while Thread.current[:l] = Thread.current[:ifh].gets
|
197
|
-
Thread.current[:seq_i]+=1 if Thread.current[:l] =~ /^>/
|
198
|
-
break if Thread.current[:seq_i] > Thread.current[:seqs_b]
|
199
|
-
Thread.current[:ofh].print Thread.current[:l] if Thread.current[:seq_i] >= Thread.current[:seqs_a]
|
200
|
-
end
|
201
|
-
Thread.current[:ifh].close
|
202
|
-
Thread.current[:ofh].close
|
203
|
-
bash sprintf(@o[:grindercmd], @o[:grinder], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
|
204
|
-
# Tag positives
|
205
|
-
puts " * tagging positive reads." unless @o[:q]
|
206
|
-
Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
|
207
|
-
Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
|
208
|
-
while Thread.current[:l]=Thread.current[:ifh].gets
|
209
|
-
Thread.current[:rd] = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
|
210
|
-
unless Thread.current[:rd].nil?
|
211
|
-
Thread.current[:positive] = false
|
212
|
-
positive_coords[Thread.current[:rd][:genome_id]] ||= []
|
213
|
-
positive_coords[Thread.current[:rd][:genome_id]].each do |gn|
|
214
|
-
Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
|
215
|
-
Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
|
216
|
-
if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
|
217
|
-
Thread.current[:positive] = true
|
218
|
-
break
|
219
|
-
end
|
220
|
-
end
|
221
|
-
Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:genome_id]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
|
222
|
-
end
|
223
|
-
Thread.current[:ofh].print Thread.current[:l]
|
224
|
-
end
|
225
|
-
Thread.current[:ofh].close
|
226
|
-
Thread.current[:ifh].close
|
227
|
-
Thread.current[:output] = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
|
228
|
-
end # Thread.new do
|
229
|
-
end # (1 .. thrs).each
|
230
|
-
# Concatenate results
|
231
|
-
ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
|
232
|
-
thr_obj.each do |t|
|
233
|
-
t.join
|
234
|
-
raise "Thread failed without error trace: #{t}" if t[:output].nil?
|
235
|
-
ifh = File.open(t[:output], 'r')
|
236
|
-
while l = ifh.gets
|
237
|
-
ofh.print l
|
238
|
-
end
|
239
|
-
ifh.close
|
240
|
-
File.unlink t[:output]
|
241
|
-
end
|
242
|
-
ofh.close
|
243
|
-
end
|
244
|
-
end # unless @o[:nomg]
|
245
|
-
# Align references
|
246
|
-
unless @o[:noaln]
|
247
|
-
puts "Aligning reference set." unless @o[:q]
|
248
|
-
if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.aln"
|
249
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
|
250
|
-
else
|
251
|
-
bash sprintf(@o[:musclecmd], @o[:muscle], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln")
|
252
|
-
puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
|
253
|
-
end
|
254
|
-
end
|
255
|
-
# Run BLAST
|
256
|
-
unless @o[:noblast]
|
257
|
-
puts "Running homology search." unless @o[:q]
|
258
|
-
if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.blast"
|
259
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
|
260
|
-
else
|
261
|
-
puts " * preparing database." unless @o[:q]
|
262
|
-
bash sprintf(@o[:makedbcmd], @o[:blastbins], (@o[:nucl]?'nucl':'prot'), "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
|
263
|
-
puts " * running BLAST." unless @o[:q]
|
264
|
-
bash sprintf(@o[:blastcmd], @o[:blastbins], (@o[:nucl]?'blastn':'blastx'), "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
|
265
|
-
end
|
266
|
-
end
|
267
|
-
# Clean
|
268
|
-
unless @o[:noclean]
|
269
|
-
puts "Cleaning." unless @o[:q]
|
270
|
-
sff = %w{.src.xml .src.fasta}
|
271
|
-
sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nomg]
|
272
|
-
sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:noblast]
|
273
|
-
sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
|
274
|
-
end
|
275
|
-
end # build!
|
276
|
-
|
277
|
-
#================================[ Compile ]
|
278
|
-
def compile!
|
279
|
-
raise "-a/--alignment is mandatory." if @o[:aln].nil?
|
280
|
-
raise "-a/--alignment must exist." unless File.exist? @o[:aln]
|
281
|
-
if @o[:table].nil?
|
282
|
-
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
|
283
|
-
@o[:table] = "#{@o[:blast]}.table"
|
284
|
-
end
|
285
|
-
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
286
|
-
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
287
|
-
|
288
|
-
puts "Testing environment." unless @o[:q]
|
289
|
-
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
290
|
-
bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
|
291
|
-
|
292
|
-
puts "Reading files." unless @o[:q]
|
293
|
-
puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
|
294
|
-
aln = Alignment.new
|
295
|
-
aln.read_fasta @o[:aln]
|
296
|
-
|
297
|
-
if File.exist? @o[:table]
|
298
|
-
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
299
|
-
else
|
300
|
-
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
301
|
-
blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
|
302
|
-
end
|
303
|
-
|
304
|
-
puts "Analyzing data." unless @o[:q]
|
305
|
-
puts " * computing windows." unless @o[:q]
|
306
|
-
data = ROCData.new(@o[:table], aln, @o[:win])
|
307
|
-
data.nucl = @o[:nucl]
|
308
|
-
if @o[:refine]
|
309
|
-
puts " * refining windows." unless @o[:q]
|
310
|
-
warn "Insufficient hits to refine results." unless data.refine! @o[:table]
|
311
|
-
end
|
312
|
-
puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
|
313
|
-
data.save @o[:rocker]
|
314
|
-
end # compile!
|
315
|
-
|
316
|
-
#================================[ Filter ]
|
317
|
-
def filter!
|
318
|
-
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
319
|
-
raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
|
320
|
-
raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
|
321
|
-
|
322
|
-
puts "Reading ROCker file." unless @o[:q]
|
323
|
-
data = ROCData.new @o[:rocker]
|
324
|
-
|
325
|
-
puts "Filtering BLAST." unless @o[:q]
|
326
|
-
ih = File.open(@o[:qblast], 'r')
|
327
|
-
oh = File.open(@o[:oblast], 'w')
|
328
|
-
while ln = ih.gets
|
329
|
-
bh = BlastHit.new(ln, data.aln)
|
330
|
-
oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
|
331
|
-
end
|
332
|
-
ih.close
|
333
|
-
oh.close
|
334
|
-
end # filter!
|
335
|
-
#================================[ Search ]
|
336
|
-
def search!
|
337
|
-
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
338
|
-
raise "Code Under development..."
|
339
|
-
# ToDo
|
340
|
-
# [ ... ]
|
341
|
-
end # search!
|
342
|
-
|
343
|
-
#================================[ Plot ]
|
344
|
-
def plot!
|
345
|
-
raise "-k/--rocker is mandatory." if o[:rocker].nil?
|
346
|
-
if @o[:table].nil?
|
347
|
-
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
|
348
|
-
@o[:table] = "#{@o[:blast]}.table"
|
349
|
-
end
|
350
|
-
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
351
|
-
|
352
|
-
puts "Testing environment." unless @o[:q]
|
353
|
-
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
354
|
-
|
355
|
-
puts "Reading files." unless @o[:q]
|
356
|
-
puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
|
357
|
-
data = ROCData.new @o[:rocker]
|
358
|
-
if File.exist? @o[:table]
|
359
|
-
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
360
|
-
else
|
361
|
-
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
362
|
-
blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
|
363
|
-
end
|
364
|
-
|
365
|
-
puts "Plotting matches." unless @o[:q]
|
366
|
-
extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
|
367
|
-
@o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
|
368
|
-
data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
|
369
|
-
data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
|
370
|
-
some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
|
371
|
-
data.rrun "par(mar=c(0,4,0,0.5)+.1);"
|
372
|
-
data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
|
373
|
-
data.rrun "noise <- runif(ncol(x),-.2,.2)"
|
374
|
-
data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
|
375
|
-
data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
|
376
|
-
|
377
|
-
puts "Plotting windows." unless @o[:q]
|
378
|
-
if some_thr
|
379
|
-
data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
|
380
|
-
data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
|
381
|
-
end
|
382
|
-
data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
|
383
|
-
"lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
|
384
|
-
|
385
|
-
puts "Plotting alignment." unless @o[:q]
|
386
|
-
data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
|
387
|
-
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
|
388
|
-
i = 0
|
389
|
-
data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
|
390
|
-
data.aln.seqs.values.each do |s|
|
391
|
-
color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
|
392
|
-
data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
|
393
|
-
i += 1
|
394
|
-
end
|
395
|
-
|
396
|
-
puts "Plotting statistics." unless @o[:q]
|
397
|
-
data.rrun "par(mar=c(5,4,0,0.5)+.1);"
|
398
|
-
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
|
399
|
-
if some_thr
|
400
|
-
sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
|
401
|
-
sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
|
402
|
-
ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
|
403
|
-
unless @o[:q]
|
404
|
-
puts " * sensitivity: #{sn}%"
|
405
|
-
puts " * specificity: #{sp}%"
|
406
|
-
puts " * accuracy: #{ac}%"
|
407
|
-
end
|
408
|
-
data.rrun "pos <- (w$V1+w$V2)/2"
|
409
|
-
if @o[:impact]
|
410
|
-
data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
411
|
-
data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
412
|
-
data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
413
|
-
else
|
414
|
-
data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
415
|
-
data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
416
|
-
data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
417
|
-
end
|
418
|
-
#data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
|
419
|
-
end
|
420
|
-
data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
|
421
|
-
data.rrun "dev.off();"
|
422
|
-
end # plot!
|
423
|
-
|
424
36
|
#================================[ Utilities ]
|
425
37
|
def blast2table(blast_f, table_f, aln, minscore)
|
426
38
|
ifh = File.open(blast_f, "r")
|
@@ -432,39 +44,6 @@ class ROCker
|
|
432
44
|
ifh.close
|
433
45
|
ofh.close
|
434
46
|
end
|
435
|
-
def genes2genomes(gene_ids)
|
436
|
-
genomes = []
|
437
|
-
ids = Array.new(gene_ids)
|
438
|
-
while ids.size>0
|
439
|
-
doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
|
440
|
-
genomes += doc.grep( /^DR\s+EMBL;/ ).map{ |ln| ln.split('; ')[1] }
|
441
|
-
end
|
442
|
-
genomes.uniq
|
443
|
-
end
|
444
|
-
def genome2taxid(genome_id)
|
445
|
-
ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
|
446
|
-
return ln if ln.nil?
|
447
|
-
ln.sub(/.*"taxon:(\d+)".*/, "\\1")
|
448
|
-
end
|
449
|
-
def genome2taxon(genome_id, rank='species')
|
450
|
-
xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
451
|
-
xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
|
452
|
-
end
|
453
|
-
def restcall(url, outfile=nil)
|
454
|
-
response = RestClient.get url
|
455
|
-
raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
|
456
|
-
unless outfile.nil?
|
457
|
-
ohf = File.open(outfile, 'w')
|
458
|
-
ohf.print response.to_s
|
459
|
-
ohf.close
|
460
|
-
end
|
461
|
-
response.to_s
|
462
|
-
end
|
463
|
-
def ebiFetch(db, ids, format, outfile=nil)
|
464
|
-
url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
|
465
|
-
$stderr.puts " # Calling: #{url}" if @o[:debug]
|
466
|
-
self.restcall url
|
467
|
-
end
|
468
47
|
def bash(cmd, err_msg=nil)
|
469
48
|
o = `#{cmd} 2>&1 && echo '{'`
|
470
49
|
raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
|
@@ -473,6 +52,14 @@ class ROCker
|
|
473
52
|
end
|
474
53
|
|
475
54
|
#================================[ Extensions ]
|
55
|
+
# To ROCker
|
56
|
+
require 'rocker/step/build'
|
57
|
+
require 'rocker/step/compile'
|
58
|
+
require 'rocker/step/search'
|
59
|
+
require 'rocker/step/filter'
|
60
|
+
require 'rocker/step/plot'
|
61
|
+
|
62
|
+
# To other
|
476
63
|
class Numeric
|
477
64
|
def ordinalize
|
478
65
|
n= self.to_s
|
@@ -0,0 +1,389 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-05-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'json'
|
9
|
+
|
10
|
+
class ROCker
|
11
|
+
#================================[ Class ]
|
12
|
+
@@EBIREST = 'http://www.ebi.ac.uk/Tools'
|
13
|
+
@@DEFAULTS.merge!({:positive=>[], :negative=>[], :genomefrx=>1.0, :seqdepth=>0.03, :readlen=>100, :minovl=>50,
|
14
|
+
# Ext. Software
|
15
|
+
:aligner=>:clustalo, :simulator=>:grinder,
|
16
|
+
:simulatorbin=>{:grinder=>'grinder'},
|
17
|
+
:simulatorcmd=>{:grinder=>'%1$s -reference_file "%2$s" -cf "%3$f" -dc \'-~*NnKkMmRrYySsWwBbVvHhDdXx\' -md uniform 0.1 -mr 95 5 -rd %4$d uniform 5 -base_name "%5$s"'},
|
18
|
+
:alignerbin=>{:muscle=>'muscle', :clustalo=>'clustalo'},
|
19
|
+
:alignercmd=>{:muscle=>'%1$s -in "%2$s" -out "%3$s" -quiet', :clustalo=>'%1$s -i "%2$s" -o "%3$s" --threads=%4$d --force'}
|
20
|
+
})
|
21
|
+
@@HAS_BUILD_GEMS = nil
|
22
|
+
def self.ebirest() @@EBIREST ; end
|
23
|
+
def self.has_build_gems?
|
24
|
+
return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
|
25
|
+
@@HAS_BUILD_GEMS = TRUE
|
26
|
+
begin
|
27
|
+
require 'rubygems'
|
28
|
+
require 'restclient'
|
29
|
+
rescue LoadError
|
30
|
+
@@HAS_BUILD_GEMS = FALSE
|
31
|
+
end
|
32
|
+
@@HAS_BUILD_GEMS
|
33
|
+
end
|
34
|
+
|
35
|
+
#================================[ Utilities ]
|
36
|
+
def genes2genomes(gene_ids)
|
37
|
+
genomes = []
|
38
|
+
ids = Array.new(gene_ids)
|
39
|
+
while ids.size>0
|
40
|
+
doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
|
41
|
+
genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
|
42
|
+
r=ln.split('; ')
|
43
|
+
{:genome_id=>r[1], :transl_id=>r[2]}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
genomes.uniq
|
47
|
+
end
|
48
|
+
def genome2taxid(genome_id)
|
49
|
+
ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
|
50
|
+
return ln if ln.nil?
|
51
|
+
ln.sub(/.*"taxon:(\d+)".*/, "\\1")
|
52
|
+
end
|
53
|
+
def genome2taxon(genome_id, rank='species')
|
54
|
+
xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
55
|
+
xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
|
56
|
+
end
|
57
|
+
def restcall(url, outfile=nil)
|
58
|
+
$stderr.puts " # Calling: #{url}" if @o[:debug]
|
59
|
+
response = RestClient::Request.execute(:method=>:get, :url=>url, :timeout=>600)
|
60
|
+
raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
|
61
|
+
unless outfile.nil?
|
62
|
+
ohf = File.open(outfile, 'w')
|
63
|
+
ohf.print response.to_s
|
64
|
+
ohf.close
|
65
|
+
end
|
66
|
+
response.to_s
|
67
|
+
end
|
68
|
+
def ebiFetch(db, ids, format, outfile=nil)
|
69
|
+
url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
|
70
|
+
res = self.restcall url
|
71
|
+
unless outfile.nil?
|
72
|
+
ohf = File.open(outfile, 'w')
|
73
|
+
ohf.print res
|
74
|
+
ohf.close
|
75
|
+
end
|
76
|
+
res
|
77
|
+
end
|
78
|
+
def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
|
79
|
+
positive_coords = {}
|
80
|
+
genomes_org = {}
|
81
|
+
i = 0
|
82
|
+
genome_ids.each do |genome_id|
|
83
|
+
print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread. \r" if thread_id==0 and not @o[:q]
|
84
|
+
unless @o[:pertaxon].nil?
|
85
|
+
genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
|
86
|
+
genomes_org[ genome_taxon.to_sym ] ||= []
|
87
|
+
genomes_org[ genome_taxon.to_sym ] << genome_id
|
88
|
+
end
|
89
|
+
genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
|
90
|
+
if @o[:reuse] and File.size? genome_file
|
91
|
+
ifh = File.open(genome_file, 'r')
|
92
|
+
doc = ifh.readlines.grep(/^[^#]/)
|
93
|
+
ifh.close
|
94
|
+
else
|
95
|
+
genome_file=nil unless @o[:noclean]
|
96
|
+
doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
|
97
|
+
end
|
98
|
+
doc.each do |ln|
|
99
|
+
next if ln =~ /^#/
|
100
|
+
r = ln.chomp.split /\t/
|
101
|
+
next if r.size < 9
|
102
|
+
prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
|
103
|
+
p = prots.select{ |id| protein_ids.include? id }.first
|
104
|
+
trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
|
105
|
+
t = trans.select{ |id| transl_ids.include? id }.first
|
106
|
+
next if p.nil? and t.nil?
|
107
|
+
positive_coords[ r[0].to_sym ] ||= []
|
108
|
+
positive_coords[ r[0].to_sym ] << {
|
109
|
+
:prot_id => p,
|
110
|
+
:tran_id => t,
|
111
|
+
:from => r[3].to_i,
|
112
|
+
:to => r[4].to_i,
|
113
|
+
:strand => r[6]
|
114
|
+
}
|
115
|
+
end
|
116
|
+
end
|
117
|
+
print "\n" if thread_id==0 and not @o[:q]
|
118
|
+
ofh = File.open json_file, "w"
|
119
|
+
ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
|
120
|
+
ofh.close
|
121
|
+
end
|
122
|
+
|
123
|
+
#================================[ Build ]
|
124
|
+
def build!
|
125
|
+
# Check requirements
|
126
|
+
puts "Testing environment." unless @o[:q]
|
127
|
+
@o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
|
128
|
+
@o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
|
129
|
+
@o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
|
130
|
+
@o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
|
131
|
+
@o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
|
132
|
+
@o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
|
133
|
+
@o[:nosearch]=true if @o[:nosimulate]
|
134
|
+
raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
|
135
|
+
@o[:positive] += @o[:posori] unless @o[:posori].nil?
|
136
|
+
@o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
|
137
|
+
@o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
|
138
|
+
unless @o[:aln].nil?
|
139
|
+
aln = Alignment.new
|
140
|
+
aln.read_fasta @o[:aln]
|
141
|
+
@o[:positive] += aln.get_ids
|
142
|
+
end
|
143
|
+
raise "-p or -P are mandatory." if @o[:positive].size==0
|
144
|
+
raise "-o/--baseout is mandatory." if @o[:baseout].nil?
|
145
|
+
if @o[:positive].size == 1 and not @o[:noaln]
|
146
|
+
warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
|
147
|
+
@o[:noaln] = true
|
148
|
+
end
|
149
|
+
unless @o[:nosimulate]
|
150
|
+
self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
|
151
|
+
end
|
152
|
+
unless @o[:noaln]
|
153
|
+
self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
|
154
|
+
self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
|
155
|
+
end
|
156
|
+
unless @o[:nosearch]
|
157
|
+
self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
|
158
|
+
self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
|
159
|
+
end
|
160
|
+
|
161
|
+
# Download genes
|
162
|
+
puts "Downloading gene data." unless @o[:q]
|
163
|
+
ref_file = @o[:baseout] + ".ref.fasta"
|
164
|
+
if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
|
165
|
+
puts " * reusing aligned sequences as positive set." unless @o[:q]
|
166
|
+
f = File.open(ref_file, "w")
|
167
|
+
f.print aln.to_seq_s
|
168
|
+
f.close
|
169
|
+
@o[:noaln] = true
|
170
|
+
elsif @o[:reuse] and File.size? ref_file
|
171
|
+
puts " * reusing positive set: #{ref_file}." unless @o[:q]
|
172
|
+
else
|
173
|
+
puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
|
174
|
+
$stderr.puts " # #{@o[:positive]}" if @o[:debug]
|
175
|
+
ids = Array.new(@o[:positive])
|
176
|
+
f = File.open(ref_file, "w")
|
177
|
+
while ids.size>0
|
178
|
+
f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
|
179
|
+
end
|
180
|
+
f.close
|
181
|
+
end
|
182
|
+
genome_ids = {:positive=>[], :negative=>[]}
|
183
|
+
transl_ids = {:positive=>[], :negative=>[]}
|
184
|
+
[:positive, :negative].each do |set|
|
185
|
+
unless @o[set].size==0
|
186
|
+
puts " * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
|
187
|
+
$stderr.puts " # #{@o[set]}" if @o[:debug]
|
188
|
+
r = genes2genomes(@o[set])
|
189
|
+
genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
|
190
|
+
transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
|
191
|
+
end
|
192
|
+
end
|
193
|
+
raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
|
194
|
+
genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
|
195
|
+
raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
|
196
|
+
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
197
|
+
|
198
|
+
# Locate genes
|
199
|
+
puts "Analyzing genome data." unless @o[:q]
|
200
|
+
coords_file = @o[:baseout] + ".src.coords"
|
201
|
+
if @o[:reuse] and File.size? coords_file
|
202
|
+
puts " * reusing coordinates: #{coords_file}." unless @o[:q]
|
203
|
+
c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
|
204
|
+
positive_coords = c[:positive_coords]
|
205
|
+
genome_org = c[:genome_org]
|
206
|
+
else
|
207
|
+
thrs = [@o[:thr], genome_ids[:positive].size].min
|
208
|
+
puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
|
209
|
+
$stderr.puts " # Looking for proteins: #{@o[:positive]}" if @o[:debug]
|
210
|
+
$stderr.puts " # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
|
211
|
+
$stderr.puts " # Looking into: #{genome_ids[:positive]}" if @o[:debug]
|
212
|
+
thr_obj = []
|
213
|
+
(0 .. (thrs-1)).each do |thr_i|
|
214
|
+
ids_to_parse = []
|
215
|
+
(0 .. (genome_ids[:positive].size-1)).each do |i|
|
216
|
+
ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
|
217
|
+
end
|
218
|
+
json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
|
219
|
+
thr_obj << json_file
|
220
|
+
fork do
|
221
|
+
get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
Process.waitall
|
225
|
+
# Combine results
|
226
|
+
positive_coords = {}
|
227
|
+
genomes_org = {}
|
228
|
+
genome_org = {}
|
229
|
+
thr_obj.each do |t|
|
230
|
+
raise "Thread failed without error trace: #{t}" unless File.exist? t
|
231
|
+
o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
|
232
|
+
o[:positive_coords].each_pair do |k,v|
|
233
|
+
positive_coords[ k ] ||= []
|
234
|
+
positive_coords[ k ] += v
|
235
|
+
end
|
236
|
+
o[:genomes_org].each_pair do |k,v|
|
237
|
+
genomes_org[ k ] ||= []
|
238
|
+
genomes_org[ k ] << v
|
239
|
+
end
|
240
|
+
File.unlink t
|
241
|
+
end
|
242
|
+
# Select one genome per taxon
|
243
|
+
unless @o[:pertaxon].nil?
|
244
|
+
genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
|
245
|
+
end
|
246
|
+
# Save coordinates
|
247
|
+
ofh = File.open(coords_file, "w")
|
248
|
+
ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
|
249
|
+
ofh.close
|
250
|
+
end
|
251
|
+
unless @o[:pertaxon].nil?
|
252
|
+
genome_ids[:positive] = genome_org.values
|
253
|
+
puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
|
254
|
+
end
|
255
|
+
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
256
|
+
found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
|
257
|
+
unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
|
258
|
+
raise "Cannot find the genomic location of any provided sequence." if found.nil?
|
259
|
+
missing = @o[:positive] - found
|
260
|
+
warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
|
261
|
+
|
262
|
+
# Download genomes
|
263
|
+
genomes_file = @o[:baseout] + '.src.fasta'
|
264
|
+
if @o[:reuse] and File.size? genomes_file
|
265
|
+
puts " * reusing existing file: #{genomes_file}." unless @o[:q]
|
266
|
+
else
|
267
|
+
puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
|
268
|
+
$stderr.puts " # #{all_genome_ids}" if @o[:debug]
|
269
|
+
ids = Array.new(all_genome_ids)
|
270
|
+
ofh = File.open(genomes_file, 'w')
|
271
|
+
while ids.size>0
|
272
|
+
ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
|
273
|
+
end
|
274
|
+
ofh.close
|
275
|
+
end
|
276
|
+
|
277
|
+
# Generate metagenome
|
278
|
+
unless @o[:nosimulate]
|
279
|
+
puts "Generating in silico metagenome" unless @o[:q]
|
280
|
+
if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
|
281
|
+
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
|
282
|
+
else
|
283
|
+
all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
|
284
|
+
thrs = [@o[:thr], all_src].min
|
285
|
+
puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
|
286
|
+
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
287
|
+
thr_obj = []
|
288
|
+
seqs_per_thr = (all_src/thrs).ceil
|
289
|
+
(0 .. (thrs-1)).each do |thr_i|
|
290
|
+
output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
|
291
|
+
thr_obj << output
|
292
|
+
fork do
|
293
|
+
seqs_a = thr_i*seqs_per_thr + 1
|
294
|
+
seqs_b = [seqs_a + seqs_per_thr, all_src].min
|
295
|
+
# Create sub-fasta
|
296
|
+
ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
|
297
|
+
ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
|
298
|
+
seq_i = 0
|
299
|
+
while l = ifh.gets
|
300
|
+
seq_i+=1 if l =~ /^>/
|
301
|
+
break if seq_i > seqs_b
|
302
|
+
ofh.print l if seq_i >= seqs_a
|
303
|
+
end
|
304
|
+
ifh.close
|
305
|
+
ofh.close
|
306
|
+
|
307
|
+
# Run simulator (except if the temporal file is already there and can be reused)
|
308
|
+
unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
|
309
|
+
bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
|
310
|
+
end
|
311
|
+
|
312
|
+
# Tag positives
|
313
|
+
puts " * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
|
314
|
+
ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
|
315
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
|
316
|
+
while l = ifh.gets
|
317
|
+
if l =~ /^>/
|
318
|
+
rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
|
319
|
+
raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
|
320
|
+
positive = false
|
321
|
+
positive_coords[rd[:genome_id].to_sym] ||= []
|
322
|
+
positive_coords[rd[:genome_id].to_sym].each do |gn|
|
323
|
+
left = rd[:to].to_i - gn[:from]
|
324
|
+
right = gn[:to] - rd[:from].to_i
|
325
|
+
if (left*right >= 0) and ([left, right].min >= @o[:minovl])
|
326
|
+
positive = true
|
327
|
+
break
|
328
|
+
end
|
329
|
+
end
|
330
|
+
l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
|
331
|
+
"ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
|
332
|
+
end
|
333
|
+
ofh.print l
|
334
|
+
end
|
335
|
+
ofh.close
|
336
|
+
ifh.close
|
337
|
+
end # fork
|
338
|
+
end # (1 .. thrs).each
|
339
|
+
Process.waitall
|
340
|
+
# Concatenate results
|
341
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
|
342
|
+
thr_obj.each do |t|
|
343
|
+
raise "Thread failed without error trace: #{t}" unless File.exist? t
|
344
|
+
ifh = File.open(t, "r")
|
345
|
+
while l = ifh.gets
|
346
|
+
ofh.print l
|
347
|
+
end
|
348
|
+
ifh.close
|
349
|
+
File.unlink t
|
350
|
+
end
|
351
|
+
ofh.close
|
352
|
+
end
|
353
|
+
end # unless @o[:nosimulate]
|
354
|
+
|
355
|
+
# Align references
|
356
|
+
unless @o[:noaln]
|
357
|
+
puts "Aligning reference set." unless @o[:q]
|
358
|
+
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
|
359
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
|
360
|
+
else
|
361
|
+
bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
|
362
|
+
puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
366
|
+
# Run similarity search
|
367
|
+
unless @o[:nosearch]
|
368
|
+
puts "Running homology search." unless @o[:q]
|
369
|
+
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
|
370
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
|
371
|
+
else
|
372
|
+
puts " * preparing database." unless @o[:q]
|
373
|
+
bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
|
374
|
+
puts " * running similarity search." unless @o[:q]
|
375
|
+
bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# Clean
|
380
|
+
unless @o[:noclean]
|
381
|
+
puts "Cleaning." unless @o[:q]
|
382
|
+
sff = %w{.src.xml .src.fasta}
|
383
|
+
sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
|
384
|
+
sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
|
385
|
+
sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
|
386
|
+
end
|
387
|
+
end # build!
|
388
|
+
end # ROCker
|
389
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-05-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
@@DEFAULTS.merge!({:refine=>true, :win=>20, :minscore=>0})
|
11
|
+
|
12
|
+
#================================[ Compile ]
|
13
|
+
def compile!
|
14
|
+
raise "-a/--alignment is mandatory." if @o[:aln].nil?
|
15
|
+
raise "-a/--alignment must exist." unless File.exist? @o[:aln]
|
16
|
+
if @o[:table].nil?
|
17
|
+
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil? or not File.exist? @o[:blast]
|
18
|
+
@o[:table] = "#{@o[:blast]}.table"
|
19
|
+
else
|
20
|
+
@o[:reuse] = true
|
21
|
+
end
|
22
|
+
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
23
|
+
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
24
|
+
|
25
|
+
puts "Testing environment." unless @o[:q]
|
26
|
+
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
27
|
+
bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
|
28
|
+
|
29
|
+
puts "Reading files." unless @o[:q]
|
30
|
+
puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
|
31
|
+
aln = Alignment.new
|
32
|
+
aln.read_fasta @o[:aln]
|
33
|
+
|
34
|
+
if @o[:reuse] and File.exist? @o[:table]
|
35
|
+
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
36
|
+
else
|
37
|
+
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
38
|
+
blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
|
39
|
+
end
|
40
|
+
|
41
|
+
puts "Analyzing data." unless @o[:q]
|
42
|
+
puts " * computing windows." unless @o[:q]
|
43
|
+
data = ROCData.new(@o[:table], aln, @o[:win])
|
44
|
+
data.nucl = @o[:nucl]
|
45
|
+
if @o[:refine]
|
46
|
+
puts " * refining windows." unless @o[:q]
|
47
|
+
warn "Insufficient hits to refine results." unless data.refine! @o[:table]
|
48
|
+
end
|
49
|
+
puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
|
50
|
+
data.save @o[:rocker]
|
51
|
+
end # compile!
|
52
|
+
end # ROCker
|
53
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-04-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
#@@DEFAULTS.merge!({ })
|
11
|
+
|
12
|
+
#================================[ Filter ]
|
13
|
+
def filter!
|
14
|
+
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
15
|
+
raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
|
16
|
+
raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
|
17
|
+
|
18
|
+
puts "Reading ROCker file." unless @o[:q]
|
19
|
+
data = ROCData.new @o[:rocker]
|
20
|
+
|
21
|
+
puts "Filtering BLAST." unless @o[:q]
|
22
|
+
ih = File.open(@o[:qblast], 'r')
|
23
|
+
oh = File.open(@o[:oblast], 'w')
|
24
|
+
while ln = ih.gets
|
25
|
+
bh = BlastHit.new(ln, data.aln)
|
26
|
+
oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
|
27
|
+
end
|
28
|
+
ih.close
|
29
|
+
oh.close
|
30
|
+
end # filter!
|
31
|
+
end # ROCker
|
32
|
+
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-04-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
@@DEFAULTS.merge!({:color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true, :sbj=>[]})
|
11
|
+
|
12
|
+
#================================[ Search ]
|
13
|
+
def plot!
|
14
|
+
raise "-k/--rocker is mandatory." if o[:rocker].nil?
|
15
|
+
if @o[:table].nil?
|
16
|
+
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
|
17
|
+
@o[:table] = "#{@o[:blast]}.table"
|
18
|
+
end
|
19
|
+
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
20
|
+
|
21
|
+
puts "Testing environment." unless @o[:q]
|
22
|
+
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
23
|
+
|
24
|
+
puts "Reading files." unless @o[:q]
|
25
|
+
puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
|
26
|
+
data = ROCData.new @o[:rocker]
|
27
|
+
if File.exist? @o[:table]
|
28
|
+
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
29
|
+
else
|
30
|
+
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
31
|
+
blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
|
32
|
+
end
|
33
|
+
|
34
|
+
puts "Plotting matches." unless @o[:q]
|
35
|
+
extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
|
36
|
+
@o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
|
37
|
+
data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
|
38
|
+
data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
|
39
|
+
some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
|
40
|
+
data.rrun "par(mar=c(0,4,0,0.5)+.1);"
|
41
|
+
data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
|
42
|
+
data.rrun "noise <- runif(ncol(x),-.2,.2)"
|
43
|
+
data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
|
44
|
+
data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
|
45
|
+
|
46
|
+
puts "Plotting windows." unless @o[:q]
|
47
|
+
if some_thr
|
48
|
+
data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
|
49
|
+
data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
|
50
|
+
end
|
51
|
+
data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
|
52
|
+
"lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
|
53
|
+
|
54
|
+
puts "Plotting alignment." unless @o[:q]
|
55
|
+
data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
|
56
|
+
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
|
57
|
+
i = 0
|
58
|
+
data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
|
59
|
+
data.aln.seqs.values.each do |s|
|
60
|
+
color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
|
61
|
+
data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
|
62
|
+
i += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
puts "Plotting statistics." unless @o[:q]
|
66
|
+
data.rrun "par(mar=c(5,4,0,0.5)+.1);"
|
67
|
+
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
|
68
|
+
if some_thr
|
69
|
+
sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
|
70
|
+
sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
|
71
|
+
ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
|
72
|
+
unless @o[:q]
|
73
|
+
puts " * sensitivity: #{sn}%"
|
74
|
+
puts " * specificity: #{sp}%"
|
75
|
+
puts " * accuracy: #{ac}%"
|
76
|
+
end
|
77
|
+
data.rrun "pos <- (w$V1+w$V2)/2"
|
78
|
+
if @o[:impact]
|
79
|
+
data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
80
|
+
data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
81
|
+
data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
82
|
+
else
|
83
|
+
data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
84
|
+
data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
85
|
+
data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
86
|
+
end
|
87
|
+
#data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
|
88
|
+
end
|
89
|
+
data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
|
90
|
+
data.rrun "dev.off();"
|
91
|
+
end # plot!
|
92
|
+
end # ROCker
|
93
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-04-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
#@@DEFAULTS.merge!({ })
|
11
|
+
|
12
|
+
#================================[ Search ]
|
13
|
+
def search!
|
14
|
+
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
15
|
+
raise "Code Under development..."
|
16
|
+
# ToDo
|
17
|
+
# [ ... ]
|
18
|
+
end # search!
|
19
|
+
end # ROCker
|
20
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-rocker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis (Coto) Orellana
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-05
|
12
|
+
date: 2015-06-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rest-client
|
@@ -25,6 +25,20 @@ dependencies:
|
|
25
25
|
- - ~>
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: 1.7.3
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: json
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ~>
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 1.8.1
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ~>
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 1.8.1
|
28
42
|
description: Detecting and quantifying functional genes in short-read metagenomic
|
29
43
|
datasets
|
30
44
|
email: lhorellana@gatech.edu
|
@@ -40,6 +54,11 @@ files:
|
|
40
54
|
- lib/rocker/rocwindow.rb
|
41
55
|
- lib/rocker/rocdata.rb
|
42
56
|
- lib/rocker/rinterface.rb
|
57
|
+
- lib/rocker/step/build.rb
|
58
|
+
- lib/rocker/step/compile.rb
|
59
|
+
- lib/rocker/step/search.rb
|
60
|
+
- lib/rocker/step/filter.rb
|
61
|
+
- lib/rocker/step/plot.rb
|
43
62
|
- bin/ROCker
|
44
63
|
homepage: http://enve-omics.ce.gatech.edu/rocker
|
45
64
|
licenses:
|
@@ -51,9 +70,9 @@ require_paths:
|
|
51
70
|
- lib
|
52
71
|
required_ruby_version: !ruby/object:Gem::Requirement
|
53
72
|
requirements:
|
54
|
-
- -
|
73
|
+
- - ~>
|
55
74
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0'
|
75
|
+
version: '2.0'
|
57
76
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
77
|
requirements:
|
59
78
|
- - '>='
|