bio-rocker 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ROCker +36 -26
- data/lib/rocker.rb +18 -431
- data/lib/rocker/step/build.rb +389 -0
- data/lib/rocker/step/compile.rb +53 -0
- data/lib/rocker/step/filter.rb +32 -0
- data/lib/rocker/step/plot.rb +93 -0
- data/lib/rocker/step/search.rb +20 -0
- metadata +23 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8a10cdc85d8b7b54c21d26f12b90c0b3dff4f82
|
4
|
+
data.tar.gz: c837b3c6687f6705dbfc7c959824dd530e7ee932
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 869cdadfed2dad125fc11c03133e2f56df53074a54b0f35d8ea5c6674029e7069332e4c35c486b1f39a417aeff01932a7eee30da44e15de036ce1a2d878d15d4
|
7
|
+
data.tar.gz: 823b30e7923c243f8dc8bb122f50426898e2cdda634516cc53e0325b150946b0699e0a9257a55c06ed8868a0f843f0274ed1d23bdf8ef2de9629fafb66f33552
|
data/bin/ROCker
CHANGED
@@ -16,8 +16,8 @@ require 'optparse'
|
|
16
16
|
$t = {
|
17
17
|
'build' => 'Creates in silico metagenomes and training sets from reference genomes.',
|
18
18
|
'compile' => 'Identifies the most discriminant bit-score per alignment position in a set of sequence.',
|
19
|
-
'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
|
20
19
|
'search' => 'Uses a ROCker compilation to identify reads putatively derived from a set of sequences.',
|
20
|
+
'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
|
21
21
|
'plot' => 'Generates a graphical representation of the alignment, the thresholds, and the hits.',
|
22
22
|
}
|
23
23
|
task = (ARGV.size > 0 ? ARGV.shift : '').downcase
|
@@ -43,49 +43,51 @@ opts = OptionParser.new do |opt|
|
|
43
43
|
opt.on("-p", "--positive ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
|
44
44
|
opt.on("-n", "--negative ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
|
45
45
|
opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
|
46
|
-
#opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides (currently not implemented)."){ raise "--nucleotides: This option is currently not implemented." }
|
47
46
|
opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
|
48
47
|
opt.separator ""
|
49
48
|
opt.separator "+ ADVANCED BUILDING ARGUMENTS"
|
50
49
|
opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one UniProtKB ID per line. If used, -p is not required."){ |v| o[:posfile]=v }
|
51
50
|
opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one UniProtKB ID per line."){ |v| o[:negfile]=v }
|
52
51
|
opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain UniProtKB ID. If used, -p is not required."){ |v| o[:aln]=v }
|
53
|
-
opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
|
54
|
-
opt.on("-
|
55
|
-
opt.on(
|
56
|
-
opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum."
|
57
|
-
|
58
|
-
opt.on( "--
|
59
|
-
opt.on( "--
|
52
|
+
opt.on("-s", "--seqdepth NUMBER", "Sequencing depth (reads/bp) to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
|
53
|
+
opt.on("-l", "--readlen INTEGER", "Average read length of in silico metagenome (in bp). By default: '#{ROCker.default :readlen}'."){ |v| o[:readlen]=v.to_i }
|
54
|
+
opt.on("-v", "--overlap INTEGER", "Minimum overlap (in bp) with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_i }
|
55
|
+
opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum."){ |v| o[:pertaxon]=v.downcase }
|
56
|
+
opt.on( "--genome-frx NUMBER", "Fraction to subsample genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}."){ |v| o[:genomefrx]=v.to_f }
|
57
|
+
opt.on( "--nosimulate", "Do not simulate metagenome. Implies --nosearch. By default, metagenome is simulated."){ |v| o[:nosimulate]=v }
|
58
|
+
opt.on( "--nosearch", "Do not execute similarity search. By default, it is executed."){ |v| o[:nosearch]=v }
|
60
59
|
opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
|
61
60
|
opt.on( "--nocleanup", "Keep all intermediate files. By default, intermediate files are removed."){ |v| o[:noclean]=v }
|
62
61
|
opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
|
63
62
|
opt.separator ""
|
64
63
|
opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
|
65
|
-
opt.on("
|
66
|
-
opt.on("
|
67
|
-
opt.on("
|
68
|
-
opt.on( "--
|
69
|
-
|
70
|
-
opt.on("--
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
64
|
+
opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
|
65
|
+
opt.on( "--simulator STR", "In silico metagenome simulator to use. Supported: 'grinder'. By default: '#{ROCker.default :simulator}'.") { |v| o[:ssimulator]=v.to_sym }
|
66
|
+
opt.on( "--aligner STR", "Multiple alignment algorithm to use. Supported: 'clustalo' and 'muscle'. By default: '#{ROCker.default :aligner}'.") { |v| o[:aligner]=v.to_sym }
|
67
|
+
opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
|
68
|
+
opt.on( "--simulator-bin PATH", "Path to the simulator executable. By default in the $PATH: '#{ROCker.default(:simulatorbin).values.join("' or '")}'.") { |v| o[:simulatorbin]=v }
|
69
|
+
opt.on( "--aligner-bin PATH", "Path to the aligner executable. By default in the $PATH: '#{ROCker.default(:alignerbin).values.join("' or '")}'.") { |v| o[:alignerbin]=v }
|
70
|
+
opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
|
71
|
+
*ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
|
72
|
+
opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
|
73
|
+
*ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
|
74
|
+
opt.on( "--simulator-cmd STR", "Command calling simulator, where %1$s: binary, %2$s: input, %3$s: seq. depth (X), %4$d: read len., %5$s: output.",
|
75
|
+
*ROCker.default(:simulatorcmd).keys.map{|k| "By default if --simulator #{k}: '#{ROCker.default(:simulatorcmd)[k]}'."}){ |v| o[:simulatorcmd]=v }
|
76
|
+
opt.on("--aligner-cmd STR", "Command calling aligner, where %1$s: binary, %2$s: input, %3$s: output, %4$d: threads.",
|
77
|
+
*ROCker.default(:alignercmd).keys.map{|k| "By default if --aligner #{k}: '#{ROCker.default(:alignercmd)[k]}'."}){ |v| o[:alignercmd]=v }
|
76
78
|
when 'compile'
|
77
79
|
opt.separator "+ COMPILATION ARGUMENTS"
|
78
80
|
opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. Required."){ |v| o[:aln]=v }
|
79
81
|
opt.on("-b", "--ref-blast PATH",
|
80
82
|
"Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
|
81
83
|
opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
|
82
|
-
opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ raise "--nucleotides: This option is currently not implemented." }
|
83
84
|
opt.separator ""
|
84
85
|
opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
|
85
86
|
opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
|
86
87
|
opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
|
87
88
|
opt.on( "--norefine", "Do not refine windows."){ o[:refine]=false }
|
88
89
|
opt.on("-w", "--window INT", "Initial size of alignment windows (in number of AA columns). By default: #{ROCker.default :win}."){ |v| o[:win]=v.to_i }
|
90
|
+
opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
|
89
91
|
opt.separator ""
|
90
92
|
opt.separator "+ INPUT/OUTPUT"
|
91
93
|
opt.separator " o The input alignment (-a) MUST be in FastA format, and the IDs must"
|
@@ -107,15 +109,23 @@ opts = OptionParser.new do |opt|
|
|
107
109
|
opt.separator " 5. Bit score threshold set for the window."
|
108
110
|
opt.separator " The file also contains the alignment (commented with #:)."
|
109
111
|
opt.separator ""
|
112
|
+
when 'search'
|
113
|
+
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
114
|
+
opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
|
115
|
+
opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
|
116
|
+
opt.separator ""
|
117
|
+
opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
|
118
|
+
opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
|
119
|
+
opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
|
120
|
+
opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
|
121
|
+
*ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
|
122
|
+
opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
|
123
|
+
*ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
|
110
124
|
when 'filter'
|
111
125
|
opt.separator "+ FILTERING ARGUMENTS"
|
112
126
|
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
113
127
|
opt.on("-x", "--query-blast PATH", "Tabular BLAST (blastx) of the query reads vs. the reference dataset. Required."){ |v| o[:qblast]=v }
|
114
128
|
opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
|
115
|
-
when 'search'
|
116
|
-
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
117
|
-
opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
|
118
|
-
opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
|
119
129
|
when 'plot'
|
120
130
|
opt.separator "+ PLOTTING ARGUMENTS"
|
121
131
|
opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
|
@@ -127,7 +137,7 @@ opts = OptionParser.new do |opt|
|
|
127
137
|
opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
|
128
138
|
opt.on( "--color", "Color alignment by amino acid."){ o[:color]=true }
|
129
139
|
opt.on( "--no-transparency", "Do not use (semi-)transparencies."){ |v| o[:transparency] = v }
|
130
|
-
opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
|
140
|
+
opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}."){ |v| o[:minscore]=v.to_f }
|
131
141
|
opt.on( "--stats-impact", "Plot impact on statistics, instead of absolute values per window."){ o[:impact]=true }
|
132
142
|
opt.on( "--stats-ylim STRING", "Limits of the Y-axis in the bottom panel. By default: '-2,.1' if --stats-impact is set, '50,100' otherwise."){ |v| o[:ylim]=v }
|
133
143
|
opt.on("-s", "--subject SBJ1,SBJ2,...", Array,
|
data/lib/rocker.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
3
|
# @author Luis (Coto) Orellana
|
4
4
|
# @license artistic license 2.0
|
5
|
-
# @update
|
5
|
+
# @update Jun-05-2015
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'rocker/blasthit'
|
@@ -10,40 +10,20 @@ require 'rocker/rocdata'
|
|
10
10
|
|
11
11
|
class ROCker
|
12
12
|
#================================[ Class ]
|
13
|
-
@@EBIREST = 'http://www.ebi.ac.uk/Tools'
|
14
13
|
@@DEFAULTS = {
|
15
14
|
# General
|
16
|
-
:q=>false, :r=>'R', :nucl=>false, :debug=>false,
|
17
|
-
#
|
18
|
-
:
|
19
|
-
|
20
|
-
:
|
21
|
-
:
|
22
|
-
|
23
|
-
:
|
24
|
-
:
|
25
|
-
# Compile
|
26
|
-
:refine=>true, :win=>20, :minscore=>0,
|
27
|
-
# Filter
|
28
|
-
:sbj=>[],
|
29
|
-
# Plot
|
30
|
-
:color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true,
|
15
|
+
:q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
|
16
|
+
# External software
|
17
|
+
:searchbins=>'',
|
18
|
+
:searchcmd=>{
|
19
|
+
:blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
|
20
|
+
:diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
|
21
|
+
:makedbcmd=>{
|
22
|
+
:blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
|
23
|
+
:diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
|
31
24
|
}
|
32
|
-
@@HAS_BUILD_GEMS = nil
|
33
|
-
def self.ebirest() @@EBIREST ; end
|
34
25
|
def self.defaults() @@DEFAULTS ; end
|
35
26
|
def self.default(k) @@DEFAULTS[k] ; end
|
36
|
-
def self.has_build_gems?
|
37
|
-
return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
|
38
|
-
@@HAS_BUILD_GEMS = TRUE
|
39
|
-
begin
|
40
|
-
require 'rubygems'
|
41
|
-
require 'restclient'
|
42
|
-
rescue LoadError
|
43
|
-
@@HAS_BUILD_GEMS = FALSE
|
44
|
-
end
|
45
|
-
@@HAS_BUILD_GEMS
|
46
|
-
end
|
47
27
|
|
48
28
|
#================================[ Instance ]
|
49
29
|
attr_reader :o
|
@@ -53,374 +33,6 @@ class ROCker
|
|
53
33
|
RInterface.R_BIN = opts[:r] unless opts[:r].nil?
|
54
34
|
end
|
55
35
|
|
56
|
-
#================================[ Build ]
|
57
|
-
def build!
|
58
|
-
# Check requirements
|
59
|
-
puts "Testing environment." unless @o[:q]
|
60
|
-
@o[:noblast]=true if @o[:nomg]
|
61
|
-
raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
|
62
|
-
@o[:positive] += @o[:posori] unless @o[:posori].nil?
|
63
|
-
@o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
|
64
|
-
@o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
|
65
|
-
unless @o[:aln].nil?
|
66
|
-
aln = Alignment.new
|
67
|
-
aln.read_fasta @o[:aln]
|
68
|
-
@o[:positive] += aln.get_ids
|
69
|
-
end
|
70
|
-
raise "-p or -P are mandatory." if @o[:positive].size==0
|
71
|
-
raise "-o/--baseout is mandatory." if @o[:baseout].nil?
|
72
|
-
if @o[:positive].size == 1 and not @o[:noaln]
|
73
|
-
warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
|
74
|
-
@o[:noaln] = true
|
75
|
-
end
|
76
|
-
self.bash "#{@o[:grinder]} --version", "-G/--grinder must be executable. Is Grinder installed?" unless @o[:nomg]
|
77
|
-
self.bash "#{@o[:muscle]} -version", "-M/--muscle must be executable. Is Muscle installed?" unless @o[:noaln]
|
78
|
-
self.bash "#{@o[:blastbins]}makeblastdb -version", "-B/--blastbins must contain executables. Is BLAST+ installed?" unless @o[:noblast]
|
79
|
-
# Download genes
|
80
|
-
puts "Downloading gene data." unless @o[:q]
|
81
|
-
f = File.open(@o[:baseout] + '.ref.fasta', 'w')
|
82
|
-
if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
|
83
|
-
puts " * re-using aligned sequences as positive set." unless @o[:q]
|
84
|
-
f.print aln.to_seq_s
|
85
|
-
@o[:noaln] = true
|
86
|
-
else
|
87
|
-
puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
|
88
|
-
$stderr.puts " # #{@o[:positive]}" if @o[:debug]
|
89
|
-
ids = Array.new(@o[:positive])
|
90
|
-
while ids.size>0
|
91
|
-
f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
|
92
|
-
end
|
93
|
-
end
|
94
|
-
f.close
|
95
|
-
genome_ids = {:positive=>[], :negative=>[]}
|
96
|
-
[:positive, :negative].each do |set|
|
97
|
-
unless @o[set].size==0
|
98
|
-
puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
|
99
|
-
$stderr.puts " # #{@o[set]}" if @o[:debug]
|
100
|
-
genome_ids[set] = genes2genomes(@o[set])
|
101
|
-
end
|
102
|
-
end
|
103
|
-
raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
|
104
|
-
genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
|
105
|
-
raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
|
106
|
-
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
107
|
-
|
108
|
-
# Locate genes
|
109
|
-
puts "Analyzing genome data." unless @o[:q]
|
110
|
-
puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s)." unless @o[:q]
|
111
|
-
$stderr.puts " # #{genome_ids[:positive]}" if @o[:debug]
|
112
|
-
positive_coords = {}
|
113
|
-
genome_org = {}
|
114
|
-
i = 0
|
115
|
-
genome_ids[:positive].each do |genome_id|
|
116
|
-
print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids[:positive].size}. \r" unless @o[:q]
|
117
|
-
unless @o[:pertaxon].nil?
|
118
|
-
genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
|
119
|
-
next unless genome_org[ genome_taxon ].nil?
|
120
|
-
genome_org[ genome_taxon ] = genome_id
|
121
|
-
end
|
122
|
-
$stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
|
123
|
-
genome_file = @o[:baseout] + '.src.' + i.to_s + '.gff3'
|
124
|
-
if @o[:reuse] and File.exist? genome_file
|
125
|
-
puts " * reusing existing file: #{genome_file}." unless @o[:q]
|
126
|
-
ifh = File.open(genome_file, 'r')
|
127
|
-
doc = ifh.readlines.grep(/^[^#]/)
|
128
|
-
ifh.close
|
129
|
-
else
|
130
|
-
genome_file=nil unless @o[:noclean]
|
131
|
-
res = ebiFetch(:embl, [genome_id], :gff3, genome_file)
|
132
|
-
doc = res.split("\n").grep(/^[^#]/)
|
133
|
-
end
|
134
|
-
doc.each do |ln|
|
135
|
-
next if ln =~ /^#/
|
136
|
-
r = ln.chomp.split /\t/
|
137
|
-
next if r.size < 9
|
138
|
-
prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
|
139
|
-
p = prots.select{ |p| @o[:positive].include? p }.first
|
140
|
-
next if p.nil?
|
141
|
-
positive_coords[ r[0] ] ||= []
|
142
|
-
positive_coords[ r[0] ] << {
|
143
|
-
#:strand => r[6],
|
144
|
-
:prot_id => p,
|
145
|
-
:from => r[3].to_i,
|
146
|
-
:to => r[4].to_i
|
147
|
-
}
|
148
|
-
end
|
149
|
-
end
|
150
|
-
print "\n" unless @o[:q]
|
151
|
-
unless @o[:pertaxon].nil?
|
152
|
-
genome_ids[:positive] = genome_org.values
|
153
|
-
puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
|
154
|
-
end
|
155
|
-
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
156
|
-
found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+)
|
157
|
-
raise "Cannot find the genomic location of any provided sequence." if found.nil?
|
158
|
-
missing = @o[:positive] - found
|
159
|
-
warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or not @o[:pertaxon].nil?
|
160
|
-
|
161
|
-
# Download genomes
|
162
|
-
genomes_file = @o[:baseout] + '.src.fasta'
|
163
|
-
if @o[:reuse] and File.exist? genomes_file
|
164
|
-
puts " * reusing existing file: #{genomes_file}." unless @o[:q]
|
165
|
-
else
|
166
|
-
puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
|
167
|
-
$stderr.puts " # #{all_genome_ids}" if @o[:debug]
|
168
|
-
ids = Array.new(all_genome_ids)
|
169
|
-
ofh = File.open(genomes_file, 'w')
|
170
|
-
while ids.size>0
|
171
|
-
ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
|
172
|
-
end
|
173
|
-
ofh.close
|
174
|
-
end
|
175
|
-
|
176
|
-
# Generate metagenome
|
177
|
-
unless @o[:nomg]
|
178
|
-
puts "Generating in silico metagenome" unless @o[:q]
|
179
|
-
if @o[:reuse] and File.exist? @o[:baseout] + ".mg.fasta"
|
180
|
-
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
|
181
|
-
else
|
182
|
-
all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
|
183
|
-
thrs = [@o[:thr], all_src].min
|
184
|
-
puts " * running grinder and tagging positive reads (#{thrs} threads)." unless @o[:q]
|
185
|
-
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
186
|
-
thr_obj = []
|
187
|
-
seqs_per_thr = (all_src/thrs).ceil
|
188
|
-
(0 .. (thrs-1)).each do |thr_i|
|
189
|
-
thr_obj << Thread.new do
|
190
|
-
Thread.current[:seqs_a] = thr_i*seqs_per_thr + 1
|
191
|
-
Thread.current[:seqs_b] = [Thread.current[:seqs_a] + seqs_per_thr, all_src].min
|
192
|
-
# Create sub-fasta
|
193
|
-
Thread.current[:ofh] = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", 'w')
|
194
|
-
Thread.current[:ifh] = File.open("#{@o[:baseout]}.src.fasta", 'r')
|
195
|
-
Thread.current[:seq_i] = 0
|
196
|
-
while Thread.current[:l] = Thread.current[:ifh].gets
|
197
|
-
Thread.current[:seq_i]+=1 if Thread.current[:l] =~ /^>/
|
198
|
-
break if Thread.current[:seq_i] > Thread.current[:seqs_b]
|
199
|
-
Thread.current[:ofh].print Thread.current[:l] if Thread.current[:seq_i] >= Thread.current[:seqs_a]
|
200
|
-
end
|
201
|
-
Thread.current[:ifh].close
|
202
|
-
Thread.current[:ofh].close
|
203
|
-
bash sprintf(@o[:grindercmd], @o[:grinder], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
|
204
|
-
# Tag positives
|
205
|
-
puts " * tagging positive reads." unless @o[:q]
|
206
|
-
Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
|
207
|
-
Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
|
208
|
-
while Thread.current[:l]=Thread.current[:ifh].gets
|
209
|
-
Thread.current[:rd] = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
|
210
|
-
unless Thread.current[:rd].nil?
|
211
|
-
Thread.current[:positive] = false
|
212
|
-
positive_coords[Thread.current[:rd][:genome_id]] ||= []
|
213
|
-
positive_coords[Thread.current[:rd][:genome_id]].each do |gn|
|
214
|
-
Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
|
215
|
-
Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
|
216
|
-
if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
|
217
|
-
Thread.current[:positive] = true
|
218
|
-
break
|
219
|
-
end
|
220
|
-
end
|
221
|
-
Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:genome_id]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
|
222
|
-
end
|
223
|
-
Thread.current[:ofh].print Thread.current[:l]
|
224
|
-
end
|
225
|
-
Thread.current[:ofh].close
|
226
|
-
Thread.current[:ifh].close
|
227
|
-
Thread.current[:output] = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
|
228
|
-
end # Thread.new do
|
229
|
-
end # (1 .. thrs).each
|
230
|
-
# Concatenate results
|
231
|
-
ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
|
232
|
-
thr_obj.each do |t|
|
233
|
-
t.join
|
234
|
-
raise "Thread failed without error trace: #{t}" if t[:output].nil?
|
235
|
-
ifh = File.open(t[:output], 'r')
|
236
|
-
while l = ifh.gets
|
237
|
-
ofh.print l
|
238
|
-
end
|
239
|
-
ifh.close
|
240
|
-
File.unlink t[:output]
|
241
|
-
end
|
242
|
-
ofh.close
|
243
|
-
end
|
244
|
-
end # unless @o[:nomg]
|
245
|
-
# Align references
|
246
|
-
unless @o[:noaln]
|
247
|
-
puts "Aligning reference set." unless @o[:q]
|
248
|
-
if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.aln"
|
249
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
|
250
|
-
else
|
251
|
-
bash sprintf(@o[:musclecmd], @o[:muscle], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln")
|
252
|
-
puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
|
253
|
-
end
|
254
|
-
end
|
255
|
-
# Run BLAST
|
256
|
-
unless @o[:noblast]
|
257
|
-
puts "Running homology search." unless @o[:q]
|
258
|
-
if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.blast"
|
259
|
-
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
|
260
|
-
else
|
261
|
-
puts " * preparing database." unless @o[:q]
|
262
|
-
bash sprintf(@o[:makedbcmd], @o[:blastbins], (@o[:nucl]?'nucl':'prot'), "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
|
263
|
-
puts " * running BLAST." unless @o[:q]
|
264
|
-
bash sprintf(@o[:blastcmd], @o[:blastbins], (@o[:nucl]?'blastn':'blastx'), "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
|
265
|
-
end
|
266
|
-
end
|
267
|
-
# Clean
|
268
|
-
unless @o[:noclean]
|
269
|
-
puts "Cleaning." unless @o[:q]
|
270
|
-
sff = %w{.src.xml .src.fasta}
|
271
|
-
sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nomg]
|
272
|
-
sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:noblast]
|
273
|
-
sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
|
274
|
-
end
|
275
|
-
end # build!
|
276
|
-
|
277
|
-
#================================[ Compile ]
|
278
|
-
def compile!
|
279
|
-
raise "-a/--alignment is mandatory." if @o[:aln].nil?
|
280
|
-
raise "-a/--alignment must exist." unless File.exist? @o[:aln]
|
281
|
-
if @o[:table].nil?
|
282
|
-
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
|
283
|
-
@o[:table] = "#{@o[:blast]}.table"
|
284
|
-
end
|
285
|
-
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
286
|
-
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
287
|
-
|
288
|
-
puts "Testing environment." unless @o[:q]
|
289
|
-
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
290
|
-
bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
|
291
|
-
|
292
|
-
puts "Reading files." unless @o[:q]
|
293
|
-
puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
|
294
|
-
aln = Alignment.new
|
295
|
-
aln.read_fasta @o[:aln]
|
296
|
-
|
297
|
-
if File.exist? @o[:table]
|
298
|
-
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
299
|
-
else
|
300
|
-
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
301
|
-
blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
|
302
|
-
end
|
303
|
-
|
304
|
-
puts "Analyzing data." unless @o[:q]
|
305
|
-
puts " * computing windows." unless @o[:q]
|
306
|
-
data = ROCData.new(@o[:table], aln, @o[:win])
|
307
|
-
data.nucl = @o[:nucl]
|
308
|
-
if @o[:refine]
|
309
|
-
puts " * refining windows." unless @o[:q]
|
310
|
-
warn "Insufficient hits to refine results." unless data.refine! @o[:table]
|
311
|
-
end
|
312
|
-
puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
|
313
|
-
data.save @o[:rocker]
|
314
|
-
end # compile!
|
315
|
-
|
316
|
-
#================================[ Filter ]
|
317
|
-
def filter!
|
318
|
-
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
319
|
-
raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
|
320
|
-
raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
|
321
|
-
|
322
|
-
puts "Reading ROCker file." unless @o[:q]
|
323
|
-
data = ROCData.new @o[:rocker]
|
324
|
-
|
325
|
-
puts "Filtering BLAST." unless @o[:q]
|
326
|
-
ih = File.open(@o[:qblast], 'r')
|
327
|
-
oh = File.open(@o[:oblast], 'w')
|
328
|
-
while ln = ih.gets
|
329
|
-
bh = BlastHit.new(ln, data.aln)
|
330
|
-
oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
|
331
|
-
end
|
332
|
-
ih.close
|
333
|
-
oh.close
|
334
|
-
end # filter!
|
335
|
-
#================================[ Search ]
|
336
|
-
def search!
|
337
|
-
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
338
|
-
raise "Code Under development..."
|
339
|
-
# ToDo
|
340
|
-
# [ ... ]
|
341
|
-
end # search!
|
342
|
-
|
343
|
-
#================================[ Plot ]
|
344
|
-
def plot!
|
345
|
-
raise "-k/--rocker is mandatory." if o[:rocker].nil?
|
346
|
-
if @o[:table].nil?
|
347
|
-
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
|
348
|
-
@o[:table] = "#{@o[:blast]}.table"
|
349
|
-
end
|
350
|
-
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
351
|
-
|
352
|
-
puts "Testing environment." unless @o[:q]
|
353
|
-
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
354
|
-
|
355
|
-
puts "Reading files." unless @o[:q]
|
356
|
-
puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
|
357
|
-
data = ROCData.new @o[:rocker]
|
358
|
-
if File.exist? @o[:table]
|
359
|
-
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
360
|
-
else
|
361
|
-
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
362
|
-
blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
|
363
|
-
end
|
364
|
-
|
365
|
-
puts "Plotting matches." unless @o[:q]
|
366
|
-
extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
|
367
|
-
@o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
|
368
|
-
data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
|
369
|
-
data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
|
370
|
-
some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
|
371
|
-
data.rrun "par(mar=c(0,4,0,0.5)+.1);"
|
372
|
-
data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
|
373
|
-
data.rrun "noise <- runif(ncol(x),-.2,.2)"
|
374
|
-
data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
|
375
|
-
data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
|
376
|
-
|
377
|
-
puts "Plotting windows." unless @o[:q]
|
378
|
-
if some_thr
|
379
|
-
data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
|
380
|
-
data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
|
381
|
-
end
|
382
|
-
data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
|
383
|
-
"lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
|
384
|
-
|
385
|
-
puts "Plotting alignment." unless @o[:q]
|
386
|
-
data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
|
387
|
-
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
|
388
|
-
i = 0
|
389
|
-
data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
|
390
|
-
data.aln.seqs.values.each do |s|
|
391
|
-
color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
|
392
|
-
data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
|
393
|
-
i += 1
|
394
|
-
end
|
395
|
-
|
396
|
-
puts "Plotting statistics." unless @o[:q]
|
397
|
-
data.rrun "par(mar=c(5,4,0,0.5)+.1);"
|
398
|
-
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
|
399
|
-
if some_thr
|
400
|
-
sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
|
401
|
-
sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
|
402
|
-
ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
|
403
|
-
unless @o[:q]
|
404
|
-
puts " * sensitivity: #{sn}%"
|
405
|
-
puts " * specificity: #{sp}%"
|
406
|
-
puts " * accuracy: #{ac}%"
|
407
|
-
end
|
408
|
-
data.rrun "pos <- (w$V1+w$V2)/2"
|
409
|
-
if @o[:impact]
|
410
|
-
data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
411
|
-
data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
412
|
-
data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
413
|
-
else
|
414
|
-
data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
415
|
-
data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
416
|
-
data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
417
|
-
end
|
418
|
-
#data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
|
419
|
-
end
|
420
|
-
data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
|
421
|
-
data.rrun "dev.off();"
|
422
|
-
end # plot!
|
423
|
-
|
424
36
|
#================================[ Utilities ]
|
425
37
|
def blast2table(blast_f, table_f, aln, minscore)
|
426
38
|
ifh = File.open(blast_f, "r")
|
@@ -432,39 +44,6 @@ class ROCker
|
|
432
44
|
ifh.close
|
433
45
|
ofh.close
|
434
46
|
end
|
435
|
-
def genes2genomes(gene_ids)
|
436
|
-
genomes = []
|
437
|
-
ids = Array.new(gene_ids)
|
438
|
-
while ids.size>0
|
439
|
-
doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
|
440
|
-
genomes += doc.grep( /^DR\s+EMBL;/ ).map{ |ln| ln.split('; ')[1] }
|
441
|
-
end
|
442
|
-
genomes.uniq
|
443
|
-
end
|
444
|
-
def genome2taxid(genome_id)
|
445
|
-
ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
|
446
|
-
return ln if ln.nil?
|
447
|
-
ln.sub(/.*"taxon:(\d+)".*/, "\\1")
|
448
|
-
end
|
449
|
-
def genome2taxon(genome_id, rank='species')
|
450
|
-
xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
451
|
-
xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
|
452
|
-
end
|
453
|
-
def restcall(url, outfile=nil)
|
454
|
-
response = RestClient.get url
|
455
|
-
raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
|
456
|
-
unless outfile.nil?
|
457
|
-
ohf = File.open(outfile, 'w')
|
458
|
-
ohf.print response.to_s
|
459
|
-
ohf.close
|
460
|
-
end
|
461
|
-
response.to_s
|
462
|
-
end
|
463
|
-
def ebiFetch(db, ids, format, outfile=nil)
|
464
|
-
url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
|
465
|
-
$stderr.puts " # Calling: #{url}" if @o[:debug]
|
466
|
-
self.restcall url
|
467
|
-
end
|
468
47
|
def bash(cmd, err_msg=nil)
|
469
48
|
o = `#{cmd} 2>&1 && echo '{'`
|
470
49
|
raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
|
@@ -473,6 +52,14 @@ class ROCker
|
|
473
52
|
end
|
474
53
|
|
475
54
|
#================================[ Extensions ]
|
55
|
+
# To ROCker
|
56
|
+
require 'rocker/step/build'
|
57
|
+
require 'rocker/step/compile'
|
58
|
+
require 'rocker/step/search'
|
59
|
+
require 'rocker/step/filter'
|
60
|
+
require 'rocker/step/plot'
|
61
|
+
|
62
|
+
# To other
|
476
63
|
class Numeric
|
477
64
|
def ordinalize
|
478
65
|
n= self.to_s
|
@@ -0,0 +1,389 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-05-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'json'
|
9
|
+
|
10
|
+
class ROCker
|
11
|
+
#================================[ Class ]
|
12
|
+
@@EBIREST = 'http://www.ebi.ac.uk/Tools'
|
13
|
+
@@DEFAULTS.merge!({:positive=>[], :negative=>[], :genomefrx=>1.0, :seqdepth=>0.03, :readlen=>100, :minovl=>50,
|
14
|
+
# Ext. Software
|
15
|
+
:aligner=>:clustalo, :simulator=>:grinder,
|
16
|
+
:simulatorbin=>{:grinder=>'grinder'},
|
17
|
+
:simulatorcmd=>{:grinder=>'%1$s -reference_file "%2$s" -cf "%3$f" -dc \'-~*NnKkMmRrYySsWwBbVvHhDdXx\' -md uniform 0.1 -mr 95 5 -rd %4$d uniform 5 -base_name "%5$s"'},
|
18
|
+
:alignerbin=>{:muscle=>'muscle', :clustalo=>'clustalo'},
|
19
|
+
:alignercmd=>{:muscle=>'%1$s -in "%2$s" -out "%3$s" -quiet', :clustalo=>'%1$s -i "%2$s" -o "%3$s" --threads=%4$d --force'}
|
20
|
+
})
|
21
|
+
@@HAS_BUILD_GEMS = nil
|
22
|
+
def self.ebirest() @@EBIREST ; end
|
23
|
+
def self.has_build_gems?
|
24
|
+
return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
|
25
|
+
@@HAS_BUILD_GEMS = TRUE
|
26
|
+
begin
|
27
|
+
require 'rubygems'
|
28
|
+
require 'restclient'
|
29
|
+
rescue LoadError
|
30
|
+
@@HAS_BUILD_GEMS = FALSE
|
31
|
+
end
|
32
|
+
@@HAS_BUILD_GEMS
|
33
|
+
end
|
34
|
+
|
35
|
+
#================================[ Utilities ]
|
36
|
+
def genes2genomes(gene_ids)
|
37
|
+
genomes = []
|
38
|
+
ids = Array.new(gene_ids)
|
39
|
+
while ids.size>0
|
40
|
+
doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
|
41
|
+
genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
|
42
|
+
r=ln.split('; ')
|
43
|
+
{:genome_id=>r[1], :transl_id=>r[2]}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
genomes.uniq
|
47
|
+
end
|
48
|
+
def genome2taxid(genome_id)
|
49
|
+
ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
|
50
|
+
return ln if ln.nil?
|
51
|
+
ln.sub(/.*"taxon:(\d+)".*/, "\\1")
|
52
|
+
end
|
53
|
+
def genome2taxon(genome_id, rank='species')
|
54
|
+
xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
|
55
|
+
xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
|
56
|
+
end
|
57
|
+
def restcall(url, outfile=nil)
|
58
|
+
$stderr.puts " # Calling: #{url}" if @o[:debug]
|
59
|
+
response = RestClient::Request.execute(:method=>:get, :url=>url, :timeout=>600)
|
60
|
+
raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
|
61
|
+
unless outfile.nil?
|
62
|
+
ohf = File.open(outfile, 'w')
|
63
|
+
ohf.print response.to_s
|
64
|
+
ohf.close
|
65
|
+
end
|
66
|
+
response.to_s
|
67
|
+
end
|
68
|
+
def ebiFetch(db, ids, format, outfile=nil)
|
69
|
+
url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
|
70
|
+
res = self.restcall url
|
71
|
+
unless outfile.nil?
|
72
|
+
ohf = File.open(outfile, 'w')
|
73
|
+
ohf.print res
|
74
|
+
ohf.close
|
75
|
+
end
|
76
|
+
res
|
77
|
+
end
|
78
|
+
def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
|
79
|
+
positive_coords = {}
|
80
|
+
genomes_org = {}
|
81
|
+
i = 0
|
82
|
+
genome_ids.each do |genome_id|
|
83
|
+
print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread. \r" if thread_id==0 and not @o[:q]
|
84
|
+
unless @o[:pertaxon].nil?
|
85
|
+
genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
|
86
|
+
genomes_org[ genome_taxon.to_sym ] ||= []
|
87
|
+
genomes_org[ genome_taxon.to_sym ] << genome_id
|
88
|
+
end
|
89
|
+
genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
|
90
|
+
if @o[:reuse] and File.size? genome_file
|
91
|
+
ifh = File.open(genome_file, 'r')
|
92
|
+
doc = ifh.readlines.grep(/^[^#]/)
|
93
|
+
ifh.close
|
94
|
+
else
|
95
|
+
genome_file=nil unless @o[:noclean]
|
96
|
+
doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
|
97
|
+
end
|
98
|
+
doc.each do |ln|
|
99
|
+
next if ln =~ /^#/
|
100
|
+
r = ln.chomp.split /\t/
|
101
|
+
next if r.size < 9
|
102
|
+
prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
|
103
|
+
p = prots.select{ |id| protein_ids.include? id }.first
|
104
|
+
trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
|
105
|
+
t = trans.select{ |id| transl_ids.include? id }.first
|
106
|
+
next if p.nil? and t.nil?
|
107
|
+
positive_coords[ r[0].to_sym ] ||= []
|
108
|
+
positive_coords[ r[0].to_sym ] << {
|
109
|
+
:prot_id => p,
|
110
|
+
:tran_id => t,
|
111
|
+
:from => r[3].to_i,
|
112
|
+
:to => r[4].to_i,
|
113
|
+
:strand => r[6]
|
114
|
+
}
|
115
|
+
end
|
116
|
+
end
|
117
|
+
print "\n" if thread_id==0 and not @o[:q]
|
118
|
+
ofh = File.open json_file, "w"
|
119
|
+
ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
|
120
|
+
ofh.close
|
121
|
+
end
|
122
|
+
|
123
|
+
#================================[ Build ]
|
124
|
+
def build!
|
125
|
+
# Check requirements
|
126
|
+
puts "Testing environment." unless @o[:q]
|
127
|
+
@o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
|
128
|
+
@o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
|
129
|
+
@o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
|
130
|
+
@o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
|
131
|
+
@o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
|
132
|
+
@o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
|
133
|
+
@o[:nosearch]=true if @o[:nosimulate]
|
134
|
+
raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
|
135
|
+
@o[:positive] += @o[:posori] unless @o[:posori].nil?
|
136
|
+
@o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
|
137
|
+
@o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
|
138
|
+
unless @o[:aln].nil?
|
139
|
+
aln = Alignment.new
|
140
|
+
aln.read_fasta @o[:aln]
|
141
|
+
@o[:positive] += aln.get_ids
|
142
|
+
end
|
143
|
+
raise "-p or -P are mandatory." if @o[:positive].size==0
|
144
|
+
raise "-o/--baseout is mandatory." if @o[:baseout].nil?
|
145
|
+
if @o[:positive].size == 1 and not @o[:noaln]
|
146
|
+
warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
|
147
|
+
@o[:noaln] = true
|
148
|
+
end
|
149
|
+
unless @o[:nosimulate]
|
150
|
+
self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
|
151
|
+
end
|
152
|
+
unless @o[:noaln]
|
153
|
+
self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
|
154
|
+
self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
|
155
|
+
end
|
156
|
+
unless @o[:nosearch]
|
157
|
+
self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
|
158
|
+
self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
|
159
|
+
end
|
160
|
+
|
161
|
+
# Download genes
|
162
|
+
puts "Downloading gene data." unless @o[:q]
|
163
|
+
ref_file = @o[:baseout] + ".ref.fasta"
|
164
|
+
if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
|
165
|
+
puts " * reusing aligned sequences as positive set." unless @o[:q]
|
166
|
+
f = File.open(ref_file, "w")
|
167
|
+
f.print aln.to_seq_s
|
168
|
+
f.close
|
169
|
+
@o[:noaln] = true
|
170
|
+
elsif @o[:reuse] and File.size? ref_file
|
171
|
+
puts " * reusing positive set: #{ref_file}." unless @o[:q]
|
172
|
+
else
|
173
|
+
puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
|
174
|
+
$stderr.puts " # #{@o[:positive]}" if @o[:debug]
|
175
|
+
ids = Array.new(@o[:positive])
|
176
|
+
f = File.open(ref_file, "w")
|
177
|
+
while ids.size>0
|
178
|
+
f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
|
179
|
+
end
|
180
|
+
f.close
|
181
|
+
end
|
182
|
+
genome_ids = {:positive=>[], :negative=>[]}
|
183
|
+
transl_ids = {:positive=>[], :negative=>[]}
|
184
|
+
[:positive, :negative].each do |set|
|
185
|
+
unless @o[set].size==0
|
186
|
+
puts " * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
|
187
|
+
$stderr.puts " # #{@o[set]}" if @o[:debug]
|
188
|
+
r = genes2genomes(@o[set])
|
189
|
+
genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
|
190
|
+
transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
|
191
|
+
end
|
192
|
+
end
|
193
|
+
raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
|
194
|
+
genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
|
195
|
+
raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
|
196
|
+
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
197
|
+
|
198
|
+
# Locate genes
|
199
|
+
puts "Analyzing genome data." unless @o[:q]
|
200
|
+
coords_file = @o[:baseout] + ".src.coords"
|
201
|
+
if @o[:reuse] and File.size? coords_file
|
202
|
+
puts " * reusing coordinates: #{coords_file}." unless @o[:q]
|
203
|
+
c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
|
204
|
+
positive_coords = c[:positive_coords]
|
205
|
+
genome_org = c[:genome_org]
|
206
|
+
else
|
207
|
+
thrs = [@o[:thr], genome_ids[:positive].size].min
|
208
|
+
puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
|
209
|
+
$stderr.puts " # Looking for proteins: #{@o[:positive]}" if @o[:debug]
|
210
|
+
$stderr.puts " # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
|
211
|
+
$stderr.puts " # Looking into: #{genome_ids[:positive]}" if @o[:debug]
|
212
|
+
thr_obj = []
|
213
|
+
(0 .. (thrs-1)).each do |thr_i|
|
214
|
+
ids_to_parse = []
|
215
|
+
(0 .. (genome_ids[:positive].size-1)).each do |i|
|
216
|
+
ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
|
217
|
+
end
|
218
|
+
json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
|
219
|
+
thr_obj << json_file
|
220
|
+
fork do
|
221
|
+
get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
Process.waitall
|
225
|
+
# Combine results
|
226
|
+
positive_coords = {}
|
227
|
+
genomes_org = {}
|
228
|
+
genome_org = {}
|
229
|
+
thr_obj.each do |t|
|
230
|
+
raise "Thread failed without error trace: #{t}" unless File.exist? t
|
231
|
+
o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
|
232
|
+
o[:positive_coords].each_pair do |k,v|
|
233
|
+
positive_coords[ k ] ||= []
|
234
|
+
positive_coords[ k ] += v
|
235
|
+
end
|
236
|
+
o[:genomes_org].each_pair do |k,v|
|
237
|
+
genomes_org[ k ] ||= []
|
238
|
+
genomes_org[ k ] << v
|
239
|
+
end
|
240
|
+
File.unlink t
|
241
|
+
end
|
242
|
+
# Select one genome per taxon
|
243
|
+
unless @o[:pertaxon].nil?
|
244
|
+
genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
|
245
|
+
end
|
246
|
+
# Save coordinates
|
247
|
+
ofh = File.open(coords_file, "w")
|
248
|
+
ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
|
249
|
+
ofh.close
|
250
|
+
end
|
251
|
+
unless @o[:pertaxon].nil?
|
252
|
+
genome_ids[:positive] = genome_org.values
|
253
|
+
puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
|
254
|
+
end
|
255
|
+
all_genome_ids = genome_ids.values.reduce(:+).uniq
|
256
|
+
found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
|
257
|
+
unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
|
258
|
+
raise "Cannot find the genomic location of any provided sequence." if found.nil?
|
259
|
+
missing = @o[:positive] - found
|
260
|
+
warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
|
261
|
+
|
262
|
+
# Download genomes
|
263
|
+
genomes_file = @o[:baseout] + '.src.fasta'
|
264
|
+
if @o[:reuse] and File.size? genomes_file
|
265
|
+
puts " * reusing existing file: #{genomes_file}." unless @o[:q]
|
266
|
+
else
|
267
|
+
puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
|
268
|
+
$stderr.puts " # #{all_genome_ids}" if @o[:debug]
|
269
|
+
ids = Array.new(all_genome_ids)
|
270
|
+
ofh = File.open(genomes_file, 'w')
|
271
|
+
while ids.size>0
|
272
|
+
ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
|
273
|
+
end
|
274
|
+
ofh.close
|
275
|
+
end
|
276
|
+
|
277
|
+
# Generate metagenome
|
278
|
+
unless @o[:nosimulate]
|
279
|
+
puts "Generating in silico metagenome" unless @o[:q]
|
280
|
+
if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
|
281
|
+
puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
|
282
|
+
else
|
283
|
+
all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
|
284
|
+
thrs = [@o[:thr], all_src].min
|
285
|
+
puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
|
286
|
+
$stderr.puts " # #{positive_coords}" if @o[:debug]
|
287
|
+
thr_obj = []
|
288
|
+
seqs_per_thr = (all_src/thrs).ceil
|
289
|
+
(0 .. (thrs-1)).each do |thr_i|
|
290
|
+
output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
|
291
|
+
thr_obj << output
|
292
|
+
fork do
|
293
|
+
seqs_a = thr_i*seqs_per_thr + 1
|
294
|
+
seqs_b = [seqs_a + seqs_per_thr, all_src].min
|
295
|
+
# Create sub-fasta
|
296
|
+
ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
|
297
|
+
ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
|
298
|
+
seq_i = 0
|
299
|
+
while l = ifh.gets
|
300
|
+
seq_i+=1 if l =~ /^>/
|
301
|
+
break if seq_i > seqs_b
|
302
|
+
ofh.print l if seq_i >= seqs_a
|
303
|
+
end
|
304
|
+
ifh.close
|
305
|
+
ofh.close
|
306
|
+
|
307
|
+
# Run simulator (except if the temporal file is already there and can be reused)
|
308
|
+
unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
|
309
|
+
bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
|
310
|
+
end
|
311
|
+
|
312
|
+
# Tag positives
|
313
|
+
puts " * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
|
314
|
+
ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
|
315
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
|
316
|
+
while l = ifh.gets
|
317
|
+
if l =~ /^>/
|
318
|
+
rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
|
319
|
+
raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
|
320
|
+
positive = false
|
321
|
+
positive_coords[rd[:genome_id].to_sym] ||= []
|
322
|
+
positive_coords[rd[:genome_id].to_sym].each do |gn|
|
323
|
+
left = rd[:to].to_i - gn[:from]
|
324
|
+
right = gn[:to] - rd[:from].to_i
|
325
|
+
if (left*right >= 0) and ([left, right].min >= @o[:minovl])
|
326
|
+
positive = true
|
327
|
+
break
|
328
|
+
end
|
329
|
+
end
|
330
|
+
l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
|
331
|
+
"ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
|
332
|
+
end
|
333
|
+
ofh.print l
|
334
|
+
end
|
335
|
+
ofh.close
|
336
|
+
ifh.close
|
337
|
+
end # fork
|
338
|
+
end # (1 .. thrs).each
|
339
|
+
Process.waitall
|
340
|
+
# Concatenate results
|
341
|
+
ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
|
342
|
+
thr_obj.each do |t|
|
343
|
+
raise "Thread failed without error trace: #{t}" unless File.exist? t
|
344
|
+
ifh = File.open(t, "r")
|
345
|
+
while l = ifh.gets
|
346
|
+
ofh.print l
|
347
|
+
end
|
348
|
+
ifh.close
|
349
|
+
File.unlink t
|
350
|
+
end
|
351
|
+
ofh.close
|
352
|
+
end
|
353
|
+
end # unless @o[:nosimulate]
|
354
|
+
|
355
|
+
# Align references
|
356
|
+
unless @o[:noaln]
|
357
|
+
puts "Aligning reference set." unless @o[:q]
|
358
|
+
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
|
359
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
|
360
|
+
else
|
361
|
+
bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
|
362
|
+
puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
366
|
+
# Run similarity search
|
367
|
+
unless @o[:nosearch]
|
368
|
+
puts "Running homology search." unless @o[:q]
|
369
|
+
if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
|
370
|
+
puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
|
371
|
+
else
|
372
|
+
puts " * preparing database." unless @o[:q]
|
373
|
+
bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
|
374
|
+
puts " * running similarity search." unless @o[:q]
|
375
|
+
bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# Clean
|
380
|
+
unless @o[:noclean]
|
381
|
+
puts "Cleaning." unless @o[:q]
|
382
|
+
sff = %w{.src.xml .src.fasta}
|
383
|
+
sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
|
384
|
+
sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
|
385
|
+
sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
|
386
|
+
end
|
387
|
+
end # build!
|
388
|
+
end # ROCker
|
389
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-05-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
@@DEFAULTS.merge!({:refine=>true, :win=>20, :minscore=>0})
|
11
|
+
|
12
|
+
#================================[ Compile ]
|
13
|
+
def compile!
|
14
|
+
raise "-a/--alignment is mandatory." if @o[:aln].nil?
|
15
|
+
raise "-a/--alignment must exist." unless File.exist? @o[:aln]
|
16
|
+
if @o[:table].nil?
|
17
|
+
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil? or not File.exist? @o[:blast]
|
18
|
+
@o[:table] = "#{@o[:blast]}.table"
|
19
|
+
else
|
20
|
+
@o[:reuse] = true
|
21
|
+
end
|
22
|
+
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
23
|
+
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
24
|
+
|
25
|
+
puts "Testing environment." unless @o[:q]
|
26
|
+
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
27
|
+
bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
|
28
|
+
|
29
|
+
puts "Reading files." unless @o[:q]
|
30
|
+
puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
|
31
|
+
aln = Alignment.new
|
32
|
+
aln.read_fasta @o[:aln]
|
33
|
+
|
34
|
+
if @o[:reuse] and File.exist? @o[:table]
|
35
|
+
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
36
|
+
else
|
37
|
+
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
38
|
+
blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
|
39
|
+
end
|
40
|
+
|
41
|
+
puts "Analyzing data." unless @o[:q]
|
42
|
+
puts " * computing windows." unless @o[:q]
|
43
|
+
data = ROCData.new(@o[:table], aln, @o[:win])
|
44
|
+
data.nucl = @o[:nucl]
|
45
|
+
if @o[:refine]
|
46
|
+
puts " * refining windows." unless @o[:q]
|
47
|
+
warn "Insufficient hits to refine results." unless data.refine! @o[:table]
|
48
|
+
end
|
49
|
+
puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
|
50
|
+
data.save @o[:rocker]
|
51
|
+
end # compile!
|
52
|
+
end # ROCker
|
53
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-04-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
#@@DEFAULTS.merge!({ })
|
11
|
+
|
12
|
+
#================================[ Filter ]
|
13
|
+
def filter!
|
14
|
+
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
15
|
+
raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
|
16
|
+
raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
|
17
|
+
|
18
|
+
puts "Reading ROCker file." unless @o[:q]
|
19
|
+
data = ROCData.new @o[:rocker]
|
20
|
+
|
21
|
+
puts "Filtering BLAST." unless @o[:q]
|
22
|
+
ih = File.open(@o[:qblast], 'r')
|
23
|
+
oh = File.open(@o[:oblast], 'w')
|
24
|
+
while ln = ih.gets
|
25
|
+
bh = BlastHit.new(ln, data.aln)
|
26
|
+
oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
|
27
|
+
end
|
28
|
+
ih.close
|
29
|
+
oh.close
|
30
|
+
end # filter!
|
31
|
+
end # ROCker
|
32
|
+
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-04-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
@@DEFAULTS.merge!({:color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true, :sbj=>[]})
|
11
|
+
|
12
|
+
#================================[ Search ]
|
13
|
+
def plot!
|
14
|
+
raise "-k/--rocker is mandatory." if o[:rocker].nil?
|
15
|
+
if @o[:table].nil?
|
16
|
+
raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
|
17
|
+
@o[:table] = "#{@o[:blast]}.table"
|
18
|
+
end
|
19
|
+
raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
|
20
|
+
|
21
|
+
puts "Testing environment." unless @o[:q]
|
22
|
+
bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
|
23
|
+
|
24
|
+
puts "Reading files." unless @o[:q]
|
25
|
+
puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
|
26
|
+
data = ROCData.new @o[:rocker]
|
27
|
+
if File.exist? @o[:table]
|
28
|
+
puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
|
29
|
+
else
|
30
|
+
puts " * generating table: #{@o[:table]}." unless @o[:q]
|
31
|
+
blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
|
32
|
+
end
|
33
|
+
|
34
|
+
puts "Plotting matches." unless @o[:q]
|
35
|
+
extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
|
36
|
+
@o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
|
37
|
+
data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
|
38
|
+
data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
|
39
|
+
some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
|
40
|
+
data.rrun "par(mar=c(0,4,0,0.5)+.1);"
|
41
|
+
data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
|
42
|
+
data.rrun "noise <- runif(ncol(x),-.2,.2)"
|
43
|
+
data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
|
44
|
+
data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
|
45
|
+
|
46
|
+
puts "Plotting windows." unless @o[:q]
|
47
|
+
if some_thr
|
48
|
+
data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
|
49
|
+
data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
|
50
|
+
end
|
51
|
+
data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
|
52
|
+
"lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
|
53
|
+
|
54
|
+
puts "Plotting alignment." unless @o[:q]
|
55
|
+
data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
|
56
|
+
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
|
57
|
+
i = 0
|
58
|
+
data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
|
59
|
+
data.aln.seqs.values.each do |s|
|
60
|
+
color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
|
61
|
+
data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
|
62
|
+
i += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
puts "Plotting statistics." unless @o[:q]
|
66
|
+
data.rrun "par(mar=c(5,4,0,0.5)+.1);"
|
67
|
+
data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
|
68
|
+
if some_thr
|
69
|
+
sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
|
70
|
+
sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
|
71
|
+
ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
|
72
|
+
unless @o[:q]
|
73
|
+
puts " * sensitivity: #{sn}%"
|
74
|
+
puts " * specificity: #{sp}%"
|
75
|
+
puts " * accuracy: #{ac}%"
|
76
|
+
end
|
77
|
+
data.rrun "pos <- (w$V1+w$V2)/2"
|
78
|
+
if @o[:impact]
|
79
|
+
data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
80
|
+
data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
81
|
+
data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
82
|
+
else
|
83
|
+
data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
|
84
|
+
data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
|
85
|
+
data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
|
86
|
+
end
|
87
|
+
#data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
|
88
|
+
end
|
89
|
+
data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
|
90
|
+
data.rrun "dev.off();"
|
91
|
+
end # plot!
|
92
|
+
end # ROCker
|
93
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#
|
2
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
3
|
+
# @author Luis (Coto) Orellana
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jun-04-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
class ROCker
|
9
|
+
#================================[ Class ]
|
10
|
+
#@@DEFAULTS.merge!({ })
|
11
|
+
|
12
|
+
#================================[ Search ]
|
13
|
+
def search!
|
14
|
+
raise "-k/--rocker is mandatory." if @o[:rocker].nil?
|
15
|
+
raise "Code Under development..."
|
16
|
+
# ToDo
|
17
|
+
# [ ... ]
|
18
|
+
end # search!
|
19
|
+
end # ROCker
|
20
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-rocker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis (Coto) Orellana
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-05
|
12
|
+
date: 2015-06-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rest-client
|
@@ -25,6 +25,20 @@ dependencies:
|
|
25
25
|
- - ~>
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: 1.7.3
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: json
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ~>
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 1.8.1
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ~>
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 1.8.1
|
28
42
|
description: Detecting and quantifying functional genes in short-read metagenomic
|
29
43
|
datasets
|
30
44
|
email: lhorellana@gatech.edu
|
@@ -40,6 +54,11 @@ files:
|
|
40
54
|
- lib/rocker/rocwindow.rb
|
41
55
|
- lib/rocker/rocdata.rb
|
42
56
|
- lib/rocker/rinterface.rb
|
57
|
+
- lib/rocker/step/build.rb
|
58
|
+
- lib/rocker/step/compile.rb
|
59
|
+
- lib/rocker/step/search.rb
|
60
|
+
- lib/rocker/step/filter.rb
|
61
|
+
- lib/rocker/step/plot.rb
|
43
62
|
- bin/ROCker
|
44
63
|
homepage: http://enve-omics.ce.gatech.edu/rocker
|
45
64
|
licenses:
|
@@ -51,9 +70,9 @@ require_paths:
|
|
51
70
|
- lib
|
52
71
|
required_ruby_version: !ruby/object:Gem::Requirement
|
53
72
|
requirements:
|
54
|
-
- -
|
73
|
+
- - ~>
|
55
74
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0'
|
75
|
+
version: '2.0'
|
57
76
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
77
|
requirements:
|
59
78
|
- - '>='
|