bio-rocker 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 51e493216ea815f9919322e9b1319f9914d824c7
4
+ data.tar.gz: 8c7bf556507ef0b11c5f2477fad589a62c016f11
5
+ SHA512:
6
+ metadata.gz: b57faf770219b8ab364b25f59696852f219a204f22b12be5fb89d7932e0cb10429316e0b7645bf7e7a18c56590a669f605f444e667cb4a9467626db100df0b34
7
+ data.tar.gz: 47223060bd29a689406067e9bff41352ffa841c8475a25943e83dc6cfc2901ae7ad8f3b8eea1a44c5cfbe0b6fccac45dd1cf493eb0000ed19877f451bf0463e8
@@ -0,0 +1,180 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @author Luis (Coto) Orellana
5
+ # @license artistic license 2.0
6
+ # @update Jan-22-2015
7
+ #
8
+
9
+ require 'rocker'
10
+ require 'optparse'
11
+
12
+
13
+ #================================[ Options parsing ]
14
+ $t = {
15
+ 'build' => 'Creates in silico metagenomes and training sets from reference genomes.',
16
+ 'compile' => 'Identifies the most discriminant bit-score per alignment position in a set of sequence.',
17
+ 'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
18
+ 'search' => 'Uses a ROCker compilation to identify reads putatively derived from a set of sequences.',
19
+ 'plot' => 'Generates a graphical representation of the alignment, the thresholds, and the hits.',
20
+ }
21
+ task = (ARGV.size > 0 ? ARGV.shift : '').downcase
22
+ ARGV << '-h' if ARGV.size==0
23
+
24
+ o = {}
25
+ opts = OptionParser.new do |opt|
26
+ if $t.keys.include? task
27
+ opt.banner = "Usage: ROCker.rb #{task} [options]"
28
+ opt.separator ""
29
+ opt.separator $t[task]
30
+ opt.separator ""
31
+ end
32
+ case task
33
+ when 'build'
34
+ unless ROCker.has_build_gems?
35
+ opt.separator "+ UNSATISFIED REQUIREMENTS"
36
+ opt.separator " The building task requires uninstalled gems, please install them executing:"
37
+ opt.separator " gem install rest_client"
38
+ opt.separator " gem install nokogiri"
39
+ opt.separator ""
40
+ end
41
+ opt.separator "+ BUILDING ARGUMENTS"
42
+ opt.on("-p", "--positive GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
43
+ opt.on("-n", "--negative GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
44
+ opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
45
+ opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
46
+ opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
47
+ opt.separator ""
48
+ opt.separator "+ ADVANCED BUILDING ARGUMENTS"
49
+ opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one GI per line. If used, -p is not required."){ |v| o[:posfile]=v }
50
+ opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one GI per line."){ |v| o[:negfile]=v }
51
+ opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain GI numbers. If used, -p is not required."){ |v| o[:aln]=v }
52
+ opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
53
+ opt.on("-v", "--overlap NUMBER", "Minimum overlap with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_f }
54
+ opt.on( "--genome-frx NUMBER", "Fraction to subsample the positive set genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}"){ |v| o[:genomefrx]=v.to_f }
55
+ opt.on( "--per-genus", "If selected, only one genome per genus is used to build the metagenome."){ o[:pergenus]=true }
56
+ opt.on( "--per-species", "If selected, only one genome per species is used to build the metagenome."){ o[:perspecies]=true }
57
+ opt.on( "--nometagenome", "Do not create metagenome. Implies --noblast. By default, metagenome is created."){ |v| o[:nomg]=v }
58
+ opt.on( "--noblast", "Do not execute BLAST. By default, BLAST is executed."){ |v| o[:noblast]=v }
59
+ opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
60
+ opt.on( "--nocleanup", "Keep all intermediate files. By default, intermediate files are removed."){ |v| o[:noclean]=v }
61
+ opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
62
+ opt.separator ""
63
+ opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
64
+ opt.on("-G", "--grinder PATH", "Path to the grinder executable. By default: '#{ROCker.default :grinder}' (in the $PATH)."){ |v| o[:grinder]=v }
65
+ opt.on("-M", "--muscle PATH", "Path to the muscle executable. By default: '#{ROCker.default :muscle}' (in the $PATH)."){ |v| o[:muscle]=v }
66
+ opt.on("-B", "--blastbins PATH", "Path to the Blast+ executables. By default: '#{ROCker.default :blastbins}' (in the $PATH)."){ |v| o[:blastbins]=v }
67
+ opt.on( "--grinder-cmd STR", "Command calling grinder, where %1$s: grinder bin, %2$s: input, %3$s: seq. depth, %4$s: output.",
68
+ "By default: '#{ROCker.default :grindercmd}'."){ |v| o[:grindercmd]=v }
69
+ opt.on("--muscle-cmd STR", "Command calling muscle, where %1$s: muscle bin, %2$s: input, %3$s: output.",
70
+ "By default: '#{ROCker.default :musclecmd}'."){ |v| o[:musclecmd]=v }
71
+ opt.on("--blast-cmd STR", "Command calling BLAST search, where %1$s: blast bins, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
72
+ "By default: '#{ROCker.default :blastcmd}'."){ |v| o[:blastcmd]=v }
73
+ opt.on("--makedb-cmd STR", "Command calling BLAST format, where %1$s: blast bins, %2$s: dbtype, %3$s: input, %4$s: database.",
74
+ "By default: '#{ROCker.default :makedbcmd}'."){ |v| o[:makedbcmd]=v }
75
+ when 'compile'
76
+ opt.separator "+ COMPILATION ARGUMENTS"
77
+ opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. Required."){ |v| o[:aln]=v }
78
+ opt.on("-b", "--ref-blast PATH",
79
+ "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
80
+ opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
81
+ opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
82
+ opt.separator ""
83
+ opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
84
+ opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
85
+ opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
86
+ opt.on( "--norefine", "Do not refine windows."){ o[:refine]=false }
87
+ opt.on("-w", "--window INT", "Initial size of alignment windows (in number of AA columns). By default: #{ROCker.default :win}."){ |v| o[:win]=v.to_i }
88
+ opt.separator ""
89
+ opt.separator "+ INPUT/OUTPUT"
90
+ opt.separator " o The input alignment (-a) MUST be in FastA format, and the IDs must"
91
+ opt.separator " coincide with those from the BLAST (-b)."
92
+ opt.separator " o The input BLAST (-b) MUST be in tabular format. True positives must"
93
+ opt.separator " contain the string '@%' somewhere in the query ID."
94
+ opt.separator " o The table file (-t) should be tab-delimited and contain six columns:"
95
+ opt.separator " 1. Subject ID."
96
+ opt.separator " 2. Start of alignment in subject (translated to alignment column)."
97
+ opt.separator " 3. End of alignment in subject (translated to alignment column)."
98
+ opt.separator " 4. Bit score."
99
+ opt.separator " 5. A number indicating if it was a true (1) or a false (0) positive."
100
+ opt.separator " 6. Mid-point of the alignment in the reference sequence."
101
+ opt.separator " o The ROCker file (-k) is a tab-delimited file containing five columns:"
102
+ opt.separator " 1. First column of the window in the alignment."
103
+ opt.separator " 2. Last column of the window in the alignment."
104
+ opt.separator " 3. Number of positives in the window (hits)."
105
+ opt.separator " 4. Number of true positives in the window."
106
+ opt.separator " 5. Bit score threshold set for the window."
107
+ opt.separator " The file also contains the alignment (commented with #:)."
108
+ opt.separator ""
109
+ when 'filter'
110
+ opt.separator "+ FILTERING ARGUMENTS"
111
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
112
+ opt.on("-x", "--query-blast PATH", "Tabular BLAST (blastx) of the query reads vs. the reference dataset. Required."){ |v| o[:qblast]=v }
113
+ opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
114
+ when 'search'
115
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
116
+ opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
117
+ opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
118
+ when 'plot'
119
+ opt.separator "+ PLOTTING ARGUMENTS"
120
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
121
+ opt.on("-b", "--ref-blast PATH",
122
+ "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
123
+ opt.on("-o", "--plot-file PATH", "File to be created with the plot. By default: value of -k + '.' + value of -f."){ |v| o[:gout]=v }
124
+ opt.separator ""
125
+ opt.separator "+ ADVANCED PLOTTING ARGUMENTS"
126
+ opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
127
+ opt.on( "--color", "Color alignment by amino acid."){ o[:color]=true }
128
+ opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
129
+ opt.on("-s", "--subject SBJ1,SBJ2,...", Array,
130
+ "Plot only information regarding this(ese) subject(s). If multiple, separate by comma. By default, all hits are plotted."){ |v| o[:sbj]=v }
131
+ opt.on("-f", "--plot-format STRING",
132
+ "Format of the plot file. Supported values: pdf (default), png, jpeg, and tiff."){ |v| o[:gformat]=v }
133
+ opt.on("-W", "--width NUMBER", "Width of the plot in inches. By default: #{ROCker.default :width}."){ |v| o[:width]=v.to_f }
134
+ opt.on("-H", "--height NUMBER", "Height of the plot in inches. By defaule: #{ROCker.default :height}."){ |v| o[:width]=v.to_f }
135
+ else
136
+ opt.banner = "Usage: ROCker.rb [task] [options]"
137
+ opt.separator ""
138
+ opt.separator "Please specify one of the following tasks:"
139
+ $t.keys.each{ |t| opt.separator " #{t}:\t#{$t[t]}" }
140
+ end
141
+ opt.separator ""
142
+ opt.separator "+ GENERAL ARGUMENTS"
143
+ opt.on("-R", "--path-to-r PATH", "Path to the R executable to be used. By default: '#{ROCker.default :r}'."){ |v| o[:r]=v }
144
+ opt.on("-q", "--quiet", "Run quietly."){ |v| o[:q]=true }
145
+ opt.on("-d", "--debug", "Display debugging information."){ |v| o[:debug]=true }
146
+ opt.on("-h", "--help","Display this screen") do
147
+ puts opt
148
+ exit
149
+ end
150
+ opt.separator ""
151
+ unless $t.include? task
152
+ puts opt
153
+ exit
154
+ end
155
+ end
156
+ opts.parse!
157
+
158
+
159
+ #================================[ Main ]
160
+ rocker = ROCker.new(o)
161
+ begin
162
+ case task
163
+ when 'build'
164
+ rocker.build!
165
+ when 'compile'
166
+ rocker.compile!
167
+ when 'filter'
168
+ rocker.filter!
169
+ when 'search'
170
+ rocker.search!
171
+ when 'plot'
172
+ rocker.plot!
173
+ end
174
+ rescue => err
175
+ $stderr.puts "Exception: #{err}\n\n"
176
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
177
+ err
178
+ end
179
+
180
+
@@ -0,0 +1,503 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ require 'rocker/blasthit'
9
+ require 'rocker/rocdata'
10
+
11
+ class ROCker
12
+ #================================[ Class ]
13
+ @@EUTILS = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils'
14
+ @@DEFAULTS = {
15
+ # General
16
+ :q=>false, :r=>'R', :nucl=>false, :debug=>false,
17
+ # Build
18
+ :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0, :pergenus=>false, :perspecies=>false,
19
+ # ext. software
20
+ :grinder=>'grinder', :muscle=>'muscle', :blastbins=>'', :seqdepth=>3, :minovl=>0.75,
21
+ :grindercmd=>'%1$s -reference_file "%2$s" -cf "%3$f" -base_name "%4$s" -dc \'-~*Nn\' -md "poly4 3e-3 3.3e-8" -mr "95 5" -rd "100 uniform 5"',
22
+ :musclecmd=>'%1$s -in "%2$s" -out "%3$s" -quiet',
23
+ :blastcmd=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
24
+ :makedbcmd=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
25
+ # Compile
26
+ :refine=>true, :win=>20, :minscore=>0,
27
+ # Filter
28
+ :sbj=>[],
29
+ # Plot
30
+ :color=>false, :gformat=>'pdf', :width=>9, :height=>9
31
+ }
32
+ @@HAS_BUILD_GEMS = nil
33
+ def self.eutils() @@EUTILS end
34
+ def self.defaults() @@DEFAULTS end
35
+ def self.default(k) @@DEFAULTS[k] end
36
+ def self.has_build_gems?
37
+ return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
38
+ @@HAS_BUILD_GEMS = TRUE
39
+ begin
40
+ require 'rubygems'
41
+ require 'restclient'
42
+ require 'nokogiri'
43
+ rescue LoadError
44
+ @@HAS_BUILD_GEMS = FALSE
45
+ end
46
+ @@HAS_BUILD_GEMS
47
+ end
48
+
49
+ #================================[ Instance ]
50
+ attr_reader :o
51
+ def initialize(opts)
52
+ @o = ROCker.defaults
53
+ opts.each{ |k,v| @o[k] = v }
54
+ RInterface.R_BIN = opts[:r] unless opts[:r].nil?
55
+ end
56
+
57
+ #================================[ Build ]
58
+ def build!
59
+ # Check requirements
60
+ puts "Testing environment." unless @o[:q]
61
+ @o[:noblast]=true if @o[:nomg]
62
+ raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
63
+ @o[:positive] += @o[:posori] unless @o[:posori].nil?
64
+ @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
65
+ @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
66
+ unless @o[:aln].nil?
67
+ aln = Alignment.new
68
+ aln.read_fasta @o[:aln]
69
+ @o[:positive] += aln.get_gis
70
+ end
71
+ raise "-p or -P are mandatory." if @o[:positive].size==0
72
+ raise "-o/--baseout is mandatory." if @o[:baseout].nil?
73
+ if @o[:positive].size == 1 and not @o[:noaln]
74
+ warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
75
+ @o[:noaln] = true
76
+ end
77
+ self.bash "#{@o[:grinder]} --version", "-G/--grinder must be executable. Is Grinder installed?" unless @o[:nomg]
78
+ self.bash "#{@o[:muscle]} -version", "-M/--muscle must be executable. Is Muscle installed?" unless @o[:noaln]
79
+ self.bash "#{@o[:blastbins]}makeblastdb -version", "-B/--blastbins must contain executables. Is BLAST+ installed?" unless @o[:noblast]
80
+ # Download genes
81
+ puts "Downloading gene data." unless @o[:q]
82
+ f = File.open(@o[:baseout] + '.ref.fasta', 'w')
83
+ if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
84
+ puts " * re-using aligned sequences as positive set." unless @o[:q]
85
+ f.print aln.to_seq_s
86
+ @o[:noaln] = true
87
+ else
88
+ puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
89
+ $stderr.puts " # #{@o[:positive]}" if @o[:debug]
90
+ ids = Array.new(@o[:positive])
91
+ while ids.size>0
92
+ f.print efetch({:db=>(@o[:nucl] ? 'nuccore' : 'protein'), :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
93
+ end
94
+ end
95
+ f.close
96
+ genome_gis = {:positive=>[], :negative=>[]}
97
+ [:positive, :negative].each do |set|
98
+ unless @o[set].size==0
99
+ puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
100
+ $stderr.puts " # #{@o[set]}" if @o[:debug]
101
+ genome_gis[set] = genes2genomes(@o[set], @o[:nucl])
102
+ end
103
+ end
104
+ raise "No genomes associated with the positive set." if genome_gis[:positive].size==0
105
+ genome_gis[:positive] = genome_gis[:positive].sample( (genome_gis[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
106
+ raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_gis[:positive].empty?
107
+ all_gis = genome_gis.values.reduce(:+).uniq
108
+
109
+ # Locate genes
110
+ puts "Analyzing genome data." unless @o[:q]
111
+ puts " * downloading and parsing #{genome_gis[:positive].size} XML file(s)." unless @o[:q]
112
+ $stderr.puts " # #{genome_gis[:positive]}" if @o[:debug]
113
+ positive_coords = {}
114
+ genome_org = {}
115
+ i = 0
116
+ genome_gis[:positive].each do |gi|
117
+ print " * scanning #{(i+=1).ordinalize} genome out of #{genome_gis[:positive].size}. \r" unless @o[:q]
118
+ $stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
119
+ genome_file = @o[:baseout] + '.src.' + i.to_s + '.xml'
120
+ if @o[:reuse] and File.exist? genome_file
121
+ puts " * reusing existing file: #{genome_file}." unless @o[:q]
122
+ ifh = File.open(genome_file, 'r')
123
+ doc = Nokogiri::XML( ifh )
124
+ ifh.close
125
+ else
126
+ genome_file=nil unless @o[:noclean]
127
+ res = efetch({:db=>'nuccore', :id=>gi, :rettype=>'xml', :retmode=>'text'}, genome_file)
128
+ doc = Nokogiri::XML( res )
129
+ end
130
+ incomplete = true
131
+ doc.xpath('//Bioseq-set/Bioseq-set_seq-set/Seq-entry').each do |genome|
132
+ genome_gi = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_seq-set/Seq-entry/Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_gi')
133
+ if !genome_gi.nil? and gi==genome_gi.content
134
+ incomplete = false
135
+ positive_coords[gi] ||= []
136
+ $stderr.puts "\n # got #{gi}, scanning" if @o[:debug]
137
+ if @o[:pergenus] or @o[:perspecies]
138
+ name = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/BioSource_org/Org-ref/Org-ref_orgname/OrgName/OrgName_name/OrgName_name_binomial/BinomialOrgName')
139
+ unless name.nil?
140
+ name_g = name.at_xpath('./BinomialOrgName_genus')
141
+ name_s = name.at_xpath('./BinomialOrgName_species')
142
+ if name_g.nil? or (name_s.nil? and @o[:perspecies])
143
+ name = nil
144
+ else
145
+ name = @o[:perspecies] ? name_g.content + " " + name_s.content : name_g.content
146
+ end
147
+ end
148
+ if name.nil?
149
+ warn "WARNING: Cannot find binomial name of #{gi}, using genome regardless of taxonomy."
150
+ name = rand(36**100).to_s(36)
151
+ end
152
+ break unless genome_org[ name ].nil?
153
+ genome_org[ name ] = gi
154
+ end
155
+ $stderr.puts " # traversing #{gi}" if @o[:debug]
156
+ genome.xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_annot/Seq-annot/Seq-annot_data/Seq-annot_data_ftable/Seq-feat').each do |pr|
157
+ pr_gi = pr.at_xpath('./Seq-feat_product/Seq-loc/Seq-loc_whole/Seq-id/Seq-id_gi')
158
+ next if pr_gi.nil?
159
+ if @o[:positive].include? pr_gi.content
160
+ $stderr.puts " # found #{pr_gi.content}" if @o[:debug]
161
+ pr_loc = pr.at_xpath('./Seq-feat_location/Seq-loc/Seq-loc_int/Seq-interval')
162
+ if pr_loc.nil?
163
+ pr_loc = pr.xpath('./Seq-feat_location/Seq-loc/Seq-loc_mix//Seq-loc/Seq-loc_int/Seq-interval')
164
+ if pr_loc.nil?
165
+ warn "WARNING: Impossible to find location of '#{pr_gi.content}' in '#{gi}'."
166
+ incomplete = true
167
+ else
168
+ pr_loc.each do |loc_int|
169
+ positive_coords[gi] << {
170
+ :gi => pr_gi.content,
171
+ :from => loc_int.at_xpath('./Seq-interval_from').content.to_i,
172
+ :to => loc_int.at_xpath('./Seq-interval_to').content.to_i
173
+ #, :strand => loc_int.at_xpath('./Seq-interval_strand/Na-strand/@value').content
174
+ }
175
+ end
176
+ end
177
+ else
178
+ positive_coords[gi] << {
179
+ :gi => pr_gi.content,
180
+ :from => pr_loc.at_xpath('./Seq-interval_from').content.to_i,
181
+ :to => pr_loc.at_xpath('./Seq-interval_to').content.to_i
182
+ #, :strand => pr_loc.at_xpath('./Seq-interval_strand/Na-strand/@value').content
183
+ }
184
+ end
185
+ end
186
+ end
187
+ break
188
+ end
189
+ end
190
+ doc = nil
191
+ warn "WARNING: Cannot find GI '#{gi}'." if incomplete
192
+ end
193
+ genome_gis[:positive] = genome_org.values if @o[:pergenus] or @o[:perspecies]
194
+ all_gis = genome_gis.values.reduce(:+).uniq
195
+ print "\n" unless @o[:q]
196
+ missing = @o[:positive] - positive_coords.values.map{ |a| a.map{ |b| b[:gi] } }.reduce(:+)
197
+ warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or @o[:pergenus] or @o[:perspecies]
198
+
199
+ # Download genomes
200
+ genomes_file = @o[:baseout] + '.src.fasta'
201
+ if @o[:reuse] and File.exist? genomes_file
202
+ puts " * reusing existing file: #{genomes_file}." unless @o[:q]
203
+ else
204
+ puts " * downloading #{all_gis.size} genome(s) in FastA." unless @o[:q]
205
+ $stderr.puts " # #{all_gis}" if @o[:debug]
206
+ ids = Array.new(all_gis)
207
+ ofh = File.open(genomes_file, 'w')
208
+ while ids.size>0
209
+ ofh.print efetch({:db=>'nuccore', :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
210
+ end
211
+ ofh.close
212
+ end
213
+
214
+ # Generate metagenome
215
+ unless @o[:nomg]
216
+ puts "Generating in silico metagenome" unless @o[:q]
217
+ if @o[:reuse] and File.exist? @o[:baseout] + ".mg.fasta"
218
+ puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
219
+ else
220
+ all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
221
+ thrs = [@o[:thr], all_src].min
222
+ puts " * running grinder and tagging positive reads (#{thrs} threads)." unless @o[:q]
223
+ $stderr.puts " # #{positive_coords}" if @o[:debug]
224
+ thr_obj = []
225
+ seqs_per_thr = (all_src/thrs).ceil
226
+ (0 .. (thrs-1)).each do |thr_i|
227
+ thr_obj << Thread.new do
228
+ Thread.current[:seqs_a] = thr_i*seqs_per_thr + 1
229
+ Thread.current[:seqs_b] = [Thread.current[:seqs_a] + seqs_per_thr, all_src].min
230
+ # Create sub-fasta
231
+ Thread.current[:ofh] = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", 'w')
232
+ Thread.current[:ifh] = File.open("#{@o[:baseout]}.src.fasta", 'r')
233
+ Thread.current[:seq_i] = 0
234
+ while Thread.current[:l] = Thread.current[:ifh].gets
235
+ Thread.current[:seq_i]+=1 if Thread.current[:l] =~ /^>/
236
+ break if Thread.current[:seq_i] > Thread.current[:seqs_b]
237
+ Thread.current[:ofh].print Thread.current[:l] if Thread.current[:seq_i] >= Thread.current[:seqs_a]
238
+ end
239
+ Thread.current[:ifh].close
240
+ Thread.current[:ofh].close
241
+ bash sprintf(@o[:grindercmd], @o[:grinder], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
242
+ # Tag positives
243
+ puts " * tagging positive reads." unless @o[:q]
244
+ Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
245
+ Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
246
+ while Thread.current[:l]=Thread.current[:ifh].gets
247
+ Thread.current[:rd] = /^>(?<id>\d+) reference=gi\|(?<gi>\d+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
248
+ unless Thread.current[:rd].nil?
249
+ Thread.current[:positive] = false
250
+ positive_coords[Thread.current[:rd][:gi]] ||= []
251
+ positive_coords[Thread.current[:rd][:gi]].each do |gn|
252
+ Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
253
+ Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
254
+ if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
255
+ Thread.current[:positive] = true
256
+ break
257
+ end
258
+ end
259
+ Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:gi]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
260
+ end
261
+ Thread.current[:ofh].print Thread.current[:l]
262
+ end
263
+ Thread.current[:ofh].close
264
+ Thread.current[:ifh].close
265
+ Thread.current[:output] = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
266
+ end # Thread.new do
267
+ end # (1 .. thrs).each
268
+ # Concatenate results
269
+ ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
270
+ thr_obj.each do |t|
271
+ t.join
272
+ raise "Thread failed without error trace: #{t}" if t[:output].nil?
273
+ ifh = File.open(t[:output], 'r')
274
+ while l = ifh.gets
275
+ ofh.print l
276
+ end
277
+ ifh.close
278
+ File.unlink t[:output]
279
+ end
280
+ ofh.close
281
+ end
282
+ end # unless @o[:nomg]
283
+ # Align references
284
+ unless @o[:noaln]
285
+ puts "Aligning reference set." unless @o[:q]
286
+ if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.aln"
287
+ puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
288
+ else
289
+ bash sprintf(@o[:musclecmd], @o[:muscle], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln")
290
+ puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
291
+ end
292
+ end
293
+ # Run BLAST
294
+ unless @o[:noblast]
295
+ puts "Running homology search." unless @o[:q]
296
+ if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.blast"
297
+ puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
298
+ else
299
+ puts " * preparing database." unless @o[:q]
300
+ bash sprintf(@o[:makedbcmd], @o[:blastbins], (@o[:nucl]?'nucl':'prot'), "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
301
+ puts " * running BLAST." unless @o[:q]
302
+ bash sprintf(@o[:blastcmd], @o[:blastbins], (@o[:nucl]?'blastn':'blastx'), "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
303
+ end
304
+ end
305
+ # Clean
306
+ unless @o[:noclean]
307
+ puts "Cleaning." unless @o[:q]
308
+ sff = %w{.src.xml .src.fasta}
309
+ sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nomg]
310
+ sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:noblast]
311
+ sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
312
+ end
313
+ end # build!
314
+
315
+ #================================[ Compile ]
316
+ def compile!
317
+ raise "-a/--alignment is mandatory." if @o[:aln].nil?
318
+ raise "-a/--alignment must exist." unless File.exist? @o[:aln]
319
+ if @o[:table].nil?
320
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
321
+ @o[:table] = "#{@o[:blast]}.table"
322
+ end
323
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
324
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
325
+
326
+ puts "Testing environment." unless @o[:q]
327
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
328
+ bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
329
+
330
+ puts "Reading files." unless @o[:q]
331
+ puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
332
+ aln = Alignment.new
333
+ aln.read_fasta @o[:aln]
334
+
335
+ if File.exist? @o[:table]
336
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
337
+ else
338
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
339
+ blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
340
+ end
341
+
342
+ puts "Analyzing data." unless @o[:q]
343
+ puts " * computing windows." unless @o[:q]
344
+ data = ROCData.new(@o[:table], aln, @o[:win])
345
+ data.nucl = @o[:nucl]
346
+ if @o[:refine]
347
+ puts " * refining windows." unless @o[:q]
348
+ warn "Insufficient hits to refine results." unless data.refine! @o[:table]
349
+ end
350
+ puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
351
+ data.save @o[:rocker]
352
+ end # compile!
353
+
354
+ #================================[ Filter ]
355
+ def filter!
356
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
357
+ raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
358
+ raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
359
+
360
+ puts "Reading ROCker file." unless @o[:q]
361
+ data = ROCData.new @o[:rocker]
362
+
363
+ puts "Filtering BLAST." unless @o[:q]
364
+ ih = File.open(@o[:qblast], 'r')
365
+ oh = File.open(@o[:oblast], 'w')
366
+ while ln = ih.gets
367
+ bh = BlastHit.new(ln, data.aln)
368
+ oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
369
+ end
370
+ ih.close
371
+ oh.close
372
+ end # filter!
373
+ #================================[ Search ]
374
+ def search!
375
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
376
+ raise "Code Under development..."
377
+ # ToDo
378
+ # [ ... ]
379
+ end # search!
380
+
381
+ #================================[ Plot ]
382
+ def plot!
383
+ raise "-k/--rocker is mandatory." if o[:rocker].nil?
384
+ if @o[:table].nil?
385
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
386
+ @o[:table] = "#{@o[:blast]}.table"
387
+ end
388
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
389
+
390
+ puts "Testing environment." unless @o[:q]
391
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
392
+
393
+ puts "Reading files." unless @o[:q]
394
+ puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
395
+ data = ROCData.new @o[:rocker]
396
+ if File.exist? @o[:table]
397
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
398
+ else
399
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
400
+ blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
401
+ end
402
+
403
+ puts "Plotting hits." unless @o[:q]
404
+ extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
405
+ @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
406
+ data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
407
+ data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
408
+ some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
409
+ data.rrun "par(mar=c(0,4,0,0.5)+.1);"
410
+ data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
411
+ data.rrun "noise <- runif(ncol(x),-.2,.2)"
412
+ data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,.2), rgb(.5,0,0,.2)), length=0);"
413
+ data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,.5), rgb(.5,0,0,.5)), pch=19, cex=1/4);"
414
+
415
+ puts "Plotting windows." unless @o[:q]
416
+ if some_thr
417
+ data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
418
+ data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
419
+ end
420
+ data.rrun "legend('bottomright',legend=c('Hit span','Hit mid-point','Reference','Non-reference')," +
421
+ "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
422
+
423
+ puts "Plotting alignment." unless @o[:q]
424
+ data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
425
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
426
+ i = 0
427
+ data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
428
+ data.aln.seqs.values.each do |s|
429
+ color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
430
+ data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
431
+ i += 1
432
+ end
433
+
434
+ puts "Plotting statistics." unless @o[:q]
435
+ data.rrun "par(mar=c(5,4,0,0.5)+.1);"
436
+ unless @o[:q] or not some_thr
437
+ puts " * sensitivity: #{data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float}%"
438
+ puts " * specificity: #{data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float}%"
439
+ puts " * accuracy: #{data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float}%"
440
+ end
441
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(50,100),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
442
+ if some_thr
443
+ data.rrun "pos <- (w$V1+w$V2)/2"
444
+ data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
445
+ data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
446
+ data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
447
+ #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
448
+ end
449
+ data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
450
+ data.rrun "dev.off();"
451
+ end # plot!
452
+
453
+ #================================[ Utilities ]
454
+ def blast2table(blast_f, table_f, aln, minscore)
455
+ ifh = File.open(blast_f, "r")
456
+ ofh = File.open(table_f, "w")
457
+ while ln = ifh.gets
458
+ bh = BlastHit.new(ln, aln)
459
+ ofh.print bh.to_s if bh.bits >= minscore
460
+ end
461
+ ifh.close
462
+ ofh.close
463
+ end
464
+ def genes2genomes(gis, nucl=false)
465
+ genomes = []
466
+ ids = Array.new(gis)
467
+ while ids.size>0
468
+ doc = Nokogiri::XML( elink({:dbfrom=>(nucl ? 'nuccore' : 'protein'), :db=>'nuccore', :id=>ids.shift(200).join(',')}) )
469
+ genomes += doc.xpath('/eLinkResult/LinkSet/LinkSetDb/Link/Id').map{ |id| id.content }
470
+ end
471
+ genomes.uniq
472
+ end
473
+ def eutils(script, params={}, outfile=nil)
474
+ response = RestClient.get "#{ROCker.eutils}/#{script}", {:params=>params}
475
+ raise "Unable to reach NCBI EUtils, error code #{response.code}." unless response.code == 200
476
+ unless outfile.nil?
477
+ ohf = File.open(outfile, 'w')
478
+ ohf.print response.to_s
479
+ ohf.close
480
+ end
481
+ response.to_s
482
+ end
483
+ def efetch(*etc) self.eutils 'efetch.fcgi', *etc end
484
+ def elink(*etc) self.eutils 'elink.fcgi', *etc end
485
+ def bash(cmd, err_msg=nil)
486
+ o = `#{cmd} 2>&1 && echo '{'`
487
+ raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
488
+ true
489
+ end
490
+ end
491
+
492
+ #================================[ Extensions ]
493
+ class Numeric
494
+ def ordinalize
495
+ n= self.to_s
496
+ s= n[-2]=='1' ? 'th' :
497
+ n[-1]=='1' ? 'st' :
498
+ n[-1]=='2' ? 'nd' :
499
+ n[-1]=='3' ? 'rd' : 'th'
500
+ n + s
501
+ end
502
+ end
503
+
@@ -0,0 +1,63 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ require 'rocker/sequence'
9
+
10
+
11
+ class Alignment
12
+ attr_reader :seqs, :cols
13
+ def initialize
14
+ @seqs = {}
15
+ end
16
+ def read_fasta(file) self.read_file(file, false) end
17
+ def read_rocker(file) self.read_file(file, true) end
18
+ def read_file(file, is_rocker)
19
+ f = File.open(file, 'r')
20
+ id = nil
21
+ sq = ""
22
+ while ln = f.gets
23
+ if is_rocker
24
+ next if /^#:(.*)/.match(ln).nil?
25
+ ln = $1
26
+ end
27
+ m = /^>(\S+)/.match(ln)
28
+ if m.nil?
29
+ sq += ln
30
+ else
31
+ self << Sequence.new(id, sq) unless id.nil?
32
+ id = m[1]
33
+ sq = ""
34
+ end
35
+ end
36
+ self << Sequence.new(id, sq) unless id.nil?
37
+ end
38
+ def <<(seq)
39
+ @seqs[seq.id] = seq
40
+ @cols = seq.cols if self.cols.nil?
41
+ raise "Aligned sequence #{seq.id} has a different length (#{seq.cols} vs #{self.cols})" unless seq.cols == self.cols
42
+ end
43
+ def get_gis
44
+ regexps = [/^gi\|(\d+)\|/, /^(\d+)\|/, /^(\d+)$/, /^gi\|(\d+)$/, /\|gi\|(\d+)\|/, /\|gi\|(\d+)$/]
45
+ gis = []
46
+ self.seqs.keys.each do |id|
47
+ gi = nil
48
+ regexps.each do |regexp|
49
+ unless regexp.match(id).nil?
50
+ gi = $1
51
+ break
52
+ end
53
+ end
54
+ gis << gi unless gi.nil?
55
+ end
56
+ gis
57
+ end
58
+ def seq(id) @seqs[id] end
59
+ def size() self.seqs.size end
60
+ def to_seq_s() self.seqs.values.map{|s| s.to_seq_s}.join + "\n" end
61
+ def to_s() self.seqs.values.map{|s| s.to_s}.join + "\n" end
62
+ end
63
+
@@ -0,0 +1,39 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class BlastHit
9
+ attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :midpoint
10
+ # Initialize from BLAST using new(ln,aln), initialize from TABLE using new(ln)
11
+ def initialize(ln, aln=nil)
12
+ l = ln.chomp.split(/\t/)
13
+ if aln.nil?
14
+ @sbj = l[0]
15
+ @sfrom = l[1].to_i
16
+ @sto = l[2].to_i
17
+ @bits = l[3].to_f
18
+ @istrue = l[4]=='1'
19
+ @midpoint = l[5].to_i
20
+ else
21
+ s = aln.seq(l[1])
22
+ return nil if s.nil?
23
+ @sbj = s.id
24
+ a = s.pos2col(l[8].to_i)
25
+ b = s.pos2col(l[9].to_i)
26
+ @sfrom = [a,b].min
27
+ @sto = [a,b].max
28
+ @bits = l[11].to_f
29
+ @istrue = ! /@%/.match(l[0]).nil?
30
+ @midpoint = s.pos2col(((l[8].to_f+l[9].to_f)/2).ceil)
31
+ end
32
+ end
33
+ def to_s
34
+ self.sbj.nil? ? "" :
35
+ [self.sbj, self.sfrom.to_s, self.sto.to_s, self.bits.to_s,
36
+ self.istrue ? '1' : '0', self.midpoint].join("\t") + "\n"
37
+ end
38
+ end
39
+
@@ -0,0 +1,39 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class RInterface
9
+ @@R_BIN = "R"
10
+ def RInterface.R_BIN=(rbin) @@R_BIN=rbin end
11
+ attr_reader :handler
12
+ def initialize
13
+ @handler = IO.popen("#{@@R_BIN} --slave 2>&1", "w+")
14
+ end
15
+ def run(cmd, type=nil)
16
+ @handler.puts cmd
17
+ @handler.puts "cat('---FIN---\n')"
18
+ o = ""
19
+ while true
20
+ l = @handler.gets
21
+ raise "R failed on command:\n#{cmd}\n\nError:\n#{o}" if l.nil?
22
+ break unless /^---FIN---/.match(l).nil?
23
+ o += l
24
+ end
25
+ o.chomp!
26
+ case type
27
+ when :float
28
+ /^\s*\[1\]\s+([0-9\.Ee+-]+|Inf).*/.match(o).nil? and raise "R error: expecting float, got #{o}"
29
+ return Float::INFINITY if $1=='Inf'
30
+ return $1.to_f
31
+ when :int
32
+ /^\s*\[1\]\s+([0-9\.Ee+-]+).*/.match(o).nil? and raise "R error: expecting integer, got #{o}"
33
+ return $1.to_i
34
+ else
35
+ return o
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,124 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ require 'rocker/rinterface'
9
+ require 'rocker/rocwindow'
10
+ require 'rocker/alignment'
11
+ require 'tmpdir'
12
+
13
+ class ROCData
14
+ attr_reader :aln, :windows, :r
15
+ # Use ROCData.new(table,aln,window) to re-compute from table, use ROCData.new(data) to load
16
+ def initialize(val, aln=nil, window=nil)
17
+ @r = RInterface.new
18
+ @nucl = false
19
+ if not aln.nil?
20
+ @aln = aln
21
+ self.rrun "library('pROC');"
22
+ self.rrun "x <- read.table('#{val}', sep='\\t', h=F);"
23
+ self.init_windows! window
24
+ else
25
+ f = File.open(val, "r")
26
+ @windows = []
27
+ while ln = f.gets
28
+ break unless /^#:/.match(ln).nil?
29
+ @windows << ROCWindow.new(self, ln)
30
+ end
31
+ f.close
32
+ @aln = Alignment.new
33
+ @aln.read_rocker(val)
34
+ end
35
+ end
36
+ def win_at_col(col) self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first end
37
+ def in_nucl?() @nucl end
38
+ def nucl=(nucl) @nucl=nucl end
39
+ def refine! table
40
+ while true
41
+ return false unless self.load_table! table
42
+ break if self._refine_iter(table)==0
43
+ end
44
+ return true
45
+ end
46
+ def _refine_iter table
47
+ to_refine = []
48
+ self.windows.each do |w|
49
+ next if w.almost_empty or w.length <= 5
50
+ self.rrun "acc <- w$accuracy[w$V1==#{w.from}];"
51
+ to_refine << w if self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
52
+ end
53
+ n = to_refine.size
54
+ return 0 unless n > 0
55
+ to_refine.each do |w|
56
+ w1 = ROCWindow.new(self, w.from, (w.from+w.to)/2)
57
+ w2 = ROCWindow.new(self, (w.from+w.to)/2, w.to)
58
+ if w1.almost_empty or w2.almost_empty
59
+ n -= 1
60
+ else
61
+ @windows << w1
62
+ @windows << w2
63
+ @windows.delete w
64
+ end
65
+ end
66
+ @windows.sort!{ |x,y| x.from <=> y.from }
67
+ n
68
+ end
69
+ def load_table! table, sbj=[], min_score=0
70
+ self.rrun "x <- read.table('#{table}', sep='\\t', h=F);"
71
+ self.rrun "x <- x[x$V1 %in% c('#{sbj.join("','")}'),];" if sbj.size > 0
72
+ self.rrun "x <- x[x$V4 >= #{minscore.to_s},];" if min_score > 0
73
+ Dir.mktmpdir do |dir|
74
+ self.save(dir + "/rocker")
75
+ self.rrun "w <- read.table('#{dir}/rocker', sep='\\t', h=F);"
76
+ end
77
+ self.rrun "w <- w[!is.na(w$V5),];"
78
+ if self.rrun("nrow(w)", :int)==0
79
+ warn "\nWARNING: Insufficient windows with estimated thresholds.\n\n"
80
+ return false
81
+ end
82
+ self.rrun <<-EOC
83
+ w$tp<-0; w$fp<-0; w$tn<-0; w$fn<-0;
84
+ for(i in 1:nrow(x)){
85
+ m <- x$V6[i];
86
+ win <- which( (m>=w$V1) & (m<=w$V2))[1];
87
+ if(!is.na(win)){
88
+ if(x$V4[i] >= w$V5[win]){
89
+ if(x$V5[i]==1){ w$tp[win] <- w$tp[win]+1 }else{ w$fp[win] <- w$fp[win]+1 };
90
+ }else{
91
+ if(x$V5[i]==1){ w$fn[win] <- w$fn[win]+1 }else{ w$tn[win] <- w$tn[win]+1 };
92
+ }
93
+ }
94
+ }
95
+ EOC
96
+ r.run <<-EOC
97
+ w$p <- w$tp + w$fp;
98
+ w$n <- w$tn + w$fn;
99
+ w$sensitivity <- 100*w$tp/(w$tp+w$fn);
100
+ w$specificity <- 100*w$tn/(w$fp+w$tn);
101
+ w$accuracy <- 100*(w$tp+w$tn)/(w$p+w$n);
102
+ w$precision <- 100*w$tp/(w$tp+w$fp);
103
+ EOC
104
+
105
+ return true
106
+ end
107
+ def init_windows!(size)
108
+ @windows = []
109
+ 1.step(self.aln.cols,size).each { |a| @windows << ROCWindow.new(self, a, a+size-1) }
110
+ end
111
+ def rrun(cmd, type=nil) self.r.run cmd, type end
112
+ def save(file)
113
+ f = File.open(file, "w")
114
+ f.print self.to_s
115
+ f.close
116
+ end
117
+ def to_s
118
+ o = ''
119
+ self.windows.each{|w| o += w.to_s}
120
+ o += self.aln.to_s
121
+ return o
122
+ end
123
+ end
124
+
@@ -0,0 +1,63 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class ROCWindow
9
+ attr_reader :data, :from, :to, :hits, :tps, :thr
10
+ def initialize(data, from=nil, to=nil)
11
+ @data = data
12
+ if from.is_a? String
13
+ r = from.split(/\t/)
14
+ @from = r[0].to_i
15
+ @to = r[1].to_i
16
+ @hits = r[2].to_i
17
+ @tps = r[3].to_i
18
+ @thr = r[4].to_f
19
+ else
20
+ a = from.nil? ? 1 : [from,1].max
21
+ b = to.nil? ? data.aln.cols : [to,data.aln.cols].min
22
+ @from = [a,b].min
23
+ @to = [a,b].max
24
+ @thr = nil
25
+ self.compute!
26
+ end
27
+ end
28
+ def compute!
29
+ self.load_hits
30
+ @hits = self.rrun "nrow(y);", :int
31
+ @tps = self.rrun "sum(y$V5);", :int
32
+ unless self.almost_empty
33
+ self.rrun "rocobj <- roc(y$V5, y$V4);"
34
+ thr = self.rrun 'coords(rocobj, "best", ret="threshold", best.method="youden", best.weights=c(0.5, sum(y$V5)/nrow(y)))[1];', :float
35
+ @thr = thr.to_f
36
+ @thr = nil if @thr==0.0 or @thr.infinite?
37
+ end
38
+ end
39
+ def around_thr
40
+ a = self.previous
41
+ b = self.next
42
+ while not a.nil? and a.thr.nil?
43
+ a = a.previous
44
+ end
45
+ while not b.nil? and b.thr.nil?
46
+ b = b.next
47
+ end
48
+ return nil if a.nil? and b.nil?
49
+ return a.thr if b.nil?
50
+ return b.thr if a.nil?
51
+ return (b.thr*(self.from-a.from) - a.thr*(self.from-b.from))/(b.from-a.from)
52
+ end
53
+ def load_hits() self.rrun "y <- x[x$V6>=#{self.from} & x$V6<=#{self.to},];" end
54
+ def previous() (self.from == 1) ? nil : self.data.win_at_col(self.from - 1) end
55
+ def next() (self.to == self.data.aln.cols) ? nil : self.data.win_at_col(self.to + 1) end
56
+ def thr_notnil() (@thr.nil? or @thr.infinite?) ? self.around_thr : @thr end
57
+ def fps() self.hits - self.tps end
58
+ def almost_empty() self.fps < 3 or self.tps < 3 end
59
+ def length() self.to - self.from + 1 end
60
+ def rrun(cmd, type=nil) self.data.rrun cmd, type end
61
+ def to_s() [self.from, self.to, self.hits, self.tps, self.thr_notnil].join("\t") + "\n" end
62
+ end
63
+
@@ -0,0 +1,38 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class Sequence
9
+ attr_reader :id, :seq, :aln
10
+ def initialize(id, aln)
11
+ @id = id
12
+ @aln = aln.gsub(/[-\.]/,'-').gsub(/[^A-Za-z-]/, '').upcase
13
+ @seq = aln.gsub(/[^A-Za-z]/, '').upcase
14
+ end
15
+ def pos2col(pos)
16
+ col = 0
17
+ self.aln.split(//).each do |c|
18
+ col+=1
19
+ pos-=1 unless c=='-'
20
+ return col if pos==0
21
+ end
22
+ col
23
+ end
24
+ def col2pos(col)
25
+ pos = 1
26
+ self.aln.split(//).each do |c|
27
+ col-=1
28
+ pos+=1 unless c=='-'
29
+ return pos if col==0
30
+ end
31
+ pos
32
+ end
33
+ def cols() self.aln.length end
34
+ def length() self.seq.length end
35
+ def to_seq_s() ">#{self.id}\n#{self.seq}\n" end
36
+ def to_s() "#:>#{self.id}\n#:#{self.aln}\n" end
37
+ end
38
+
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-rocker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Luis (Coto) Orellana
8
+ - Luis M. Rodriguez-R
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-01-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Detecting and quantifying functional genes in short-read metagenomic
15
+ datasets
16
+ email: lhorellana@gatech.edu
17
+ executables:
18
+ - ROCker
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/rocker.rb
23
+ - lib/rocker/sequence.rb
24
+ - lib/rocker/alignment.rb
25
+ - lib/rocker/blasthit.rb
26
+ - lib/rocker/rocwindow.rb
27
+ - lib/rocker/rocdata.rb
28
+ - lib/rocker/rinterface.rb
29
+ - bin/ROCker
30
+ homepage: http://enve-omics.ce.gatech.edu/rocker
31
+ licenses:
32
+ - artistic 2.0
33
+ metadata: {}
34
+ post_install_message:
35
+ rdoc_options: []
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubyforge_project:
50
+ rubygems_version: 2.0.14
51
+ signing_key:
52
+ specification_version: 4
53
+ summary: ROCker
54
+ test_files: []