bio-rocker 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 51e493216ea815f9919322e9b1319f9914d824c7
4
+ data.tar.gz: 8c7bf556507ef0b11c5f2477fad589a62c016f11
5
+ SHA512:
6
+ metadata.gz: b57faf770219b8ab364b25f59696852f219a204f22b12be5fb89d7932e0cb10429316e0b7645bf7e7a18c56590a669f605f444e667cb4a9467626db100df0b34
7
+ data.tar.gz: 47223060bd29a689406067e9bff41352ffa841c8475a25943e83dc6cfc2901ae7ad8f3b8eea1a44c5cfbe0b6fccac45dd1cf493eb0000ed19877f451bf0463e8
@@ -0,0 +1,180 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @author Luis (Coto) Orellana
5
+ # @license artistic license 2.0
6
+ # @update Jan-22-2015
7
+ #
8
+
9
+ require 'rocker'
10
+ require 'optparse'
11
+
12
+
13
+ #================================[ Options parsing ]
14
+ $t = {
15
+ 'build' => 'Creates in silico metagenomes and training sets from reference genomes.',
16
+ 'compile' => 'Identifies the most discriminant bit-score per alignment position in a set of sequence.',
17
+ 'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
18
+ 'search' => 'Uses a ROCker compilation to identify reads putatively derived from a set of sequences.',
19
+ 'plot' => 'Generates a graphical representation of the alignment, the thresholds, and the hits.',
20
+ }
21
+ task = (ARGV.size > 0 ? ARGV.shift : '').downcase
22
+ ARGV << '-h' if ARGV.size==0
23
+
24
+ o = {}
25
+ opts = OptionParser.new do |opt|
26
+ if $t.keys.include? task
27
+ opt.banner = "Usage: ROCker.rb #{task} [options]"
28
+ opt.separator ""
29
+ opt.separator $t[task]
30
+ opt.separator ""
31
+ end
32
+ case task
33
+ when 'build'
34
+ unless ROCker.has_build_gems?
35
+ opt.separator "+ UNSATISFIED REQUIREMENTS"
36
+ opt.separator " The building task requires uninstalled gems, please install them executing:"
37
+ opt.separator " gem install rest_client"
38
+ opt.separator " gem install nokogiri"
39
+ opt.separator ""
40
+ end
41
+ opt.separator "+ BUILDING ARGUMENTS"
42
+ opt.on("-p", "--positive GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
43
+ opt.on("-n", "--negative GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
44
+ opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
45
+ opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
46
+ opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
47
+ opt.separator ""
48
+ opt.separator "+ ADVANCED BUILDING ARGUMENTS"
49
+ opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one GI per line. If used, -p is not required."){ |v| o[:posfile]=v }
50
+ opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one GI per line."){ |v| o[:negfile]=v }
51
+ opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain GI numbers. If used, -p is not required."){ |v| o[:aln]=v }
52
+ opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
53
+ opt.on("-v", "--overlap NUMBER", "Minimum overlap with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_f }
54
+ opt.on( "--genome-frx NUMBER", "Fraction to subsample the positive set genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}"){ |v| o[:genomefrx]=v.to_f }
55
+ opt.on( "--per-genus", "If selected, only one genome per genus is used to build the metagenome."){ o[:pergenus]=true }
56
+ opt.on( "--per-species", "If selected, only one genome per species is used to build the metagenome."){ o[:perspecies]=true }
57
+ opt.on( "--nometagenome", "Do not create metagenome. Implies --noblast. By default, metagenome is created."){ |v| o[:nomg]=v }
58
+ opt.on( "--noblast", "Do not execute BLAST. By default, BLAST is executed."){ |v| o[:noblast]=v }
59
+ opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
60
+ opt.on( "--nocleanup", "Keep all intermediate files. By default, intermediate files are removed."){ |v| o[:noclean]=v }
61
+ opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
62
+ opt.separator ""
63
+ opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
64
+ opt.on("-G", "--grinder PATH", "Path to the grinder executable. By default: '#{ROCker.default :grinder}' (in the $PATH)."){ |v| o[:grinder]=v }
65
+ opt.on("-M", "--muscle PATH", "Path to the muscle executable. By default: '#{ROCker.default :muscle}' (in the $PATH)."){ |v| o[:muscle]=v }
66
+ opt.on("-B", "--blastbins PATH", "Path to the Blast+ executables. By default: '#{ROCker.default :blastbins}' (in the $PATH)."){ |v| o[:blastbins]=v }
67
+ opt.on( "--grinder-cmd STR", "Command calling grinder, where %1$s: grinder bin, %2$s: input, %3$s: seq. depth, %4$s: output.",
68
+ "By default: '#{ROCker.default :grindercmd}'."){ |v| o[:grindercmd]=v }
69
+ opt.on("--muscle-cmd STR", "Command calling muscle, where %1$s: muscle bin, %2$s: input, %3$s: output.",
70
+ "By default: '#{ROCker.default :musclecmd}'."){ |v| o[:musclecmd]=v }
71
+ opt.on("--blast-cmd STR", "Command calling BLAST search, where %1$s: blast bins, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
72
+ "By default: '#{ROCker.default :blastcmd}'."){ |v| o[:blastcmd]=v }
73
+ opt.on("--makedb-cmd STR", "Command calling BLAST format, where %1$s: blast bins, %2$s: dbtype, %3$s: input, %4$s: database.",
74
+ "By default: '#{ROCker.default :makedbcmd}'."){ |v| o[:makedbcmd]=v }
75
+ when 'compile'
76
+ opt.separator "+ COMPILATION ARGUMENTS"
77
+ opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. Required."){ |v| o[:aln]=v }
78
+ opt.on("-b", "--ref-blast PATH",
79
+ "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
80
+ opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
81
+ opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
82
+ opt.separator ""
83
+ opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
84
+ opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
85
+ opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
86
+ opt.on( "--norefine", "Do not refine windows."){ o[:refine]=false }
87
+ opt.on("-w", "--window INT", "Initial size of alignment windows (in number of AA columns). By default: #{ROCker.default :win}."){ |v| o[:win]=v.to_i }
88
+ opt.separator ""
89
+ opt.separator "+ INPUT/OUTPUT"
90
+ opt.separator " o The input alignment (-a) MUST be in FastA format, and the IDs must"
91
+ opt.separator " coincide with those from the BLAST (-b)."
92
+ opt.separator " o The input BLAST (-b) MUST be in tabular format. True positives must"
93
+ opt.separator " contain the string '@%' somewhere in the query ID."
94
+ opt.separator " o The table file (-t) should be tab-delimited and contain six columns:"
95
+ opt.separator " 1. Subject ID."
96
+ opt.separator " 2. Start of alignment in subject (translated to alignment column)."
97
+ opt.separator " 3. End of alignment in subject (translated to alignment column)."
98
+ opt.separator " 4. Bit score."
99
+ opt.separator " 5. A number indicating if it was a true (1) or a false (0) positive."
100
+ opt.separator " 6. Mid-point of the alignment in the reference sequence."
101
+ opt.separator " o The ROCker file (-k) is a tab-delimited file containing five columns:"
102
+ opt.separator " 1. First column of the window in the alignment."
103
+ opt.separator " 2. Last column of the window in the alignment."
104
+ opt.separator " 3. Number of positives in the window (hits)."
105
+ opt.separator " 4. Number of true positives in the window."
106
+ opt.separator " 5. Bit score threshold set for the window."
107
+ opt.separator " The file also contains the alignment (commented with #:)."
108
+ opt.separator ""
109
+ when 'filter'
110
+ opt.separator "+ FILTERING ARGUMENTS"
111
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
112
+ opt.on("-x", "--query-blast PATH", "Tabular BLAST (blastx) of the query reads vs. the reference dataset. Required."){ |v| o[:qblast]=v }
113
+ opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
114
+ when 'search'
115
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
116
+ opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
117
+ opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
118
+ when 'plot'
119
+ opt.separator "+ PLOTTING ARGUMENTS"
120
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
121
+ opt.on("-b", "--ref-blast PATH",
122
+ "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
123
+ opt.on("-o", "--plot-file PATH", "File to be created with the plot. By default: value of -k + '.' + value of -f."){ |v| o[:gout]=v }
124
+ opt.separator ""
125
+ opt.separator "+ ADVANCED PLOTTING ARGUMENTS"
126
+ opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
127
+ opt.on( "--color", "Color alignment by amino acid."){ o[:color]=true }
128
+ opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
129
+ opt.on("-s", "--subject SBJ1,SBJ2,...", Array,
130
+ "Plot only information regarding this(ese) subject(s). If multiple, separate by comma. By default, all hits are plotted."){ |v| o[:sbj]=v }
131
+ opt.on("-f", "--plot-format STRING",
132
+ "Format of the plot file. Supported values: pdf (default), png, jpeg, and tiff."){ |v| o[:gformat]=v }
133
+ opt.on("-W", "--width NUMBER", "Width of the plot in inches. By default: #{ROCker.default :width}."){ |v| o[:width]=v.to_f }
134
+ opt.on("-H", "--height NUMBER", "Height of the plot in inches. By defaule: #{ROCker.default :height}."){ |v| o[:width]=v.to_f }
135
+ else
136
+ opt.banner = "Usage: ROCker.rb [task] [options]"
137
+ opt.separator ""
138
+ opt.separator "Please specify one of the following tasks:"
139
+ $t.keys.each{ |t| opt.separator " #{t}:\t#{$t[t]}" }
140
+ end
141
+ opt.separator ""
142
+ opt.separator "+ GENERAL ARGUMENTS"
143
+ opt.on("-R", "--path-to-r PATH", "Path to the R executable to be used. By default: '#{ROCker.default :r}'."){ |v| o[:r]=v }
144
+ opt.on("-q", "--quiet", "Run quietly."){ |v| o[:q]=true }
145
+ opt.on("-d", "--debug", "Display debugging information."){ |v| o[:debug]=true }
146
+ opt.on("-h", "--help","Display this screen") do
147
+ puts opt
148
+ exit
149
+ end
150
+ opt.separator ""
151
+ unless $t.include? task
152
+ puts opt
153
+ exit
154
+ end
155
+ end
156
+ opts.parse!
157
+
158
+
159
+ #================================[ Main ]
160
+ rocker = ROCker.new(o)
161
+ begin
162
+ case task
163
+ when 'build'
164
+ rocker.build!
165
+ when 'compile'
166
+ rocker.compile!
167
+ when 'filter'
168
+ rocker.filter!
169
+ when 'search'
170
+ rocker.search!
171
+ when 'plot'
172
+ rocker.plot!
173
+ end
174
+ rescue => err
175
+ $stderr.puts "Exception: #{err}\n\n"
176
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
177
+ err
178
+ end
179
+
180
+
@@ -0,0 +1,503 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ require 'rocker/blasthit'
9
+ require 'rocker/rocdata'
10
+
11
+ class ROCker
12
+ #================================[ Class ]
13
+ @@EUTILS = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils'
14
+ @@DEFAULTS = {
15
+ # General
16
+ :q=>false, :r=>'R', :nucl=>false, :debug=>false,
17
+ # Build
18
+ :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0, :pergenus=>false, :perspecies=>false,
19
+ # ext. software
20
+ :grinder=>'grinder', :muscle=>'muscle', :blastbins=>'', :seqdepth=>3, :minovl=>0.75,
21
+ :grindercmd=>'%1$s -reference_file "%2$s" -cf "%3$f" -base_name "%4$s" -dc \'-~*Nn\' -md "poly4 3e-3 3.3e-8" -mr "95 5" -rd "100 uniform 5"',
22
+ :musclecmd=>'%1$s -in "%2$s" -out "%3$s" -quiet',
23
+ :blastcmd=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
24
+ :makedbcmd=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
25
+ # Compile
26
+ :refine=>true, :win=>20, :minscore=>0,
27
+ # Filter
28
+ :sbj=>[],
29
+ # Plot
30
+ :color=>false, :gformat=>'pdf', :width=>9, :height=>9
31
+ }
32
+ @@HAS_BUILD_GEMS = nil
33
+ def self.eutils() @@EUTILS end
34
+ def self.defaults() @@DEFAULTS end
35
+ def self.default(k) @@DEFAULTS[k] end
36
+ def self.has_build_gems?
37
+ return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
38
+ @@HAS_BUILD_GEMS = TRUE
39
+ begin
40
+ require 'rubygems'
41
+ require 'restclient'
42
+ require 'nokogiri'
43
+ rescue LoadError
44
+ @@HAS_BUILD_GEMS = FALSE
45
+ end
46
+ @@HAS_BUILD_GEMS
47
+ end
48
+
49
+ #================================[ Instance ]
50
+ attr_reader :o
51
+ def initialize(opts)
52
+ @o = ROCker.defaults
53
+ opts.each{ |k,v| @o[k] = v }
54
+ RInterface.R_BIN = opts[:r] unless opts[:r].nil?
55
+ end
56
+
57
+ #================================[ Build ]
58
+ def build!
59
+ # Check requirements
60
+ puts "Testing environment." unless @o[:q]
61
+ @o[:noblast]=true if @o[:nomg]
62
+ raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
63
+ @o[:positive] += @o[:posori] unless @o[:posori].nil?
64
+ @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
65
+ @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
66
+ unless @o[:aln].nil?
67
+ aln = Alignment.new
68
+ aln.read_fasta @o[:aln]
69
+ @o[:positive] += aln.get_gis
70
+ end
71
+ raise "-p or -P are mandatory." if @o[:positive].size==0
72
+ raise "-o/--baseout is mandatory." if @o[:baseout].nil?
73
+ if @o[:positive].size == 1 and not @o[:noaln]
74
+ warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
75
+ @o[:noaln] = true
76
+ end
77
+ self.bash "#{@o[:grinder]} --version", "-G/--grinder must be executable. Is Grinder installed?" unless @o[:nomg]
78
+ self.bash "#{@o[:muscle]} -version", "-M/--muscle must be executable. Is Muscle installed?" unless @o[:noaln]
79
+ self.bash "#{@o[:blastbins]}makeblastdb -version", "-B/--blastbins must contain executables. Is BLAST+ installed?" unless @o[:noblast]
80
+ # Download genes
81
+ puts "Downloading gene data." unless @o[:q]
82
+ f = File.open(@o[:baseout] + '.ref.fasta', 'w')
83
+ if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
84
+ puts " * re-using aligned sequences as positive set." unless @o[:q]
85
+ f.print aln.to_seq_s
86
+ @o[:noaln] = true
87
+ else
88
+ puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
89
+ $stderr.puts " # #{@o[:positive]}" if @o[:debug]
90
+ ids = Array.new(@o[:positive])
91
+ while ids.size>0
92
+ f.print efetch({:db=>(@o[:nucl] ? 'nuccore' : 'protein'), :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
93
+ end
94
+ end
95
+ f.close
96
+ genome_gis = {:positive=>[], :negative=>[]}
97
+ [:positive, :negative].each do |set|
98
+ unless @o[set].size==0
99
+ puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
100
+ $stderr.puts " # #{@o[set]}" if @o[:debug]
101
+ genome_gis[set] = genes2genomes(@o[set], @o[:nucl])
102
+ end
103
+ end
104
+ raise "No genomes associated with the positive set." if genome_gis[:positive].size==0
105
+ genome_gis[:positive] = genome_gis[:positive].sample( (genome_gis[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
106
+ raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_gis[:positive].empty?
107
+ all_gis = genome_gis.values.reduce(:+).uniq
108
+
109
+ # Locate genes
110
+ puts "Analyzing genome data." unless @o[:q]
111
+ puts " * downloading and parsing #{genome_gis[:positive].size} XML file(s)." unless @o[:q]
112
+ $stderr.puts " # #{genome_gis[:positive]}" if @o[:debug]
113
+ positive_coords = {}
114
+ genome_org = {}
115
+ i = 0
116
+ genome_gis[:positive].each do |gi|
117
+ print " * scanning #{(i+=1).ordinalize} genome out of #{genome_gis[:positive].size}. \r" unless @o[:q]
118
+ $stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
119
+ genome_file = @o[:baseout] + '.src.' + i.to_s + '.xml'
120
+ if @o[:reuse] and File.exist? genome_file
121
+ puts " * reusing existing file: #{genome_file}." unless @o[:q]
122
+ ifh = File.open(genome_file, 'r')
123
+ doc = Nokogiri::XML( ifh )
124
+ ifh.close
125
+ else
126
+ genome_file=nil unless @o[:noclean]
127
+ res = efetch({:db=>'nuccore', :id=>gi, :rettype=>'xml', :retmode=>'text'}, genome_file)
128
+ doc = Nokogiri::XML( res )
129
+ end
130
+ incomplete = true
131
+ doc.xpath('//Bioseq-set/Bioseq-set_seq-set/Seq-entry').each do |genome|
132
+ genome_gi = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_seq-set/Seq-entry/Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_gi')
133
+ if !genome_gi.nil? and gi==genome_gi.content
134
+ incomplete = false
135
+ positive_coords[gi] ||= []
136
+ $stderr.puts "\n # got #{gi}, scanning" if @o[:debug]
137
+ if @o[:pergenus] or @o[:perspecies]
138
+ name = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/BioSource_org/Org-ref/Org-ref_orgname/OrgName/OrgName_name/OrgName_name_binomial/BinomialOrgName')
139
+ unless name.nil?
140
+ name_g = name.at_xpath('./BinomialOrgName_genus')
141
+ name_s = name.at_xpath('./BinomialOrgName_species')
142
+ if name_g.nil? or (name_s.nil? and @o[:perspecies])
143
+ name = nil
144
+ else
145
+ name = @o[:perspecies] ? name_g.content + " " + name_s.content : name_g.content
146
+ end
147
+ end
148
+ if name.nil?
149
+ warn "WARNING: Cannot find binomial name of #{gi}, using genome regardless of taxonomy."
150
+ name = rand(36**100).to_s(36)
151
+ end
152
+ break unless genome_org[ name ].nil?
153
+ genome_org[ name ] = gi
154
+ end
155
+ $stderr.puts " # traversing #{gi}" if @o[:debug]
156
+ genome.xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_annot/Seq-annot/Seq-annot_data/Seq-annot_data_ftable/Seq-feat').each do |pr|
157
+ pr_gi = pr.at_xpath('./Seq-feat_product/Seq-loc/Seq-loc_whole/Seq-id/Seq-id_gi')
158
+ next if pr_gi.nil?
159
+ if @o[:positive].include? pr_gi.content
160
+ $stderr.puts " # found #{pr_gi.content}" if @o[:debug]
161
+ pr_loc = pr.at_xpath('./Seq-feat_location/Seq-loc/Seq-loc_int/Seq-interval')
162
+ if pr_loc.nil?
163
+ pr_loc = pr.xpath('./Seq-feat_location/Seq-loc/Seq-loc_mix//Seq-loc/Seq-loc_int/Seq-interval')
164
+ if pr_loc.nil?
165
+ warn "WARNING: Impossible to find location of '#{pr_gi.content}' in '#{gi}'."
166
+ incomplete = true
167
+ else
168
+ pr_loc.each do |loc_int|
169
+ positive_coords[gi] << {
170
+ :gi => pr_gi.content,
171
+ :from => loc_int.at_xpath('./Seq-interval_from').content.to_i,
172
+ :to => loc_int.at_xpath('./Seq-interval_to').content.to_i
173
+ #, :strand => loc_int.at_xpath('./Seq-interval_strand/Na-strand/@value').content
174
+ }
175
+ end
176
+ end
177
+ else
178
+ positive_coords[gi] << {
179
+ :gi => pr_gi.content,
180
+ :from => pr_loc.at_xpath('./Seq-interval_from').content.to_i,
181
+ :to => pr_loc.at_xpath('./Seq-interval_to').content.to_i
182
+ #, :strand => pr_loc.at_xpath('./Seq-interval_strand/Na-strand/@value').content
183
+ }
184
+ end
185
+ end
186
+ end
187
+ break
188
+ end
189
+ end
190
+ doc = nil
191
+ warn "WARNING: Cannot find GI '#{gi}'." if incomplete
192
+ end
193
+ genome_gis[:positive] = genome_org.values if @o[:pergenus] or @o[:perspecies]
194
+ all_gis = genome_gis.values.reduce(:+).uniq
195
+ print "\n" unless @o[:q]
196
+ missing = @o[:positive] - positive_coords.values.map{ |a| a.map{ |b| b[:gi] } }.reduce(:+)
197
+ warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or @o[:pergenus] or @o[:perspecies]
198
+
199
+ # Download genomes
200
+ genomes_file = @o[:baseout] + '.src.fasta'
201
+ if @o[:reuse] and File.exist? genomes_file
202
+ puts " * reusing existing file: #{genomes_file}." unless @o[:q]
203
+ else
204
+ puts " * downloading #{all_gis.size} genome(s) in FastA." unless @o[:q]
205
+ $stderr.puts " # #{all_gis}" if @o[:debug]
206
+ ids = Array.new(all_gis)
207
+ ofh = File.open(genomes_file, 'w')
208
+ while ids.size>0
209
+ ofh.print efetch({:db=>'nuccore', :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
210
+ end
211
+ ofh.close
212
+ end
213
+
214
+ # Generate metagenome
215
+ unless @o[:nomg]
216
+ puts "Generating in silico metagenome" unless @o[:q]
217
+ if @o[:reuse] and File.exist? @o[:baseout] + ".mg.fasta"
218
+ puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
219
+ else
220
+ all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
221
+ thrs = [@o[:thr], all_src].min
222
+ puts " * running grinder and tagging positive reads (#{thrs} threads)." unless @o[:q]
223
+ $stderr.puts " # #{positive_coords}" if @o[:debug]
224
+ thr_obj = []
225
+ seqs_per_thr = (all_src/thrs).ceil
226
+ (0 .. (thrs-1)).each do |thr_i|
227
+ thr_obj << Thread.new do
228
+ Thread.current[:seqs_a] = thr_i*seqs_per_thr + 1
229
+ Thread.current[:seqs_b] = [Thread.current[:seqs_a] + seqs_per_thr, all_src].min
230
+ # Create sub-fasta
231
+ Thread.current[:ofh] = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", 'w')
232
+ Thread.current[:ifh] = File.open("#{@o[:baseout]}.src.fasta", 'r')
233
+ Thread.current[:seq_i] = 0
234
+ while Thread.current[:l] = Thread.current[:ifh].gets
235
+ Thread.current[:seq_i]+=1 if Thread.current[:l] =~ /^>/
236
+ break if Thread.current[:seq_i] > Thread.current[:seqs_b]
237
+ Thread.current[:ofh].print Thread.current[:l] if Thread.current[:seq_i] >= Thread.current[:seqs_a]
238
+ end
239
+ Thread.current[:ifh].close
240
+ Thread.current[:ofh].close
241
+ bash sprintf(@o[:grindercmd], @o[:grinder], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
242
+ # Tag positives
243
+ puts " * tagging positive reads." unless @o[:q]
244
+ Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
245
+ Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
246
+ while Thread.current[:l]=Thread.current[:ifh].gets
247
+ Thread.current[:rd] = /^>(?<id>\d+) reference=gi\|(?<gi>\d+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
248
+ unless Thread.current[:rd].nil?
249
+ Thread.current[:positive] = false
250
+ positive_coords[Thread.current[:rd][:gi]] ||= []
251
+ positive_coords[Thread.current[:rd][:gi]].each do |gn|
252
+ Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
253
+ Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
254
+ if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
255
+ Thread.current[:positive] = true
256
+ break
257
+ end
258
+ end
259
+ Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:gi]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
260
+ end
261
+ Thread.current[:ofh].print Thread.current[:l]
262
+ end
263
+ Thread.current[:ofh].close
264
+ Thread.current[:ifh].close
265
+ Thread.current[:output] = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
266
+ end # Thread.new do
267
+ end # (1 .. thrs).each
268
+ # Concatenate results
269
+ ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
270
+ thr_obj.each do |t|
271
+ t.join
272
+ raise "Thread failed without error trace: #{t}" if t[:output].nil?
273
+ ifh = File.open(t[:output], 'r')
274
+ while l = ifh.gets
275
+ ofh.print l
276
+ end
277
+ ifh.close
278
+ File.unlink t[:output]
279
+ end
280
+ ofh.close
281
+ end
282
+ end # unless @o[:nomg]
283
+ # Align references
284
+ unless @o[:noaln]
285
+ puts "Aligning reference set." unless @o[:q]
286
+ if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.aln"
287
+ puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
288
+ else
289
+ bash sprintf(@o[:musclecmd], @o[:muscle], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln")
290
+ puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
291
+ end
292
+ end
293
+ # Run BLAST
294
+ unless @o[:noblast]
295
+ puts "Running homology search." unless @o[:q]
296
+ if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.blast"
297
+ puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
298
+ else
299
+ puts " * preparing database." unless @o[:q]
300
+ bash sprintf(@o[:makedbcmd], @o[:blastbins], (@o[:nucl]?'nucl':'prot'), "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
301
+ puts " * running BLAST." unless @o[:q]
302
+ bash sprintf(@o[:blastcmd], @o[:blastbins], (@o[:nucl]?'blastn':'blastx'), "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
303
+ end
304
+ end
305
+ # Clean
306
+ unless @o[:noclean]
307
+ puts "Cleaning." unless @o[:q]
308
+ sff = %w{.src.xml .src.fasta}
309
+ sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nomg]
310
+ sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:noblast]
311
+ sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
312
+ end
313
+ end # build!
314
+
315
+ #================================[ Compile ]
316
+ def compile!
317
+ raise "-a/--alignment is mandatory." if @o[:aln].nil?
318
+ raise "-a/--alignment must exist." unless File.exist? @o[:aln]
319
+ if @o[:table].nil?
320
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
321
+ @o[:table] = "#{@o[:blast]}.table"
322
+ end
323
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
324
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
325
+
326
+ puts "Testing environment." unless @o[:q]
327
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
328
+ bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
329
+
330
+ puts "Reading files." unless @o[:q]
331
+ puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
332
+ aln = Alignment.new
333
+ aln.read_fasta @o[:aln]
334
+
335
+ if File.exist? @o[:table]
336
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
337
+ else
338
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
339
+ blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
340
+ end
341
+
342
+ puts "Analyzing data." unless @o[:q]
343
+ puts " * computing windows." unless @o[:q]
344
+ data = ROCData.new(@o[:table], aln, @o[:win])
345
+ data.nucl = @o[:nucl]
346
+ if @o[:refine]
347
+ puts " * refining windows." unless @o[:q]
348
+ warn "Insufficient hits to refine results." unless data.refine! @o[:table]
349
+ end
350
+ puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
351
+ data.save @o[:rocker]
352
+ end # compile!
353
+
354
+ #================================[ Filter ]
355
+ def filter!
356
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
357
+ raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
358
+ raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
359
+
360
+ puts "Reading ROCker file." unless @o[:q]
361
+ data = ROCData.new @o[:rocker]
362
+
363
+ puts "Filtering BLAST." unless @o[:q]
364
+ ih = File.open(@o[:qblast], 'r')
365
+ oh = File.open(@o[:oblast], 'w')
366
+ while ln = ih.gets
367
+ bh = BlastHit.new(ln, data.aln)
368
+ oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
369
+ end
370
+ ih.close
371
+ oh.close
372
+ end # filter!
373
+ #================================[ Search ]
374
+ def search!
375
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
376
+ raise "Code Under development..."
377
+ # ToDo
378
+ # [ ... ]
379
+ end # search!
380
+
381
+ #================================[ Plot ]
382
+ def plot!
383
+ raise "-k/--rocker is mandatory." if o[:rocker].nil?
384
+ if @o[:table].nil?
385
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
386
+ @o[:table] = "#{@o[:blast]}.table"
387
+ end
388
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
389
+
390
+ puts "Testing environment." unless @o[:q]
391
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
392
+
393
+ puts "Reading files." unless @o[:q]
394
+ puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
395
+ data = ROCData.new @o[:rocker]
396
+ if File.exist? @o[:table]
397
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
398
+ else
399
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
400
+ blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
401
+ end
402
+
403
+ puts "Plotting hits." unless @o[:q]
404
+ extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
405
+ @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
406
+ data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
407
+ data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
408
+ some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
409
+ data.rrun "par(mar=c(0,4,0,0.5)+.1);"
410
+ data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
411
+ data.rrun "noise <- runif(ncol(x),-.2,.2)"
412
+ data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,.2), rgb(.5,0,0,.2)), length=0);"
413
+ data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,.5), rgb(.5,0,0,.5)), pch=19, cex=1/4);"
414
+
415
+ puts "Plotting windows." unless @o[:q]
416
+ if some_thr
417
+ data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
418
+ data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
419
+ end
420
+ data.rrun "legend('bottomright',legend=c('Hit span','Hit mid-point','Reference','Non-reference')," +
421
+ "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
422
+
423
+ puts "Plotting alignment." unless @o[:q]
424
+ data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
425
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
426
+ i = 0
427
+ data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
428
+ data.aln.seqs.values.each do |s|
429
+ color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
430
+ data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
431
+ i += 1
432
+ end
433
+
434
+ puts "Plotting statistics." unless @o[:q]
435
+ data.rrun "par(mar=c(5,4,0,0.5)+.1);"
436
+ unless @o[:q] or not some_thr
437
+ puts " * sensitivity: #{data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float}%"
438
+ puts " * specificity: #{data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float}%"
439
+ puts " * accuracy: #{data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float}%"
440
+ end
441
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(50,100),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
442
+ if some_thr
443
+ data.rrun "pos <- (w$V1+w$V2)/2"
444
+ data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
445
+ data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
446
+ data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
447
+ #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
448
+ end
449
+ data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
450
+ data.rrun "dev.off();"
451
+ end # plot!
452
+
453
+ #================================[ Utilities ]
454
+ def blast2table(blast_f, table_f, aln, minscore)
455
+ ifh = File.open(blast_f, "r")
456
+ ofh = File.open(table_f, "w")
457
+ while ln = ifh.gets
458
+ bh = BlastHit.new(ln, aln)
459
+ ofh.print bh.to_s if bh.bits >= minscore
460
+ end
461
+ ifh.close
462
+ ofh.close
463
+ end
464
+ def genes2genomes(gis, nucl=false)
465
+ genomes = []
466
+ ids = Array.new(gis)
467
+ while ids.size>0
468
+ doc = Nokogiri::XML( elink({:dbfrom=>(nucl ? 'nuccore' : 'protein'), :db=>'nuccore', :id=>ids.shift(200).join(',')}) )
469
+ genomes += doc.xpath('/eLinkResult/LinkSet/LinkSetDb/Link/Id').map{ |id| id.content }
470
+ end
471
+ genomes.uniq
472
+ end
473
+ def eutils(script, params={}, outfile=nil)
474
+ response = RestClient.get "#{ROCker.eutils}/#{script}", {:params=>params}
475
+ raise "Unable to reach NCBI EUtils, error code #{response.code}." unless response.code == 200
476
+ unless outfile.nil?
477
+ ohf = File.open(outfile, 'w')
478
+ ohf.print response.to_s
479
+ ohf.close
480
+ end
481
+ response.to_s
482
+ end
483
+ def efetch(*etc) self.eutils 'efetch.fcgi', *etc end
484
+ def elink(*etc) self.eutils 'elink.fcgi', *etc end
485
+ def bash(cmd, err_msg=nil)
486
+ o = `#{cmd} 2>&1 && echo '{'`
487
+ raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
488
+ true
489
+ end
490
+ end
491
+
492
+ #================================[ Extensions ]
493
+ class Numeric
494
+ def ordinalize
495
+ n= self.to_s
496
+ s= n[-2]=='1' ? 'th' :
497
+ n[-1]=='1' ? 'st' :
498
+ n[-1]=='2' ? 'nd' :
499
+ n[-1]=='3' ? 'rd' : 'th'
500
+ n + s
501
+ end
502
+ end
503
+
@@ -0,0 +1,63 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ require 'rocker/sequence'
9
+
10
+
11
+ class Alignment
12
+ attr_reader :seqs, :cols
13
+ def initialize
14
+ @seqs = {}
15
+ end
16
+ def read_fasta(file) self.read_file(file, false) end
17
+ def read_rocker(file) self.read_file(file, true) end
18
+ def read_file(file, is_rocker)
19
+ f = File.open(file, 'r')
20
+ id = nil
21
+ sq = ""
22
+ while ln = f.gets
23
+ if is_rocker
24
+ next if /^#:(.*)/.match(ln).nil?
25
+ ln = $1
26
+ end
27
+ m = /^>(\S+)/.match(ln)
28
+ if m.nil?
29
+ sq += ln
30
+ else
31
+ self << Sequence.new(id, sq) unless id.nil?
32
+ id = m[1]
33
+ sq = ""
34
+ end
35
+ end
36
+ self << Sequence.new(id, sq) unless id.nil?
37
+ end
38
+ def <<(seq)
39
+ @seqs[seq.id] = seq
40
+ @cols = seq.cols if self.cols.nil?
41
+ raise "Aligned sequence #{seq.id} has a different length (#{seq.cols} vs #{self.cols})" unless seq.cols == self.cols
42
+ end
43
+ def get_gis
44
+ regexps = [/^gi\|(\d+)\|/, /^(\d+)\|/, /^(\d+)$/, /^gi\|(\d+)$/, /\|gi\|(\d+)\|/, /\|gi\|(\d+)$/]
45
+ gis = []
46
+ self.seqs.keys.each do |id|
47
+ gi = nil
48
+ regexps.each do |regexp|
49
+ unless regexp.match(id).nil?
50
+ gi = $1
51
+ break
52
+ end
53
+ end
54
+ gis << gi unless gi.nil?
55
+ end
56
+ gis
57
+ end
58
+ def seq(id) @seqs[id] end
59
+ def size() self.seqs.size end
60
+ def to_seq_s() self.seqs.values.map{|s| s.to_seq_s}.join + "\n" end
61
+ def to_s() self.seqs.values.map{|s| s.to_s}.join + "\n" end
62
+ end
63
+
@@ -0,0 +1,39 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class BlastHit
9
+ attr_reader :sbj, :sfrom, :sto, :bits, :istrue, :midpoint
10
+ # Initialize from BLAST using new(ln,aln), initialize from TABLE using new(ln)
11
+ def initialize(ln, aln=nil)
12
+ l = ln.chomp.split(/\t/)
13
+ if aln.nil?
14
+ @sbj = l[0]
15
+ @sfrom = l[1].to_i
16
+ @sto = l[2].to_i
17
+ @bits = l[3].to_f
18
+ @istrue = l[4]=='1'
19
+ @midpoint = l[5].to_i
20
+ else
21
+ s = aln.seq(l[1])
22
+ return nil if s.nil?
23
+ @sbj = s.id
24
+ a = s.pos2col(l[8].to_i)
25
+ b = s.pos2col(l[9].to_i)
26
+ @sfrom = [a,b].min
27
+ @sto = [a,b].max
28
+ @bits = l[11].to_f
29
+ @istrue = ! /@%/.match(l[0]).nil?
30
+ @midpoint = s.pos2col(((l[8].to_f+l[9].to_f)/2).ceil)
31
+ end
32
+ end
33
+ def to_s
34
+ self.sbj.nil? ? "" :
35
+ [self.sbj, self.sfrom.to_s, self.sto.to_s, self.bits.to_s,
36
+ self.istrue ? '1' : '0', self.midpoint].join("\t") + "\n"
37
+ end
38
+ end
39
+
@@ -0,0 +1,39 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class RInterface
9
+ @@R_BIN = "R"
10
+ def RInterface.R_BIN=(rbin) @@R_BIN=rbin end
11
+ attr_reader :handler
12
+ def initialize
13
+ @handler = IO.popen("#{@@R_BIN} --slave 2>&1", "w+")
14
+ end
15
+ def run(cmd, type=nil)
16
+ @handler.puts cmd
17
+ @handler.puts "cat('---FIN---\n')"
18
+ o = ""
19
+ while true
20
+ l = @handler.gets
21
+ raise "R failed on command:\n#{cmd}\n\nError:\n#{o}" if l.nil?
22
+ break unless /^---FIN---/.match(l).nil?
23
+ o += l
24
+ end
25
+ o.chomp!
26
+ case type
27
+ when :float
28
+ /^\s*\[1\]\s+([0-9\.Ee+-]+|Inf).*/.match(o).nil? and raise "R error: expecting float, got #{o}"
29
+ return Float::INFINITY if $1=='Inf'
30
+ return $1.to_f
31
+ when :int
32
+ /^\s*\[1\]\s+([0-9\.Ee+-]+).*/.match(o).nil? and raise "R error: expecting integer, got #{o}"
33
+ return $1.to_i
34
+ else
35
+ return o
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,124 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ require 'rocker/rinterface'
9
+ require 'rocker/rocwindow'
10
+ require 'rocker/alignment'
11
+ require 'tmpdir'
12
+
13
+ class ROCData
14
+ attr_reader :aln, :windows, :r
15
+ # Use ROCData.new(table,aln,window) to re-compute from table, use ROCData.new(data) to load
16
+ def initialize(val, aln=nil, window=nil)
17
+ @r = RInterface.new
18
+ @nucl = false
19
+ if not aln.nil?
20
+ @aln = aln
21
+ self.rrun "library('pROC');"
22
+ self.rrun "x <- read.table('#{val}', sep='\\t', h=F);"
23
+ self.init_windows! window
24
+ else
25
+ f = File.open(val, "r")
26
+ @windows = []
27
+ while ln = f.gets
28
+ break unless /^#:/.match(ln).nil?
29
+ @windows << ROCWindow.new(self, ln)
30
+ end
31
+ f.close
32
+ @aln = Alignment.new
33
+ @aln.read_rocker(val)
34
+ end
35
+ end
36
+ def win_at_col(col) self.windows.select{|w| (w.from<=col) and (w.to>=col)}.first end
37
+ def in_nucl?() @nucl end
38
+ def nucl=(nucl) @nucl=nucl end
39
+ def refine! table
40
+ while true
41
+ return false unless self.load_table! table
42
+ break if self._refine_iter(table)==0
43
+ end
44
+ return true
45
+ end
46
+ def _refine_iter table
47
+ to_refine = []
48
+ self.windows.each do |w|
49
+ next if w.almost_empty or w.length <= 5
50
+ self.rrun "acc <- w$accuracy[w$V1==#{w.from}];"
51
+ to_refine << w if self.rrun("ifelse(is.na(acc), 100, acc)", :float) < 95.0
52
+ end
53
+ n = to_refine.size
54
+ return 0 unless n > 0
55
+ to_refine.each do |w|
56
+ w1 = ROCWindow.new(self, w.from, (w.from+w.to)/2)
57
+ w2 = ROCWindow.new(self, (w.from+w.to)/2, w.to)
58
+ if w1.almost_empty or w2.almost_empty
59
+ n -= 1
60
+ else
61
+ @windows << w1
62
+ @windows << w2
63
+ @windows.delete w
64
+ end
65
+ end
66
+ @windows.sort!{ |x,y| x.from <=> y.from }
67
+ n
68
+ end
69
+ def load_table! table, sbj=[], min_score=0
70
+ self.rrun "x <- read.table('#{table}', sep='\\t', h=F);"
71
+ self.rrun "x <- x[x$V1 %in% c('#{sbj.join("','")}'),];" if sbj.size > 0
72
+ self.rrun "x <- x[x$V4 >= #{minscore.to_s},];" if min_score > 0
73
+ Dir.mktmpdir do |dir|
74
+ self.save(dir + "/rocker")
75
+ self.rrun "w <- read.table('#{dir}/rocker', sep='\\t', h=F);"
76
+ end
77
+ self.rrun "w <- w[!is.na(w$V5),];"
78
+ if self.rrun("nrow(w)", :int)==0
79
+ warn "\nWARNING: Insufficient windows with estimated thresholds.\n\n"
80
+ return false
81
+ end
82
+ self.rrun <<-EOC
83
+ w$tp<-0; w$fp<-0; w$tn<-0; w$fn<-0;
84
+ for(i in 1:nrow(x)){
85
+ m <- x$V6[i];
86
+ win <- which( (m>=w$V1) & (m<=w$V2))[1];
87
+ if(!is.na(win)){
88
+ if(x$V4[i] >= w$V5[win]){
89
+ if(x$V5[i]==1){ w$tp[win] <- w$tp[win]+1 }else{ w$fp[win] <- w$fp[win]+1 };
90
+ }else{
91
+ if(x$V5[i]==1){ w$fn[win] <- w$fn[win]+1 }else{ w$tn[win] <- w$tn[win]+1 };
92
+ }
93
+ }
94
+ }
95
+ EOC
96
+ r.run <<-EOC
97
+ w$p <- w$tp + w$fp;
98
+ w$n <- w$tn + w$fn;
99
+ w$sensitivity <- 100*w$tp/(w$tp+w$fn);
100
+ w$specificity <- 100*w$tn/(w$fp+w$tn);
101
+ w$accuracy <- 100*(w$tp+w$tn)/(w$p+w$n);
102
+ w$precision <- 100*w$tp/(w$tp+w$fp);
103
+ EOC
104
+
105
+ return true
106
+ end
107
+ def init_windows!(size)
108
+ @windows = []
109
+ 1.step(self.aln.cols,size).each { |a| @windows << ROCWindow.new(self, a, a+size-1) }
110
+ end
111
+ def rrun(cmd, type=nil) self.r.run cmd, type end
112
+ def save(file)
113
+ f = File.open(file, "w")
114
+ f.print self.to_s
115
+ f.close
116
+ end
117
+ def to_s
118
+ o = ''
119
+ self.windows.each{|w| o += w.to_s}
120
+ o += self.aln.to_s
121
+ return o
122
+ end
123
+ end
124
+
@@ -0,0 +1,63 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class ROCWindow
9
+ attr_reader :data, :from, :to, :hits, :tps, :thr
10
+ def initialize(data, from=nil, to=nil)
11
+ @data = data
12
+ if from.is_a? String
13
+ r = from.split(/\t/)
14
+ @from = r[0].to_i
15
+ @to = r[1].to_i
16
+ @hits = r[2].to_i
17
+ @tps = r[3].to_i
18
+ @thr = r[4].to_f
19
+ else
20
+ a = from.nil? ? 1 : [from,1].max
21
+ b = to.nil? ? data.aln.cols : [to,data.aln.cols].min
22
+ @from = [a,b].min
23
+ @to = [a,b].max
24
+ @thr = nil
25
+ self.compute!
26
+ end
27
+ end
28
+ def compute!
29
+ self.load_hits
30
+ @hits = self.rrun "nrow(y);", :int
31
+ @tps = self.rrun "sum(y$V5);", :int
32
+ unless self.almost_empty
33
+ self.rrun "rocobj <- roc(y$V5, y$V4);"
34
+ thr = self.rrun 'coords(rocobj, "best", ret="threshold", best.method="youden", best.weights=c(0.5, sum(y$V5)/nrow(y)))[1];', :float
35
+ @thr = thr.to_f
36
+ @thr = nil if @thr==0.0 or @thr.infinite?
37
+ end
38
+ end
39
+ def around_thr
40
+ a = self.previous
41
+ b = self.next
42
+ while not a.nil? and a.thr.nil?
43
+ a = a.previous
44
+ end
45
+ while not b.nil? and b.thr.nil?
46
+ b = b.next
47
+ end
48
+ return nil if a.nil? and b.nil?
49
+ return a.thr if b.nil?
50
+ return b.thr if a.nil?
51
+ return (b.thr*(self.from-a.from) - a.thr*(self.from-b.from))/(b.from-a.from)
52
+ end
53
+ def load_hits() self.rrun "y <- x[x$V6>=#{self.from} & x$V6<=#{self.to},];" end
54
+ def previous() (self.from == 1) ? nil : self.data.win_at_col(self.from - 1) end
55
+ def next() (self.to == self.data.aln.cols) ? nil : self.data.win_at_col(self.to + 1) end
56
+ def thr_notnil() (@thr.nil? or @thr.infinite?) ? self.around_thr : @thr end
57
+ def fps() self.hits - self.tps end
58
+ def almost_empty() self.fps < 3 or self.tps < 3 end
59
+ def length() self.to - self.from + 1 end
60
+ def rrun(cmd, type=nil) self.data.rrun cmd, type end
61
+ def to_s() [self.from, self.to, self.hits, self.tps, self.thr_notnil].join("\t") + "\n" end
62
+ end
63
+
@@ -0,0 +1,38 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jan-22-2015
6
+ #
7
+
8
+ class Sequence
9
+ attr_reader :id, :seq, :aln
10
+ def initialize(id, aln)
11
+ @id = id
12
+ @aln = aln.gsub(/[-\.]/,'-').gsub(/[^A-Za-z-]/, '').upcase
13
+ @seq = aln.gsub(/[^A-Za-z]/, '').upcase
14
+ end
15
+ def pos2col(pos)
16
+ col = 0
17
+ self.aln.split(//).each do |c|
18
+ col+=1
19
+ pos-=1 unless c=='-'
20
+ return col if pos==0
21
+ end
22
+ col
23
+ end
24
+ def col2pos(col)
25
+ pos = 1
26
+ self.aln.split(//).each do |c|
27
+ col-=1
28
+ pos+=1 unless c=='-'
29
+ return pos if col==0
30
+ end
31
+ pos
32
+ end
33
+ def cols() self.aln.length end
34
+ def length() self.seq.length end
35
+ def to_seq_s() ">#{self.id}\n#{self.seq}\n" end
36
+ def to_s() "#:>#{self.id}\n#:#{self.aln}\n" end
37
+ end
38
+
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-rocker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Luis (Coto) Orellana
8
+ - Luis M. Rodriguez-R
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-01-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Detecting and quantifying functional genes in short-read metagenomic
15
+ datasets
16
+ email: lhorellana@gatech.edu
17
+ executables:
18
+ - ROCker
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/rocker.rb
23
+ - lib/rocker/sequence.rb
24
+ - lib/rocker/alignment.rb
25
+ - lib/rocker/blasthit.rb
26
+ - lib/rocker/rocwindow.rb
27
+ - lib/rocker/rocdata.rb
28
+ - lib/rocker/rinterface.rb
29
+ - bin/ROCker
30
+ homepage: http://enve-omics.ce.gatech.edu/rocker
31
+ licenses:
32
+ - artistic 2.0
33
+ metadata: {}
34
+ post_install_message:
35
+ rdoc_options: []
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubyforge_project:
50
+ rubygems_version: 2.0.14
51
+ signing_key:
52
+ specification_version: 4
53
+ summary: ROCker
54
+ test_files: []