bio-rocker 0.2.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 14d82f0e8c6f1cf052b52c82de99c5616ebfd2a3
4
- data.tar.gz: 175ffb75e14ecfa7f12073ef8fddff2d8f8bda5d
3
+ metadata.gz: b8a10cdc85d8b7b54c21d26f12b90c0b3dff4f82
4
+ data.tar.gz: c837b3c6687f6705dbfc7c959824dd530e7ee932
5
5
  SHA512:
6
- metadata.gz: d8ce626b7731d7293339c74edbc8ac06005c6b4f988d7cb45e191b24ccc2194091d214d2352c5cfd9e34018abc4b94ed3ffa1b734a5438bc6246fc4f564f6abf
7
- data.tar.gz: 0d7187d853a4ac73b91808f52757c94f7d61bc4f19762a7ba4ec97363c55c1eeb32791f78b31ae5d91a7dd72b6779b3bc4cd34b9783e3509bae72be33e4e523a
6
+ metadata.gz: 869cdadfed2dad125fc11c03133e2f56df53074a54b0f35d8ea5c6674029e7069332e4c35c486b1f39a417aeff01932a7eee30da44e15de036ce1a2d878d15d4
7
+ data.tar.gz: 823b30e7923c243f8dc8bb122f50426898e2cdda634516cc53e0325b150946b0699e0a9257a55c06ed8868a0f843f0274ed1d23bdf8ef2de9629fafb66f33552
data/bin/ROCker CHANGED
@@ -16,8 +16,8 @@ require 'optparse'
16
16
  $t = {
17
17
  'build' => 'Creates in silico metagenomes and training sets from reference genomes.',
18
18
  'compile' => 'Identifies the most discriminant bit-score per alignment position in a set of sequence.',
19
- 'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
20
19
  'search' => 'Uses a ROCker compilation to identify reads putatively derived from a set of sequences.',
20
+ 'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
21
21
  'plot' => 'Generates a graphical representation of the alignment, the thresholds, and the hits.',
22
22
  }
23
23
  task = (ARGV.size > 0 ? ARGV.shift : '').downcase
@@ -43,49 +43,51 @@ opts = OptionParser.new do |opt|
43
43
  opt.on("-p", "--positive ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
44
44
  opt.on("-n", "--negative ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
45
45
  opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
46
- #opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides (currently not implemented)."){ raise "--nucleotides: This option is currently not implemented." }
47
46
  opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
48
47
  opt.separator ""
49
48
  opt.separator "+ ADVANCED BUILDING ARGUMENTS"
50
49
  opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one UniProtKB ID per line. If used, -p is not required."){ |v| o[:posfile]=v }
51
50
  opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one UniProtKB ID per line."){ |v| o[:negfile]=v }
52
51
  opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain UniProtKB ID. If used, -p is not required."){ |v| o[:aln]=v }
53
- opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
54
- opt.on("-v", "--overlap NUMBER", "Minimum overlap with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_f }
55
- opt.on( "--genome-frx NUMBER", "Fraction to subsample the positive set genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}"){ |v| o[:genomefrx]=v.to_f }
56
- opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum.",
57
- "This option replaces --per-genus and --per-species from v0.1.*."){ |v| o[:pertaxon]=v.downcase }
58
- opt.on( "--nometagenome", "Do not create metagenome. Implies --noblast. By default, metagenome is created."){ |v| o[:nomg]=v }
59
- opt.on( "--noblast", "Do not execute BLAST. By default, BLAST is executed."){ |v| o[:noblast]=v }
52
+ opt.on("-s", "--seqdepth NUMBER", "Sequencing depth (reads/bp) to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
53
+ opt.on("-l", "--readlen INTEGER", "Average read length of in silico metagenome (in bp). By default: '#{ROCker.default :readlen}'."){ |v| o[:readlen]=v.to_i }
54
+ opt.on("-v", "--overlap INTEGER", "Minimum overlap (in bp) with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_i }
55
+ opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum."){ |v| o[:pertaxon]=v.downcase }
56
+ opt.on( "--genome-frx NUMBER", "Fraction to subsample genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}."){ |v| o[:genomefrx]=v.to_f }
57
+ opt.on( "--nosimulate", "Do not simulate metagenome. Implies --nosearch. By default, metagenome is simulated."){ |v| o[:nosimulate]=v }
58
+ opt.on( "--nosearch", "Do not execute similarity search. By default, it is executed."){ |v| o[:nosearch]=v }
60
59
  opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
61
60
  opt.on( "--nocleanup", "Keep all intermediate files. By default, intermediate files are removed."){ |v| o[:noclean]=v }
62
61
  opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
63
62
  opt.separator ""
64
63
  opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
65
- opt.on("-G", "--grinder PATH", "Path to the grinder executable. By default: '#{ROCker.default :grinder}' (in the $PATH)."){ |v| o[:grinder]=v }
66
- opt.on("-M", "--muscle PATH", "Path to the muscle executable. By default: '#{ROCker.default :muscle}' (in the $PATH)."){ |v| o[:muscle]=v }
67
- opt.on("-B", "--blastbins PATH", "Path to the Blast+ executables. By default: '#{ROCker.default :blastbins}' (in the $PATH)."){ |v| o[:blastbins]=v }
68
- opt.on( "--grinder-cmd STR", "Command calling grinder, where %1$s: grinder bin, %2$s: input, %3$s: seq. depth, %4$s: output.",
69
- "By default: '#{ROCker.default :grindercmd}'."){ |v| o[:grindercmd]=v }
70
- opt.on("--muscle-cmd STR", "Command calling muscle, where %1$s: muscle bin, %2$s: input, %3$s: output.",
71
- "By default: '#{ROCker.default :musclecmd}'."){ |v| o[:musclecmd]=v }
72
- opt.on("--blast-cmd STR", "Command calling BLAST search, where %1$s: blast bins, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
73
- "By default: '#{ROCker.default :blastcmd}'."){ |v| o[:blastcmd]=v }
74
- opt.on("--makedb-cmd STR", "Command calling BLAST format, where %1$s: blast bins, %2$s: dbtype, %3$s: input, %4$s: database.",
75
- "By default: '#{ROCker.default :makedbcmd}'."){ |v| o[:makedbcmd]=v }
64
+ opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
65
+ opt.on( "--simulator STR", "In silico metagenome simulator to use. Supported: 'grinder'. By default: '#{ROCker.default :simulator}'.") { |v| o[:ssimulator]=v.to_sym }
66
+ opt.on( "--aligner STR", "Multiple alignment algorithm to use. Supported: 'clustalo' and 'muscle'. By default: '#{ROCker.default :aligner}'.") { |v| o[:aligner]=v.to_sym }
67
+ opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
68
+ opt.on( "--simulator-bin PATH", "Path to the simulator executable. By default in the $PATH: '#{ROCker.default(:simulatorbin).values.join("' or '")}'.") { |v| o[:simulatorbin]=v }
69
+ opt.on( "--aligner-bin PATH", "Path to the aligner executable. By default in the $PATH: '#{ROCker.default(:alignerbin).values.join("' or '")}'.") { |v| o[:alignerbin]=v }
70
+ opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
71
+ *ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
72
+ opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
73
+ *ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
74
+ opt.on( "--simulator-cmd STR", "Command calling simulator, where %1$s: binary, %2$s: input, %3$s: seq. depth (X), %4$d: read len., %5$s: output.",
75
+ *ROCker.default(:simulatorcmd).keys.map{|k| "By default if --simulator #{k}: '#{ROCker.default(:simulatorcmd)[k]}'."}){ |v| o[:simulatorcmd]=v }
76
+ opt.on("--aligner-cmd STR", "Command calling aligner, where %1$s: binary, %2$s: input, %3$s: output, %4$d: threads.",
77
+ *ROCker.default(:alignercmd).keys.map{|k| "By default if --aligner #{k}: '#{ROCker.default(:alignercmd)[k]}'."}){ |v| o[:alignercmd]=v }
76
78
  when 'compile'
77
79
  opt.separator "+ COMPILATION ARGUMENTS"
78
80
  opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. Required."){ |v| o[:aln]=v }
79
81
  opt.on("-b", "--ref-blast PATH",
80
82
  "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
81
83
  opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
82
- opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ raise "--nucleotides: This option is currently not implemented." }
83
84
  opt.separator ""
84
85
  opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
85
86
  opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
86
87
  opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
87
88
  opt.on( "--norefine", "Do not refine windows."){ o[:refine]=false }
88
89
  opt.on("-w", "--window INT", "Initial size of alignment windows (in number of AA columns). By default: #{ROCker.default :win}."){ |v| o[:win]=v.to_i }
90
+ opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
89
91
  opt.separator ""
90
92
  opt.separator "+ INPUT/OUTPUT"
91
93
  opt.separator " o The input alignment (-a) MUST be in FastA format, and the IDs must"
@@ -107,15 +109,23 @@ opts = OptionParser.new do |opt|
107
109
  opt.separator " 5. Bit score threshold set for the window."
108
110
  opt.separator " The file also contains the alignment (commented with #:)."
109
111
  opt.separator ""
112
+ when 'search'
113
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
114
+ opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
115
+ opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
116
+ opt.separator ""
117
+ opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
118
+ opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
119
+ opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
120
+ opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
121
+ *ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
122
+ opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
123
+ *ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
110
124
  when 'filter'
111
125
  opt.separator "+ FILTERING ARGUMENTS"
112
126
  opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
113
127
  opt.on("-x", "--query-blast PATH", "Tabular BLAST (blastx) of the query reads vs. the reference dataset. Required."){ |v| o[:qblast]=v }
114
128
  opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
115
- when 'search'
116
- opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
117
- opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
118
- opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
119
129
  when 'plot'
120
130
  opt.separator "+ PLOTTING ARGUMENTS"
121
131
  opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
@@ -127,7 +137,7 @@ opts = OptionParser.new do |opt|
127
137
  opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
128
138
  opt.on( "--color", "Color alignment by amino acid."){ o[:color]=true }
129
139
  opt.on( "--no-transparency", "Do not use (semi-)transparencies."){ |v| o[:transparency] = v }
130
- opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
140
+ opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}."){ |v| o[:minscore]=v.to_f }
131
141
  opt.on( "--stats-impact", "Plot impact on statistics, instead of absolute values per window."){ o[:impact]=true }
132
142
  opt.on( "--stats-ylim STRING", "Limits of the Y-axis in the bottom panel. By default: '-2,.1' if --stats-impact is set, '50,100' otherwise."){ |v| o[:ylim]=v }
133
143
  opt.on("-s", "--subject SBJ1,SBJ2,...", Array,
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update May-14-2015
5
+ # @update Jun-05-2015
6
6
  #
7
7
 
8
8
  require 'rocker/blasthit'
@@ -10,40 +10,20 @@ require 'rocker/rocdata'
10
10
 
11
11
  class ROCker
12
12
  #================================[ Class ]
13
- @@EBIREST = 'http://www.ebi.ac.uk/Tools'
14
13
  @@DEFAULTS = {
15
14
  # General
16
- :q=>false, :r=>'R', :nucl=>false, :debug=>false,
17
- # Build
18
- :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0,
19
- # ext. software
20
- :grinder=>'grinder', :muscle=>'muscle', :blastbins=>'', :seqdepth=>3, :minovl=>0.75,
21
- :grindercmd=>'%1$s -reference_file "%2$s" -cf "%3$f" -base_name "%4$s" -dc \'-~*Nn\' -md "uniform 0.1" -mr "95 5" -rd "100 uniform 5"',
22
- :musclecmd=>'%1$s -in "%2$s" -out "%3$s" -quiet',
23
- :blastcmd=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
24
- :makedbcmd=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
25
- # Compile
26
- :refine=>true, :win=>20, :minscore=>0,
27
- # Filter
28
- :sbj=>[],
29
- # Plot
30
- :color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true,
15
+ :q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
16
+ # External software
17
+ :searchbins=>'',
18
+ :searchcmd=>{
19
+ :blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
20
+ :diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
21
+ :makedbcmd=>{
22
+ :blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
23
+ :diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
31
24
  }
32
- @@HAS_BUILD_GEMS = nil
33
- def self.ebirest() @@EBIREST ; end
34
25
  def self.defaults() @@DEFAULTS ; end
35
26
  def self.default(k) @@DEFAULTS[k] ; end
36
- def self.has_build_gems?
37
- return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
38
- @@HAS_BUILD_GEMS = TRUE
39
- begin
40
- require 'rubygems'
41
- require 'restclient'
42
- rescue LoadError
43
- @@HAS_BUILD_GEMS = FALSE
44
- end
45
- @@HAS_BUILD_GEMS
46
- end
47
27
 
48
28
  #================================[ Instance ]
49
29
  attr_reader :o
@@ -53,374 +33,6 @@ class ROCker
53
33
  RInterface.R_BIN = opts[:r] unless opts[:r].nil?
54
34
  end
55
35
 
56
- #================================[ Build ]
57
- def build!
58
- # Check requirements
59
- puts "Testing environment." unless @o[:q]
60
- @o[:noblast]=true if @o[:nomg]
61
- raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
62
- @o[:positive] += @o[:posori] unless @o[:posori].nil?
63
- @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
64
- @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
65
- unless @o[:aln].nil?
66
- aln = Alignment.new
67
- aln.read_fasta @o[:aln]
68
- @o[:positive] += aln.get_ids
69
- end
70
- raise "-p or -P are mandatory." if @o[:positive].size==0
71
- raise "-o/--baseout is mandatory." if @o[:baseout].nil?
72
- if @o[:positive].size == 1 and not @o[:noaln]
73
- warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
74
- @o[:noaln] = true
75
- end
76
- self.bash "#{@o[:grinder]} --version", "-G/--grinder must be executable. Is Grinder installed?" unless @o[:nomg]
77
- self.bash "#{@o[:muscle]} -version", "-M/--muscle must be executable. Is Muscle installed?" unless @o[:noaln]
78
- self.bash "#{@o[:blastbins]}makeblastdb -version", "-B/--blastbins must contain executables. Is BLAST+ installed?" unless @o[:noblast]
79
- # Download genes
80
- puts "Downloading gene data." unless @o[:q]
81
- f = File.open(@o[:baseout] + '.ref.fasta', 'w')
82
- if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
83
- puts " * re-using aligned sequences as positive set." unless @o[:q]
84
- f.print aln.to_seq_s
85
- @o[:noaln] = true
86
- else
87
- puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
88
- $stderr.puts " # #{@o[:positive]}" if @o[:debug]
89
- ids = Array.new(@o[:positive])
90
- while ids.size>0
91
- f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
92
- end
93
- end
94
- f.close
95
- genome_ids = {:positive=>[], :negative=>[]}
96
- [:positive, :negative].each do |set|
97
- unless @o[set].size==0
98
- puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
99
- $stderr.puts " # #{@o[set]}" if @o[:debug]
100
- genome_ids[set] = genes2genomes(@o[set])
101
- end
102
- end
103
- raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
104
- genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
105
- raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
106
- all_genome_ids = genome_ids.values.reduce(:+).uniq
107
-
108
- # Locate genes
109
- puts "Analyzing genome data." unless @o[:q]
110
- puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s)." unless @o[:q]
111
- $stderr.puts " # #{genome_ids[:positive]}" if @o[:debug]
112
- positive_coords = {}
113
- genome_org = {}
114
- i = 0
115
- genome_ids[:positive].each do |genome_id|
116
- print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids[:positive].size}. \r" unless @o[:q]
117
- unless @o[:pertaxon].nil?
118
- genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
119
- next unless genome_org[ genome_taxon ].nil?
120
- genome_org[ genome_taxon ] = genome_id
121
- end
122
- $stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
123
- genome_file = @o[:baseout] + '.src.' + i.to_s + '.gff3'
124
- if @o[:reuse] and File.exist? genome_file
125
- puts " * reusing existing file: #{genome_file}." unless @o[:q]
126
- ifh = File.open(genome_file, 'r')
127
- doc = ifh.readlines.grep(/^[^#]/)
128
- ifh.close
129
- else
130
- genome_file=nil unless @o[:noclean]
131
- res = ebiFetch(:embl, [genome_id], :gff3, genome_file)
132
- doc = res.split("\n").grep(/^[^#]/)
133
- end
134
- doc.each do |ln|
135
- next if ln =~ /^#/
136
- r = ln.chomp.split /\t/
137
- next if r.size < 9
138
- prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
139
- p = prots.select{ |p| @o[:positive].include? p }.first
140
- next if p.nil?
141
- positive_coords[ r[0] ] ||= []
142
- positive_coords[ r[0] ] << {
143
- #:strand => r[6],
144
- :prot_id => p,
145
- :from => r[3].to_i,
146
- :to => r[4].to_i
147
- }
148
- end
149
- end
150
- print "\n" unless @o[:q]
151
- unless @o[:pertaxon].nil?
152
- genome_ids[:positive] = genome_org.values
153
- puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
154
- end
155
- all_genome_ids = genome_ids.values.reduce(:+).uniq
156
- found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+)
157
- raise "Cannot find the genomic location of any provided sequence." if found.nil?
158
- missing = @o[:positive] - found
159
- warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or not @o[:pertaxon].nil?
160
-
161
- # Download genomes
162
- genomes_file = @o[:baseout] + '.src.fasta'
163
- if @o[:reuse] and File.exist? genomes_file
164
- puts " * reusing existing file: #{genomes_file}." unless @o[:q]
165
- else
166
- puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
167
- $stderr.puts " # #{all_genome_ids}" if @o[:debug]
168
- ids = Array.new(all_genome_ids)
169
- ofh = File.open(genomes_file, 'w')
170
- while ids.size>0
171
- ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
172
- end
173
- ofh.close
174
- end
175
-
176
- # Generate metagenome
177
- unless @o[:nomg]
178
- puts "Generating in silico metagenome" unless @o[:q]
179
- if @o[:reuse] and File.exist? @o[:baseout] + ".mg.fasta"
180
- puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
181
- else
182
- all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
183
- thrs = [@o[:thr], all_src].min
184
- puts " * running grinder and tagging positive reads (#{thrs} threads)." unless @o[:q]
185
- $stderr.puts " # #{positive_coords}" if @o[:debug]
186
- thr_obj = []
187
- seqs_per_thr = (all_src/thrs).ceil
188
- (0 .. (thrs-1)).each do |thr_i|
189
- thr_obj << Thread.new do
190
- Thread.current[:seqs_a] = thr_i*seqs_per_thr + 1
191
- Thread.current[:seqs_b] = [Thread.current[:seqs_a] + seqs_per_thr, all_src].min
192
- # Create sub-fasta
193
- Thread.current[:ofh] = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", 'w')
194
- Thread.current[:ifh] = File.open("#{@o[:baseout]}.src.fasta", 'r')
195
- Thread.current[:seq_i] = 0
196
- while Thread.current[:l] = Thread.current[:ifh].gets
197
- Thread.current[:seq_i]+=1 if Thread.current[:l] =~ /^>/
198
- break if Thread.current[:seq_i] > Thread.current[:seqs_b]
199
- Thread.current[:ofh].print Thread.current[:l] if Thread.current[:seq_i] >= Thread.current[:seqs_a]
200
- end
201
- Thread.current[:ifh].close
202
- Thread.current[:ofh].close
203
- bash sprintf(@o[:grindercmd], @o[:grinder], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
204
- # Tag positives
205
- puts " * tagging positive reads." unless @o[:q]
206
- Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
207
- Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
208
- while Thread.current[:l]=Thread.current[:ifh].gets
209
- Thread.current[:rd] = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
210
- unless Thread.current[:rd].nil?
211
- Thread.current[:positive] = false
212
- positive_coords[Thread.current[:rd][:genome_id]] ||= []
213
- positive_coords[Thread.current[:rd][:genome_id]].each do |gn|
214
- Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
215
- Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
216
- if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
217
- Thread.current[:positive] = true
218
- break
219
- end
220
- end
221
- Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:genome_id]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
222
- end
223
- Thread.current[:ofh].print Thread.current[:l]
224
- end
225
- Thread.current[:ofh].close
226
- Thread.current[:ifh].close
227
- Thread.current[:output] = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
228
- end # Thread.new do
229
- end # (1 .. thrs).each
230
- # Concatenate results
231
- ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
232
- thr_obj.each do |t|
233
- t.join
234
- raise "Thread failed without error trace: #{t}" if t[:output].nil?
235
- ifh = File.open(t[:output], 'r')
236
- while l = ifh.gets
237
- ofh.print l
238
- end
239
- ifh.close
240
- File.unlink t[:output]
241
- end
242
- ofh.close
243
- end
244
- end # unless @o[:nomg]
245
- # Align references
246
- unless @o[:noaln]
247
- puts "Aligning reference set." unless @o[:q]
248
- if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.aln"
249
- puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
250
- else
251
- bash sprintf(@o[:musclecmd], @o[:muscle], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln")
252
- puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
253
- end
254
- end
255
- # Run BLAST
256
- unless @o[:noblast]
257
- puts "Running homology search." unless @o[:q]
258
- if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.blast"
259
- puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
260
- else
261
- puts " * preparing database." unless @o[:q]
262
- bash sprintf(@o[:makedbcmd], @o[:blastbins], (@o[:nucl]?'nucl':'prot'), "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
263
- puts " * running BLAST." unless @o[:q]
264
- bash sprintf(@o[:blastcmd], @o[:blastbins], (@o[:nucl]?'blastn':'blastx'), "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
265
- end
266
- end
267
- # Clean
268
- unless @o[:noclean]
269
- puts "Cleaning." unless @o[:q]
270
- sff = %w{.src.xml .src.fasta}
271
- sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nomg]
272
- sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:noblast]
273
- sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
274
- end
275
- end # build!
276
-
277
- #================================[ Compile ]
278
- def compile!
279
- raise "-a/--alignment is mandatory." if @o[:aln].nil?
280
- raise "-a/--alignment must exist." unless File.exist? @o[:aln]
281
- if @o[:table].nil?
282
- raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
283
- @o[:table] = "#{@o[:blast]}.table"
284
- end
285
- raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
286
- raise "-k/--rocker is mandatory." if @o[:rocker].nil?
287
-
288
- puts "Testing environment." unless @o[:q]
289
- bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
290
- bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
291
-
292
- puts "Reading files." unless @o[:q]
293
- puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
294
- aln = Alignment.new
295
- aln.read_fasta @o[:aln]
296
-
297
- if File.exist? @o[:table]
298
- puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
299
- else
300
- puts " * generating table: #{@o[:table]}." unless @o[:q]
301
- blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
302
- end
303
-
304
- puts "Analyzing data." unless @o[:q]
305
- puts " * computing windows." unless @o[:q]
306
- data = ROCData.new(@o[:table], aln, @o[:win])
307
- data.nucl = @o[:nucl]
308
- if @o[:refine]
309
- puts " * refining windows." unless @o[:q]
310
- warn "Insufficient hits to refine results." unless data.refine! @o[:table]
311
- end
312
- puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
313
- data.save @o[:rocker]
314
- end # compile!
315
-
316
- #================================[ Filter ]
317
- def filter!
318
- raise "-k/--rocker is mandatory." if @o[:rocker].nil?
319
- raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
320
- raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
321
-
322
- puts "Reading ROCker file." unless @o[:q]
323
- data = ROCData.new @o[:rocker]
324
-
325
- puts "Filtering BLAST." unless @o[:q]
326
- ih = File.open(@o[:qblast], 'r')
327
- oh = File.open(@o[:oblast], 'w')
328
- while ln = ih.gets
329
- bh = BlastHit.new(ln, data.aln)
330
- oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
331
- end
332
- ih.close
333
- oh.close
334
- end # filter!
335
- #================================[ Search ]
336
- def search!
337
- raise "-k/--rocker is mandatory." if @o[:rocker].nil?
338
- raise "Code Under development..."
339
- # ToDo
340
- # [ ... ]
341
- end # search!
342
-
343
- #================================[ Plot ]
344
- def plot!
345
- raise "-k/--rocker is mandatory." if o[:rocker].nil?
346
- if @o[:table].nil?
347
- raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
348
- @o[:table] = "#{@o[:blast]}.table"
349
- end
350
- raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
351
-
352
- puts "Testing environment." unless @o[:q]
353
- bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
354
-
355
- puts "Reading files." unless @o[:q]
356
- puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
357
- data = ROCData.new @o[:rocker]
358
- if File.exist? @o[:table]
359
- puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
360
- else
361
- puts " * generating table: #{@o[:table]}." unless @o[:q]
362
- blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
363
- end
364
-
365
- puts "Plotting matches." unless @o[:q]
366
- extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
367
- @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
368
- data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
369
- data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
370
- some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
371
- data.rrun "par(mar=c(0,4,0,0.5)+.1);"
372
- data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
373
- data.rrun "noise <- runif(ncol(x),-.2,.2)"
374
- data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
375
- data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
376
-
377
- puts "Plotting windows." unless @o[:q]
378
- if some_thr
379
- data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
380
- data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
381
- end
382
- data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
383
- "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
384
-
385
- puts "Plotting alignment." unless @o[:q]
386
- data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
387
- data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
388
- i = 0
389
- data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
390
- data.aln.seqs.values.each do |s|
391
- color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
392
- data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
393
- i += 1
394
- end
395
-
396
- puts "Plotting statistics." unless @o[:q]
397
- data.rrun "par(mar=c(5,4,0,0.5)+.1);"
398
- data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
399
- if some_thr
400
- sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
401
- sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
402
- ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
403
- unless @o[:q]
404
- puts " * sensitivity: #{sn}%"
405
- puts " * specificity: #{sp}%"
406
- puts " * accuracy: #{ac}%"
407
- end
408
- data.rrun "pos <- (w$V1+w$V2)/2"
409
- if @o[:impact]
410
- data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
411
- data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
412
- data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
413
- else
414
- data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
415
- data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
416
- data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
417
- end
418
- #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
419
- end
420
- data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
421
- data.rrun "dev.off();"
422
- end # plot!
423
-
424
36
  #================================[ Utilities ]
425
37
  def blast2table(blast_f, table_f, aln, minscore)
426
38
  ifh = File.open(blast_f, "r")
@@ -432,39 +44,6 @@ class ROCker
432
44
  ifh.close
433
45
  ofh.close
434
46
  end
435
- def genes2genomes(gene_ids)
436
- genomes = []
437
- ids = Array.new(gene_ids)
438
- while ids.size>0
439
- doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
440
- genomes += doc.grep( /^DR\s+EMBL;/ ).map{ |ln| ln.split('; ')[1] }
441
- end
442
- genomes.uniq
443
- end
444
- def genome2taxid(genome_id)
445
- ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
446
- return ln if ln.nil?
447
- ln.sub(/.*"taxon:(\d+)".*/, "\\1")
448
- end
449
- def genome2taxon(genome_id, rank='species')
450
- xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
451
- xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
452
- end
453
- def restcall(url, outfile=nil)
454
- response = RestClient.get url
455
- raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
456
- unless outfile.nil?
457
- ohf = File.open(outfile, 'w')
458
- ohf.print response.to_s
459
- ohf.close
460
- end
461
- response.to_s
462
- end
463
- def ebiFetch(db, ids, format, outfile=nil)
464
- url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
465
- $stderr.puts " # Calling: #{url}" if @o[:debug]
466
- self.restcall url
467
- end
468
47
  def bash(cmd, err_msg=nil)
469
48
  o = `#{cmd} 2>&1 && echo '{'`
470
49
  raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
@@ -473,6 +52,14 @@ class ROCker
473
52
  end
474
53
 
475
54
  #================================[ Extensions ]
55
+ # To ROCker
56
+ require 'rocker/step/build'
57
+ require 'rocker/step/compile'
58
+ require 'rocker/step/search'
59
+ require 'rocker/step/filter'
60
+ require 'rocker/step/plot'
61
+
62
+ # To other
476
63
  class Numeric
477
64
  def ordinalize
478
65
  n= self.to_s
@@ -0,0 +1,389 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-05-2015
6
+ #
7
+
8
+ require 'json'
9
+
10
+ class ROCker
11
+ #================================[ Class ]
12
+ @@EBIREST = 'http://www.ebi.ac.uk/Tools'
13
+ @@DEFAULTS.merge!({:positive=>[], :negative=>[], :genomefrx=>1.0, :seqdepth=>0.03, :readlen=>100, :minovl=>50,
14
+ # Ext. Software
15
+ :aligner=>:clustalo, :simulator=>:grinder,
16
+ :simulatorbin=>{:grinder=>'grinder'},
17
+ :simulatorcmd=>{:grinder=>'%1$s -reference_file "%2$s" -cf "%3$f" -dc \'-~*NnKkMmRrYySsWwBbVvHhDdXx\' -md uniform 0.1 -mr 95 5 -rd %4$d uniform 5 -base_name "%5$s"'},
18
+ :alignerbin=>{:muscle=>'muscle', :clustalo=>'clustalo'},
19
+ :alignercmd=>{:muscle=>'%1$s -in "%2$s" -out "%3$s" -quiet', :clustalo=>'%1$s -i "%2$s" -o "%3$s" --threads=%4$d --force'}
20
+ })
21
+ @@HAS_BUILD_GEMS = nil
22
+ def self.ebirest() @@EBIREST ; end
23
+ def self.has_build_gems?
24
+ return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
25
+ @@HAS_BUILD_GEMS = TRUE
26
+ begin
27
+ require 'rubygems'
28
+ require 'restclient'
29
+ rescue LoadError
30
+ @@HAS_BUILD_GEMS = FALSE
31
+ end
32
+ @@HAS_BUILD_GEMS
33
+ end
34
+
35
+ #================================[ Utilities ]
36
+ def genes2genomes(gene_ids)
37
+ genomes = []
38
+ ids = Array.new(gene_ids)
39
+ while ids.size>0
40
+ doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
41
+ genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
42
+ r=ln.split('; ')
43
+ {:genome_id=>r[1], :transl_id=>r[2]}
44
+ end
45
+ end
46
+ genomes.uniq
47
+ end
48
+ def genome2taxid(genome_id)
49
+ ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
50
+ return ln if ln.nil?
51
+ ln.sub(/.*"taxon:(\d+)".*/, "\\1")
52
+ end
53
+ def genome2taxon(genome_id, rank='species')
54
+ xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
55
+ xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
56
+ end
57
+ def restcall(url, outfile=nil)
58
+ $stderr.puts " # Calling: #{url}" if @o[:debug]
59
+ response = RestClient::Request.execute(:method=>:get, :url=>url, :timeout=>600)
60
+ raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
61
+ unless outfile.nil?
62
+ ohf = File.open(outfile, 'w')
63
+ ohf.print response.to_s
64
+ ohf.close
65
+ end
66
+ response.to_s
67
+ end
68
+ def ebiFetch(db, ids, format, outfile=nil)
69
+ url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
70
+ res = self.restcall url
71
+ unless outfile.nil?
72
+ ohf = File.open(outfile, 'w')
73
+ ohf.print res
74
+ ohf.close
75
+ end
76
+ res
77
+ end
78
+ def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
79
+ positive_coords = {}
80
+ genomes_org = {}
81
+ i = 0
82
+ genome_ids.each do |genome_id|
83
+ print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread. \r" if thread_id==0 and not @o[:q]
84
+ unless @o[:pertaxon].nil?
85
+ genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
86
+ genomes_org[ genome_taxon.to_sym ] ||= []
87
+ genomes_org[ genome_taxon.to_sym ] << genome_id
88
+ end
89
+ genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
90
+ if @o[:reuse] and File.size? genome_file
91
+ ifh = File.open(genome_file, 'r')
92
+ doc = ifh.readlines.grep(/^[^#]/)
93
+ ifh.close
94
+ else
95
+ genome_file=nil unless @o[:noclean]
96
+ doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
97
+ end
98
+ doc.each do |ln|
99
+ next if ln =~ /^#/
100
+ r = ln.chomp.split /\t/
101
+ next if r.size < 9
102
+ prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
103
+ p = prots.select{ |id| protein_ids.include? id }.first
104
+ trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
105
+ t = trans.select{ |id| transl_ids.include? id }.first
106
+ next if p.nil? and t.nil?
107
+ positive_coords[ r[0].to_sym ] ||= []
108
+ positive_coords[ r[0].to_sym ] << {
109
+ :prot_id => p,
110
+ :tran_id => t,
111
+ :from => r[3].to_i,
112
+ :to => r[4].to_i,
113
+ :strand => r[6]
114
+ }
115
+ end
116
+ end
117
+ print "\n" if thread_id==0 and not @o[:q]
118
+ ofh = File.open json_file, "w"
119
+ ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
120
+ ofh.close
121
+ end
122
+
123
+ #================================[ Build ]
124
+ def build!
125
+ # Check requirements
126
+ puts "Testing environment." unless @o[:q]
127
+ @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
128
+ @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
129
+ @o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
130
+ @o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
131
+ @o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
132
+ @o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
133
+ @o[:nosearch]=true if @o[:nosimulate]
134
+ raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
135
+ @o[:positive] += @o[:posori] unless @o[:posori].nil?
136
+ @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
137
+ @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
138
+ unless @o[:aln].nil?
139
+ aln = Alignment.new
140
+ aln.read_fasta @o[:aln]
141
+ @o[:positive] += aln.get_ids
142
+ end
143
+ raise "-p or -P are mandatory." if @o[:positive].size==0
144
+ raise "-o/--baseout is mandatory." if @o[:baseout].nil?
145
+ if @o[:positive].size == 1 and not @o[:noaln]
146
+ warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
147
+ @o[:noaln] = true
148
+ end
149
+ unless @o[:nosimulate]
150
+ self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
151
+ end
152
+ unless @o[:noaln]
153
+ self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
154
+ self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
155
+ end
156
+ unless @o[:nosearch]
157
+ self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
158
+ self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
159
+ end
160
+
161
+ # Download genes
162
+ puts "Downloading gene data." unless @o[:q]
163
+ ref_file = @o[:baseout] + ".ref.fasta"
164
+ if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
165
+ puts " * reusing aligned sequences as positive set." unless @o[:q]
166
+ f = File.open(ref_file, "w")
167
+ f.print aln.to_seq_s
168
+ f.close
169
+ @o[:noaln] = true
170
+ elsif @o[:reuse] and File.size? ref_file
171
+ puts " * reusing positive set: #{ref_file}." unless @o[:q]
172
+ else
173
+ puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
174
+ $stderr.puts " # #{@o[:positive]}" if @o[:debug]
175
+ ids = Array.new(@o[:positive])
176
+ f = File.open(ref_file, "w")
177
+ while ids.size>0
178
+ f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
179
+ end
180
+ f.close
181
+ end
182
+ genome_ids = {:positive=>[], :negative=>[]}
183
+ transl_ids = {:positive=>[], :negative=>[]}
184
+ [:positive, :negative].each do |set|
185
+ unless @o[set].size==0
186
+ puts " * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
187
+ $stderr.puts " # #{@o[set]}" if @o[:debug]
188
+ r = genes2genomes(@o[set])
189
+ genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
190
+ transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
191
+ end
192
+ end
193
+ raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
194
+ genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
195
+ raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
196
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
197
+
198
+ # Locate genes
199
+ puts "Analyzing genome data." unless @o[:q]
200
+ coords_file = @o[:baseout] + ".src.coords"
201
+ if @o[:reuse] and File.size? coords_file
202
+ puts " * reusing coordinates: #{coords_file}." unless @o[:q]
203
+ c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
204
+ positive_coords = c[:positive_coords]
205
+ genome_org = c[:genome_org]
206
+ else
207
+ thrs = [@o[:thr], genome_ids[:positive].size].min
208
+ puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
209
+ $stderr.puts " # Looking for proteins: #{@o[:positive]}" if @o[:debug]
210
+ $stderr.puts " # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
211
+ $stderr.puts " # Looking into: #{genome_ids[:positive]}" if @o[:debug]
212
+ thr_obj = []
213
+ (0 .. (thrs-1)).each do |thr_i|
214
+ ids_to_parse = []
215
+ (0 .. (genome_ids[:positive].size-1)).each do |i|
216
+ ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
217
+ end
218
+ json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
219
+ thr_obj << json_file
220
+ fork do
221
+ get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
222
+ end
223
+ end
224
+ Process.waitall
225
+ # Combine results
226
+ positive_coords = {}
227
+ genomes_org = {}
228
+ genome_org = {}
229
+ thr_obj.each do |t|
230
+ raise "Thread failed without error trace: #{t}" unless File.exist? t
231
+ o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
232
+ o[:positive_coords].each_pair do |k,v|
233
+ positive_coords[ k ] ||= []
234
+ positive_coords[ k ] += v
235
+ end
236
+ o[:genomes_org].each_pair do |k,v|
237
+ genomes_org[ k ] ||= []
238
+ genomes_org[ k ] << v
239
+ end
240
+ File.unlink t
241
+ end
242
+ # Select one genome per taxon
243
+ unless @o[:pertaxon].nil?
244
+ genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
245
+ end
246
+ # Save coordinates
247
+ ofh = File.open(coords_file, "w")
248
+ ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
249
+ ofh.close
250
+ end
251
+ unless @o[:pertaxon].nil?
252
+ genome_ids[:positive] = genome_org.values
253
+ puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
254
+ end
255
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
256
+ found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
257
+ unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
258
+ raise "Cannot find the genomic location of any provided sequence." if found.nil?
259
+ missing = @o[:positive] - found
260
+ warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
261
+
262
+ # Download genomes
263
+ genomes_file = @o[:baseout] + '.src.fasta'
264
+ if @o[:reuse] and File.size? genomes_file
265
+ puts " * reusing existing file: #{genomes_file}." unless @o[:q]
266
+ else
267
+ puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
268
+ $stderr.puts " # #{all_genome_ids}" if @o[:debug]
269
+ ids = Array.new(all_genome_ids)
270
+ ofh = File.open(genomes_file, 'w')
271
+ while ids.size>0
272
+ ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
273
+ end
274
+ ofh.close
275
+ end
276
+
277
+ # Generate metagenome
278
+ unless @o[:nosimulate]
279
+ puts "Generating in silico metagenome" unless @o[:q]
280
+ if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
281
+ puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
282
+ else
283
+ all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
284
+ thrs = [@o[:thr], all_src].min
285
+ puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
286
+ $stderr.puts " # #{positive_coords}" if @o[:debug]
287
+ thr_obj = []
288
+ seqs_per_thr = (all_src/thrs).ceil
289
+ (0 .. (thrs-1)).each do |thr_i|
290
+ output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
291
+ thr_obj << output
292
+ fork do
293
+ seqs_a = thr_i*seqs_per_thr + 1
294
+ seqs_b = [seqs_a + seqs_per_thr, all_src].min
295
+ # Create sub-fasta
296
+ ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
297
+ ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
298
+ seq_i = 0
299
+ while l = ifh.gets
300
+ seq_i+=1 if l =~ /^>/
301
+ break if seq_i > seqs_b
302
+ ofh.print l if seq_i >= seqs_a
303
+ end
304
+ ifh.close
305
+ ofh.close
306
+
307
+ # Run simulator (except if the temporal file is already there and can be reused)
308
+ unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
309
+ bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
310
+ end
311
+
312
+ # Tag positives
313
+ puts " * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
314
+ ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
315
+ ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
316
+ while l = ifh.gets
317
+ if l =~ /^>/
318
+ rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
319
+ raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
320
+ positive = false
321
+ positive_coords[rd[:genome_id].to_sym] ||= []
322
+ positive_coords[rd[:genome_id].to_sym].each do |gn|
323
+ left = rd[:to].to_i - gn[:from]
324
+ right = gn[:to] - rd[:from].to_i
325
+ if (left*right >= 0) and ([left, right].min >= @o[:minovl])
326
+ positive = true
327
+ break
328
+ end
329
+ end
330
+ l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
331
+ "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
332
+ end
333
+ ofh.print l
334
+ end
335
+ ofh.close
336
+ ifh.close
337
+ end # fork
338
+ end # (1 .. thrs).each
339
+ Process.waitall
340
+ # Concatenate results
341
+ ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
342
+ thr_obj.each do |t|
343
+ raise "Thread failed without error trace: #{t}" unless File.exist? t
344
+ ifh = File.open(t, "r")
345
+ while l = ifh.gets
346
+ ofh.print l
347
+ end
348
+ ifh.close
349
+ File.unlink t
350
+ end
351
+ ofh.close
352
+ end
353
+ end # unless @o[:nosimulate]
354
+
355
+ # Align references
356
+ unless @o[:noaln]
357
+ puts "Aligning reference set." unless @o[:q]
358
+ if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
359
+ puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
360
+ else
361
+ bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
362
+ puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
363
+ end
364
+ end
365
+
366
+ # Run similarity search
367
+ unless @o[:nosearch]
368
+ puts "Running homology search." unless @o[:q]
369
+ if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
370
+ puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
371
+ else
372
+ puts " * preparing database." unless @o[:q]
373
+ bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
374
+ puts " * running similarity search." unless @o[:q]
375
+ bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
376
+ end
377
+ end
378
+
379
+ # Clean
380
+ unless @o[:noclean]
381
+ puts "Cleaning." unless @o[:q]
382
+ sff = %w{.src.xml .src.fasta}
383
+ sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
384
+ sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
385
+ sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
386
+ end
387
+ end # build!
388
+ end # ROCker
389
+
@@ -0,0 +1,53 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-05-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ @@DEFAULTS.merge!({:refine=>true, :win=>20, :minscore=>0})
11
+
12
+ #================================[ Compile ]
13
+ def compile!
14
+ raise "-a/--alignment is mandatory." if @o[:aln].nil?
15
+ raise "-a/--alignment must exist." unless File.exist? @o[:aln]
16
+ if @o[:table].nil?
17
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil? or not File.exist? @o[:blast]
18
+ @o[:table] = "#{@o[:blast]}.table"
19
+ else
20
+ @o[:reuse] = true
21
+ end
22
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
23
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
24
+
25
+ puts "Testing environment." unless @o[:q]
26
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
27
+ bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
28
+
29
+ puts "Reading files." unless @o[:q]
30
+ puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
31
+ aln = Alignment.new
32
+ aln.read_fasta @o[:aln]
33
+
34
+ if @o[:reuse] and File.exist? @o[:table]
35
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
36
+ else
37
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
38
+ blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
39
+ end
40
+
41
+ puts "Analyzing data." unless @o[:q]
42
+ puts " * computing windows." unless @o[:q]
43
+ data = ROCData.new(@o[:table], aln, @o[:win])
44
+ data.nucl = @o[:nucl]
45
+ if @o[:refine]
46
+ puts " * refining windows." unless @o[:q]
47
+ warn "Insufficient hits to refine results." unless data.refine! @o[:table]
48
+ end
49
+ puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
50
+ data.save @o[:rocker]
51
+ end # compile!
52
+ end # ROCker
53
+
@@ -0,0 +1,32 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-04-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ #@@DEFAULTS.merge!({ })
11
+
12
+ #================================[ Filter ]
13
+ def filter!
14
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
15
+ raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
16
+ raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
17
+
18
+ puts "Reading ROCker file." unless @o[:q]
19
+ data = ROCData.new @o[:rocker]
20
+
21
+ puts "Filtering BLAST." unless @o[:q]
22
+ ih = File.open(@o[:qblast], 'r')
23
+ oh = File.open(@o[:oblast], 'w')
24
+ while ln = ih.gets
25
+ bh = BlastHit.new(ln, data.aln)
26
+ oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
27
+ end
28
+ ih.close
29
+ oh.close
30
+ end # filter!
31
+ end # ROCker
32
+
@@ -0,0 +1,93 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-04-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ @@DEFAULTS.merge!({:color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true, :sbj=>[]})
11
+
12
+ #================================[ Search ]
13
+ def plot!
14
+ raise "-k/--rocker is mandatory." if o[:rocker].nil?
15
+ if @o[:table].nil?
16
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
17
+ @o[:table] = "#{@o[:blast]}.table"
18
+ end
19
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
20
+
21
+ puts "Testing environment." unless @o[:q]
22
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
23
+
24
+ puts "Reading files." unless @o[:q]
25
+ puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
26
+ data = ROCData.new @o[:rocker]
27
+ if File.exist? @o[:table]
28
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
29
+ else
30
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
31
+ blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
32
+ end
33
+
34
+ puts "Plotting matches." unless @o[:q]
35
+ extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
36
+ @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
37
+ data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
38
+ data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
39
+ some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
40
+ data.rrun "par(mar=c(0,4,0,0.5)+.1);"
41
+ data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
42
+ data.rrun "noise <- runif(ncol(x),-.2,.2)"
43
+ data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
44
+ data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
45
+
46
+ puts "Plotting windows." unless @o[:q]
47
+ if some_thr
48
+ data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
49
+ data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
50
+ end
51
+ data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
52
+ "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
53
+
54
+ puts "Plotting alignment." unless @o[:q]
55
+ data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
56
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
57
+ i = 0
58
+ data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
59
+ data.aln.seqs.values.each do |s|
60
+ color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
61
+ data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
62
+ i += 1
63
+ end
64
+
65
+ puts "Plotting statistics." unless @o[:q]
66
+ data.rrun "par(mar=c(5,4,0,0.5)+.1);"
67
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
68
+ if some_thr
69
+ sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
70
+ sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
71
+ ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
72
+ unless @o[:q]
73
+ puts " * sensitivity: #{sn}%"
74
+ puts " * specificity: #{sp}%"
75
+ puts " * accuracy: #{ac}%"
76
+ end
77
+ data.rrun "pos <- (w$V1+w$V2)/2"
78
+ if @o[:impact]
79
+ data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
80
+ data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
81
+ data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
82
+ else
83
+ data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
84
+ data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
85
+ data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
86
+ end
87
+ #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
88
+ end
89
+ data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
90
+ data.rrun "dev.off();"
91
+ end # plot!
92
+ end # ROCker
93
+
@@ -0,0 +1,20 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-04-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ #@@DEFAULTS.merge!({ })
11
+
12
+ #================================[ Search ]
13
+ def search!
14
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
15
+ raise "Code Under development..."
16
+ # ToDo
17
+ # [ ... ]
18
+ end # search!
19
+ end # ROCker
20
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-rocker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis (Coto) Orellana
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-05-07 00:00:00.000000000 Z
12
+ date: 2015-06-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rest-client
@@ -25,6 +25,20 @@ dependencies:
25
25
  - - ~>
26
26
  - !ruby/object:Gem::Version
27
27
  version: 1.7.3
28
+ - !ruby/object:Gem::Dependency
29
+ name: json
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ~>
33
+ - !ruby/object:Gem::Version
34
+ version: 1.8.1
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ~>
40
+ - !ruby/object:Gem::Version
41
+ version: 1.8.1
28
42
  description: Detecting and quantifying functional genes in short-read metagenomic
29
43
  datasets
30
44
  email: lhorellana@gatech.edu
@@ -40,6 +54,11 @@ files:
40
54
  - lib/rocker/rocwindow.rb
41
55
  - lib/rocker/rocdata.rb
42
56
  - lib/rocker/rinterface.rb
57
+ - lib/rocker/step/build.rb
58
+ - lib/rocker/step/compile.rb
59
+ - lib/rocker/step/search.rb
60
+ - lib/rocker/step/filter.rb
61
+ - lib/rocker/step/plot.rb
43
62
  - bin/ROCker
44
63
  homepage: http://enve-omics.ce.gatech.edu/rocker
45
64
  licenses:
@@ -51,9 +70,9 @@ require_paths:
51
70
  - lib
52
71
  required_ruby_version: !ruby/object:Gem::Requirement
53
72
  requirements:
54
- - - '>='
73
+ - - ~>
55
74
  - !ruby/object:Gem::Version
56
- version: '0'
75
+ version: '2.0'
57
76
  required_rubygems_version: !ruby/object:Gem::Requirement
58
77
  requirements:
59
78
  - - '>='