bio-rocker 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 14d82f0e8c6f1cf052b52c82de99c5616ebfd2a3
4
- data.tar.gz: 175ffb75e14ecfa7f12073ef8fddff2d8f8bda5d
3
+ metadata.gz: b8a10cdc85d8b7b54c21d26f12b90c0b3dff4f82
4
+ data.tar.gz: c837b3c6687f6705dbfc7c959824dd530e7ee932
5
5
  SHA512:
6
- metadata.gz: d8ce626b7731d7293339c74edbc8ac06005c6b4f988d7cb45e191b24ccc2194091d214d2352c5cfd9e34018abc4b94ed3ffa1b734a5438bc6246fc4f564f6abf
7
- data.tar.gz: 0d7187d853a4ac73b91808f52757c94f7d61bc4f19762a7ba4ec97363c55c1eeb32791f78b31ae5d91a7dd72b6779b3bc4cd34b9783e3509bae72be33e4e523a
6
+ metadata.gz: 869cdadfed2dad125fc11c03133e2f56df53074a54b0f35d8ea5c6674029e7069332e4c35c486b1f39a417aeff01932a7eee30da44e15de036ce1a2d878d15d4
7
+ data.tar.gz: 823b30e7923c243f8dc8bb122f50426898e2cdda634516cc53e0325b150946b0699e0a9257a55c06ed8868a0f843f0274ed1d23bdf8ef2de9629fafb66f33552
data/bin/ROCker CHANGED
@@ -16,8 +16,8 @@ require 'optparse'
16
16
  $t = {
17
17
  'build' => 'Creates in silico metagenomes and training sets from reference genomes.',
18
18
  'compile' => 'Identifies the most discriminant bit-score per alignment position in a set of sequence.',
19
- 'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
20
19
  'search' => 'Uses a ROCker compilation to identify reads putatively derived from a set of sequences.',
20
+ 'filter' => 'Uses a pre-compiled set of bit-score thresholds to filter a search result.',
21
21
  'plot' => 'Generates a graphical representation of the alignment, the thresholds, and the hits.',
22
22
  }
23
23
  task = (ARGV.size > 0 ? ARGV.shift : '').downcase
@@ -43,49 +43,51 @@ opts = OptionParser.new do |opt|
43
43
  opt.on("-p", "--positive ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
44
44
  opt.on("-n", "--negative ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
45
45
  opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
46
- #opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides (currently not implemented)."){ raise "--nucleotides: This option is currently not implemented." }
47
46
  opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
48
47
  opt.separator ""
49
48
  opt.separator "+ ADVANCED BUILDING ARGUMENTS"
50
49
  opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one UniProtKB ID per line. If used, -p is not required."){ |v| o[:posfile]=v }
51
50
  opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one UniProtKB ID per line."){ |v| o[:negfile]=v }
52
51
  opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain UniProtKB ID. If used, -p is not required."){ |v| o[:aln]=v }
53
- opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
54
- opt.on("-v", "--overlap NUMBER", "Minimum overlap with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_f }
55
- opt.on( "--genome-frx NUMBER", "Fraction to subsample the positive set genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}"){ |v| o[:genomefrx]=v.to_f }
56
- opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum.",
57
- "This option replaces --per-genus and --per-species from v0.1.*."){ |v| o[:pertaxon]=v.downcase }
58
- opt.on( "--nometagenome", "Do not create metagenome. Implies --noblast. By default, metagenome is created."){ |v| o[:nomg]=v }
59
- opt.on( "--noblast", "Do not execute BLAST. By default, BLAST is executed."){ |v| o[:noblast]=v }
52
+ opt.on("-s", "--seqdepth NUMBER", "Sequencing depth (reads/bp) to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
53
+ opt.on("-l", "--readlen INTEGER", "Average read length of in silico metagenome (in bp). By default: '#{ROCker.default :readlen}'."){ |v| o[:readlen]=v.to_i }
54
+ opt.on("-v", "--overlap INTEGER", "Minimum overlap (in bp) with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_i }
55
+ opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum."){ |v| o[:pertaxon]=v.downcase }
56
+ opt.on( "--genome-frx NUMBER", "Fraction to subsample genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}."){ |v| o[:genomefrx]=v.to_f }
57
+ opt.on( "--nosimulate", "Do not simulate metagenome. Implies --nosearch. By default, metagenome is simulated."){ |v| o[:nosimulate]=v }
58
+ opt.on( "--nosearch", "Do not execute similarity search. By default, it is executed."){ |v| o[:nosearch]=v }
60
59
  opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
61
60
  opt.on( "--nocleanup", "Keep all intermediate files. By default, intermediate files are removed."){ |v| o[:noclean]=v }
62
61
  opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
63
62
  opt.separator ""
64
63
  opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
65
- opt.on("-G", "--grinder PATH", "Path to the grinder executable. By default: '#{ROCker.default :grinder}' (in the $PATH)."){ |v| o[:grinder]=v }
66
- opt.on("-M", "--muscle PATH", "Path to the muscle executable. By default: '#{ROCker.default :muscle}' (in the $PATH)."){ |v| o[:muscle]=v }
67
- opt.on("-B", "--blastbins PATH", "Path to the Blast+ executables. By default: '#{ROCker.default :blastbins}' (in the $PATH)."){ |v| o[:blastbins]=v }
68
- opt.on( "--grinder-cmd STR", "Command calling grinder, where %1$s: grinder bin, %2$s: input, %3$s: seq. depth, %4$s: output.",
69
- "By default: '#{ROCker.default :grindercmd}'."){ |v| o[:grindercmd]=v }
70
- opt.on("--muscle-cmd STR", "Command calling muscle, where %1$s: muscle bin, %2$s: input, %3$s: output.",
71
- "By default: '#{ROCker.default :musclecmd}'."){ |v| o[:musclecmd]=v }
72
- opt.on("--blast-cmd STR", "Command calling BLAST search, where %1$s: blast bins, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
73
- "By default: '#{ROCker.default :blastcmd}'."){ |v| o[:blastcmd]=v }
74
- opt.on("--makedb-cmd STR", "Command calling BLAST format, where %1$s: blast bins, %2$s: dbtype, %3$s: input, %4$s: database.",
75
- "By default: '#{ROCker.default :makedbcmd}'."){ |v| o[:makedbcmd]=v }
64
+ opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
65
+ opt.on( "--simulator STR", "In silico metagenome simulator to use. Supported: 'grinder'. By default: '#{ROCker.default :simulator}'.") { |v| o[:ssimulator]=v.to_sym }
66
+ opt.on( "--aligner STR", "Multiple alignment algorithm to use. Supported: 'clustalo' and 'muscle'. By default: '#{ROCker.default :aligner}'.") { |v| o[:aligner]=v.to_sym }
67
+ opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
68
+ opt.on( "--simulator-bin PATH", "Path to the simulator executable. By default in the $PATH: '#{ROCker.default(:simulatorbin).values.join("' or '")}'.") { |v| o[:simulatorbin]=v }
69
+ opt.on( "--aligner-bin PATH", "Path to the aligner executable. By default in the $PATH: '#{ROCker.default(:alignerbin).values.join("' or '")}'.") { |v| o[:alignerbin]=v }
70
+ opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
71
+ *ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
72
+ opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
73
+ *ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
74
+ opt.on( "--simulator-cmd STR", "Command calling simulator, where %1$s: binary, %2$s: input, %3$s: seq. depth (X), %4$d: read len., %5$s: output.",
75
+ *ROCker.default(:simulatorcmd).keys.map{|k| "By default if --simulator #{k}: '#{ROCker.default(:simulatorcmd)[k]}'."}){ |v| o[:simulatorcmd]=v }
76
+ opt.on("--aligner-cmd STR", "Command calling aligner, where %1$s: binary, %2$s: input, %3$s: output, %4$d: threads.",
77
+ *ROCker.default(:alignercmd).keys.map{|k| "By default if --aligner #{k}: '#{ROCker.default(:alignercmd)[k]}'."}){ |v| o[:alignercmd]=v }
76
78
  when 'compile'
77
79
  opt.separator "+ COMPILATION ARGUMENTS"
78
80
  opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. Required."){ |v| o[:aln]=v }
79
81
  opt.on("-b", "--ref-blast PATH",
80
82
  "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
81
83
  opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
82
- opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ raise "--nucleotides: This option is currently not implemented." }
83
84
  opt.separator ""
84
85
  opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
85
86
  opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
86
87
  opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
87
88
  opt.on( "--norefine", "Do not refine windows."){ o[:refine]=false }
88
89
  opt.on("-w", "--window INT", "Initial size of alignment windows (in number of AA columns). By default: #{ROCker.default :win}."){ |v| o[:win]=v.to_i }
90
+ opt.on( "--reuse-files", "Re-use existing result files. By default, existing files are ignored."){ |v| o[:reuse]=true }
89
91
  opt.separator ""
90
92
  opt.separator "+ INPUT/OUTPUT"
91
93
  opt.separator " o The input alignment (-a) MUST be in FastA format, and the IDs must"
@@ -107,15 +109,23 @@ opts = OptionParser.new do |opt|
107
109
  opt.separator " 5. Bit score threshold set for the window."
108
110
  opt.separator " The file also contains the alignment (commented with #:)."
109
111
  opt.separator ""
112
+ when 'search'
113
+ opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
114
+ opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
115
+ opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
116
+ opt.separator ""
117
+ opt.separator "+ EXTERNAL SOFTWARE OPTIONS"
118
+ opt.on( "--search STR", "Similarity search algorithm to use. Supported: 'blast' and 'diamond'. By default: '#{ROCker.default :search}'.") { |v| o[:search]=v.to_sym }
119
+ opt.on( "--search-bins PATH", "Path to the similarity search executables. By default in the $PATH: '#{ROCker.default :searchbins}'.") { |v| o[:searchbins]=v }
120
+ opt.on( "--search-cmd STR", "Command calling similarity search, where %1$s: binaries, %2$s: program, %3$s: input, %4$s: database, %5$s: output, %6$d: threads.",
121
+ *ROCker.default(:searchcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:searchcmd)[k]}'."}){ |v| o[:searchcmd]=v }
122
+ opt.on( "--makedb-cmd STR", "Command calling database format for similarity search, where %1$s: binaries, %2$s: dbtype, %3$s: input, %4$s: database.",
123
+ *ROCker.default(:makedbcmd).keys.map{|k| "By default if --search #{k}: '#{ROCker.default(:makedbcmd)[k]}'."}){ |v| o[:makedbcmd]=v }
110
124
  when 'filter'
111
125
  opt.separator "+ FILTERING ARGUMENTS"
112
126
  opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
113
127
  opt.on("-x", "--query-blast PATH", "Tabular BLAST (blastx) of the query reads vs. the reference dataset. Required."){ |v| o[:qblast]=v }
114
128
  opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
115
- when 'search'
116
- opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
117
- opt.on("-q", "--query PATH", "File containing the query sequences in FastA format. Required."){ |v| o[:query]=v }
118
- opt.on("-o", "--out-blast PATH", "Filtered tabular BLAST to be created. Required."){ |v| o[:oblast]=v }
119
129
  when 'plot'
120
130
  opt.separator "+ PLOTTING ARGUMENTS"
121
131
  opt.on("-k", "--rocker PATH", "ROCker file generated by the compile task (-k). Required."){ |v| o[:rocker]=v }
@@ -127,7 +137,7 @@ opts = OptionParser.new do |opt|
127
137
  opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
128
138
  opt.on( "--color", "Color alignment by amino acid."){ o[:color]=true }
129
139
  opt.on( "--no-transparency", "Do not use (semi-)transparencies."){ |v| o[:transparency] = v }
130
- opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}"){ |v| o[:minscore]=v.to_f }
140
+ opt.on( "--min-score NUMBER", "Minimum Bit-Score to consider a hit. By default: #{ROCker.default :minscore}."){ |v| o[:minscore]=v.to_f }
131
141
  opt.on( "--stats-impact", "Plot impact on statistics, instead of absolute values per window."){ o[:impact]=true }
132
142
  opt.on( "--stats-ylim STRING", "Limits of the Y-axis in the bottom panel. By default: '-2,.1' if --stats-impact is set, '50,100' otherwise."){ |v| o[:ylim]=v }
133
143
  opt.on("-s", "--subject SBJ1,SBJ2,...", Array,
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update May-14-2015
5
+ # @update Jun-05-2015
6
6
  #
7
7
 
8
8
  require 'rocker/blasthit'
@@ -10,40 +10,20 @@ require 'rocker/rocdata'
10
10
 
11
11
  class ROCker
12
12
  #================================[ Class ]
13
- @@EBIREST = 'http://www.ebi.ac.uk/Tools'
14
13
  @@DEFAULTS = {
15
14
  # General
16
- :q=>false, :r=>'R', :nucl=>false, :debug=>false,
17
- # Build
18
- :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0,
19
- # ext. software
20
- :grinder=>'grinder', :muscle=>'muscle', :blastbins=>'', :seqdepth=>3, :minovl=>0.75,
21
- :grindercmd=>'%1$s -reference_file "%2$s" -cf "%3$f" -base_name "%4$s" -dc \'-~*Nn\' -md "uniform 0.1" -mr "95 5" -rd "100 uniform 5"',
22
- :musclecmd=>'%1$s -in "%2$s" -out "%3$s" -quiet',
23
- :blastcmd=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
24
- :makedbcmd=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
25
- # Compile
26
- :refine=>true, :win=>20, :minscore=>0,
27
- # Filter
28
- :sbj=>[],
29
- # Plot
30
- :color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true,
15
+ :q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
16
+ # External software
17
+ :searchbins=>'',
18
+ :searchcmd=>{
19
+ :blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
20
+ :diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
21
+ :makedbcmd=>{
22
+ :blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
23
+ :diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
31
24
  }
32
- @@HAS_BUILD_GEMS = nil
33
- def self.ebirest() @@EBIREST ; end
34
25
  def self.defaults() @@DEFAULTS ; end
35
26
  def self.default(k) @@DEFAULTS[k] ; end
36
- def self.has_build_gems?
37
- return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
38
- @@HAS_BUILD_GEMS = TRUE
39
- begin
40
- require 'rubygems'
41
- require 'restclient'
42
- rescue LoadError
43
- @@HAS_BUILD_GEMS = FALSE
44
- end
45
- @@HAS_BUILD_GEMS
46
- end
47
27
 
48
28
  #================================[ Instance ]
49
29
  attr_reader :o
@@ -53,374 +33,6 @@ class ROCker
53
33
  RInterface.R_BIN = opts[:r] unless opts[:r].nil?
54
34
  end
55
35
 
56
- #================================[ Build ]
57
- def build!
58
- # Check requirements
59
- puts "Testing environment." unless @o[:q]
60
- @o[:noblast]=true if @o[:nomg]
61
- raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
62
- @o[:positive] += @o[:posori] unless @o[:posori].nil?
63
- @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
64
- @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
65
- unless @o[:aln].nil?
66
- aln = Alignment.new
67
- aln.read_fasta @o[:aln]
68
- @o[:positive] += aln.get_ids
69
- end
70
- raise "-p or -P are mandatory." if @o[:positive].size==0
71
- raise "-o/--baseout is mandatory." if @o[:baseout].nil?
72
- if @o[:positive].size == 1 and not @o[:noaln]
73
- warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
74
- @o[:noaln] = true
75
- end
76
- self.bash "#{@o[:grinder]} --version", "-G/--grinder must be executable. Is Grinder installed?" unless @o[:nomg]
77
- self.bash "#{@o[:muscle]} -version", "-M/--muscle must be executable. Is Muscle installed?" unless @o[:noaln]
78
- self.bash "#{@o[:blastbins]}makeblastdb -version", "-B/--blastbins must contain executables. Is BLAST+ installed?" unless @o[:noblast]
79
- # Download genes
80
- puts "Downloading gene data." unless @o[:q]
81
- f = File.open(@o[:baseout] + '.ref.fasta', 'w')
82
- if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
83
- puts " * re-using aligned sequences as positive set." unless @o[:q]
84
- f.print aln.to_seq_s
85
- @o[:noaln] = true
86
- else
87
- puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
88
- $stderr.puts " # #{@o[:positive]}" if @o[:debug]
89
- ids = Array.new(@o[:positive])
90
- while ids.size>0
91
- f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
92
- end
93
- end
94
- f.close
95
- genome_ids = {:positive=>[], :negative=>[]}
96
- [:positive, :negative].each do |set|
97
- unless @o[set].size==0
98
- puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
99
- $stderr.puts " # #{@o[set]}" if @o[:debug]
100
- genome_ids[set] = genes2genomes(@o[set])
101
- end
102
- end
103
- raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
104
- genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
105
- raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
106
- all_genome_ids = genome_ids.values.reduce(:+).uniq
107
-
108
- # Locate genes
109
- puts "Analyzing genome data." unless @o[:q]
110
- puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s)." unless @o[:q]
111
- $stderr.puts " # #{genome_ids[:positive]}" if @o[:debug]
112
- positive_coords = {}
113
- genome_org = {}
114
- i = 0
115
- genome_ids[:positive].each do |genome_id|
116
- print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids[:positive].size}. \r" unless @o[:q]
117
- unless @o[:pertaxon].nil?
118
- genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
119
- next unless genome_org[ genome_taxon ].nil?
120
- genome_org[ genome_taxon ] = genome_id
121
- end
122
- $stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
123
- genome_file = @o[:baseout] + '.src.' + i.to_s + '.gff3'
124
- if @o[:reuse] and File.exist? genome_file
125
- puts " * reusing existing file: #{genome_file}." unless @o[:q]
126
- ifh = File.open(genome_file, 'r')
127
- doc = ifh.readlines.grep(/^[^#]/)
128
- ifh.close
129
- else
130
- genome_file=nil unless @o[:noclean]
131
- res = ebiFetch(:embl, [genome_id], :gff3, genome_file)
132
- doc = res.split("\n").grep(/^[^#]/)
133
- end
134
- doc.each do |ln|
135
- next if ln =~ /^#/
136
- r = ln.chomp.split /\t/
137
- next if r.size < 9
138
- prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
139
- p = prots.select{ |p| @o[:positive].include? p }.first
140
- next if p.nil?
141
- positive_coords[ r[0] ] ||= []
142
- positive_coords[ r[0] ] << {
143
- #:strand => r[6],
144
- :prot_id => p,
145
- :from => r[3].to_i,
146
- :to => r[4].to_i
147
- }
148
- end
149
- end
150
- print "\n" unless @o[:q]
151
- unless @o[:pertaxon].nil?
152
- genome_ids[:positive] = genome_org.values
153
- puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
154
- end
155
- all_genome_ids = genome_ids.values.reduce(:+).uniq
156
- found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+)
157
- raise "Cannot find the genomic location of any provided sequence." if found.nil?
158
- missing = @o[:positive] - found
159
- warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or not @o[:pertaxon].nil?
160
-
161
- # Download genomes
162
- genomes_file = @o[:baseout] + '.src.fasta'
163
- if @o[:reuse] and File.exist? genomes_file
164
- puts " * reusing existing file: #{genomes_file}." unless @o[:q]
165
- else
166
- puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
167
- $stderr.puts " # #{all_genome_ids}" if @o[:debug]
168
- ids = Array.new(all_genome_ids)
169
- ofh = File.open(genomes_file, 'w')
170
- while ids.size>0
171
- ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
172
- end
173
- ofh.close
174
- end
175
-
176
- # Generate metagenome
177
- unless @o[:nomg]
178
- puts "Generating in silico metagenome" unless @o[:q]
179
- if @o[:reuse] and File.exist? @o[:baseout] + ".mg.fasta"
180
- puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
181
- else
182
- all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
183
- thrs = [@o[:thr], all_src].min
184
- puts " * running grinder and tagging positive reads (#{thrs} threads)." unless @o[:q]
185
- $stderr.puts " # #{positive_coords}" if @o[:debug]
186
- thr_obj = []
187
- seqs_per_thr = (all_src/thrs).ceil
188
- (0 .. (thrs-1)).each do |thr_i|
189
- thr_obj << Thread.new do
190
- Thread.current[:seqs_a] = thr_i*seqs_per_thr + 1
191
- Thread.current[:seqs_b] = [Thread.current[:seqs_a] + seqs_per_thr, all_src].min
192
- # Create sub-fasta
193
- Thread.current[:ofh] = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", 'w')
194
- Thread.current[:ifh] = File.open("#{@o[:baseout]}.src.fasta", 'r')
195
- Thread.current[:seq_i] = 0
196
- while Thread.current[:l] = Thread.current[:ifh].gets
197
- Thread.current[:seq_i]+=1 if Thread.current[:l] =~ /^>/
198
- break if Thread.current[:seq_i] > Thread.current[:seqs_b]
199
- Thread.current[:ofh].print Thread.current[:l] if Thread.current[:seq_i] >= Thread.current[:seqs_a]
200
- end
201
- Thread.current[:ifh].close
202
- Thread.current[:ofh].close
203
- bash sprintf(@o[:grindercmd], @o[:grinder], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
204
- # Tag positives
205
- puts " * tagging positive reads." unless @o[:q]
206
- Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
207
- Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
208
- while Thread.current[:l]=Thread.current[:ifh].gets
209
- Thread.current[:rd] = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
210
- unless Thread.current[:rd].nil?
211
- Thread.current[:positive] = false
212
- positive_coords[Thread.current[:rd][:genome_id]] ||= []
213
- positive_coords[Thread.current[:rd][:genome_id]].each do |gn|
214
- Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
215
- Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
216
- if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
217
- Thread.current[:positive] = true
218
- break
219
- end
220
- end
221
- Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:genome_id]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
222
- end
223
- Thread.current[:ofh].print Thread.current[:l]
224
- end
225
- Thread.current[:ofh].close
226
- Thread.current[:ifh].close
227
- Thread.current[:output] = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
228
- end # Thread.new do
229
- end # (1 .. thrs).each
230
- # Concatenate results
231
- ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
232
- thr_obj.each do |t|
233
- t.join
234
- raise "Thread failed without error trace: #{t}" if t[:output].nil?
235
- ifh = File.open(t[:output], 'r')
236
- while l = ifh.gets
237
- ofh.print l
238
- end
239
- ifh.close
240
- File.unlink t[:output]
241
- end
242
- ofh.close
243
- end
244
- end # unless @o[:nomg]
245
- # Align references
246
- unless @o[:noaln]
247
- puts "Aligning reference set." unless @o[:q]
248
- if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.aln"
249
- puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
250
- else
251
- bash sprintf(@o[:musclecmd], @o[:muscle], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln")
252
- puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
253
- end
254
- end
255
- # Run BLAST
256
- unless @o[:noblast]
257
- puts "Running homology search." unless @o[:q]
258
- if @o[:reuse] and File.exist? "#{@o[:baseout]}.ref.blast"
259
- puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
260
- else
261
- puts " * preparing database." unless @o[:q]
262
- bash sprintf(@o[:makedbcmd], @o[:blastbins], (@o[:nucl]?'nucl':'prot'), "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
263
- puts " * running BLAST." unless @o[:q]
264
- bash sprintf(@o[:blastcmd], @o[:blastbins], (@o[:nucl]?'blastn':'blastx'), "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
265
- end
266
- end
267
- # Clean
268
- unless @o[:noclean]
269
- puts "Cleaning." unless @o[:q]
270
- sff = %w{.src.xml .src.fasta}
271
- sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nomg]
272
- sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:noblast]
273
- sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
274
- end
275
- end # build!
276
-
277
- #================================[ Compile ]
278
- def compile!
279
- raise "-a/--alignment is mandatory." if @o[:aln].nil?
280
- raise "-a/--alignment must exist." unless File.exist? @o[:aln]
281
- if @o[:table].nil?
282
- raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
283
- @o[:table] = "#{@o[:blast]}.table"
284
- end
285
- raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
286
- raise "-k/--rocker is mandatory." if @o[:rocker].nil?
287
-
288
- puts "Testing environment." unless @o[:q]
289
- bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
290
- bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
291
-
292
- puts "Reading files." unless @o[:q]
293
- puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
294
- aln = Alignment.new
295
- aln.read_fasta @o[:aln]
296
-
297
- if File.exist? @o[:table]
298
- puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
299
- else
300
- puts " * generating table: #{@o[:table]}." unless @o[:q]
301
- blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
302
- end
303
-
304
- puts "Analyzing data." unless @o[:q]
305
- puts " * computing windows." unless @o[:q]
306
- data = ROCData.new(@o[:table], aln, @o[:win])
307
- data.nucl = @o[:nucl]
308
- if @o[:refine]
309
- puts " * refining windows." unless @o[:q]
310
- warn "Insufficient hits to refine results." unless data.refine! @o[:table]
311
- end
312
- puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
313
- data.save @o[:rocker]
314
- end # compile!
315
-
316
- #================================[ Filter ]
317
- def filter!
318
- raise "-k/--rocker is mandatory." if @o[:rocker].nil?
319
- raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
320
- raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
321
-
322
- puts "Reading ROCker file." unless @o[:q]
323
- data = ROCData.new @o[:rocker]
324
-
325
- puts "Filtering BLAST." unless @o[:q]
326
- ih = File.open(@o[:qblast], 'r')
327
- oh = File.open(@o[:oblast], 'w')
328
- while ln = ih.gets
329
- bh = BlastHit.new(ln, data.aln)
330
- oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
331
- end
332
- ih.close
333
- oh.close
334
- end # filter!
335
- #================================[ Search ]
336
- def search!
337
- raise "-k/--rocker is mandatory." if @o[:rocker].nil?
338
- raise "Code Under development..."
339
- # ToDo
340
- # [ ... ]
341
- end # search!
342
-
343
- #================================[ Plot ]
344
- def plot!
345
- raise "-k/--rocker is mandatory." if o[:rocker].nil?
346
- if @o[:table].nil?
347
- raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
348
- @o[:table] = "#{@o[:blast]}.table"
349
- end
350
- raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
351
-
352
- puts "Testing environment." unless @o[:q]
353
- bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
354
-
355
- puts "Reading files." unless @o[:q]
356
- puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
357
- data = ROCData.new @o[:rocker]
358
- if File.exist? @o[:table]
359
- puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
360
- else
361
- puts " * generating table: #{@o[:table]}." unless @o[:q]
362
- blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
363
- end
364
-
365
- puts "Plotting matches." unless @o[:q]
366
- extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
367
- @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
368
- data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
369
- data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
370
- some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
371
- data.rrun "par(mar=c(0,4,0,0.5)+.1);"
372
- data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
373
- data.rrun "noise <- runif(ncol(x),-.2,.2)"
374
- data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
375
- data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
376
-
377
- puts "Plotting windows." unless @o[:q]
378
- if some_thr
379
- data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
380
- data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
381
- end
382
- data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
383
- "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
384
-
385
- puts "Plotting alignment." unless @o[:q]
386
- data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
387
- data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
388
- i = 0
389
- data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
390
- data.aln.seqs.values.each do |s|
391
- color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
392
- data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
393
- i += 1
394
- end
395
-
396
- puts "Plotting statistics." unless @o[:q]
397
- data.rrun "par(mar=c(5,4,0,0.5)+.1);"
398
- data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
399
- if some_thr
400
- sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
401
- sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
402
- ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
403
- unless @o[:q]
404
- puts " * sensitivity: #{sn}%"
405
- puts " * specificity: #{sp}%"
406
- puts " * accuracy: #{ac}%"
407
- end
408
- data.rrun "pos <- (w$V1+w$V2)/2"
409
- if @o[:impact]
410
- data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
411
- data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
412
- data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
413
- else
414
- data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
415
- data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
416
- data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
417
- end
418
- #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
419
- end
420
- data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
421
- data.rrun "dev.off();"
422
- end # plot!
423
-
424
36
  #================================[ Utilities ]
425
37
  def blast2table(blast_f, table_f, aln, minscore)
426
38
  ifh = File.open(blast_f, "r")
@@ -432,39 +44,6 @@ class ROCker
432
44
  ifh.close
433
45
  ofh.close
434
46
  end
435
- def genes2genomes(gene_ids)
436
- genomes = []
437
- ids = Array.new(gene_ids)
438
- while ids.size>0
439
- doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
440
- genomes += doc.grep( /^DR\s+EMBL;/ ).map{ |ln| ln.split('; ')[1] }
441
- end
442
- genomes.uniq
443
- end
444
- def genome2taxid(genome_id)
445
- ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
446
- return ln if ln.nil?
447
- ln.sub(/.*"taxon:(\d+)".*/, "\\1")
448
- end
449
- def genome2taxon(genome_id, rank='species')
450
- xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
451
- xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
452
- end
453
- def restcall(url, outfile=nil)
454
- response = RestClient.get url
455
- raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
456
- unless outfile.nil?
457
- ohf = File.open(outfile, 'w')
458
- ohf.print response.to_s
459
- ohf.close
460
- end
461
- response.to_s
462
- end
463
- def ebiFetch(db, ids, format, outfile=nil)
464
- url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
465
- $stderr.puts " # Calling: #{url}" if @o[:debug]
466
- self.restcall url
467
- end
468
47
  def bash(cmd, err_msg=nil)
469
48
  o = `#{cmd} 2>&1 && echo '{'`
470
49
  raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
@@ -473,6 +52,14 @@ class ROCker
473
52
  end
474
53
 
475
54
  #================================[ Extensions ]
55
+ # To ROCker
56
+ require 'rocker/step/build'
57
+ require 'rocker/step/compile'
58
+ require 'rocker/step/search'
59
+ require 'rocker/step/filter'
60
+ require 'rocker/step/plot'
61
+
62
+ # To other
476
63
  class Numeric
477
64
  def ordinalize
478
65
  n= self.to_s
@@ -0,0 +1,389 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-05-2015
6
+ #
7
+
8
+ require 'json'
9
+
10
+ class ROCker
11
+ #================================[ Class ]
12
+ @@EBIREST = 'http://www.ebi.ac.uk/Tools'
13
+ @@DEFAULTS.merge!({:positive=>[], :negative=>[], :genomefrx=>1.0, :seqdepth=>0.03, :readlen=>100, :minovl=>50,
14
+ # Ext. Software
15
+ :aligner=>:clustalo, :simulator=>:grinder,
16
+ :simulatorbin=>{:grinder=>'grinder'},
17
+ :simulatorcmd=>{:grinder=>'%1$s -reference_file "%2$s" -cf "%3$f" -dc \'-~*NnKkMmRrYySsWwBbVvHhDdXx\' -md uniform 0.1 -mr 95 5 -rd %4$d uniform 5 -base_name "%5$s"'},
18
+ :alignerbin=>{:muscle=>'muscle', :clustalo=>'clustalo'},
19
+ :alignercmd=>{:muscle=>'%1$s -in "%2$s" -out "%3$s" -quiet', :clustalo=>'%1$s -i "%2$s" -o "%3$s" --threads=%4$d --force'}
20
+ })
21
+ @@HAS_BUILD_GEMS = nil
22
+ def self.ebirest() @@EBIREST ; end
23
+ def self.has_build_gems?
24
+ return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
25
+ @@HAS_BUILD_GEMS = TRUE
26
+ begin
27
+ require 'rubygems'
28
+ require 'restclient'
29
+ rescue LoadError
30
+ @@HAS_BUILD_GEMS = FALSE
31
+ end
32
+ @@HAS_BUILD_GEMS
33
+ end
34
+
35
+ #================================[ Utilities ]
36
+ def genes2genomes(gene_ids)
37
+ genomes = []
38
+ ids = Array.new(gene_ids)
39
+ while ids.size>0
40
+ doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
41
+ genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
42
+ r=ln.split('; ')
43
+ {:genome_id=>r[1], :transl_id=>r[2]}
44
+ end
45
+ end
46
+ genomes.uniq
47
+ end
48
+ def genome2taxid(genome_id)
49
+ ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
50
+ return ln if ln.nil?
51
+ ln.sub(/.*"taxon:(\d+)".*/, "\\1")
52
+ end
53
+ def genome2taxon(genome_id, rank='species')
54
+ xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
55
+ xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
56
+ end
57
+ def restcall(url, outfile=nil)
58
+ $stderr.puts " # Calling: #{url}" if @o[:debug]
59
+ response = RestClient::Request.execute(:method=>:get, :url=>url, :timeout=>600)
60
+ raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
61
+ unless outfile.nil?
62
+ ohf = File.open(outfile, 'w')
63
+ ohf.print response.to_s
64
+ ohf.close
65
+ end
66
+ response.to_s
67
+ end
68
+ def ebiFetch(db, ids, format, outfile=nil)
69
+ url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
70
+ res = self.restcall url
71
+ unless outfile.nil?
72
+ ohf = File.open(outfile, 'w')
73
+ ohf.print res
74
+ ohf.close
75
+ end
76
+ res
77
+ end
78
+ def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
79
+ positive_coords = {}
80
+ genomes_org = {}
81
+ i = 0
82
+ genome_ids.each do |genome_id|
83
+ print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread. \r" if thread_id==0 and not @o[:q]
84
+ unless @o[:pertaxon].nil?
85
+ genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
86
+ genomes_org[ genome_taxon.to_sym ] ||= []
87
+ genomes_org[ genome_taxon.to_sym ] << genome_id
88
+ end
89
+ genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
90
+ if @o[:reuse] and File.size? genome_file
91
+ ifh = File.open(genome_file, 'r')
92
+ doc = ifh.readlines.grep(/^[^#]/)
93
+ ifh.close
94
+ else
95
+ genome_file=nil unless @o[:noclean]
96
+ doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
97
+ end
98
+ doc.each do |ln|
99
+ next if ln =~ /^#/
100
+ r = ln.chomp.split /\t/
101
+ next if r.size < 9
102
+ prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
103
+ p = prots.select{ |id| protein_ids.include? id }.first
104
+ trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
105
+ t = trans.select{ |id| transl_ids.include? id }.first
106
+ next if p.nil? and t.nil?
107
+ positive_coords[ r[0].to_sym ] ||= []
108
+ positive_coords[ r[0].to_sym ] << {
109
+ :prot_id => p,
110
+ :tran_id => t,
111
+ :from => r[3].to_i,
112
+ :to => r[4].to_i,
113
+ :strand => r[6]
114
+ }
115
+ end
116
+ end
117
+ print "\n" if thread_id==0 and not @o[:q]
118
+ ofh = File.open json_file, "w"
119
+ ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
120
+ ofh.close
121
+ end
122
+
123
+ #================================[ Build ]
124
+ def build!
125
+ # Check requirements
126
+ puts "Testing environment." unless @o[:q]
127
+ @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
128
+ @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
129
+ @o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
130
+ @o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
131
+ @o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
132
+ @o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
133
+ @o[:nosearch]=true if @o[:nosimulate]
134
+ raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
135
+ @o[:positive] += @o[:posori] unless @o[:posori].nil?
136
+ @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
137
+ @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
138
+ unless @o[:aln].nil?
139
+ aln = Alignment.new
140
+ aln.read_fasta @o[:aln]
141
+ @o[:positive] += aln.get_ids
142
+ end
143
+ raise "-p or -P are mandatory." if @o[:positive].size==0
144
+ raise "-o/--baseout is mandatory." if @o[:baseout].nil?
145
+ if @o[:positive].size == 1 and not @o[:noaln]
146
+ warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
147
+ @o[:noaln] = true
148
+ end
149
+ unless @o[:nosimulate]
150
+ self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
151
+ end
152
+ unless @o[:noaln]
153
+ self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
154
+ self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
155
+ end
156
+ unless @o[:nosearch]
157
+ self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
158
+ self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
159
+ end
160
+
161
+ # Download genes
162
+ puts "Downloading gene data." unless @o[:q]
163
+ ref_file = @o[:baseout] + ".ref.fasta"
164
+ if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
165
+ puts " * reusing aligned sequences as positive set." unless @o[:q]
166
+ f = File.open(ref_file, "w")
167
+ f.print aln.to_seq_s
168
+ f.close
169
+ @o[:noaln] = true
170
+ elsif @o[:reuse] and File.size? ref_file
171
+ puts " * reusing positive set: #{ref_file}." unless @o[:q]
172
+ else
173
+ puts " * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
174
+ $stderr.puts " # #{@o[:positive]}" if @o[:debug]
175
+ ids = Array.new(@o[:positive])
176
+ f = File.open(ref_file, "w")
177
+ while ids.size>0
178
+ f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
179
+ end
180
+ f.close
181
+ end
182
+ genome_ids = {:positive=>[], :negative=>[]}
183
+ transl_ids = {:positive=>[], :negative=>[]}
184
+ [:positive, :negative].each do |set|
185
+ unless @o[set].size==0
186
+ puts " * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
187
+ $stderr.puts " # #{@o[set]}" if @o[:debug]
188
+ r = genes2genomes(@o[set])
189
+ genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
190
+ transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
191
+ end
192
+ end
193
+ raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
194
+ genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
195
+ raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
196
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
197
+
198
+ # Locate genes
199
+ puts "Analyzing genome data." unless @o[:q]
200
+ coords_file = @o[:baseout] + ".src.coords"
201
+ if @o[:reuse] and File.size? coords_file
202
+ puts " * reusing coordinates: #{coords_file}." unless @o[:q]
203
+ c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
204
+ positive_coords = c[:positive_coords]
205
+ genome_org = c[:genome_org]
206
+ else
207
+ thrs = [@o[:thr], genome_ids[:positive].size].min
208
+ puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
209
+ $stderr.puts " # Looking for proteins: #{@o[:positive]}" if @o[:debug]
210
+ $stderr.puts " # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
211
+ $stderr.puts " # Looking into: #{genome_ids[:positive]}" if @o[:debug]
212
+ thr_obj = []
213
+ (0 .. (thrs-1)).each do |thr_i|
214
+ ids_to_parse = []
215
+ (0 .. (genome_ids[:positive].size-1)).each do |i|
216
+ ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
217
+ end
218
+ json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
219
+ thr_obj << json_file
220
+ fork do
221
+ get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
222
+ end
223
+ end
224
+ Process.waitall
225
+ # Combine results
226
+ positive_coords = {}
227
+ genomes_org = {}
228
+ genome_org = {}
229
+ thr_obj.each do |t|
230
+ raise "Thread failed without error trace: #{t}" unless File.exist? t
231
+ o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
232
+ o[:positive_coords].each_pair do |k,v|
233
+ positive_coords[ k ] ||= []
234
+ positive_coords[ k ] += v
235
+ end
236
+ o[:genomes_org].each_pair do |k,v|
237
+ genomes_org[ k ] ||= []
238
+ genomes_org[ k ] << v
239
+ end
240
+ File.unlink t
241
+ end
242
+ # Select one genome per taxon
243
+ unless @o[:pertaxon].nil?
244
+ genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
245
+ end
246
+ # Save coordinates
247
+ ofh = File.open(coords_file, "w")
248
+ ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
249
+ ofh.close
250
+ end
251
+ unless @o[:pertaxon].nil?
252
+ genome_ids[:positive] = genome_org.values
253
+ puts " Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
254
+ end
255
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
256
+ found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
257
+ unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
258
+ raise "Cannot find the genomic location of any provided sequence." if found.nil?
259
+ missing = @o[:positive] - found
260
+ warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
261
+
262
+ # Download genomes
263
+ genomes_file = @o[:baseout] + '.src.fasta'
264
+ if @o[:reuse] and File.size? genomes_file
265
+ puts " * reusing existing file: #{genomes_file}." unless @o[:q]
266
+ else
267
+ puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
268
+ $stderr.puts " # #{all_genome_ids}" if @o[:debug]
269
+ ids = Array.new(all_genome_ids)
270
+ ofh = File.open(genomes_file, 'w')
271
+ while ids.size>0
272
+ ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
273
+ end
274
+ ofh.close
275
+ end
276
+
277
+ # Generate metagenome
278
+ unless @o[:nosimulate]
279
+ puts "Generating in silico metagenome" unless @o[:q]
280
+ if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
281
+ puts " * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
282
+ else
283
+ all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
284
+ thrs = [@o[:thr], all_src].min
285
+ puts " * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
286
+ $stderr.puts " # #{positive_coords}" if @o[:debug]
287
+ thr_obj = []
288
+ seqs_per_thr = (all_src/thrs).ceil
289
+ (0 .. (thrs-1)).each do |thr_i|
290
+ output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
291
+ thr_obj << output
292
+ fork do
293
+ seqs_a = thr_i*seqs_per_thr + 1
294
+ seqs_b = [seqs_a + seqs_per_thr, all_src].min
295
+ # Create sub-fasta
296
+ ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
297
+ ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
298
+ seq_i = 0
299
+ while l = ifh.gets
300
+ seq_i+=1 if l =~ /^>/
301
+ break if seq_i > seqs_b
302
+ ofh.print l if seq_i >= seqs_a
303
+ end
304
+ ifh.close
305
+ ofh.close
306
+
307
+ # Run simulator (except if the temporal file is already there and can be reused)
308
+ unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
309
+ bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
310
+ end
311
+
312
+ # Tag positives
313
+ puts " * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
314
+ ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
315
+ ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
316
+ while l = ifh.gets
317
+ if l =~ /^>/
318
+ rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
319
+ raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
320
+ positive = false
321
+ positive_coords[rd[:genome_id].to_sym] ||= []
322
+ positive_coords[rd[:genome_id].to_sym].each do |gn|
323
+ left = rd[:to].to_i - gn[:from]
324
+ right = gn[:to] - rd[:from].to_i
325
+ if (left*right >= 0) and ([left, right].min >= @o[:minovl])
326
+ positive = true
327
+ break
328
+ end
329
+ end
330
+ l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
331
+ "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
332
+ end
333
+ ofh.print l
334
+ end
335
+ ofh.close
336
+ ifh.close
337
+ end # fork
338
+ end # (1 .. thrs).each
339
+ Process.waitall
340
+ # Concatenate results
341
+ ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
342
+ thr_obj.each do |t|
343
+ raise "Thread failed without error trace: #{t}" unless File.exist? t
344
+ ifh = File.open(t, "r")
345
+ while l = ifh.gets
346
+ ofh.print l
347
+ end
348
+ ifh.close
349
+ File.unlink t
350
+ end
351
+ ofh.close
352
+ end
353
+ end # unless @o[:nosimulate]
354
+
355
+ # Align references
356
+ unless @o[:noaln]
357
+ puts "Aligning reference set." unless @o[:q]
358
+ if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
359
+ puts " * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
360
+ else
361
+ bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
362
+ puts " +--\n | IMPORTANT NOTE: Manually checking the alignment before\n | the 'compile' step is *strongly* encouraged.\n +--\n" unless @o[:q]
363
+ end
364
+ end
365
+
366
+ # Run similarity search
367
+ unless @o[:nosearch]
368
+ puts "Running homology search." unless @o[:q]
369
+ if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
370
+ puts " * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
371
+ else
372
+ puts " * preparing database." unless @o[:q]
373
+ bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
374
+ puts " * running similarity search." unless @o[:q]
375
+ bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
376
+ end
377
+ end
378
+
379
+ # Clean
380
+ unless @o[:noclean]
381
+ puts "Cleaning." unless @o[:q]
382
+ sff = %w{.src.xml .src.fasta}
383
+ sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
384
+ sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
385
+ sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
386
+ end
387
+ end # build!
388
+ end # ROCker
389
+
@@ -0,0 +1,53 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-05-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ @@DEFAULTS.merge!({:refine=>true, :win=>20, :minscore=>0})
11
+
12
+ #================================[ Compile ]
13
+ def compile!
14
+ raise "-a/--alignment is mandatory." if @o[:aln].nil?
15
+ raise "-a/--alignment must exist." unless File.exist? @o[:aln]
16
+ if @o[:table].nil?
17
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil? or not File.exist? @o[:blast]
18
+ @o[:table] = "#{@o[:blast]}.table"
19
+ else
20
+ @o[:reuse] = true
21
+ end
22
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
23
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
24
+
25
+ puts "Testing environment." unless @o[:q]
26
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
27
+ bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."
28
+
29
+ puts "Reading files." unless @o[:q]
30
+ puts " * loading alignment: #{@o[:aln]}." unless @o[:q]
31
+ aln = Alignment.new
32
+ aln.read_fasta @o[:aln]
33
+
34
+ if @o[:reuse] and File.exist? @o[:table]
35
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
36
+ else
37
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
38
+ blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
39
+ end
40
+
41
+ puts "Analyzing data." unless @o[:q]
42
+ puts " * computing windows." unless @o[:q]
43
+ data = ROCData.new(@o[:table], aln, @o[:win])
44
+ data.nucl = @o[:nucl]
45
+ if @o[:refine]
46
+ puts " * refining windows." unless @o[:q]
47
+ warn "Insufficient hits to refine results." unless data.refine! @o[:table]
48
+ end
49
+ puts " * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
50
+ data.save @o[:rocker]
51
+ end # compile!
52
+ end # ROCker
53
+
@@ -0,0 +1,32 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-04-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ #@@DEFAULTS.merge!({ })
11
+
12
+ #================================[ Filter ]
13
+ def filter!
14
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
15
+ raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
16
+ raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
17
+
18
+ puts "Reading ROCker file." unless @o[:q]
19
+ data = ROCData.new @o[:rocker]
20
+
21
+ puts "Filtering BLAST." unless @o[:q]
22
+ ih = File.open(@o[:qblast], 'r')
23
+ oh = File.open(@o[:oblast], 'w')
24
+ while ln = ih.gets
25
+ bh = BlastHit.new(ln, data.aln)
26
+ oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
27
+ end
28
+ ih.close
29
+ oh.close
30
+ end # filter!
31
+ end # ROCker
32
+
@@ -0,0 +1,93 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-04-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ @@DEFAULTS.merge!({:color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true, :sbj=>[]})
11
+
12
+ #================================[ Search ]
13
+ def plot!
14
+ raise "-k/--rocker is mandatory." if o[:rocker].nil?
15
+ if @o[:table].nil?
16
+ raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
17
+ @o[:table] = "#{@o[:blast]}.table"
18
+ end
19
+ raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
20
+
21
+ puts "Testing environment." unless @o[:q]
22
+ bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
23
+
24
+ puts "Reading files." unless @o[:q]
25
+ puts " * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
26
+ data = ROCData.new @o[:rocker]
27
+ if File.exist? @o[:table]
28
+ puts " * reusing existing file: #{@o[:table]}." unless @o[:q]
29
+ else
30
+ puts " * generating table: #{@o[:table]}." unless @o[:q]
31
+ blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
32
+ end
33
+
34
+ puts "Plotting matches." unless @o[:q]
35
+ extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
36
+ @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
37
+ data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
38
+ data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
39
+ some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
40
+ data.rrun "par(mar=c(0,4,0,0.5)+.1);"
41
+ data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
42
+ data.rrun "noise <- runif(ncol(x),-.2,.2)"
43
+ data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
44
+ data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"
45
+
46
+ puts "Plotting windows." unless @o[:q]
47
+ if some_thr
48
+ data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
49
+ data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
50
+ end
51
+ data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
52
+ "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"
53
+
54
+ puts "Plotting alignment." unless @o[:q]
55
+ data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
56
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
57
+ i = 0
58
+ data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
59
+ data.aln.seqs.values.each do |s|
60
+ color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
61
+ data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
62
+ i += 1
63
+ end
64
+
65
+ puts "Plotting statistics." unless @o[:q]
66
+ data.rrun "par(mar=c(5,4,0,0.5)+.1);"
67
+ data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
68
+ if some_thr
69
+ sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
70
+ sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
71
+ ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
72
+ unless @o[:q]
73
+ puts " * sensitivity: #{sn}%"
74
+ puts " * specificity: #{sp}%"
75
+ puts " * accuracy: #{ac}%"
76
+ end
77
+ data.rrun "pos <- (w$V1+w$V2)/2"
78
+ if @o[:impact]
79
+ data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
80
+ data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
81
+ data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
82
+ else
83
+ data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
84
+ data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
85
+ data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
86
+ end
87
+ #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
88
+ end
89
+ data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
90
+ data.rrun "dev.off();"
91
+ end # plot!
92
+ end # ROCker
93
+
@@ -0,0 +1,20 @@
1
+ #
2
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
+ # @author Luis (Coto) Orellana
4
+ # @license artistic license 2.0
5
+ # @update Jun-04-2015
6
+ #
7
+
8
+ class ROCker
9
+ #================================[ Class ]
10
+ #@@DEFAULTS.merge!({ })
11
+
12
+ #================================[ Search ]
13
+ def search!
14
+ raise "-k/--rocker is mandatory." if @o[:rocker].nil?
15
+ raise "Code Under development..."
16
+ # ToDo
17
+ # [ ... ]
18
+ end # search!
19
+ end # ROCker
20
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-rocker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis (Coto) Orellana
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-05-07 00:00:00.000000000 Z
12
+ date: 2015-06-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rest-client
@@ -25,6 +25,20 @@ dependencies:
25
25
  - - ~>
26
26
  - !ruby/object:Gem::Version
27
27
  version: 1.7.3
28
+ - !ruby/object:Gem::Dependency
29
+ name: json
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ~>
33
+ - !ruby/object:Gem::Version
34
+ version: 1.8.1
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ~>
40
+ - !ruby/object:Gem::Version
41
+ version: 1.8.1
28
42
  description: Detecting and quantifying functional genes in short-read metagenomic
29
43
  datasets
30
44
  email: lhorellana@gatech.edu
@@ -40,6 +54,11 @@ files:
40
54
  - lib/rocker/rocwindow.rb
41
55
  - lib/rocker/rocdata.rb
42
56
  - lib/rocker/rinterface.rb
57
+ - lib/rocker/step/build.rb
58
+ - lib/rocker/step/compile.rb
59
+ - lib/rocker/step/search.rb
60
+ - lib/rocker/step/filter.rb
61
+ - lib/rocker/step/plot.rb
43
62
  - bin/ROCker
44
63
  homepage: http://enve-omics.ce.gatech.edu/rocker
45
64
  licenses:
@@ -51,9 +70,9 @@ require_paths:
51
70
  - lib
52
71
  required_ruby_version: !ruby/object:Gem::Requirement
53
72
  requirements:
54
- - - '>='
73
+ - - ~>
55
74
  - !ruby/object:Gem::Version
56
- version: '0'
75
+ version: '2.0'
57
76
  required_rubygems_version: !ruby/object:Gem::Requirement
58
77
  requirements:
59
78
  - - '>='