bio-rocker 0.1.04 → 0.2.0alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/bin/ROCker +11 -11
  3. data/lib/rocker.rb +59 -105
  4. data/lib/rocker/alignment.rb +8 -8
  5. metadata +19 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: af49dee4fb6e1297372669742be4b7c0e7b00e9c
4
- data.tar.gz: f49816fa930e991ec8e2835fb8567c1f2850cde9
3
+ metadata.gz: 83c3e3b92659ce86b8bc1c2119b3d37db119b432
4
+ data.tar.gz: 0f27ba6ed5086baf64e99732dc86a4c7e31432d9
5
5
  SHA512:
6
- metadata.gz: 392fa058bbd87e2dfdaed707599df2439512a904af348e7ac1a057899191e20f3a4215316eb87671992fa07e2a6f211bb7c83e33073539156d5d63e2c8b175d7
7
- data.tar.gz: 02489a4c6597d42f9adfbf6f4586c8183523745ad5be98c99f31f578d30dec9f205f3bea4e23c5d52b1cd46744b58606bf8b70c2ddcbcf06f406a88183606ad1
6
+ metadata.gz: cc2045b7943455232a5d9e6a3d3f26f45c953bf31a6b480f6ce3a2c8ab37c1468d0d6ce511521c5afc52ee65dfecc66f9b6195ad17dce4db211c17cdc591ead7
7
+ data.tar.gz: c8bda317dda0da8eed23cd976eb6c848f866382e37f5c8315bbd365fe65f85f7319a60eb2c2d511461fca4e329af403c31ccd77a16c0a880ec720edfd9ac269f
data/bin/ROCker CHANGED
@@ -3,9 +3,10 @@
3
3
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
4
  # @author Luis (Coto) Orellana
5
5
  # @license artistic license 2.0
6
- # @update Mar-23-2015
6
+ # @update May-07-2015
7
7
  #
8
8
 
9
+ $:.push File.expand_path(File.dirname(__FILE__) + '/../lib')
9
10
  require 'rocker'
10
11
  require 'optparse'
11
12
 
@@ -35,25 +36,24 @@ opts = OptionParser.new do |opt|
35
36
  opt.separator "+ UNSATISFIED REQUIREMENTS"
36
37
  opt.separator " The building task requires uninstalled gems, please install them executing:"
37
38
  opt.separator " gem install rest_client"
38
- opt.separator " gem install nokogiri"
39
39
  opt.separator ""
40
40
  end
41
41
  opt.separator "+ BUILDING ARGUMENTS"
42
- opt.on("-p", "--positive GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
43
- opt.on("-n", "--negative GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
42
+ opt.on("-p", "--positive ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
43
+ opt.on("-n", "--negative ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
44
44
  opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
45
- opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
45
+ #opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides (currently not implemented)."){ raise "--nucleotides: This option is currently not implemented." }
46
46
  opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
47
47
  opt.separator ""
48
48
  opt.separator "+ ADVANCED BUILDING ARGUMENTS"
49
- opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one GI per line. If used, -p is not required."){ |v| o[:posfile]=v }
50
- opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one GI per line."){ |v| o[:negfile]=v }
51
- opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain GI numbers. If used, -p is not required."){ |v| o[:aln]=v }
49
+ opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one UniProtKB ID per line. If used, -p is not required."){ |v| o[:posfile]=v }
50
+ opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one UniProtKB ID per line."){ |v| o[:negfile]=v }
51
+ opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain UniProtKB ID. If used, -p is not required."){ |v| o[:aln]=v }
52
52
  opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
53
53
  opt.on("-v", "--overlap NUMBER", "Minimum overlap with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_f }
54
54
  opt.on( "--genome-frx NUMBER", "Fraction to subsample the positive set genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}"){ |v| o[:genomefrx]=v.to_f }
55
- opt.on( "--per-genus", "If selected, only one genome per genus is used to build the metagenome."){ o[:pergenus]=true }
56
- opt.on( "--per-species", "If selected, only one genome per species is used to build the metagenome."){ o[:perspecies]=true }
55
+ opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum.",
56
+ "This option replaces --per-genus and --per-species, but is temporarily out of service."){ |v| o[:pertaxon]=v.downcase }
57
57
  opt.on( "--nometagenome", "Do not create metagenome. Implies --noblast. By default, metagenome is created."){ |v| o[:nomg]=v }
58
58
  opt.on( "--noblast", "Do not execute BLAST. By default, BLAST is executed."){ |v| o[:noblast]=v }
59
59
  opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
@@ -78,7 +78,7 @@ opts = OptionParser.new do |opt|
78
78
  opt.on("-b", "--ref-blast PATH",
79
79
  "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
80
80
  opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
81
- opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
81
+ opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ raise "--nucleotides: This option is currently not implemented." }
82
82
  opt.separator ""
83
83
  opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
84
84
  opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
data/lib/rocker.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Mar-23-2015
5
+ # @update May-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/blasthit'
@@ -10,12 +10,12 @@ require 'rocker/rocdata'
10
10
 
11
11
  class ROCker
12
12
  #================================[ Class ]
13
- @@EUTILS = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils'
13
+ @@EBIREST = 'http://www.ebi.ac.uk/Tools'
14
14
  @@DEFAULTS = {
15
15
  # General
16
16
  :q=>false, :r=>'R', :nucl=>false, :debug=>false,
17
17
  # Build
18
- :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0, :pergenus=>false, :perspecies=>false,
18
+ :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0,
19
19
  # ext. software
20
20
  :grinder=>'grinder', :muscle=>'muscle', :blastbins=>'', :seqdepth=>3, :minovl=>0.75,
21
21
  :grindercmd=>'%1$s -reference_file "%2$s" -cf "%3$f" -base_name "%4$s" -dc \'-~*Nn\' -md "poly4 3e-3 3.3e-8" -mr "95 5" -rd "100 uniform 5"',
@@ -30,16 +30,15 @@ class ROCker
30
30
  :color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true,
31
31
  }
32
32
  @@HAS_BUILD_GEMS = nil
33
- def self.eutils() @@EUTILS end
34
- def self.defaults() @@DEFAULTS end
35
- def self.default(k) @@DEFAULTS[k] end
33
+ def self.ebirest() @@EBIREST ; end
34
+ def self.defaults() @@DEFAULTS ; end
35
+ def self.default(k) @@DEFAULTS[k] ; end
36
36
  def self.has_build_gems?
37
37
  return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
38
38
  @@HAS_BUILD_GEMS = TRUE
39
39
  begin
40
40
  require 'rubygems'
41
41
  require 'restclient'
42
- require 'nokogiri'
43
42
  rescue LoadError
44
43
  @@HAS_BUILD_GEMS = FALSE
45
44
  end
@@ -66,7 +65,7 @@ class ROCker
66
65
  unless @o[:aln].nil?
67
66
  aln = Alignment.new
68
67
  aln.read_fasta @o[:aln]
69
- @o[:positive] += aln.get_gis
68
+ @o[:positive] += aln.get_ids
70
69
  end
71
70
  raise "-p or -P are mandatory." if @o[:positive].size==0
72
71
  raise "-o/--baseout is mandatory." if @o[:baseout].nil?
@@ -89,124 +88,76 @@ class ROCker
89
88
  $stderr.puts " # #{@o[:positive]}" if @o[:debug]
90
89
  ids = Array.new(@o[:positive])
91
90
  while ids.size>0
92
- f.print efetch({:db=>(@o[:nucl] ? 'nuccore' : 'protein'), :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
91
+ f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
93
92
  end
94
93
  end
95
94
  f.close
96
- genome_gis = {:positive=>[], :negative=>[]}
95
+ genome_ids = {:positive=>[], :negative=>[]}
97
96
  [:positive, :negative].each do |set|
98
97
  unless @o[set].size==0
99
98
  puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
100
99
  $stderr.puts " # #{@o[set]}" if @o[:debug]
101
- genome_gis[set] = genes2genomes(@o[set], @o[:nucl])
100
+ genome_ids[set] = genes2genomes(@o[set])
102
101
  end
103
102
  end
104
- raise "No genomes associated with the positive set." if genome_gis[:positive].size==0
105
- genome_gis[:positive] = genome_gis[:positive].sample( (genome_gis[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
106
- raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_gis[:positive].empty?
107
- all_gis = genome_gis.values.reduce(:+).uniq
103
+ raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
104
+ genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
105
+ raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
106
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
108
107
 
109
108
  # Locate genes
110
109
  puts "Analyzing genome data." unless @o[:q]
111
- puts " * downloading and parsing #{genome_gis[:positive].size} XML file(s)." unless @o[:q]
112
- $stderr.puts " # #{genome_gis[:positive]}" if @o[:debug]
110
+ puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s)." unless @o[:q]
111
+ $stderr.puts " # #{genome_ids[:positive]}" if @o[:debug]
113
112
  positive_coords = {}
114
113
  genome_org = {}
115
114
  i = 0
116
- genome_gis[:positive].each do |gi|
117
- print " * scanning #{(i+=1).ordinalize} genome out of #{genome_gis[:positive].size}. \r" unless @o[:q]
115
+ genome_ids[:positive].each do |genome_id|
116
+ print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids[:positive].size}. \r" unless @o[:q]
117
+ # ToDo check organism name using genome_org unless @o[:pertaxon].nil?
118
118
  $stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
119
- genome_file = @o[:baseout] + '.src.' + i.to_s + '.xml'
119
+ genome_file = @o[:baseout] + '.src.' + i.to_s + '.gff3'
120
120
  if @o[:reuse] and File.exist? genome_file
121
121
  puts " * reusing existing file: #{genome_file}." unless @o[:q]
122
122
  ifh = File.open(genome_file, 'r')
123
- doc = Nokogiri::XML( ifh )
123
+ doc = ifh.readlines.grep(/^[^#]/)
124
124
  ifh.close
125
125
  else
126
126
  genome_file=nil unless @o[:noclean]
127
- res = efetch({:db=>'nuccore', :id=>gi, :rettype=>'xml', :retmode=>'text'}, genome_file)
128
- doc = Nokogiri::XML( res )
127
+ res = ebiFetch(:embl, [genome_id], :gff3, genome_file)
128
+ doc = res.split("\n").grep(/^[^#]/)
129
129
  end
130
- incomplete = true
131
- doc.xpath('//Bioseq-set/Bioseq-set_seq-set/Seq-entry').each do |genome|
132
- genome_gi = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_seq-set/Seq-entry/Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_gi')
133
- if !genome_gi.nil? and gi==genome_gi.content
134
- incomplete = false
135
- positive_coords[gi] ||= []
136
- $stderr.puts "\n # got #{gi}, scanning" if @o[:debug]
137
- if @o[:pergenus] or @o[:perspecies]
138
- name = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/BioSource_org/Org-ref/Org-ref_orgname/OrgName/OrgName_name/OrgName_name_binomial/BinomialOrgName')
139
- unless name.nil?
140
- name_g = name.at_xpath('./BinomialOrgName_genus')
141
- name_s = name.at_xpath('./BinomialOrgName_species')
142
- if name_g.nil? or (name_s.nil? and @o[:perspecies])
143
- name = nil
144
- else
145
- name = @o[:perspecies] ? name_g.content + " " + name_s.content : name_g.content
146
- end
147
- end
148
- if name.nil?
149
- warn "WARNING: Cannot find binomial name of #{gi}, using genome regardless of taxonomy."
150
- name = rand(36**100).to_s(36)
151
- end
152
- break unless genome_org[ name ].nil?
153
- genome_org[ name ] = gi
154
- end
155
- $stderr.puts " # traversing #{gi}" if @o[:debug]
156
- genome.xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_annot/Seq-annot/Seq-annot_data/Seq-annot_data_ftable/Seq-feat').each do |pr|
157
- pr_gi = pr.at_xpath('./Seq-feat_product/Seq-loc/Seq-loc_whole/Seq-id/Seq-id_gi')
158
- next if pr_gi.nil?
159
- if @o[:positive].include? pr_gi.content
160
- $stderr.puts " # found #{pr_gi.content}" if @o[:debug]
161
- pr_loc = pr.at_xpath('./Seq-feat_location/Seq-loc/Seq-loc_int/Seq-interval')
162
- if pr_loc.nil?
163
- pr_loc = pr.xpath('./Seq-feat_location/Seq-loc/Seq-loc_mix//Seq-loc/Seq-loc_int/Seq-interval')
164
- if pr_loc.nil?
165
- warn "WARNING: Impossible to find location of '#{pr_gi.content}' in '#{gi}'."
166
- incomplete = true
167
- else
168
- pr_loc.each do |loc_int|
169
- positive_coords[gi] << {
170
- :gi => pr_gi.content,
171
- :from => loc_int.at_xpath('./Seq-interval_from').content.to_i,
172
- :to => loc_int.at_xpath('./Seq-interval_to').content.to_i
173
- #, :strand => loc_int.at_xpath('./Seq-interval_strand/Na-strand/@value').content
174
- }
175
- end
176
- end
177
- else
178
- positive_coords[gi] << {
179
- :gi => pr_gi.content,
180
- :from => pr_loc.at_xpath('./Seq-interval_from').content.to_i,
181
- :to => pr_loc.at_xpath('./Seq-interval_to').content.to_i
182
- #, :strand => pr_loc.at_xpath('./Seq-interval_strand/Na-strand/@value').content
183
- }
184
- end
185
- end
186
- end
187
- break
188
- end
130
+ doc.each do |ln|
131
+ r = ln.chomp.split /\t/
132
+ prots = r[8].split(/;/).grep(/^db_xref=UniProtKB\/TrEMBL:/){ |xref| xref.split(/:/)[1] }
133
+ p = prots.select{ |p| @o[:positive].include? p }.first
134
+ next if p.nil?
135
+ positive_coords[ r[0] ] ||= []
136
+ positive_coords[ r[0] ] << {
137
+ #:strand => r[6],
138
+ :prot_id => p,
139
+ :from => r[3].to_i,
140
+ :to => r[4].to_i
141
+ }
189
142
  end
190
- doc = nil
191
- warn "WARNING: Cannot find GI '#{gi}'." if incomplete
192
143
  end
193
- genome_gis[:positive] = genome_org.values if @o[:pergenus] or @o[:perspecies]
194
- all_gis = genome_gis.values.reduce(:+).uniq
195
144
  print "\n" unless @o[:q]
196
- missing = @o[:positive] - positive_coords.values.map{ |a| a.map{ |b| b[:gi] } }.reduce(:+)
197
- warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or @o[:pergenus] or @o[:perspecies]
145
+ genome_ids[:positive] = genome_org.values unless @o[:pertaxon].nil?
146
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
147
+ missing = @o[:positive] - positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+)
148
+ warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or not @o[:pertaxon].nil?
198
149
 
199
150
  # Download genomes
200
151
  genomes_file = @o[:baseout] + '.src.fasta'
201
152
  if @o[:reuse] and File.exist? genomes_file
202
153
  puts " * reusing existing file: #{genomes_file}." unless @o[:q]
203
154
  else
204
- puts " * downloading #{all_gis.size} genome(s) in FastA." unless @o[:q]
205
- $stderr.puts " # #{all_gis}" if @o[:debug]
206
- ids = Array.new(all_gis)
155
+ puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
156
+ $stderr.puts " # #{all_genome_ids}" if @o[:debug]
157
+ ids = Array.new(all_genome_ids)
207
158
  ofh = File.open(genomes_file, 'w')
208
159
  while ids.size>0
209
- ofh.print efetch({:db=>'nuccore', :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
160
+ ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
210
161
  end
211
162
  ofh.close
212
163
  end
@@ -244,11 +195,11 @@ class ROCker
244
195
  Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
245
196
  Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
246
197
  while Thread.current[:l]=Thread.current[:ifh].gets
247
- Thread.current[:rd] = /^>(?<id>\d+) reference=gi\|(?<gi>\d+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
198
+ Thread.current[:rd] = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
248
199
  unless Thread.current[:rd].nil?
249
200
  Thread.current[:positive] = false
250
- positive_coords[Thread.current[:rd][:gi]] ||= []
251
- positive_coords[Thread.current[:rd][:gi]].each do |gn|
201
+ positive_coords[Thread.current[:rd][:genome_id]] ||= []
202
+ positive_coords[Thread.current[:rd][:genome_id]].each do |gn|
252
203
  Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
253
204
  Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
254
205
  if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
@@ -256,7 +207,7 @@ class ROCker
256
207
  break
257
208
  end
258
209
  end
259
- Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:gi]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
210
+ Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:genome_id]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
260
211
  end
261
212
  Thread.current[:ofh].print Thread.current[:l]
262
213
  end
@@ -470,18 +421,18 @@ class ROCker
470
421
  ifh.close
471
422
  ofh.close
472
423
  end
473
- def genes2genomes(gis, nucl=false)
424
+ def genes2genomes(gene_ids)
474
425
  genomes = []
475
- ids = Array.new(gis)
426
+ ids = Array.new(gene_ids)
476
427
  while ids.size>0
477
- doc = Nokogiri::XML( elink({:dbfrom=>(nucl ? 'nuccore' : 'protein'), :db=>'nuccore', :id=>ids.shift(200).join(',')}) )
478
- genomes += doc.xpath('/eLinkResult/LinkSet/LinkSetDb/Link/Id').map{ |id| id.content }
428
+ doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
429
+ genomes += doc.grep( /^DR\s+EMBL;/ ).map{ |ln| ln.split('; ')[1] }
479
430
  end
480
431
  genomes.uniq
481
432
  end
482
- def eutils(script, params={}, outfile=nil)
483
- response = RestClient.get "#{ROCker.eutils}/#{script}", {:params=>params}
484
- raise "Unable to reach NCBI EUtils, error code #{response.code}." unless response.code == 200
433
+ def restcall(url, outfile=nil)
434
+ response = RestClient.get url
435
+ raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
485
436
  unless outfile.nil?
486
437
  ohf = File.open(outfile, 'w')
487
438
  ohf.print response.to_s
@@ -489,8 +440,11 @@ class ROCker
489
440
  end
490
441
  response.to_s
491
442
  end
492
- def efetch(*etc) self.eutils 'efetch.fcgi', *etc end
493
- def elink(*etc) self.eutils 'elink.fcgi', *etc end
443
+ def ebiFetch(db, ids, format, outfile=nil)
444
+ url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
445
+ $stderr.puts " # Calling: #{url}" if @o[:debug]
446
+ self.restcall url
447
+ end
494
448
  def bash(cmd, err_msg=nil)
495
449
  o = `#{cmd} 2>&1 && echo '{'`
496
450
  raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update May-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/sequence'
@@ -40,20 +40,20 @@ class Alignment
40
40
  @cols = seq.cols if self.cols.nil?
41
41
  raise "Aligned sequence #{seq.id} has a different length (#{seq.cols} vs #{self.cols})" unless seq.cols == self.cols
42
42
  end
43
- def get_gis
44
- regexps = [/^gi\|(\d+)\|/, /^(\d+)\|/, /^(\d+)$/, /^gi\|(\d+)$/, /\|gi\|(\d+)\|/, /\|gi\|(\d+)$/]
45
- gis = []
43
+ def get_ids
44
+ regexps = [/^[A-Za-z]+\|([A-Za-z0-9_]+)\|/, /^([A-Za-z0-9_]+)$/, /^([A-Za-z0-9_]+) /]
45
+ prot_ids = []
46
46
  self.seqs.keys.each do |id|
47
- gi = nil
47
+ prot_id = nil
48
48
  regexps.each do |regexp|
49
49
  unless regexp.match(id).nil?
50
- gi = $1
50
+ prot_id = $1
51
51
  break
52
52
  end
53
53
  end
54
- gis << gi unless gi.nil?
54
+ prot_ids << prot_id unless prot_id.nil?
55
55
  end
56
- gis
56
+ prot_ids
57
57
  end
58
58
  def seq(id) @seqs[id] end
59
59
  def size() self.seqs.size end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-rocker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.04
4
+ version: 0.2.0alpha
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis (Coto) Orellana
@@ -9,8 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-01-20 00:00:00.000000000 Z
13
- dependencies: []
12
+ date: 2015-05-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rest-client
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ~>
19
+ - !ruby/object:Gem::Version
20
+ version: 1.7.3
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ~>
26
+ - !ruby/object:Gem::Version
27
+ version: 1.7.3
14
28
  description: Detecting and quantifying functional genes in short-read metagenomic
15
29
  datasets
16
30
  email: lhorellana@gatech.edu
@@ -42,9 +56,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
42
56
  version: '0'
43
57
  required_rubygems_version: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - '>='
59
+ - - '>'
46
60
  - !ruby/object:Gem::Version
47
- version: '0'
61
+ version: 1.3.1
48
62
  requirements: []
49
63
  rubyforge_project:
50
64
  rubygems_version: 2.0.14