bio-rocker 0.1.04 → 0.2.0alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/bin/ROCker +11 -11
  3. data/lib/rocker.rb +59 -105
  4. data/lib/rocker/alignment.rb +8 -8
  5. metadata +19 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: af49dee4fb6e1297372669742be4b7c0e7b00e9c
4
- data.tar.gz: f49816fa930e991ec8e2835fb8567c1f2850cde9
3
+ metadata.gz: 83c3e3b92659ce86b8bc1c2119b3d37db119b432
4
+ data.tar.gz: 0f27ba6ed5086baf64e99732dc86a4c7e31432d9
5
5
  SHA512:
6
- metadata.gz: 392fa058bbd87e2dfdaed707599df2439512a904af348e7ac1a057899191e20f3a4215316eb87671992fa07e2a6f211bb7c83e33073539156d5d63e2c8b175d7
7
- data.tar.gz: 02489a4c6597d42f9adfbf6f4586c8183523745ad5be98c99f31f578d30dec9f205f3bea4e23c5d52b1cd46744b58606bf8b70c2ddcbcf06f406a88183606ad1
6
+ metadata.gz: cc2045b7943455232a5d9e6a3d3f26f45c953bf31a6b480f6ce3a2c8ab37c1468d0d6ce511521c5afc52ee65dfecc66f9b6195ad17dce4db211c17cdc591ead7
7
+ data.tar.gz: c8bda317dda0da8eed23cd976eb6c848f866382e37f5c8315bbd365fe65f85f7319a60eb2c2d511461fca4e329af403c31ccd77a16c0a880ec720edfd9ac269f
data/bin/ROCker CHANGED
@@ -3,9 +3,10 @@
3
3
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
4
  # @author Luis (Coto) Orellana
5
5
  # @license artistic license 2.0
6
- # @update Mar-23-2015
6
+ # @update May-07-2015
7
7
  #
8
8
 
9
+ $:.push File.expand_path(File.dirname(__FILE__) + '/../lib')
9
10
  require 'rocker'
10
11
  require 'optparse'
11
12
 
@@ -35,25 +36,24 @@ opts = OptionParser.new do |opt|
35
36
  opt.separator "+ UNSATISFIED REQUIREMENTS"
36
37
  opt.separator " The building task requires uninstalled gems, please install them executing:"
37
38
  opt.separator " gem install rest_client"
38
- opt.separator " gem install nokogiri"
39
39
  opt.separator ""
40
40
  end
41
41
  opt.separator "+ BUILDING ARGUMENTS"
42
- opt.on("-p", "--positive GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
43
- opt.on("-n", "--negative GI1,GI2,GI3", Array, "Comma-separated list of NCBI GIs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
42
+ opt.on("-p", "--positive ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'positive' training set. Required unless -P or -a are used."){ |v| o[:posori]=v }
43
+ opt.on("-n", "--negative ID1,ID2,ID3", Array, "Comma-separated list of UniProtKB IDs corresponding to the 'negative' training set. See also -N."){ |v| o[:negative]=v }
44
44
  opt.on("-o", "--baseout PATH", "Prefix for the output files to be generated. Required."){ |v| o[:baseout]=v }
45
- opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
45
+ #opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides (currently not implemented)."){ raise "--nucleotides: This option is currently not implemented." }
46
46
  opt.on("-t", "--threads INT", "Number of threads to use. By default: #{ROCker.default :thr}."){ |v| o[:thr]=v.to_i }
47
47
  opt.separator ""
48
48
  opt.separator "+ ADVANCED BUILDING ARGUMENTS"
49
- opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one GI per line. If used, -p is not required."){ |v| o[:posfile]=v }
50
- opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one GI per line."){ |v| o[:negfile]=v }
51
- opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain GI numbers. If used, -p is not required."){ |v| o[:aln]=v }
49
+ opt.on("-P", "--positive-file PATH", "File containing the positive set (see -p), one UniProtKB ID per line. If used, -p is not required."){ |v| o[:posfile]=v }
50
+ opt.on("-N", "--negative-file PATH", "File containing the negative set (see -n), one UniProtKB ID per line."){ |v| o[:negfile]=v }
51
+ opt.on("-a", "--alignment PATH", "Protein alignment of the reference sequences. The defline must contain UniProtKB ID. If used, -p is not required."){ |v| o[:aln]=v }
52
52
  opt.on("-s", "--seqdepth NUMBER", "Sequencing depth to be used in building the in silico metagenome. By default: '#{ROCker.default :seqdepth}'."){ |v| o[:seqdepth]=v.to_f }
53
53
  opt.on("-v", "--overlap NUMBER", "Minimum overlap with reference gene to tag a read as positive. By default: '#{ROCker.default :minovl}'."){ |v| o[:minovl]=v.to_f }
54
54
  opt.on( "--genome-frx NUMBER", "Fraction to subsample the positive set genomes to generate the metagenome. By default: #{ROCker.default :genomefrx}"){ |v| o[:genomefrx]=v.to_f }
55
- opt.on( "--per-genus", "If selected, only one genome per genus is used to build the metagenome."){ o[:pergenus]=true }
56
- opt.on( "--per-species", "If selected, only one genome per species is used to build the metagenome."){ o[:perspecies]=true }
55
+ opt.on( "--per-taxon RANK", "If selected, only one genome per taxon is used to build the metagenome. Valid ranks include: species, genus, family, order, class, phylum.",
56
+ "This option replaces --per-genus and --per-species, but is temporarily out of service."){ |v| o[:pertaxon]=v.downcase }
57
57
  opt.on( "--nometagenome", "Do not create metagenome. Implies --noblast. By default, metagenome is created."){ |v| o[:nomg]=v }
58
58
  opt.on( "--noblast", "Do not execute BLAST. By default, BLAST is executed."){ |v| o[:noblast]=v }
59
59
  opt.on( "--noalignment", "Do not align reference set. By default, references are aligned."){ |v| o[:noaln]=v }
@@ -78,7 +78,7 @@ opts = OptionParser.new do |opt|
78
78
  opt.on("-b", "--ref-blast PATH",
79
79
  "Tabular BLAST (blastx) of the test reads vs. the reference dataset. Required unless -t exists."){ |v| o[:blast]=v }
80
80
  opt.on("-k", "--rocker PATH", "ROCker file to be created. Required."){ |v| o[:rocker]=v }
81
- opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ o[:nucl]=true }
81
+ opt.on( "--nucleotides", "If set, it assumes that the input sequences are in nucleotides. By default, proteins are assumed."){ raise "--nucleotides: This option is currently not implemented." }
82
82
  opt.separator ""
83
83
  opt.separator "+ ADVANCED COMPILATION ARGUMENTS"
84
84
  opt.on("-t", "--table PATH", "Formated tabular file to be created (or reused). Required unless -b is provided."){ |v| o[:table]=v }
data/lib/rocker.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Mar-23-2015
5
+ # @update May-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/blasthit'
@@ -10,12 +10,12 @@ require 'rocker/rocdata'
10
10
 
11
11
  class ROCker
12
12
  #================================[ Class ]
13
- @@EUTILS = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils'
13
+ @@EBIREST = 'http://www.ebi.ac.uk/Tools'
14
14
  @@DEFAULTS = {
15
15
  # General
16
16
  :q=>false, :r=>'R', :nucl=>false, :debug=>false,
17
17
  # Build
18
- :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0, :pergenus=>false, :perspecies=>false,
18
+ :positive=>[], :negative=>[], :thr=>2,:genomefrx=>1.0,
19
19
  # ext. software
20
20
  :grinder=>'grinder', :muscle=>'muscle', :blastbins=>'', :seqdepth=>3, :minovl=>0.75,
21
21
  :grindercmd=>'%1$s -reference_file "%2$s" -cf "%3$f" -base_name "%4$s" -dc \'-~*Nn\' -md "poly4 3e-3 3.3e-8" -mr "95 5" -rd "100 uniform 5"',
@@ -30,16 +30,15 @@ class ROCker
30
30
  :color=>false, :gformat=>'pdf', :width=>9, :height=>9, :impact=>false, :transparency=>true,
31
31
  }
32
32
  @@HAS_BUILD_GEMS = nil
33
- def self.eutils() @@EUTILS end
34
- def self.defaults() @@DEFAULTS end
35
- def self.default(k) @@DEFAULTS[k] end
33
+ def self.ebirest() @@EBIREST ; end
34
+ def self.defaults() @@DEFAULTS ; end
35
+ def self.default(k) @@DEFAULTS[k] ; end
36
36
  def self.has_build_gems?
37
37
  return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
38
38
  @@HAS_BUILD_GEMS = TRUE
39
39
  begin
40
40
  require 'rubygems'
41
41
  require 'restclient'
42
- require 'nokogiri'
43
42
  rescue LoadError
44
43
  @@HAS_BUILD_GEMS = FALSE
45
44
  end
@@ -66,7 +65,7 @@ class ROCker
66
65
  unless @o[:aln].nil?
67
66
  aln = Alignment.new
68
67
  aln.read_fasta @o[:aln]
69
- @o[:positive] += aln.get_gis
68
+ @o[:positive] += aln.get_ids
70
69
  end
71
70
  raise "-p or -P are mandatory." if @o[:positive].size==0
72
71
  raise "-o/--baseout is mandatory." if @o[:baseout].nil?
@@ -89,124 +88,76 @@ class ROCker
89
88
  $stderr.puts " # #{@o[:positive]}" if @o[:debug]
90
89
  ids = Array.new(@o[:positive])
91
90
  while ids.size>0
92
- f.print efetch({:db=>(@o[:nucl] ? 'nuccore' : 'protein'), :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
91
+ f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
93
92
  end
94
93
  end
95
94
  f.close
96
- genome_gis = {:positive=>[], :negative=>[]}
95
+ genome_ids = {:positive=>[], :negative=>[]}
97
96
  [:positive, :negative].each do |set|
98
97
  unless @o[set].size==0
99
98
  puts " * gathering genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
100
99
  $stderr.puts " # #{@o[set]}" if @o[:debug]
101
- genome_gis[set] = genes2genomes(@o[set], @o[:nucl])
100
+ genome_ids[set] = genes2genomes(@o[set])
102
101
  end
103
102
  end
104
- raise "No genomes associated with the positive set." if genome_gis[:positive].size==0
105
- genome_gis[:positive] = genome_gis[:positive].sample( (genome_gis[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
106
- raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_gis[:positive].empty?
107
- all_gis = genome_gis.values.reduce(:+).uniq
103
+ raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
104
+ genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
105
+ raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
106
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
108
107
 
109
108
  # Locate genes
110
109
  puts "Analyzing genome data." unless @o[:q]
111
- puts " * downloading and parsing #{genome_gis[:positive].size} XML file(s)." unless @o[:q]
112
- $stderr.puts " # #{genome_gis[:positive]}" if @o[:debug]
110
+ puts " * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s)." unless @o[:q]
111
+ $stderr.puts " # #{genome_ids[:positive]}" if @o[:debug]
113
112
  positive_coords = {}
114
113
  genome_org = {}
115
114
  i = 0
116
- genome_gis[:positive].each do |gi|
117
- print " * scanning #{(i+=1).ordinalize} genome out of #{genome_gis[:positive].size}. \r" unless @o[:q]
115
+ genome_ids[:positive].each do |genome_id|
116
+ print " * scanning #{(i+=1).ordinalize} genome out of #{genome_ids[:positive].size}. \r" unless @o[:q]
117
+ # ToDo check organism name using genome_org unless @o[:pertaxon].nil?
118
118
  $stderr.puts " # Looking for any of #{@o[:positive]}" if @o[:debug]
119
- genome_file = @o[:baseout] + '.src.' + i.to_s + '.xml'
119
+ genome_file = @o[:baseout] + '.src.' + i.to_s + '.gff3'
120
120
  if @o[:reuse] and File.exist? genome_file
121
121
  puts " * reusing existing file: #{genome_file}." unless @o[:q]
122
122
  ifh = File.open(genome_file, 'r')
123
- doc = Nokogiri::XML( ifh )
123
+ doc = ifh.readlines.grep(/^[^#]/)
124
124
  ifh.close
125
125
  else
126
126
  genome_file=nil unless @o[:noclean]
127
- res = efetch({:db=>'nuccore', :id=>gi, :rettype=>'xml', :retmode=>'text'}, genome_file)
128
- doc = Nokogiri::XML( res )
127
+ res = ebiFetch(:embl, [genome_id], :gff3, genome_file)
128
+ doc = res.split("\n").grep(/^[^#]/)
129
129
  end
130
- incomplete = true
131
- doc.xpath('//Bioseq-set/Bioseq-set_seq-set/Seq-entry').each do |genome|
132
- genome_gi = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_seq-set/Seq-entry/Seq-entry_seq/Bioseq/Bioseq_id/Seq-id/Seq-id_gi')
133
- if !genome_gi.nil? and gi==genome_gi.content
134
- incomplete = false
135
- positive_coords[gi] ||= []
136
- $stderr.puts "\n # got #{gi}, scanning" if @o[:debug]
137
- if @o[:pergenus] or @o[:perspecies]
138
- name = genome.at_xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_descr/Seq-descr/Seqdesc/Seqdesc_source/BioSource/BioSource_org/Org-ref/Org-ref_orgname/OrgName/OrgName_name/OrgName_name_binomial/BinomialOrgName')
139
- unless name.nil?
140
- name_g = name.at_xpath('./BinomialOrgName_genus')
141
- name_s = name.at_xpath('./BinomialOrgName_species')
142
- if name_g.nil? or (name_s.nil? and @o[:perspecies])
143
- name = nil
144
- else
145
- name = @o[:perspecies] ? name_g.content + " " + name_s.content : name_g.content
146
- end
147
- end
148
- if name.nil?
149
- warn "WARNING: Cannot find binomial name of #{gi}, using genome regardless of taxonomy."
150
- name = rand(36**100).to_s(36)
151
- end
152
- break unless genome_org[ name ].nil?
153
- genome_org[ name ] = gi
154
- end
155
- $stderr.puts " # traversing #{gi}" if @o[:debug]
156
- genome.xpath('./Seq-entry_set/Bioseq-set/Bioseq-set_annot/Seq-annot/Seq-annot_data/Seq-annot_data_ftable/Seq-feat').each do |pr|
157
- pr_gi = pr.at_xpath('./Seq-feat_product/Seq-loc/Seq-loc_whole/Seq-id/Seq-id_gi')
158
- next if pr_gi.nil?
159
- if @o[:positive].include? pr_gi.content
160
- $stderr.puts " # found #{pr_gi.content}" if @o[:debug]
161
- pr_loc = pr.at_xpath('./Seq-feat_location/Seq-loc/Seq-loc_int/Seq-interval')
162
- if pr_loc.nil?
163
- pr_loc = pr.xpath('./Seq-feat_location/Seq-loc/Seq-loc_mix//Seq-loc/Seq-loc_int/Seq-interval')
164
- if pr_loc.nil?
165
- warn "WARNING: Impossible to find location of '#{pr_gi.content}' in '#{gi}'."
166
- incomplete = true
167
- else
168
- pr_loc.each do |loc_int|
169
- positive_coords[gi] << {
170
- :gi => pr_gi.content,
171
- :from => loc_int.at_xpath('./Seq-interval_from').content.to_i,
172
- :to => loc_int.at_xpath('./Seq-interval_to').content.to_i
173
- #, :strand => loc_int.at_xpath('./Seq-interval_strand/Na-strand/@value').content
174
- }
175
- end
176
- end
177
- else
178
- positive_coords[gi] << {
179
- :gi => pr_gi.content,
180
- :from => pr_loc.at_xpath('./Seq-interval_from').content.to_i,
181
- :to => pr_loc.at_xpath('./Seq-interval_to').content.to_i
182
- #, :strand => pr_loc.at_xpath('./Seq-interval_strand/Na-strand/@value').content
183
- }
184
- end
185
- end
186
- end
187
- break
188
- end
130
+ doc.each do |ln|
131
+ r = ln.chomp.split /\t/
132
+ prots = r[8].split(/;/).grep(/^db_xref=UniProtKB\/TrEMBL:/){ |xref| xref.split(/:/)[1] }
133
+ p = prots.select{ |p| @o[:positive].include? p }.first
134
+ next if p.nil?
135
+ positive_coords[ r[0] ] ||= []
136
+ positive_coords[ r[0] ] << {
137
+ #:strand => r[6],
138
+ :prot_id => p,
139
+ :from => r[3].to_i,
140
+ :to => r[4].to_i
141
+ }
189
142
  end
190
- doc = nil
191
- warn "WARNING: Cannot find GI '#{gi}'." if incomplete
192
143
  end
193
- genome_gis[:positive] = genome_org.values if @o[:pergenus] or @o[:perspecies]
194
- all_gis = genome_gis.values.reduce(:+).uniq
195
144
  print "\n" unless @o[:q]
196
- missing = @o[:positive] - positive_coords.values.map{ |a| a.map{ |b| b[:gi] } }.reduce(:+)
197
- warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or @o[:pergenus] or @o[:perspecies]
145
+ genome_ids[:positive] = genome_org.values unless @o[:pertaxon].nil?
146
+ all_genome_ids = genome_ids.values.reduce(:+).uniq
147
+ missing = @o[:positive] - positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+)
148
+ warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\n\n" unless missing.size==0 or @o[:genomefrx]<1.0 or not @o[:pertaxon].nil?
198
149
 
199
150
  # Download genomes
200
151
  genomes_file = @o[:baseout] + '.src.fasta'
201
152
  if @o[:reuse] and File.exist? genomes_file
202
153
  puts " * reusing existing file: #{genomes_file}." unless @o[:q]
203
154
  else
204
- puts " * downloading #{all_gis.size} genome(s) in FastA." unless @o[:q]
205
- $stderr.puts " # #{all_gis}" if @o[:debug]
206
- ids = Array.new(all_gis)
155
+ puts " * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
156
+ $stderr.puts " # #{all_genome_ids}" if @o[:debug]
157
+ ids = Array.new(all_genome_ids)
207
158
  ofh = File.open(genomes_file, 'w')
208
159
  while ids.size>0
209
- ofh.print efetch({:db=>'nuccore', :id=>ids.shift(200).join(','), :rettype=>'fasta', :retmode=>'text'})
160
+ ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
210
161
  end
211
162
  ofh.close
212
163
  end
@@ -244,11 +195,11 @@ class ROCker
244
195
  Thread.current[:ifh] = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
245
196
  Thread.current[:ofh] = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
246
197
  while Thread.current[:l]=Thread.current[:ifh].gets
247
- Thread.current[:rd] = /^>(?<id>\d+) reference=gi\|(?<gi>\d+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
198
+ Thread.current[:rd] = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(Thread.current[:l])
248
199
  unless Thread.current[:rd].nil?
249
200
  Thread.current[:positive] = false
250
- positive_coords[Thread.current[:rd][:gi]] ||= []
251
- positive_coords[Thread.current[:rd][:gi]].each do |gn|
201
+ positive_coords[Thread.current[:rd][:genome_id]] ||= []
202
+ positive_coords[Thread.current[:rd][:genome_id]].each do |gn|
252
203
  Thread.current[:left] = Thread.current[:rd][:to].to_i - gn[:from]
253
204
  Thread.current[:right] = gn[:to] - Thread.current[:rd][:from].to_i
254
205
  if (Thread.current[:left]*Thread.current[:right] >= 0) and ([Thread.current[:left], Thread.current[:right]].min/(Thread.current[:rd][:to].to_i-Thread.current[:rd][:from].to_i) >= @o[:minovl])
@@ -256,7 +207,7 @@ class ROCker
256
207
  break
257
208
  end
258
209
  end
259
- Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:gi]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
210
+ Thread.current[:l] = ">#{Thread.current[:rd][:id]}#{Thread.current[:positive] ? "@%" : ""} ref=#{Thread.current[:rd][:genome_id]}:#{Thread.current[:rd][:from]}..#{Thread.current[:rd][:to]}#{(Thread.current[:rd][:comp]=='complement(')?'-':'+'}\n"
260
211
  end
261
212
  Thread.current[:ofh].print Thread.current[:l]
262
213
  end
@@ -470,18 +421,18 @@ class ROCker
470
421
  ifh.close
471
422
  ofh.close
472
423
  end
473
- def genes2genomes(gis, nucl=false)
424
+ def genes2genomes(gene_ids)
474
425
  genomes = []
475
- ids = Array.new(gis)
426
+ ids = Array.new(gene_ids)
476
427
  while ids.size>0
477
- doc = Nokogiri::XML( elink({:dbfrom=>(nucl ? 'nuccore' : 'protein'), :db=>'nuccore', :id=>ids.shift(200).join(',')}) )
478
- genomes += doc.xpath('/eLinkResult/LinkSet/LinkSetDb/Link/Id').map{ |id| id.content }
428
+ doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
429
+ genomes += doc.grep( /^DR\s+EMBL;/ ).map{ |ln| ln.split('; ')[1] }
479
430
  end
480
431
  genomes.uniq
481
432
  end
482
- def eutils(script, params={}, outfile=nil)
483
- response = RestClient.get "#{ROCker.eutils}/#{script}", {:params=>params}
484
- raise "Unable to reach NCBI EUtils, error code #{response.code}." unless response.code == 200
433
+ def restcall(url, outfile=nil)
434
+ response = RestClient.get url
435
+ raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
485
436
  unless outfile.nil?
486
437
  ohf = File.open(outfile, 'w')
487
438
  ohf.print response.to_s
@@ -489,8 +440,11 @@ class ROCker
489
440
  end
490
441
  response.to_s
491
442
  end
492
- def efetch(*etc) self.eutils 'efetch.fcgi', *etc end
493
- def elink(*etc) self.eutils 'elink.fcgi', *etc end
443
+ def ebiFetch(db, ids, format, outfile=nil)
444
+ url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
445
+ $stderr.puts " # Calling: #{url}" if @o[:debug]
446
+ self.restcall url
447
+ end
494
448
  def bash(cmd, err_msg=nil)
495
449
  o = `#{cmd} 2>&1 && echo '{'`
496
450
  raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
@@ -2,7 +2,7 @@
2
2
  # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
3
3
  # @author Luis (Coto) Orellana
4
4
  # @license artistic license 2.0
5
- # @update Jan-22-2015
5
+ # @update May-07-2015
6
6
  #
7
7
 
8
8
  require 'rocker/sequence'
@@ -40,20 +40,20 @@ class Alignment
40
40
  @cols = seq.cols if self.cols.nil?
41
41
  raise "Aligned sequence #{seq.id} has a different length (#{seq.cols} vs #{self.cols})" unless seq.cols == self.cols
42
42
  end
43
- def get_gis
44
- regexps = [/^gi\|(\d+)\|/, /^(\d+)\|/, /^(\d+)$/, /^gi\|(\d+)$/, /\|gi\|(\d+)\|/, /\|gi\|(\d+)$/]
45
- gis = []
43
+ def get_ids
44
+ regexps = [/^[A-Za-z]+\|([A-Za-z0-9_]+)\|/, /^([A-Za-z0-9_]+)$/, /^([A-Za-z0-9_]+) /]
45
+ prot_ids = []
46
46
  self.seqs.keys.each do |id|
47
- gi = nil
47
+ prot_id = nil
48
48
  regexps.each do |regexp|
49
49
  unless regexp.match(id).nil?
50
- gi = $1
50
+ prot_id = $1
51
51
  break
52
52
  end
53
53
  end
54
- gis << gi unless gi.nil?
54
+ prot_ids << prot_id unless prot_id.nil?
55
55
  end
56
- gis
56
+ prot_ids
57
57
  end
58
58
  def seq(id) @seqs[id] end
59
59
  def size() self.seqs.size end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-rocker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.04
4
+ version: 0.2.0alpha
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis (Coto) Orellana
@@ -9,8 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-01-20 00:00:00.000000000 Z
13
- dependencies: []
12
+ date: 2015-05-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rest-client
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ~>
19
+ - !ruby/object:Gem::Version
20
+ version: 1.7.3
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ~>
26
+ - !ruby/object:Gem::Version
27
+ version: 1.7.3
14
28
  description: Detecting and quantifying functional genes in short-read metagenomic
15
29
  datasets
16
30
  email: lhorellana@gatech.edu
@@ -42,9 +56,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
42
56
  version: '0'
43
57
  required_rubygems_version: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - '>='
59
+ - - '>'
46
60
  - !ruby/object:Gem::Version
47
- version: '0'
61
+ version: 1.3.1
48
62
  requirements: []
49
63
  rubyforge_project:
50
64
  rubygems_version: 2.0.14