RubyGems - rbbt-sources - Versions diffs - 0.3.1 → 0.4.0 - Mend

rbbt-sources 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/rbbt/sources/biomart.rb +42 -16
data/lib/rbbt/sources/organism.rb +2 -1
data/lib/rbbt/sources/organism/sequence.rb +49 -8
data/share/install/Organism/Rno/Rakefile +44 -0
data/share/install/Organism/organism_helpers.rb +18 -17
data/test/rbbt/sources/test_biomart.rb +15 -2
metadata +6 -5

data/lib/rbbt/sources/biomart.rb CHANGED

@@ -1,3 +1,4 @@
+require 'rbbt'
 require 'rbbt/util/tsv'
 require 'rbbt/util/log'
 require 'cgi'
@@ -13,6 +14,8 @@ module BioMart
   BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
+  MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.yaml
   private
   @@biomart_query_xml = <<-EOT
@@ -28,12 +31,14 @@ module BioMart
   EOT
   def self.set_archive(date)
+    @archive = date
     @archive_url = BIOMART_URL.sub(/http:\/\/biomart\./, 'http://' + date + '.archive.ensembl.')
     Log.debug "Using Archive URL #{ @archive_url }"
   end
   def self.unset_archive
     Log.debug "Restoring current version URL #{BIOMART_URL}"
+    @archive = nil
     @archive_url = nil
   end
@@ -61,15 +66,16 @@ module BioMart
     result_file = TmpFile.tmp_file
     Open.write(result_file, response)
+    new_datafile = TmpFile.tmp_file
     if data.nil?
-      data = result_file
+      TSV.merge_rows Open.open(result_file), new_datafile
+      data = new_datafile
     else
-      new_datafile = TmpFile.tmp_file
       TSV.paste_merge data, result_file, new_datafile
       FileUtils.rm data
       data = new_datafile
-      FileUtils.rm result_file
     end
+    FileUtils.rm result_file
     data
   end
@@ -91,7 +97,8 @@ module BioMart
   # cause an error if the BioMart WS does not allow filtering with that
   # attribute.
   def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
-    open_options = Misc.add_defaults open_options, :nocache => false
+    open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil
+    filename, field_names = Misc.process_options open_options, :filename, :field_names
     attrs   ||= []
     open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
@@ -118,22 +125,41 @@ module BioMart
       data = get(database, main, chunk, filters, data, open_options)
     }
-    result = TSV.new(data, open_options)
-    result.key_field = main
-    result.fields = attrs
-    result.filename = "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
-    FileUtils.rm data
-    result
+    open_options[:filename] ||= "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
+    if filename.nil?
+      results = TSV.new data, open_options
+      results.key_field = main
+      results.fields = attrs
+      results
+    else
+      Open.write(filename) do |f|
+        f.puts "#: " << Misc.hash2string(TSV::EXTRA_ACCESSORS.collect{|key| [key, open_options[key]]})
+        if field_names.nil?
+          f.puts "#" << [main, attrs].flatten * "\t"
+        else
+          f.puts "#" << field_names * "\t"
+        end
+        f.write Open.read(data)
+      end
+      FileUtils.rm data
+      filename
+    end
   end
   def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
-    codes = attrs.collect{|attr| attr[1]}
-    tsv = query(database, main.last, codes, filters, data, open_options)
+    if @archive_url
+      attrs = attrs.reject{|attr| MISSING_IN_ARCHIVE[@archive].include? attr[1]}
+    end
-    tsv.key_field = main.first
-    tsv.fields    = attrs.collect{|attr| attr.first}
-    tsv
+    codes = attrs.collect{|attr| attr[1]}
+    if open_options[:filename].nil?
+      tsv = query(database, main.last, codes, filters, data, open_options)
+      tsv.key_field = main.first
+      tsv.fields    = attrs.collect{|attr| attr.first}
+      tsv
+    else
+      query(database, main.last, codes, filters, data, open_options.merge(:field_names => [main.first, attrs.collect{|attr| attr.first}].flatten))
+    end
   end
 end

data/lib/rbbt/sources/organism.rb CHANGED

@@ -29,6 +29,7 @@ module Organism
     options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
     double = Misc.process_options options, :double
     options.merge! :target => target unless target.nil?
     options.merge! :fields => fields unless fields.nil?
@@ -69,7 +70,7 @@ module Organism
     }.first
   end
-  ["Hsa", "Sce"].each do |organism|
+  ["Hsa", "Rno", "Sce"].each do |organism|
     rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
     rakefile.lib_dir = Resource.caller_lib_dir __FILE__
     rakefile.pkgdir = 'phgx'

data/lib/rbbt/sources/organism/sequence.rb CHANGED

@@ -4,6 +4,8 @@ require 'bio'
 # Sequence analyses
 module Organism
   extend WorkFlow
+  relative_to Rbbt, "share/organisms"
+  self.jobdir = Rbbt.var.organism.find
   def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
     exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
@@ -201,16 +203,59 @@ module Organism
     position_offsets
   end
-  task_option :org, "Organism", :string
+  task_option :organism, "Organism", :string, "Hsa"
   task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
-  task :genomic_mutation_to_protein_mutation => :tsv do |org, genomic_mutations|
+  task_dependencies nil
+  task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
+    genomic_mutations = case
+                        when TSV === genomic_mutations
+                          genomic_mutations
+                        else
+                          TSV.new StringIO.new(genomic_mutations), :list
+                        end
+    genomic_mutations.key_field = "Position"
+    genomic_mutations.fields = ["Mutation"]
+    positions = genomic_mutations.keys.collect{|l| l.split(":")}
+    step(:resources, "Load Resources")
+    genes_at_positions = Hash[*genomic_mutations.keys.zip(Organism.genes_at_genomic_positions(org, positions)).flatten]
+    genomic_mutations.add_field "#{org.sub(/\/.*/,'')}:Ensembl Gene ID" do |position, values|
+      genes_at_positions[position]
+    end
+    genomic_mutations
+  end
+  task_description <<-EOF
+Translates a collection of mutations in genomic coordinates into mutations in aminoacids for the
+protein products of transcripts including those positions.
+  EOF
+  task_option :organism, "Organism", :string, "Hsa"
+  task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
+  task_dependencies nil
+  task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
+    genomic_mutations = case
+                        when TSV === genomic_mutations
+                          genomic_mutations
+                        else
+                          TSV.new StringIO.new(genomic_mutations), :list
+                        end
+    genomic_mutations.key_field = "Position"
+    genomic_mutations.fields = ["Mutation"]
     positions = genomic_mutations.keys.collect{|l| l.split(":")}
     step(:prepare, "Prepare Results")
     results = TSV.new({})
     results.key_field = "Position"
-    results.fields = ["Ensembl Transcript ID", "Mutation"]
+    results.fields = ["#{org.sub(/\/.*/,'')}:Ensembl Transcript ID", "Protein Mutation"]
     results.type = :double
+    results.filename = path
     step(:resources, "Load Resources")
     transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
@@ -229,7 +274,6 @@ module Organism
       transcripts.each do |transcript, offset_info|
         offset, strand = offset_info
-        ddd strand
         begin
           codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
         rescue
@@ -237,12 +281,9 @@ module Organism
           next
         end
-        ddd codon
         if not codon.nil?
           alleles.each do |allele|
-            ddd allele
             allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
-            ddd allele
             change = Organism.codon_change(allele, *codon.values_at(0,1))
             pos_code = position * ":"
             mutation = [change.first, codon.last + 1, change.last] * ""
@@ -323,7 +364,7 @@ X	10085674	C	T
   #positions =  positions.select ["10:98099540"]
   Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
-  job =  Organism.job :genomic_mutation_to_protein_mutation, "Metastasis", org, positions.slice("Tumor")
+  job =  Organism.job :genomic_mutations_to_protein_mutations, "Metastasis", org, positions.slice("Tumor")
   job.run
   while not job.done?

data/share/install/Organism/Rno/Rakefile ADDED

@@ -0,0 +1,44 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+require File.join(File.dirname(__FILE__), '../../lib/helpers')
+$taxs = [10116]
+$scientific_name = "Rattus norvegicus"
+$biomart_db = 'rnorvegicus_gene_ensembl'
+$biomart_lexicon = [
+  [ 'Associated Gene Name' , "external_gene_id"],
+  [ 'HGNC symbol', "hgnc_symbol"  ],
+  [ 'HGNC automatic gene name', "hgnc_automatic_gene_name"  ],
+  [ 'HGNC curated gene name ', "hgnc_curated_gene_name"  ],
+]
+$biomart_identifiers = [
+  ['Associated Gene Name' , "external_gene_id"],
+  ['Protein ID' , "protein_id"] ,
+  ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
+  ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
+  ['RefSeq Protein ID' , "refseq_peptide"] ,
+  ['RefSeq DNA ID' , "refseq_dna"] ,
+  ['EMBL (Genbank) ID' , "embl"] ,
+  ['RGD ID' , "rgd"] ,
+  ['RGD Symbol' , "rgd_symbol"] ,
+  ['Affy rae230a', "affy_rae230a"],
+  ['Affy rae230b', "affy_rae230b"],
+  ['Affy RaGene', "affy_ragene_1_0_st_v1"],
+  ['Affy rat230 2', "affy_rat230_2"],
+  ['Affy RaEx', "affy_raex_1_0_st_v1"],
+  ['Affy rg u34a', "affy_rg_u34a"],
+  ['Affy rg u34b', "affy_rg_u34b"],
+  ['Affy rg u34c', "affy_rg_u34c"],
+  ['Affy rn u34', "affy_rn_u34"],
+  ['Affy rt u34', "affy_rt_u34"],
+  ['Agilent WholeGenome',"agilent_wholegenome" ],
+  ['Codelink ID ', "codelink"],
+]
+$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
+load File.join(File.dirname(__FILE__), '../organism_helpers.rb')

data/share/install/Organism/organism_helpers.rb CHANGED

@@ -103,7 +103,7 @@ file 'scientific_name' do |t|
 end
 file 'identifiers' do |t|
-  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [])
+  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
   $biomart_identifiers.each do |name, key, prefix|
     if prefix
       identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
@@ -114,20 +114,20 @@ file 'identifiers' do |t|
 end
 file 'gene_transcripts' do |t|
-  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat)
+  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat, :namespace => $namespace)
   File.open(t.name, 'w') do |f| f.puts transcripts end
 end
 file 'transcripts' => 'gene_positions' do |t|
-  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list)
+  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
   transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
   File.open(t.name, 'w') do |f| f.puts transcripts end
 end
 file 'transcript_3utr' do |t|
-  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :merge => true)
+  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
   File.open(t.name, 'w') do |f|
     f.puts "#: :type=:single#cast=to_i"
@@ -142,7 +142,7 @@ end
 file 'transcript_5utr' do |t|
-  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :merge => true)
+  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
   File.open(t.name, 'w') do |f|
     f.puts "#: :type=:single#cast=to_i"
@@ -162,7 +162,7 @@ file 'gene_positions' do |t|
 end
 file 'gene_sequence' do |t|
-  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_sequence, [], nil, :type => :flat, :merge => true)
+  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_sequence, [], nil, :type => :flat, :namespace => $namespace)
   File.open(t.name, 'w') do |f|
     f.puts "#: :type=:single"
@@ -179,7 +179,7 @@ file 'gene_sequence' do |t|
 end
 file 'protein_sequence' do |t|
-  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :merge => true)
+  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
   File.open(t.name, 'w') do |f|
     f.puts "#: :type=:single"
@@ -197,20 +197,20 @@ file 'protein_sequence' do |t|
 end
 file 'exons' => 'gene_positions' do |t|
-  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list)
+  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
   exons.attach TSV.new('gene_positions'), "Chromosome Name"
   File.open(t.name, 'w') do |f| f.puts exons end
 end
 file 'transcript_exons' do |t|
-  exons = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_exons, [], nil, :keep_empty => true)
+  exons = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_exons, [], nil, :keep_empty => true, :namespace => $namespace)
   File.open(t.name, 'w') do |f| f.puts exons end
 end
 file 'transcript_sequence' do |t|
-  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :merge => true)
+  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
   File.open(t.name, 'w') do |f|
     f.puts "#: :type=:single"
@@ -232,28 +232,28 @@ $biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD
 $biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
 file 'germline_variations' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
-  File.open(t.name, 'w') do |f| f.puts variations.to_s end
+  variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
 end
 file 'germline_variation_positions' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
+  variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
   File.open(t.name, 'w') do |f| f.puts variations.to_s end
 end
 file 'somatic_variations' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
+  variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
   File.open(t.name, 'w') do |f| f.puts variations.to_s end
 end
 file 'somatic_variation_positions' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
+  variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
   File.open(t.name, 'w') do |f| f.puts variations.to_s end
 end
 file 'gene_pmids' do |t|
   tsv =  Entrez.entrez2pubmed($taxs)
-  text = "#Entrez Gene ID\tPMID"
+  text = "#: :namespace=#{$namespace}"
+  text += "#Entrez Gene ID\tPMID"
   tsv.each do |gene, pmids|
     text << "\n" << gene << "\t" << pmids * "|"
   end
@@ -270,7 +270,8 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
   transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
-  string = "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
+  string = "#: :namespace=#{$namespace}"
+  string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
   exons.each do |exon, info|
     gene, start, finish, strand, chr = info

data/test/rbbt/sources/test_biomart.rb CHANGED

@@ -1,5 +1,6 @@
 require File.dirname(__FILE__) + '/../../test_helper'
 require 'rbbt/sources/biomart'
+require 'rbbt/util/tmpfile'
 require 'test/unit'
 class TestBioMart < Test::Unit::TestCase
@@ -20,16 +21,28 @@ class TestBioMart < Test::Unit::TestCase
   def test_query
     data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['external_gene_id'].include? 'YBL044W')
+    TmpFile.with_file do |f|
+      filename = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false}, :filename => f)
+      data = TSV.new Open.open(filename)
+      assert(data['852236']['external_gene_id'].include? 'YBL044W')
+    end
   end
   def test_tsv
     data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['Protein ID'].include? 'CAA84864')
     assert_equal 'Entrez Gene', data.key_field
     assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
+    TmpFile.with_file do |f|
+      filename = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false}, :filename => f)
+      data = TSV.new Open.open(filename, :merge => true)
+      assert(data['852236']['Protein ID'].include? 'CAA84864')
+      assert_equal 'Entrez Gene', data.key_field
+      assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
+    end
   end
 end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-sources
 version: !ruby/object:Gem::Version
-  hash: 17
+  hash: 15
   prerelease:
   segments:
   - 0
-  - 3
-  - 1
-  version: 0.3.1
+  - 4
+  - 0
+  version: 0.4.0
 platform: ruby
 authors:
 - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-03-19 00:00:00 +01:00
+date: 2011-03-23 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -95,6 +95,7 @@ files:
 - lib/rbbt/sources/polysearch.rb
 - lib/rbbt/sources/pubmed.rb
 - share/install/Organism/Hsa/Rakefile
+- share/install/Organism/Rno/Rakefile
 - share/install/Organism/Sce/Rakefile
 - share/install/Organism/organism_helpers.rb
 - share/install/lib/helpers.rb