RubyGems - rbbt-sources - Versions diffs - 0.2.1 → 0.2.2 - Mend

rbbt-sources 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/lib/rbbt/sources/COSTART.rb +16 -0
data/lib/rbbt/sources/CTCAE.rb +6 -0
data/lib/rbbt/sources/Reactome.rb +16 -0
data/lib/rbbt/sources/biomart.rb +16 -3
data/lib/rbbt/sources/entrez.rb +6 -6
data/lib/rbbt/sources/go.rb +2 -3
data/lib/rbbt/sources/organism.rb +41 -0
data/lib/rbbt/sources/polysearch.rb +10 -0
data/share/install/Organism/Hsa/Rakefile +36 -2
data/share/install/Organism/Sce/Rakefile +2 -2
data/test/rbbt/sources/test_biomart.rb +2 -2
data/test/rbbt/sources/test_entrez.rb +1 -1
data/test/rbbt/sources/test_organism.rb +13 -1
metadata +10 -6

data/lib/rbbt/sources/COSTART.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'rbbt-util'
+module COSTART
+  Rbbt.claim "COSTART",
+    Proc.new do
+      terms = ["#COSTART Terms"]
+      Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
+        puts line
+        next unless line =~ /^'(.*)',/
+        terms << $1
+      end
+      terms * "\n"
+    end, 'COSTART'
+end

data/lib/rbbt/sources/CTCAE.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'rbbt-util'
+require 'rbbt/util/excel2tsv'
+module CTCAE
+  Rbb.claim "CTCAE", TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls'), 'CTCAE'
+end

data/lib/rbbt/sources/Reactome.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'rbbt-util'
+module Reactome
+  Rbbt.claim "Reactome",
+    Proc.new do
+      headers = ["Uniprot ID#1", "Ensembl Gene ID#2","Entrez Gene ID#1", "Uniprot ID#2", "Ensembl Gene ID#2", "Entrez Gene ID#2" , "Type", "Reaction", "PMID"]
+      tsv = TSV.new(Open.open("http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz"), :fix => Proc.new {|l| l.gsub(/[\w ]+:/, "")})
+      tsv.key_field = headers.shift
+      tsv.fields    = headers
+      tsv.to_s
+    end, 'Reactome'
+  ]
+end

data/lib/rbbt/sources/biomart.rb CHANGED Viewed

@@ -10,6 +10,8 @@ module BioMart
   class BioMart::QueryError < StandardError; end
+  BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
   private
   @@biomart_query_xml = <<-EOT
@@ -23,8 +25,14 @@ module BioMart
 </Dataset>
 </Query>
   EOT
+  def self.set_archive(date)
+    @archive_url = BIOMART_URL.sub(/www\.biomar\./, date + '.archive.ensemble')
+  end
+  def self.unset_archive
+    @archive_url = nil
+  end
   def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
     attrs   ||= []
@@ -37,8 +45,13 @@ module BioMart
     query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
     query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
-    response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options)
-    if response =~ /Query ERROR:/
+    if @archive_url
+      response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
+    else
+      response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
+    end
+    if response.empty? or response =~ /Query ERROR:/
       raise BioMart::QueryError, response
     end

data/lib/rbbt/sources/entrez.rb CHANGED Viewed

@@ -5,29 +5,29 @@ require 'set'
 module Entrez
-  Rbbt.add_datafiles "gene_info" => ['databases/entrez', 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'],
-    "gene2pubmed" => ["databases/entrez", "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz" ]
+  Rbbt.claim "gene_info", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz', 'databases/entrez'
+  Rbbt.claim "gene2pubmed", 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz', 'databases/entrez'
   def self.entrez2native(taxs, options = {})
-    options = Misc.add_defaults options, :native => 1, :extra => 5, :flatten => true, :persistence => true
+    options = Misc.add_defaults options, :key => 1, :others => 5, :persistence => true, :merge => true
     taxs = [taxs] unless Array === taxs
     options.merge! :grep => taxs
-    tsv = TSV.new(Rbbt.find_datafile('gene_info'), options)
+    tsv = TSV.new(Rbbt.files.databases.entrez.gene_info, :flat, options)
     tsv.key_field = "Entrez Gene ID"
     tsv.fields    = ["Native ID"]
     tsv
   end
   def self.entrez2pubmed(taxs)
-    options = {:native => 1, :extra => 2, :flatten => true, :persistence => true}
+    options = {:key => 1, :others => 2, :persistence => true, :merge => true}
     taxs = [taxs] unless taxs.is_a?(Array)
     taxs = taxs.collect{|t| t.to_s}
     options.merge! :grep => taxs
-    TSV.new(Rbbt.find_datafile('gene2pubmed'), options)
+    TSV.new(Rbbt.files.databases.entrez.gene2pubmed, :flat, options)
   end
   class Gene

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -4,9 +4,8 @@ require 'rbbt-util'
 # now all it does is provide a translation form id to the actual names.
 module GO
-  Rbbt.add_datafiles :gene_ontology => ['databases/GO', 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'],
-    :goslim_generic => ['databases/GO', 'http://www.geneontology.org/GO_slims/goslim_generic.obo']
+  Rbbt.claim :gene_ontology, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo', 'databases/GO'
+  Rbbt.claim :goslim_generic, 'http://www.geneontology.org/GO_slims/goslim_generic.obo', 'databases/GO'
   MULTIPLE_VALUE_FIELDS = %w(is_a)
   TSV_GENE_ONTOLOGY = File.join(TSV.cachedir, 'gene_ontology')

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -1,12 +1,53 @@
 require 'rbbt-util'
 require 'rbbt/util/data_module'
 module Organism
   class OrganismNotProcessedError < StandardError; end
   def self.datadir(org)
     File.join(Rbbt.datadir, 'organisms', org)
   end
+  def self.normalize(org, list, field = nil, others = nil, options = {})
+    return [] if list.nil? or list.empty?
+    options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
+    double = Misc.process_options options, :double
+    if Array === list
+      if double
+        index.values_at *list
+      else
+        index.values_at(*list).collect{|e| Misc.first e}
+      end
+    else
+      if double
+        index[list]
+      else
+        index[list].first
+      end
+    end
+  end
+  def self.guess_id(org, values)
+    identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
+    field_matches = identifiers.field_matches(values)
+    field_matches.sort_by{|field, matches| matches.uniq.length}.last
+  end
+  def self.organisms
+    Dir.glob(File.join(PKGData.sharedir_for_file(__FILE__), 'install/Organism/*/Rakefile')).collect{|f| File.basename(File.dirname(f))}
+  end
+  def self.name(organism)
+    Open.read(Organism.scientific_name(organism)).strip
+  end
+  def self.organism(name)
+    organisms.select{|organism|
+      organism == name or Organism.name(organism) =~ /#{ name }/i
+    }.first
+  end
   extend DataModule

data/lib/rbbt/sources/polysearch.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'rbbt'
+module Polysearch
+  Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
+  Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
+  Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
+  Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
+  Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
+end

data/share/install/Organism/Hsa/Rakefile CHANGED Viewed

@@ -52,9 +52,18 @@ $biomart_identifiers = [
   [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
 ]
+$biomart_positions = [
+  ['Chromosome Name','chromosome_name'],
+  ['Strand','strand'],
+  ['Gene Start','start_position'],
+  ['Gene End','end_position'],
+  ['Transcript Start','transcript_start'],
+  ['Transcript End','transcript_end'],
+]
-file 'name' do |t|
+file 'scientific_name' do |t|
   File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
 end
@@ -77,7 +86,32 @@ file 'identifiers' do |t|
   File.open(t.name, 'w') do |f| f.puts identifiers end
 end
+file 'gene_go' do |t|
+  url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
+  tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
+  index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
+  new = TSV.new({})
+  tsv.through do |key, values|
+    next if index[key].nil?
+    new_key = index[key].first
+    new[new_key] = values
+  end
+  new.key_field = "Associated Gene Name"
+  new.fields = ["GO Term"]
+  Open.write(t.name, new.to_s)
+end
+file 'gene_positions' do |t|
+  BioMart.set_archive('may2009')
+  positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
+  BioMart.unset_archive
+  Open.write(t.name, positions.to_s)
+end
-task :default => ['name', 'lexicon', 'identifiers']
+task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']

data/share/install/Organism/Sce/Rakefile CHANGED Viewed

@@ -10,7 +10,7 @@ $biomart_db = 'scerevisiae_gene_ensembl'
 $biomart_main = ['Entrez Gene ID', 'entrezgene']
-file 'name' do |t|
+file 'scientific_name' do |t|
   File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
 end
@@ -27,7 +27,7 @@ file 'lexicon' do |t|
 end
 file 'identifiers' do |t|
-  identifiers = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
+  identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
   merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})

data/test/rbbt/sources/test_biomart.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'test/unit'
 class TestBioMart < Test::Unit::TestCase
-  def test_get
+  def _test_get
     assert_raise BioMart::QueryError do
       BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
     end
@@ -23,7 +23,7 @@ class TestBioMart < Test::Unit::TestCase
     assert(data['852236']['external_gene_id'].include? 'YBL044W')
   end
-  def test_tsv
+  def _test_tsv
     data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['Protein ID'].include? 'CAA84864')

data/test/rbbt/sources/test_entrez.rb CHANGED Viewed

@@ -18,7 +18,7 @@ class TestEntrez < Test::Unit::TestCase
     tax   = $yeast_tax
     data = Entrez.entrez2pubmed(tax)
-    assert(data['850320'].include? '15102838')
+    assert(data['850320'].include? '1574125')
   end
   def test_getonline

data/test/rbbt/sources/test_organism.rb CHANGED Viewed

@@ -6,12 +6,24 @@ class TestEntrez < Test::Unit::TestCase
   def test_identifiers
     assert TSV.new(Organism.identifiers('Sce'))['S000006120']["Ensembl Gene ID"].include?('YPL199C')
     assert TSV.new(Organism::Sce.identifiers)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
-    #assert Organism.identifiers('Hsa')['1020']["Associated Gene Name"].include?('CDK5')
+    assert TSV.new(Organism.identifiers('Hsa'))['1020']["Associated Gene Name"].include?('CDK5')
   end
   def test_lexicon
     assert TSV.new(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
   end
+  def test_guess_id
+    ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
+    gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
+    assert_equal "Ensembl Gene ID", Organism::Sce.guess_id(ensembl).first
+    assert_equal "Associated Gene Name", Organism::Sce.guess_id(gene_name).first
+  end
+  def test_organisms
+    assert Organism.organisms.include? "Hsa"
+    assert_equal "Hsa", Organism.organism("Homo sapiens")
+  end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-sources
 version: !ruby/object:Gem::Version
-  hash: 21
-  prerelease: false
+  hash: 19
+  prerelease:
   segments:
   - 0
   - 2
-  - 1
-  version: 0.2.1
+  - 2
+  version: 0.2.2
 platform: ruby
 authors:
 - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-12-14 00:00:00 +01:00
+date: 2011-01-30 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -83,12 +83,16 @@ extensions: []
 extra_rdoc_files: []
 files:
+- lib/rbbt/sources/COSTART.rb
+- lib/rbbt/sources/CTCAE.rb
+- lib/rbbt/sources/Reactome.rb
 - lib/rbbt/sources/bibtex.rb
 - lib/rbbt/sources/biomart.rb
 - lib/rbbt/sources/entrez.rb
 - lib/rbbt/sources/go.rb
 - lib/rbbt/sources/gscholar.rb
 - lib/rbbt/sources/organism.rb
+- lib/rbbt/sources/polysearch.rb
 - lib/rbbt/sources/pubmed.rb
 - share/install/Organism/Hsa/Rakefile
 - share/install/Organism/Sce/Rakefile
@@ -129,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.7
+rubygems_version: 1.4.2
 signing_key:
 specification_version: 3
 summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)