RubyGems - rbbt-sources - Versions diffs - 1.2.0 → 2.0.0 - Mend

rbbt-sources 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/etc/biomart/missing_in_archive +11 -0
data/lib/rbbt/sources/COSMIC.rb +47 -4
data/lib/rbbt/sources/HPRD.rb +23 -0
data/lib/rbbt/sources/InterPro.rb +98 -8
data/lib/rbbt/sources/NCI.rb +7 -5
data/lib/rbbt/sources/PSI_MI.rb +41 -0
data/lib/rbbt/sources/STITCH.rb +92 -0
data/lib/rbbt/sources/barcode.rb +0 -3
data/lib/rbbt/sources/biomart.rb +3 -3
data/lib/rbbt/sources/dbSNP.rb +100 -0
data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
data/lib/rbbt/sources/entrez.rb +2 -2
data/lib/rbbt/sources/genomes1000.rb +45 -0
data/lib/rbbt/sources/go.rb +16 -4
data/lib/rbbt/sources/organism.rb +80 -12
data/lib/rbbt/sources/pfam.rb +63 -3
data/lib/rbbt/sources/pubmed.rb +10 -3
data/lib/rbbt/sources/reactome.rb +82 -0
data/lib/rbbt/sources/tfacts.rb +37 -36
data/lib/rbbt/sources/uniprot.rb +25 -23
data/share/Ensembl/release_dates +18 -0
data/share/install/Genomes1000/Rakefile +15 -0
data/share/install/JoChem/Rakefile +11 -3
data/share/install/NCI/Rakefile +54 -16
data/share/install/Organism/Hsa/Rakefile +3 -2
data/share/install/Organism/Rno/Rakefile +1 -2
data/share/install/Organism/Sce/Rakefile +43 -45
data/share/install/Organism/organism_helpers.rb +360 -96
data/share/install/STITCH/Rakefile +0 -0
data/test/rbbt/sources/test_organism.rb +26 -7
data/test/rbbt/sources/test_pubmed.rb +5 -0
metadata +94 -97
data/share/install/InterPro/Rakefile +0 -29

data/lib/rbbt/sources/tfacts.rb CHANGED Viewed

@@ -1,64 +1,65 @@
 require 'rbbt'
 require 'rbbt/resource'
-require 'nokogiri'
+require 'spreadsheet'
 module TFacts
   extend Resource
   self.subdir = "share/databases/TF"
-  def self.targets_for_gene_unsigned(gene_name)
-    doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
+  TFacts.claim TFacts.source["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
-    doc.css("td a").collect{|link| link.content.strip}
-  end
+  TFacts.claim TFacts.targets, :proc do
+    book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
+    sheet = book.worksheet 0
-  def self.targets_for_gene_signed(gene_name)
-    doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
+    tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name"], :namespace => "Hsa", :type => :flat)
+    sheet.each do |row|
+      target, tf = row.values_at 0, 1
+      tsv[target] ||= []
+      tsv[target] << tf
+    end
-    rows = doc.css("tr")
-    rows.shift
-    targets = {}
-    rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
-    targets
+    tsv.to_s
   end
-  def self.known_transcription_factors_signed
-    Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
-  end
+  TFacts.claim TFacts.targets_signed, :proc do
+    book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
+    sheet = book.worksheet 0
-  def self.known_transcription_factors_unsigned
-    Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
-  end
+    tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name", "Sign"], :namespace => "Hsa", :type => :double)
+    sheet.each do |row|
+      target, tf, sign = row.values_at 0, 1, 2
+      tsv[target] ||= [[],[]]
+      tsv[target][0] << tf
+      tsv[target][1] << sign
+    end
-  TFacts.claim TFacts.targets, :proc do
-    tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
-    TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
     tsv.to_s
   end
-  TFacts.claim TFacts.targets_signed, :proc do
-    tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
-    Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
-      tsv[tf] = [targets.keys, targets.values]
-    end
-    tsv.to_s
+  TFacts.claim TFacts.regulators, :proc do
+    TFacts.targets.tsv.reorder("Transcription Factor Associated Gene Name").to_s
   end
 end
 if defined? Entity and defined? Gene and Entity === Gene
   module Gene
     property :is_transcription_factor? => :array2single do
-      @is_trasncription_factor ||= begin
-                                     tfs = TFacts.targets.keys
-                                     self.name.collect{|gene| tfs.include? gene}
-                                   end
+      tfs = TFacts.targets.keys
+      self.name.collect{|gene| tfs.include? gene}
+    end
+    persist :_ary_is_transcription_factor?
+    property :transcription_regulators => :array2single do
+      Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
     end
+    persist :_ary_transcription_regulators
-   property :transcription_targets => :array2single do
-     @transcription_targets ||= begin
-                                  Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
-                                end
-   end
+    property :transcription_targets => :array2single do
+      Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
+    end
+    persist :_ary_transcription_targets
   end
 end

data/lib/rbbt/sources/uniprot.rb CHANGED Viewed

@@ -3,34 +3,32 @@ require 'rbbt/resource'
 require 'rbbt/sources/cath'
 require 'rbbt/sources/uniprot'
-module Uniprot
+module UniProt
   extend Resource
-  self.subdir = "share/databases/Uniprot"
+  self.subdir = "share/databases/UniProt"
-  Uniprot.claim Uniprot.annotated_variants, :proc do
+  UniProt.claim UniProt.annotated_variants, :proc do
     url = "http://www.uniprot.org/docs/humsavar.txt"
     tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
-                   :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
-                   :fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
+                   :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[1..5] + [(parts[6..-1] || []) * " "]) * "\t"},
+                   :type => :double,
+                   :merge => true,
+                   :key_field => "UniProt/SwissProt Accession",
+                   :fields => ["UniProt Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
     tsv.unnamed = true
-    tsv.process "Amino Acid Mutation" do |mutation|
-      if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
-        wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
-        mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
-        [wt, $2, mut] * ""
-      else
-        mutation
+    tsv.process "Amino Acid Mutation" do |mutations|
+      mutations.collect do |mutation|
+        if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
+          wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
+          mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
+          [wt, $2, mut] * ""
+        else
+          mutation
+        end
       end
     end
-    uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
-    mutation_pos = tsv.identify_field "Amino Acid Mutation"
-    tsv.add_field "Mutated Isoform" do |key, values|
-      [values[uniprot_pos], values[mutation_pos]] * ":"
-    end
-    tsv.reorder("Mutated Isoform").to_s
+    tsv.to_s
   end
@@ -44,7 +42,12 @@ module Uniprot
     text.split(/\n/).each{|l|
       next unless l =~ /^DR\s+PDB; (.*)\./
       id, method, resolution, region = $1.split(";").collect{|v| v.strip}
-      chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
+      begin
+        chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
+      rescue
+        Log.warn("Error process Uniprot PDB line: #{line}")
+        next
+      end
       pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
     }
     pdb
@@ -96,7 +99,6 @@ module Uniprot
     variants
   end
   def self.cath(protein)
     url = UNIPROT_TEXT.sub "[PROTEIN]", protein
     text = Open.read(url)
@@ -118,7 +120,7 @@ module Uniprot
   end
   def self.pdbs_covering_aa_position(protein, aa_position)
-    Uniprot.pdbs(protein).select do |pdb, info|
+    UniProt.pdbs(protein).select do |pdb, info|
       info[:region].include? aa_position
     end
   end

data/share/Ensembl/release_dates ADDED Viewed

@@ -0,0 +1,18 @@
+#: :type=:single
+#Release	build
+current	jul2012
+release-68	jul2012
+release-67	may2012
+release-66	feb2012
+release-65	dec2011
+release-64	sep2011
+release-63	jun2011
+release-62	apr2011
+release-61	feb2011
+release-60	nov2010
+release-59	aug2010
+release-58	may2010
+release-57	mar2010
+release-56	sep2009
+release-55	jul2009
+release-54	may2009

data/share/install/Genomes1000/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+rule /(.+)/ do |t|
+  require 'net/ftp'
+  chromosome = File.basename(t.name)
+  ftp = Genomes1000::URL
+  ftp = Net::FTP.new(Genomes1000::FTP_SERVER)
+  ftp.login
+  ftp.chdir(Genomes1000::FTP_PATH)
+  file = ftp.list("*.chr" + chromosome + ".*").collect{|l| l.split(" ").last}.last
+  ddd file
+  exit
+end

data/share/install/JoChem/Rakefile CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rbbt/util/misc'
 def read_chunk(jochem)
   chunk = ""
-  while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
+  while (not jochem.eof? and not (line = Misc.fixutf8(jochem.gets)).match(/^--/))
     chunk << line
   end
   return nil if chunk.empty?
@@ -20,7 +20,7 @@ def process_jochem
   identifiers = File.open('identifiers', 'w')
   identifiers.puts("#: :namespace=JoChem")
-  identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
+  identifiers.puts("#ID\tCompound Name\tPubChem:Substance ID\tPubChem:Coumpound ID\tDrugBank:ID\tChemIDplus:ID\tCAS:ID\tMeSH:Term\tChEBI:ID\tHMDB:ID\tKEGG:Coumpound ID\tKEGG:Drug ID")
   lexicon = File.open('lexicon', 'w')
   lexicon.puts("#: :namespace=JoChem")
@@ -49,12 +49,20 @@ def process_jochem
     tm = info["TM"] || []
     db = info["DB"] || []
+    cheb = db.collect{|code| code.match(/CHEB_(.*)/) ? $1 : nil}.compact
+    chid = db.collect{|code| code.match(/CHID_(.*)/) ? $1 : nil}.compact
     pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
+    pubs = db.collect{|code| code.match(/PUBS_(.*)/) ? $1 : nil}.compact
     drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
+    cas  = db.collect{|code| code.match(/CAS_(.*)/) ? $1 : nil}.compact
+    mesh = db.collect{|code| code.match(/MESH_(.*)/) ? $1 : nil}.compact
+    hmdb = db.collect{|code| code.match(/HMDB_(.*)/) ? $1 : nil}.compact
+    kegg = db.collect{|code| code.match(/KEGG_(.*)/) ? $1 : nil}.compact
+    kegd = db.collect{|code| code.match(/KEGD_(.*)/) ? $1 : nil}.compact
     inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
     lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
-    identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
+    identifiers.puts [id, na, pubs * "|", pubc * "|", drug * "|", chid * "|", cas * "|", mesh * "|", cheb * "|", hmdb * "|", kegg * "|", kegd * "|" ] * "\t"
     inchi.puts [id, inch * "|"] * "\t" if inch.any?
     definitions.puts [id, df] * "\t" unless df.nil?
   end

data/share/install/NCI/Rakefile CHANGED Viewed

@@ -1,7 +1,8 @@
 require 'nokogiri'
+require 'rbbt-util'
 module NCI
-  def self.get_pathways(xml, format = "UP")
+  def self.get_pathways(xml, format = "UP", get_short_name = false)
     doc = Nokogiri::XML(xml)
     pathways = {}
@@ -9,36 +10,60 @@ module NCI
     doc.xpath("//Molecule").each do |molecule|
       id = molecule.attribute('id').value
       type = molecule.attribute('molecule_type').value
-      next unless type == "protein"
       names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
-      next if names.empty?
-      molecules[id] = {:xml => molecule, :uniprot => names.first}
+      molecules[id] = {:xml => molecule, :proteins => names}
     end
     interactions = {}
     doc.xpath("//Interaction").each do |interaction|
       id = interaction.attribute('id').value
-      molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
+      type = interaction.attribute('interaction_type').value
+      molecule_ids = interaction.xpath('InteractionComponentList/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}.flatten.compact
+      pathway_ids = interaction.xpath('Abstraction').collect{|c| c.attribute('pathway_idref').value}.flatten.compact
-      interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
+      interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids, :pathway_ids => pathway_ids}
     end
     doc.xpath("//Pathway").each do |pathway|
       id = pathway.attribute('id').value
       subnet = pathway.attribute('subnet').value
       name = pathway.xpath('LongName').first.content
+      short_name = pathway.xpath('ShortName').first.content if get_short_name
-      interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
+      interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
-      pathway_interactions = interaction_ids.collect{|id| interactions[id]}
-      pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
+      pathway_interactions = interaction_ids.collect{|i| interactions[i]}
+      pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.compact.flatten
-      pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
-        next unless molecules.include? id
-        molecules[id][:uniprot]
+      pathway_uniprot_ids = pathway_molecule_ids.collect do |i|
+        next unless molecules.include? i
+        molecules[i][:proteins]
+      end
+      if get_short_name
+        pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq], [short_name]]
+      else
+        pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq]]
       end
-      pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
     end
+    doc.xpath("//Pathway").each do |pathway|
+      id = pathway.attribute('id').value
+      subnet = pathway.attribute('subnet').value
+      name = pathway.xpath('LongName').first.content
+      interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
+      pathway_interactions = interaction_ids.collect{|i| interactions[i]}
+      pathway_subnet_ids = pathway_interactions.collect{|info| info[:pathway_ids]}.compact.flatten
+      pathway_subnet_ids.collect do |nid|
+        next unless pathways.include? nid
+        new_genes = pathways[id].last
+        pathways[nid][1] = (pathways[nid][1] + new_genes).uniq
+      end
+    end
     pathways
   end
 end
@@ -60,11 +85,25 @@ file 'biocarta_pathways' do |t|
   xml = Open.read(url)
-  pathways = NCI.get_pathways(xml, "LL")
+  pathways = NCI.get_pathways(xml, "LL", true)
-  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
+  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID", "Pathway Short Name"]).to_s)
 end
+file 'biocarta_pathways_fixed_ids' => 'biocarta_pathways' do |t|
+  orig = TSV.open(Open.open(t.prerequisites.first))
+  tsv = TSV.setup({}, :type => :double, :key_field => "BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID"])
+  orig.through do |key, values|
+    name, genes, short = values
+    code = "h_" + short.first
+    tsv[code] = [name, genes]
+  end
+  Open.write(t.name, tsv.to_s)
+end
 file 'reactome_pathways' do |t|
   url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
@@ -76,4 +115,3 @@ file 'reactome_pathways' do |t|
   Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
 end

data/share/install/Organism/Hsa/Rakefile CHANGED Viewed

@@ -43,7 +43,7 @@ $biomart_probe_identifiers = [
   [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
   [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
   [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
-  [ 'Agilent WholeGenome',"agilent_wholegenome" ],
+  #[ 'Agilent WholeGenome',"agilent_wholegenome" ],
   [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
   [ 'Codelink ID', 'codelink' ],
   [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
@@ -62,6 +62,7 @@ $biomart_identifiers = [
   [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
   [ 'HGNC ID', "hgnc_id", 'HGNC'],
   [ 'EMBL (Genbank) ID' , "embl"] ,
+  [ 'RefSeq mRNA' , "refseq_mrna"] ,
   # Probes
   [ 'AFFY HC G110', 'affy_hc_g110' ],
@@ -80,7 +81,7 @@ $biomart_identifiers = [
   [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
   [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
   [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
-  [ 'Agilent WholeGenome',"agilent_wholegenome" ],
+  #[ 'Agilent WholeGenome',"agilent_wholegenome" ],
   [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
   [ 'Codelink ID', 'codelink' ],
   [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],

data/share/install/Organism/Rno/Rakefile CHANGED Viewed

@@ -19,12 +19,12 @@ $biomart_lexicon = [
 ]
 $biomart_identifiers = [
+  ['Entrez Gene ID', "entrezgene"],
   ['Associated Gene Name' , "external_gene_id"],
   ['Protein ID' , "protein_id"] ,
   ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
   ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
   ['RefSeq Protein ID' , "refseq_peptide"] ,
-  ['RefSeq DNA ID' , "refseq_dna"] ,
   ['EMBL (Genbank) ID' , "embl"] ,
   ['RGD ID' , "rgd"] ,
   ['RGD Symbol' , "rgd_symbol"] ,
@@ -39,7 +39,6 @@ $biomart_identifiers = [
   ['Affy rg u34c', "affy_rg_u34c"],
   ['Affy rn u34', "affy_rn_u34"],
   ['Affy rt u34', "affy_rt_u34"],
-  ['Agilent WholeGenome',"agilent_wholegenome" ],
   ['Codelink ID ', "codelink"],
 ]

data/share/install/Organism/Sce/Rakefile CHANGED Viewed

@@ -4,50 +4,48 @@ require 'rbbt/sources/entrez'
 require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [559292,4932]
-$native = "SGD ID"
-$url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
-$biomart_db = 'scerevisiae_gene_ensembl'
-$biomart_main = ['Entrez Gene ID', 'entrezgene']
-$ortholog_key = "yeast_ensembl_gene"
-file 'scientific_name' do |t|
-  File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
-end
-file 'lexicon' do |t|
-  lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
-  lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
-  lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
-  lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
+$scientific_name = "Saccharomyces cerevisiae"
+#$ortholog_key = "yeast_ensembl_gene"
-  File.open(t.name, 'w') do |f| f.puts lexicon end
-end
-file 'identifiers' do |t|
-  identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
-  identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
-  identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
-                [['Associated Gene Name' , "external_gene_id"],
-                  ['Ensembl Gene ID', "ensembl_gene_id"  ],
-                  ['Ensembl Protein ID', "ensembl_peptide_id"  ],
-                  ['RefSeq Protein ID' , "refseq_peptide"] ,
-                  ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
-                  ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
-                  ['Protein ID' , "protein_id"] ,
-                  ['EMBL (Genbank) ID' , "embl"] ,
-                  # Affymetrix
-                  ['Affy yeast 2',"affy_yeast_2"],
-                  ['Affy yg s98', "affy_yg_s98"]])
-  File.open(t.name, 'w') do |f| f.puts identifiers end
-end
-task :default => ['name', 'lexicon', 'identifiers']
+$biomart_db = 'scerevisiae_gene_ensembl'
+$biomart_lexicon = [
+  [ 'Associated Gene Name' , "external_gene_id"],
+]
+$biomart_protein_identifiers = [
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'Unigene ID', "unigene"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+]
+$biomart_probe_identifiers = [
+]
+$biomart_identifiers = [
+  [ 'Entrez Gene ID', "entrezgene"],
+  [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
+  [ 'Associated Gene Name', "external_gene_id"  ],
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+  [ 'EMBL (Genbank) ID' , "embl"] ,
+  [ 'RefSeq mRNA' , "refseq_mrna"] ,
+]
+$biomart_go= [
+  ["GO ID", 'go_id'],
+  ["GO Namespace", 'namespace_1003'],
+]
+$biomart_go_2009= [
+  ["GO BP ID", 'go_biological_process_id'],
+  ["GO MF ID", 'go_molecular_function_id'],
+  ["GO CC ID", 'go_cellular_component_id'],
+]
+$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
+load File.join(File.dirname(__FILE__), '../organism_helpers.rb')