RubyGems - rbbt-sources - Versions diffs - 3.3.0 → 3.4.1 - Mend

rbbt-sources 3.3.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/etc/allowed_biomart_archives +2 -4
data/etc/biomart/missing_in_archive +2 -0
data/etc/build_organism +4 -4
data/etc/organisms +1 -0
data/lib/rbbt/sources/biomart.rb +48 -13
data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
data/lib/rbbt/sources/entrez.rb +13 -0
data/lib/rbbt/sources/go.rb +2 -2
data/lib/rbbt/sources/mesh.rb +26 -0
data/lib/rbbt/sources/organism.rb +45 -24
data/lib/rbbt/sources/pubmed.rb +13 -2
data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
data/share/install/Organism/Sce.rake +38 -0
data/share/install/Organism/organism_helpers.rb +126 -53
data/share/install/lib/rake_helper.rb +2 -2
data/test/rbbt/sources/test_biomart.rb +44 -6
data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
data/test/rbbt/sources/test_entrez.rb +5 -0
data/test/rbbt/sources/test_mesh.rb +10 -0
data/test/rbbt/sources/test_organism.rb +15 -15
data/test/rbbt/sources/test_pubmed.rb +18 -8
metadata +12 -7
data/share/install/Organism/Sce/Rakefile +0 -52

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b10dbe140b4c0733476823e5f5d94e57a3d9a755fc370f6b9640d1e7b8efc368
-  data.tar.gz: 38aaf56670a07537ad0ef0c025d17e655fc5d7fb87d97ee1c08d0af82c44fbbd
+  metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
+  data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
 SHA512:
-  metadata.gz: a8ac9df1da30fc7aec3c54a5a200a0c7a9629807b9238089a1e8064e78b0ecd5bad36c4b6a77fac7e7cfdf332ad56be06149b12d0e0fd7f6506b0b82d2e03bcf
-  data.tar.gz: acff50e8bdb0d4443c3e1dbd237539953206b7d5dcb886db64ec0677f7bba43cf3a9782e4147985a9b3fc1b34df692e89fb2b7185f2aa0f93ccd196a4d19d54a
+  metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
+  data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0

data/etc/allowed_biomart_archives CHANGED Viewed

@@ -1,8 +1,6 @@
 may2009
 feb2014
-may2017
-oct2018
-apr2019
+may2015
+sep2019
 feb2021
 feb2023
-oct2016

data/etc/biomart/missing_in_archive CHANGED Viewed

@@ -19,6 +19,8 @@
     - refseq_ncrna_predicted
 ">jun2015":
     - uniprot_swissprot_accession~uniprot_swissprot
+">jan2023":
+    - external_transcript_id~external_transcript_name
 <aug2014:
     - external_gene_name~external_gene_id
 may2010:

data/etc/build_organism CHANGED Viewed

@@ -2,7 +2,7 @@
 hg18	Hsa/may2008
 hg19	Hsa/feb2014
 b37	Hsa/feb2014
-hg38	Hsa/may2017
-GRCh38	Hsa/may2017
-mm10	Mmu/may2017
-GRCm38	Mmu/may2017
+hg38	Hsa/feb2023
+GRCh38	Hsa/feb2023
+mm10	Mmu/feb2023
+GRCm38	Mmu/feb2023

data/etc/organisms CHANGED Viewed

@@ -1,2 +1,3 @@
 Hsa
 Mmu
+Sce

data/lib/rbbt/sources/biomart.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'rbbt/tsv'
 require 'rbbt/tsv/attach'
 require 'rbbt/util/log'
 require 'cgi'
+require 'rbbt/sources/organism'
 # This module interacts with BioMart. It performs queries to BioMart and
 # synthesises a hash with the results. Note that this module connects to the
@@ -13,7 +14,7 @@ module BioMart
   class BioMart::QueryError < StandardError; end
-  BIOMART_URL = 'http://www.ensembl.org/biomart/martservice?query='
+  BIOMART_URL = 'ensembl.org/biomart/martservice'
   MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
@@ -22,7 +23,7 @@ module BioMart
   @@biomart_query_xml = <<-EOT
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
-<Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
+<Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
 <Dataset name = "<!--DATABASE-->" interface = "default" >
 <!--FILTERS-->
 <!--MAIN-->
@@ -36,14 +37,10 @@ module BioMart
       raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
     end
     Thread.current['archive'] = date
-    Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
-    Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
   end
   def self.unset_archive
-    Log.debug "Restoring current version URL #{BIOMART_URL}"
     Thread.current['archive'] = nil
-    Thread.current['archive_url'] = nil
   end
   def self.with_archive(data)
@@ -55,6 +52,21 @@ module BioMart
     end
   end
+  def self.final_url(query, archive = nil, ensembl_domain = nil)
+    url_domain = if archive.nil?
+      if ensembl_domain.nil?
+        'www'
+      else
+        ensembl_domain
+      end
+    elsif ensembl_domain
+      [archive, ensembl_domain] * "-"
+    else
+      [archive, 'archive'] * "."
+    end
+    "http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
+  end
   def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
     open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
     repeats = true
@@ -75,11 +87,17 @@ module BioMart
     query = @@biomart_query_xml.dup
     query.sub!(/<!--DATABASE-->/,database)
+    if Thread.current["ensembl_domain"]
+      query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
+    else
+      query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
+    end
     query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
     query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
     query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
-    url = Thread.current['archive_url'] ? Thread.current['archive_url'] + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
+    url = final_url(query,  Thread.current["archive"], Thread.current["ensembl_domain"])
     begin
       response = Open.read(url, open_options.dup)
@@ -105,10 +123,17 @@ module BioMart
     new_datafile = TmpFile.tmp_file
     if data.nil?
-      TSV.merge_row_fields Open.open(result_file), new_datafile
+      Open.open(result_file) do |file|
+        Open.write(new_datafile, Open.collapse_stream(file))
+      end
       data = new_datafile
     else
-      TSV.merge_different_fields data, result_file, new_datafile
+      Open.open(result_file) do |stream_result|
+        Open.open(data) do |stream_data|
+          Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
+        end
+      end
+      #TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
       FileUtils.rm data
       data = new_datafile
     end
@@ -142,9 +167,9 @@ module BioMart
     IndiferentHash.setup(open_options)
-    Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
+    Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
-    max_items = 2
+    max_items = 1
     chunks = []
     chunk = []
     attrs.each{|a|
@@ -178,7 +203,7 @@ module BioMart
       results
     else
       Open.write(filename) do |f|
-        f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
+        f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
         if field_names.nil?
           f.puts "#" << [main, attrs].flatten * "\t"
         else
@@ -211,7 +236,17 @@ module BioMart
     changes = {}
     missing.select{|m| m.include? "~" }.each do |str|
       orig,_sep, new = str.partition "~"
-      changes[orig] = new
+      if orig.include?(":")
+        target_db, _sep, orig = orig.partition(":")
+        if target_db[0] == "-"
+          next if database == target_db[1..-1]
+        else
+          next unless database == target_db
+        end
+        changes[orig] = new
+      else
+        changes[orig] = new
+      end
     end
     changed = true
     while changed

data/lib/rbbt/sources/ensembl_ftp.rb CHANGED Viewed

@@ -9,11 +9,29 @@ module Ensembl
   module FTP
     SERVER = "ftp.ensembl.org"
+    DOMAIN_SERVER = "ftp.ensemblgenomes.org"
-    def self.mysql_path(release)
+    def self.ftp_name_for_domain(domain, organism, subdir='mysql')
+      code, build = organism.split "/"
+      build ||= "current"
+      release = build == "current" ? 'current' : Ensembl.releases[build]
+      name = Organism.scientific_name(organism)
+      ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
+      ftp.passive = true
+      ftp.login
+      dir = File.join('pub', domain,  'current', subdir)
+      ftp.chdir(dir)
+      file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f|  f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
+      ftp.close
+      [release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
     end
-    def self.ftp_name_for(organism)
+    def self.ftp_name_for(organism, subdir='mysql')
+      if domain = Thread.current["ensembl_domain"]
+        return ftp_name_for_domain(domain, organism,subdir)
+      end
       code, build = organism.split "/"
       build ||= "current"
@@ -23,8 +41,9 @@ module Ensembl
         ftp = Net::FTP.new(Ensembl::FTP::SERVER)
         ftp.passive = true
         ftp.login
-        ftp.chdir(File.join('pub', 'current_mysql'))
-        file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
+        dir = File.join('pub', "current_#{subdir}")
+        ftp.chdir(dir)
+        file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
         ftp.close
       else
         release = Ensembl.releases[build]
@@ -32,24 +51,21 @@ module Ensembl
         ftp = Net::FTP.new(Ensembl::FTP::SERVER)
         ftp.passive = true
         ftp.login
-        ftp.chdir(File.join('pub', release, 'mysql'))
-        file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
+        dir = File.join('pub', release, subdir)
+        ftp.chdir(dir)
+        file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
         ftp.close
       end
-      [release, file]
+      [release, File.join(Ensembl::FTP::SERVER, dir, file)]
     end
-    def self.ftp_directory_for(organism)
-      release, ftp_name = ftp_name_for(organism)
-      if release == 'current'
-        File.join('/pub/', 'current_mysql', ftp_name)
-      else
-        File.join('/pub/', release, 'mysql', ftp_name)
-      end
+    def self.ftp_url_for(organism)
+      release, ftp_url = ftp_name_for(organism)
+      ftp_url
     end
     def self.base_url(organism)
-      File.join("ftp://" + SERVER, ftp_directory_for(organism) )
+      File.join("ftp://", ftp_url_for(organism) )
     end
     def self.url_for(organism, table, extension)

data/lib/rbbt/sources/entrez.rb CHANGED Viewed

@@ -8,6 +8,19 @@ module Entrez
   Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
   Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
+  Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
+    TmpFile.with_dir do |dir|
+      Misc.in_dir dir do
+        CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
+        CMD.cmd("tar xvfz taxdump.tar.gz")
+        CMD.cmd("grep 'scientific name' names.dmp  |cut -f 1,3 > tmp.tsv")
+        tsv = TSV.open('tmp.tsv', type: :single)
+        tsv.key_field = "Entrez Tax ID"
+        tsv.fields = ["Scientific Name"]
+        Open.write(filename, tsv.to_s)
+      end
+    end
+  end
   def self.entrez2native(taxs, options = {})
     options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -25,8 +25,8 @@ module GO
   # the gene_ontology.obo file and extracts all the fields, although right now,
   # only the name field is used.
   def self.init
-    Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
-      info.serializer = :marshal if info.respond_to? :serializer
+    Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
+      #info.serializer = :marshal if info.respond_to? :serializer
       Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
         term_info = {}

data/lib/rbbt/sources/mesh.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'rbbt-util'
+require 'rbbt/resource'
+module MeSH
+  extend Resource
+  self.subdir = "share/databases/MeSH"
+  MeSH.claim MeSH["data.gz"], :url, "https://nlmpubs.nlm.nih.gov/projects/mesh/rdf/mesh.nt.gz"
+  MeSH.claim MeSH.vocabulary, :proc do
+    dumper = TSV::Dumper.new :key_field => "MeSH ID", :fields => ["Label"], :type => :single
+    dumper.init
+    TSV.traverse MeSH.data, :type => :array, :into => dumper, :bar => "Processing MeSH vocab" do |line|
+      sub, verb, obj = line.split("\t")
+      next unless verb && verb.include?("rdf-schema#label")
+      id = sub.split("/").last[0..-2]
+      label = obj.split('"')[1]
+      [id, label]
+    end
+  end
+end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -9,6 +9,10 @@ module Organism
   ARCHIVE_MONTH_INDEX = {}
   %w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
+  def self.rake_organism_helper
+    Rbbt.share.install.Organism["organism_helpers.rb"].find
+  end
   def self.compare_archives(a1, a2)
     a1 = a1.partition("/").last if a1 and a1.include? "/"
     a2 = a2.partition("/").last if a2 and a2.include? "/"
@@ -29,7 +33,8 @@ module Organism
   end
   def self.default_code(organism = "Hsa")
-    organism.split("/").first << "/feb2014"
+    latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
+    organism.split("/").first << "/" << latest
   end
   def self.organism_codes(organism = nil)
@@ -43,7 +48,8 @@ module Organism
   end
   def self.installed_organisms
-    Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
+    Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
+    Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
   end
   def self.prepared_organisms
@@ -62,25 +68,6 @@ module Organism
       nil
   end
-  Organism.installable_organisms.each do |organism|
-    claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
-    module_eval "#{ organism } = with_key '#{organism}'"
-  end
-  Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
-    Open.mkdir File.dirname(file) unless File.directory?(file)
-    url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
-    CMD.cmd_log("wget '#{url}' -O '#{file}'")
-    CMD.cmd("chmod 0755 '#{file}'")
-    Rbbt.set_software_env
-    nil
-  end
-  CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
-  Rbbt.set_software_env
   def self.hg_build(organism)
     require 'rbbt/sources/ensembl_ftp'
     organism = organism.strip
@@ -257,7 +244,16 @@ module Organism
   end
   def self.scientific_name(organism)
-    Organism[organism]["scientific_name"].produce.read.strip
+    Organism[organism].scientific_name.read.strip
+  end
+  def self.make_organism(name, long = false)
+    first, _, second = name.partition(/[ _]/)
+    if long
+      first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
+    else
+      first[0].upcase + second[0..1].downcase
+    end
   end
   def self.organism(name)
@@ -295,7 +291,7 @@ module Organism
     organism ||= "Hsa"
     @@gene_start_end ||= {}
-    gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unmamed => true)
+    gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
     ranges = genes.collect{|gene|
       start, eend = gene_start_end[gene]
@@ -339,7 +335,8 @@ module Organism
   def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
     chromosome_sizes = {}
-    Organism[organism].glob_all("chromosome_*").each do |file|
+    Organism.chromosomes(organism).produce.tsv.each do |chr|
+      file = Organism[organism]["chromosome_#{chr}"].produce.find
       chromosome = file.split("_").last.split(".").first
       size = if Open.gzip?(file) || Open.bgzip?(file)
                CMD.cmd("zcat '#{ file }' | wc -c ").read
@@ -352,4 +349,28 @@ module Organism
     chromosome_sizes
   end
+  Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
+    Open.mkdir File.dirname(file) unless File.directory?(file)
+    url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
+    CMD.cmd_log("wget '#{url}' -O '#{file}'")
+    CMD.cmd("chmod 0755 '#{file}'")
+    Rbbt.set_software_env
+    nil
+  end
+  CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
+  Rbbt.set_software_env
+  Organism.installable_organisms.each do |organism|
+    if Rbbt.share.install.Organism[organism].Rakefile.exists?
+      rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
+    else
+      rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
+    end
+    claim Organism[organism], :rake, rakefile
+    module_eval "#{ organism } = with_key '#{organism}'"
+  end
 end

data/lib/rbbt/sources/pubmed.rb CHANGED Viewed

@@ -51,6 +51,7 @@ module PubMed
       end
       [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
     end
     def self.parse_xml(xml)
       require 'nokogiri'
@@ -91,6 +92,16 @@ module PubMed
         [lastname, forename] * ", "
       end * " and "
+      info[:mesh] = parser.search("MeshHeadingList/MeshHeading").collect do |mesh|
+        descriptor = mesh.search("DescriptorName").first.attr('UI')
+        qualifiers = mesh.search("QualifierName").collect{|q| q.attr('UI')}
+        [descriptor] + qualifiers.collect{|q| descriptor + q }
+      end.compact.flatten
+      info[:substance] = parser.search("NameOfSubstance").collect do |substance|
+        substance.attr('UI')
+      end
       info[:bibentry] = bibentry.downcase if bibentry
       info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
@@ -102,7 +113,7 @@ module PubMed
       info
     end
-    attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
+    attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url, :mesh, :substance
     attr_accessor *XML_KEYS.collect{|p| p.first }
     def initialize(xml)
@@ -141,7 +152,7 @@ module PubMed
                  `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
                  TmpFile.with_file do |txt|
                    `pdftotext #{ pdf } #{ txt }`
-                   text = Open.read(txt) if File.exists? txt
+                   text = Open.read(txt) if File.exist?(txt)
                  end
                end
                text

data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} RENAMED Viewed

@@ -1,8 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [9606]
 $scientific_name = "Homo sapiens"
 $ortholog_key = "hsapiens_homolog_ensembl_gene"
@@ -95,17 +90,30 @@ $biomart_identifiers = [
   [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
 ]
-$biomart_go= [
-  ["GO ID", 'go_id'],
-  ["GO Namespace", 'namespace_1003'],
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+load Organism.rake_organism_helper
+file 'regulators' do |t|
+  regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
+  regulatory_fields = [
+    ['Chromosome Name','chromosome_name'],
+    ['Region Start', 'chromosome_start'],
+    ['Region End', 'chromosome_end'],
+    ['Feature type', 'feature_type_name'],
 ]
+  regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
+  Misc.sensiblewrite(t.name, regulators.to_s)
+end
-$biomart_go_2009= [
-  ["GO BP ID", 'go_biological_process_id'],
-  ["GO MF ID", 'go_molecular_function_id'],
-  ["GO CC ID", 'go_cellular_component_id'],
+file 'regulator_activity' do |t|
+  regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
+  regulatory_fields = [
+    ['Epigenome name','epigenome_name'],
+    ['Activity', 'activity'],
 ]
+  regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
-#$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
+  Misc.sensiblewrite(t.name, regulators.to_s)
+end

data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} RENAMED Viewed

@@ -1,8 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [10090]
 $scientific_name = "Mus musculus"
 $ortholog_key = "mmusculus_homolog_ensembl_gene"
@@ -43,18 +38,6 @@ $biomart_identifiers = [
   [ 'EMBL (Genbank) ID' , "embl"] ,
 ]
-$biomart_go= [
-  ["GO ID", 'go_id'],
-  ["GO Namespace", 'namespace_1003'],
-]
-$biomart_go_2009= [
-  ["GO BP ID", 'go_biological_process_id'],
-  ["GO MF ID", 'go_molecular_function_id'],
-  ["GO CC ID", 'go_cellular_component_id'],
-]
-$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+load Organism.rake_organism_helper

data/share/install/Organism/{Rno/Rakefile → Rno.rake} RENAMED Viewed

@@ -1,8 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [10116]
 $scientific_name = "Rattus norvegicus"
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
   [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
 ]
-$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+load Organism.rake_organism_helper

data/share/install/Organism/Sce.rake ADDED Viewed

@@ -0,0 +1,38 @@
+$taxs = [559292,4932]
+$scientific_name = "Saccharomyces cerevisiae"
+$ensembl_domain = 'fungi'
+#$ortholog_key = "yeast_ensembl_gene"
+$biomart_db = 'scerevisiae_eg_gene'
+$biomart_lexicon = [
+  [ 'Associated Gene Name' , "external_gene_name"],
+]
+$biomart_protein_identifiers = [
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'Unigene ID', "unigene"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+]
+$biomart_probe_identifiers = [
+]
+$biomart_identifiers = [
+  [ 'Entrez Gene ID', "entrezgene"],
+  [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
+  [ 'Associated Gene Name', "external_gene_name"  ],
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+  [ 'EMBL (Genbank) ID' , "embl"] ,
+  [ 'RefSeq DNA' , "refseq_dna"] ,
+]
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+Thread.current["ensembl_domain"] = $ensembl_domain
+load Organism.rake_organism_helper