RubyGems - rbbt-sources - Versions diffs - 3.4.0 → 3.4.1 - Mend

rbbt-sources 3.4.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/etc/allowed_biomart_archives +2 -4
data/etc/biomart/missing_in_archive +2 -0
data/etc/build_organism +4 -4
data/etc/organisms +1 -0
data/lib/rbbt/sources/biomart.rb +48 -13
data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
data/lib/rbbt/sources/entrez.rb +13 -0
data/lib/rbbt/sources/go.rb +2 -2
data/lib/rbbt/sources/organism.rb +45 -24
data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
data/share/install/Organism/Sce.rake +38 -0
data/share/install/Organism/organism_helpers.rb +123 -50
data/test/rbbt/sources/test_biomart.rb +44 -6
data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
data/test/rbbt/sources/test_entrez.rb +5 -0
data/test/rbbt/sources/test_organism.rb +15 -15
metadata +9 -7
data/share/install/Organism/Sce/Rakefile +0 -52

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 87f97c6af3dab4a1b39cb258acdf9bf4d105df5703a04d6264f960ff79e81faa
-  data.tar.gz: ff91f67bc0775e0a20678ede8eeb312fa1e7a42d18095c2d9bcb1a5c0e4fc000
+  metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
+  data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
 SHA512:
-  metadata.gz: 0b23136a81511a1ad55d5bb2af5784fd74512b9355bf40023a5197180bf25b69aefa966a3dafc8347f1864da174637fa0a2f95bb687a8973a4b23f5e6778398d
-  data.tar.gz: 4fca8a03899b980a18da56d9cdd56bc0136ce126c718ca61836fa3cf55313f77664b7ddecadba2ad45491c2e309604da3ce7288c5de70c660f498bfcc2849aec
+  metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
+  data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0

data/etc/allowed_biomart_archives CHANGED Viewed

@@ -1,8 +1,6 @@
 may2009
 feb2014
-may2017
-oct2018
-apr2019
+may2015
+sep2019
 feb2021
 feb2023
-oct2016

data/etc/biomart/missing_in_archive CHANGED Viewed

@@ -19,6 +19,8 @@
     - refseq_ncrna_predicted
 ">jun2015":
     - uniprot_swissprot_accession~uniprot_swissprot
+">jan2023":
+    - external_transcript_id~external_transcript_name
 <aug2014:
     - external_gene_name~external_gene_id
 may2010:

data/etc/build_organism CHANGED Viewed

@@ -2,7 +2,7 @@
 hg18	Hsa/may2008
 hg19	Hsa/feb2014
 b37	Hsa/feb2014
-hg38	Hsa/may2017
-GRCh38	Hsa/may2017
-mm10	Mmu/may2017
-GRCm38	Mmu/may2017
+hg38	Hsa/feb2023
+GRCh38	Hsa/feb2023
+mm10	Mmu/feb2023
+GRCm38	Mmu/feb2023

data/etc/organisms CHANGED Viewed

@@ -1,2 +1,3 @@
 Hsa
 Mmu
+Sce

data/lib/rbbt/sources/biomart.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'rbbt/tsv'
 require 'rbbt/tsv/attach'
 require 'rbbt/util/log'
 require 'cgi'
+require 'rbbt/sources/organism'
 # This module interacts with BioMart. It performs queries to BioMart and
 # synthesises a hash with the results. Note that this module connects to the
@@ -13,7 +14,7 @@ module BioMart
   class BioMart::QueryError < StandardError; end
-  BIOMART_URL = 'http://www.ensembl.org/biomart/martservice?query='
+  BIOMART_URL = 'ensembl.org/biomart/martservice'
   MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
@@ -22,7 +23,7 @@ module BioMart
   @@biomart_query_xml = <<-EOT
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
-<Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
+<Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
 <Dataset name = "<!--DATABASE-->" interface = "default" >
 <!--FILTERS-->
 <!--MAIN-->
@@ -36,14 +37,10 @@ module BioMart
       raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
     end
     Thread.current['archive'] = date
-    Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
-    Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
   end
   def self.unset_archive
-    Log.debug "Restoring current version URL #{BIOMART_URL}"
     Thread.current['archive'] = nil
-    Thread.current['archive_url'] = nil
   end
   def self.with_archive(data)
@@ -55,6 +52,21 @@ module BioMart
     end
   end
+  def self.final_url(query, archive = nil, ensembl_domain = nil)
+    url_domain = if archive.nil?
+      if ensembl_domain.nil?
+        'www'
+      else
+        ensembl_domain
+      end
+    elsif ensembl_domain
+      [archive, ensembl_domain] * "-"
+    else
+      [archive, 'archive'] * "."
+    end
+    "http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
+  end
   def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
     open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
     repeats = true
@@ -75,11 +87,17 @@ module BioMart
     query = @@biomart_query_xml.dup
     query.sub!(/<!--DATABASE-->/,database)
+    if Thread.current["ensembl_domain"]
+      query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
+    else
+      query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
+    end
     query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
     query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
     query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
-    url = Thread.current['archive_url'] ? Thread.current['archive_url'] + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
+    url = final_url(query,  Thread.current["archive"], Thread.current["ensembl_domain"])
     begin
       response = Open.read(url, open_options.dup)
@@ -105,10 +123,17 @@ module BioMart
     new_datafile = TmpFile.tmp_file
     if data.nil?
-      TSV.merge_row_fields Open.open(result_file), new_datafile
+      Open.open(result_file) do |file|
+        Open.write(new_datafile, Open.collapse_stream(file))
+      end
       data = new_datafile
     else
-      TSV.merge_different_fields data, result_file, new_datafile
+      Open.open(result_file) do |stream_result|
+        Open.open(data) do |stream_data|
+          Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
+        end
+      end
+      #TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
       FileUtils.rm data
       data = new_datafile
     end
@@ -142,9 +167,9 @@ module BioMart
     IndiferentHash.setup(open_options)
-    Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
+    Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
-    max_items = 2
+    max_items = 1
     chunks = []
     chunk = []
     attrs.each{|a|
@@ -178,7 +203,7 @@ module BioMart
       results
     else
       Open.write(filename) do |f|
-        f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
+        f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
         if field_names.nil?
           f.puts "#" << [main, attrs].flatten * "\t"
         else
@@ -211,7 +236,17 @@ module BioMart
     changes = {}
     missing.select{|m| m.include? "~" }.each do |str|
       orig,_sep, new = str.partition "~"
-      changes[orig] = new
+      if orig.include?(":")
+        target_db, _sep, orig = orig.partition(":")
+        if target_db[0] == "-"
+          next if database == target_db[1..-1]
+        else
+          next unless database == target_db
+        end
+        changes[orig] = new
+      else
+        changes[orig] = new
+      end
     end
     changed = true
     while changed

data/lib/rbbt/sources/ensembl_ftp.rb CHANGED Viewed

@@ -9,11 +9,29 @@ module Ensembl
   module FTP
     SERVER = "ftp.ensembl.org"
+    DOMAIN_SERVER = "ftp.ensemblgenomes.org"
-    def self.mysql_path(release)
+    def self.ftp_name_for_domain(domain, organism, subdir='mysql')
+      code, build = organism.split "/"
+      build ||= "current"
+      release = build == "current" ? 'current' : Ensembl.releases[build]
+      name = Organism.scientific_name(organism)
+      ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
+      ftp.passive = true
+      ftp.login
+      dir = File.join('pub', domain,  'current', subdir)
+      ftp.chdir(dir)
+      file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f|  f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
+      ftp.close
+      [release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
     end
-    def self.ftp_name_for(organism)
+    def self.ftp_name_for(organism, subdir='mysql')
+      if domain = Thread.current["ensembl_domain"]
+        return ftp_name_for_domain(domain, organism,subdir)
+      end
       code, build = organism.split "/"
       build ||= "current"
@@ -23,8 +41,9 @@ module Ensembl
         ftp = Net::FTP.new(Ensembl::FTP::SERVER)
         ftp.passive = true
         ftp.login
-        ftp.chdir(File.join('pub', 'current_mysql'))
-        file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
+        dir = File.join('pub', "current_#{subdir}")
+        ftp.chdir(dir)
+        file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
         ftp.close
       else
         release = Ensembl.releases[build]
@@ -32,24 +51,21 @@ module Ensembl
         ftp = Net::FTP.new(Ensembl::FTP::SERVER)
         ftp.passive = true
         ftp.login
-        ftp.chdir(File.join('pub', release, 'mysql'))
-        file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
+        dir = File.join('pub', release, subdir)
+        ftp.chdir(dir)
+        file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
         ftp.close
       end
-      [release, file]
+      [release, File.join(Ensembl::FTP::SERVER, dir, file)]
     end
-    def self.ftp_directory_for(organism)
-      release, ftp_name = ftp_name_for(organism)
-      if release == 'current'
-        File.join('/pub/', 'current_mysql', ftp_name)
-      else
-        File.join('/pub/', release, 'mysql', ftp_name)
-      end
+    def self.ftp_url_for(organism)
+      release, ftp_url = ftp_name_for(organism)
+      ftp_url
     end
     def self.base_url(organism)
-      File.join("ftp://" + SERVER, ftp_directory_for(organism) )
+      File.join("ftp://", ftp_url_for(organism) )
     end
     def self.url_for(organism, table, extension)

data/lib/rbbt/sources/entrez.rb CHANGED Viewed

@@ -8,6 +8,19 @@ module Entrez
   Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
   Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
+  Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
+    TmpFile.with_dir do |dir|
+      Misc.in_dir dir do
+        CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
+        CMD.cmd("tar xvfz taxdump.tar.gz")
+        CMD.cmd("grep 'scientific name' names.dmp  |cut -f 1,3 > tmp.tsv")
+        tsv = TSV.open('tmp.tsv', type: :single)
+        tsv.key_field = "Entrez Tax ID"
+        tsv.fields = ["Scientific Name"]
+        Open.write(filename, tsv.to_s)
+      end
+    end
+  end
   def self.entrez2native(taxs, options = {})
     options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -25,8 +25,8 @@ module GO
   # the gene_ontology.obo file and extracts all the fields, although right now,
   # only the name field is used.
   def self.init
-    Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
-      info.serializer = :marshal if info.respond_to? :serializer
+    Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
+      #info.serializer = :marshal if info.respond_to? :serializer
       Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
         term_info = {}

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -9,6 +9,10 @@ module Organism
   ARCHIVE_MONTH_INDEX = {}
   %w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
+  def self.rake_organism_helper
+    Rbbt.share.install.Organism["organism_helpers.rb"].find
+  end
   def self.compare_archives(a1, a2)
     a1 = a1.partition("/").last if a1 and a1.include? "/"
     a2 = a2.partition("/").last if a2 and a2.include? "/"
@@ -29,7 +33,8 @@ module Organism
   end
   def self.default_code(organism = "Hsa")
-    organism.split("/").first << "/feb2014"
+    latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
+    organism.split("/").first << "/" << latest
   end
   def self.organism_codes(organism = nil)
@@ -43,7 +48,8 @@ module Organism
   end
   def self.installed_organisms
-    Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
+    Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
+    Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
   end
   def self.prepared_organisms
@@ -62,25 +68,6 @@ module Organism
       nil
   end
-  Organism.installable_organisms.each do |organism|
-    claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
-    module_eval "#{ organism } = with_key '#{organism}'"
-  end
-  Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
-    Open.mkdir File.dirname(file) unless File.directory?(file)
-    url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
-    CMD.cmd_log("wget '#{url}' -O '#{file}'")
-    CMD.cmd("chmod 0755 '#{file}'")
-    Rbbt.set_software_env
-    nil
-  end
-  CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
-  Rbbt.set_software_env
   def self.hg_build(organism)
     require 'rbbt/sources/ensembl_ftp'
     organism = organism.strip
@@ -257,7 +244,16 @@ module Organism
   end
   def self.scientific_name(organism)
-    Organism[organism]["scientific_name"].produce.read.strip
+    Organism[organism].scientific_name.read.strip
+  end
+  def self.make_organism(name, long = false)
+    first, _, second = name.partition(/[ _]/)
+    if long
+      first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
+    else
+      first[0].upcase + second[0..1].downcase
+    end
   end
   def self.organism(name)
@@ -295,7 +291,7 @@ module Organism
     organism ||= "Hsa"
     @@gene_start_end ||= {}
-    gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unmamed => true)
+    gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
     ranges = genes.collect{|gene|
       start, eend = gene_start_end[gene]
@@ -339,7 +335,8 @@ module Organism
   def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
     chromosome_sizes = {}
-    Organism[organism].glob_all("chromosome_*").each do |file|
+    Organism.chromosomes(organism).produce.tsv.each do |chr|
+      file = Organism[organism]["chromosome_#{chr}"].produce.find
       chromosome = file.split("_").last.split(".").first
       size = if Open.gzip?(file) || Open.bgzip?(file)
                CMD.cmd("zcat '#{ file }' | wc -c ").read
@@ -352,4 +349,28 @@ module Organism
     chromosome_sizes
   end
+  Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
+    Open.mkdir File.dirname(file) unless File.directory?(file)
+    url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
+    CMD.cmd_log("wget '#{url}' -O '#{file}'")
+    CMD.cmd("chmod 0755 '#{file}'")
+    Rbbt.set_software_env
+    nil
+  end
+  CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
+  Rbbt.set_software_env
+  Organism.installable_organisms.each do |organism|
+    if Rbbt.share.install.Organism[organism].Rakefile.exists?
+      rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
+    else
+      rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
+    end
+    claim Organism[organism], :rake, rakefile
+    module_eval "#{ organism } = with_key '#{organism}'"
+  end
 end

data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} RENAMED Viewed

@@ -1,8 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [9606]
 $scientific_name = "Homo sapiens"
 $ortholog_key = "hsapiens_homolog_ensembl_gene"
@@ -95,17 +90,30 @@ $biomart_identifiers = [
   [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
 ]
-$biomart_go= [
-  ["GO ID", 'go_id'],
-  ["GO Namespace", 'namespace_1003'],
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+load Organism.rake_organism_helper
+file 'regulators' do |t|
+  regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
+  regulatory_fields = [
+    ['Chromosome Name','chromosome_name'],
+    ['Region Start', 'chromosome_start'],
+    ['Region End', 'chromosome_end'],
+    ['Feature type', 'feature_type_name'],
 ]
+  regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
+  Misc.sensiblewrite(t.name, regulators.to_s)
+end
-$biomart_go_2009= [
-  ["GO BP ID", 'go_biological_process_id'],
-  ["GO MF ID", 'go_molecular_function_id'],
-  ["GO CC ID", 'go_cellular_component_id'],
+file 'regulator_activity' do |t|
+  regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
+  regulatory_fields = [
+    ['Epigenome name','epigenome_name'],
+    ['Activity', 'activity'],
 ]
+  regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
-#$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
+  Misc.sensiblewrite(t.name, regulators.to_s)
+end

data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} RENAMED Viewed

@@ -1,8 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [10090]
 $scientific_name = "Mus musculus"
 $ortholog_key = "mmusculus_homolog_ensembl_gene"
@@ -43,18 +38,6 @@ $biomart_identifiers = [
   [ 'EMBL (Genbank) ID' , "embl"] ,
 ]
-$biomart_go= [
-  ["GO ID", 'go_id'],
-  ["GO Namespace", 'namespace_1003'],
-]
-$biomart_go_2009= [
-  ["GO BP ID", 'go_biological_process_id'],
-  ["GO MF ID", 'go_molecular_function_id'],
-  ["GO CC ID", 'go_cellular_component_id'],
-]
-$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+load Organism.rake_organism_helper

data/share/install/Organism/{Rno/Rakefile → Rno.rake} RENAMED Viewed

@@ -1,8 +1,3 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [10116]
 $scientific_name = "Rattus norvegicus"
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
   [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
 ]
-$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+load Organism.rake_organism_helper

data/share/install/Organism/Sce.rake ADDED Viewed

@@ -0,0 +1,38 @@
+$taxs = [559292,4932]
+$scientific_name = "Saccharomyces cerevisiae"
+$ensembl_domain = 'fungi'
+#$ortholog_key = "yeast_ensembl_gene"
+$biomart_db = 'scerevisiae_eg_gene'
+$biomart_lexicon = [
+  [ 'Associated Gene Name' , "external_gene_name"],
+]
+$biomart_protein_identifiers = [
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'Unigene ID', "unigene"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+]
+$biomart_probe_identifiers = [
+]
+$biomart_identifiers = [
+  [ 'Entrez Gene ID', "entrezgene"],
+  [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
+  [ 'Associated Gene Name', "external_gene_name"  ],
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+  [ 'EMBL (Genbank) ID' , "embl"] ,
+  [ 'RefSeq DNA' , "refseq_dna"] ,
+]
+$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
+Thread.current["namespace"] = $namespace
+Thread.current["ensembl_domain"] = $ensembl_domain
+load Organism.rake_organism_helper

data/share/install/Organism/organism_helpers.rb CHANGED Viewed

@@ -1,8 +1,11 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
 require 'net/ftp'
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+require File.join(File.dirname(__FILE__), '../lib/helpers')
 require 'rbbt/sources/ensembl_ftp'
-#Thread.current['namespace'] = $namespace
 $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
 $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
 $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -77,6 +80,17 @@ $biomart_pfam= [
   ["Pfam Domain", 'pfam'],
 ]
+$biomart_go= [
+  ["GO ID", 'go_id'],
+  ["GO Namespace", 'namespace_1003'],
+]
+$biomart_go_2009= [
+  ["GO BP ID", 'go_biological_process_id'],
+  ["GO MF ID", 'go_molecular_function_id'],
+  ["GO CC ID", 'go_cellular_component_id'],
+]
 $biomart_gene_biotype= [
   ["Biotype", 'gene_biotype'],
 ]
@@ -91,7 +105,13 @@ $biomart_exons = [
 #{{{ Rules
 file 'entrez_taxids' do |t|
-  Misc.sensiblewrite(t.name, $taxs * "\n")
+  if $tax && $tax.any?
+    Misc.sensiblewrite(t.name, $taxs * "\n")
+  else
+    tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
+    taxs = tsv[$scientific_name] || []
+    Misc.sensiblewrite(t.name, taxs * "\n")
+  end
 end
 file 'scientific_name' do |t|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
   Misc.sensiblewrite(t.name, $ortholog_key)
 end
-file 'identifiers' do |t|
+file 'identifiers' => 'entrez_taxids' do |t|
+  tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
   identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
   identifiers.unnamed =  true
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
   end
   name_pos = identifiers.identify_field "Associated Gene Name"
-  entrez2name = Entrez.entrez2name($taxs)
-  identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
-    names = values[name_pos]
+  if tax_codes and tax_codes.any?
+    entrez2name = Entrez.entrez2name(tax_codes)
+    identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
+      names = values[name_pos] || []
-    matches = entrez.select do |e|
-      entrez2name.include?(e) && (names & entrez2name[e]).any?
-    end
+      matches = entrez.select do |e|
+        entrez2name.include?(e) && (names & entrez2name[e]).any?
+      end
-    if matches.any?
-      matches
-    else
-      entrez
+      if matches.any?
+        matches
+      else
+        entrez
+      end
     end
   end
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
     identifiers = identifiers.reorder(:key, ordered_fields)
   end
-  entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
-  entrez_synonyms.key_field = "Entrez Gene ID"
-  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
+  if tax_codes and tax_codes.any?
+    entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
+    entrez_synonyms.key_field = "Entrez Gene ID"
+    entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
-  identifiers.attach entrez_synonyms
+    identifiers.attach entrez_synonyms
+  end
   identifiers.with_unnamed do
     identifiers.each do |key, values|
       values.each do |list|
+        list ||= []
         list.reject!{|v| v.nil? or v.empty?}
         list.uniq!
       end
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
   Misc.sensiblewrite(t.name, identifiers.to_s)
 end
-file 'lexicon' => 'identifiers' do |t|
+file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
   tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
+  tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
-  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
+  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
   entrez_description.key_field = "Entrez Gene ID"
   entrez_description.fields = ["Entrez Gene Description"]
@@ -308,8 +335,9 @@ end
 # {{{ Other info
-file 'gene_pmids' do |t|
-  tsv =  Entrez.entrez2pubmed($taxs)
+file 'gene_pmids' => 'entrez_taxids' do |t|
+  tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
+  tsv =  Entrez.entrez2pubmed(tax_codes)
   text = "#: :namespace=#{Thread.current['namespace']}\n"
   text += "#Entrez Gene ID\tPMID"
   tsv.each do |gene, pmids|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
   gene_go.monitor = true
   gene_go.process "GO ID" do |key, go_id, values|
-    clean = values.zip_fields.select do |id, type|
+    clean = NamedArray.zip_fields(values).select do |id, type|
       type == "biological_process"
     end
     clean.collect{|id, type| id}
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
 end
 file 'chromosomes' do |t|
-  goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
+  tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
-  Misc.sensiblewrite(t.name, goterms.to_s)
+  Misc.sensiblewrite(t.name, tsv.keys * "\n")
 end
 file 'blacklist_chromosomes' => 'chromosomes' do |t|
@@ -511,6 +539,15 @@ end
 rule /^chromosome_.*/ do |t|
   chr = t.name.match(/chromosome_(.*)/)[1]
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
+  organism = File.basename(dirname)
+  if organism =~ /^[a-z]{3}20[0-9]{2}/
+    archive = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, archive)
+  end
   # HACK: Skip LRG chromosomes
   raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
   release = Ensembl.releases[archive]
-  ftp = Net::FTP.new("ftp.ensembl.org")
+  fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
+  server, _, path = fasta_url.partition("/")
+  path = "/" + path
+  ftp = Net::FTP.new(server)
   ftp.passive = true
   ftp.login
-  if release.nil? or release == 'current'
-    ftp.chdir("pub/current_fasta/")
-  else
-    ftp.chdir("pub/#{ release }/fasta/")
-  end
-  ftp.chdir($scientific_name.downcase.sub(" ",'_'))
+  ftp.chdir(path)
   ftp.chdir('dna')
-  file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
-  raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
-  Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
+  file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
+  if file
+    Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
-  Misc.lock t.name + '.rake' do
-    TmpFile.with_file do |tmpfile|
-      ftp.getbinaryfile(file, tmpfile)
-      Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
-      ftp.close
+    Misc.lock t.name + '.rake' do
+      TmpFile.with_file do |tmpfile|
+        ftp.getbinaryfile(file, tmpfile)
+        Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+        ftp.close
+      end
     end
+  else
+    file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
+    Misc.lock t.name + '.rake' do
+      TmpFile.with_file do |tmpfile|
+        ftp.getbinaryfile(file, tmpfile)
+        txt = Open.read(tmpfile, :gzip => true)
+        chr_txt = []
+        in_chr = false
+        txt.split("\n").each do |line|
+          if line.start_with?(">#{chr}")
+            in_chr = true
+          elsif line.start_with?(">")
+            in_chr = false
+          else
+            chr_txt << line if in_chr
+          end
+        end
+        Misc.sensiblewrite(t.name, chr_txt * "" )
+        ftp.close
+      end
+    end
+    raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
   end
 end
@@ -584,6 +644,16 @@ end
 require 'bio'
 file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
+  organism = File.basename(dirname)
+  if organism =~ /^[a-z]{3}20[0-9]{2}/
+    archive = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, archive)
+  end
   exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
   chr_transcript_ranges ||= {}
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
   chr_transcript_ranges.each do |chr, transcript_ranges|
     begin
       raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
-      p = File.expand_path("./chromosome_#{chr}")
-      Organism.root.annotate p
-      p.sub!(%r{.*/organisms/},'share/organisms/')
-      chr_str = p.produce.read
+      pkgdir = Thread.current["resource"]
+      p = pkgdir[organism]["chromosome_#{chr}"]
+      p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
+      chr_str = p.read
     rescue Exception
       Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
       raise $! unless $!.message =~ /not supported/
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
     organism = File.join(organism, archive)
   end
-  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
+  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
   if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
     exon2ensembl       = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
     transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
   end
-  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unmamed => true)
-  transcript_exons   = TSV.open("./transcript_exons", :unmamed => true)
-  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
+  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unnamed => true)
+  transcript_exons   = TSV.open("./transcript_exons", :unnamed => true)
+  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
   transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
   transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
@@ -719,12 +789,13 @@ end
 file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
   transcript_5utr     = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
   transcript_3utr     = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
-  transcript_phase     = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
+  transcript_phase    = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
   transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
   transcript_protein  = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
   protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
+  transcript_sequence.monitor = true
   transcript_sequence.through do |transcript, sequence|
     protein = transcript_protein[transcript]
     next if protein.nil? or protein.empty?
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
     uni_seq = UniProt.get_uniprot_sequence(uni)
     ensps = uni2ensps[uni]
     next if ensps.nil? or ensps.empty?
     best_ensp = ensps.sort_by do |ensp|
       ensp_seq = ensp2seq[ensp]
       if ensp_seq
@@ -829,3 +901,4 @@ file 'cdna_fasta' do |t|
   Open.download(url, "#{t.name}.gz")
   nil
 end

data/test/rbbt/sources/test_biomart.rb CHANGED Viewed

@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
 require 'rbbt/sources/pubmed'
 require 'test/unit'
 require 'rbbt/sources/biomart'
+require 'rbbt/sources/organism'
 require 'rbbt/util/tmpfile'
 require 'test/unit'
 class TestBioMart < Test::Unit::TestCase
   def setup
-    BioMart.set_archive Organism.default_code("Hsa")
+    BioMart.set_archive "feb2014"
   end
   def teardown
     BioMart.unset_archive
   end
-  def _test_get
+  def test_get_Sce
     assert_raise BioMart::QueryError do
       BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
     end
-    data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :merge => true, :wget_options => {:quiet => false})
+    BioMart.set_archive "feb2023-fungi"
+    data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
     tsv = TSV.open data, :double, :merge => true
-    assert(tsv['852236'][0].include? 'CAA84864')
+    assert(tsv['852236'][0].include? 'CAA84864.1')
-    data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
+    data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
     tsv = TSV.open data, :double, :merge => true
     assert(tsv['852236'][1].include? 'YBL044W')
   end
-  def _test_query
+  def test_get_Hsa
+    Log.severity = 0
+    data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
+    tsv = TSV.open data, :double, :merge => true
+    assert(tsv['852236'][0].include? 'CAA84864.1')
+  end
+  def test_query
     data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['external_gene_id'].include? 'YBL044W')
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
     end
   end
+  def __test_transcrip_exons
+    Log.with_severity 1 do
+      TmpFile.with_file do |f|
+        fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
+        main = fields[0]
+        attrs = fields.values_at(1, 2)
+        attrs_first = [attrs.first]
+        attrs_last = [attrs.last]
+        database = 'hsapiens_gene_ensembl'
+        filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        data = TSV.open Open.open(filename)
+        assert(data['852236']['external_gene_id'].include? 'YBL044W')
+      end
+    end
+  end
   def test_tsv
     data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['Protein ID'].include? 'CAA84864')

data/test/rbbt/sources/test_ensembl_ftp.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
+require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
+class TestEnsemblFTP < Test::Unit::TestCase
+  def test_ftp_for
+    assert_nothing_raised do
+      Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
+    end
+  end
+end

data/test/rbbt/sources/test_entrez.rb CHANGED Viewed

@@ -14,6 +14,11 @@ class TestEntrez < Test::Unit::TestCase
     assert(lexicon['855611'].include? 'S000005056')
   end
+  def test_entrez2name
+    tax    = $yeast_tax
+    Entrez.entrez2name(tax)
+  end
   def test_entrez2pubmed
     tax   = $yeast_tax

data/test/rbbt/sources/test_organism.rb CHANGED Viewed

@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
 class TestOrganism < Test::Unit::TestCase
-  def test_known_ids
+  def _test_known_ids
     assert Organism.known_ids("Hsa").include?("Associated Gene Name")
   end
-  def test_location
+  def _test_location
     assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
   end
-  def test_identifiers
+  def _test_identifiers
     assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
     assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
     assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
   end
-  def test_lexicon
+  def _test_lexicon
     assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
   end
-  def test_guess_id
+  def _test_guess_id
     ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
     gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
     assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
     assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
   end
-  def test_organisms
+  def _test_organisms
     assert Organism.organisms.include? "Hsa"
     assert_equal "Hsa", Organism.organism("Homo sapiens")
   end
-  def test_attach_translations
+  def _test_attach_translations
     tsv = TSV.setup({"1020" => []}, :type => :list)
     tsv.key_field = "Entrez Gene ID"
     tsv.fields = []
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
     assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
   end
-  def test_entrez_taxids
+  def _test_entrez_taxids
     assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
   end
-  def test_lift_over
+  def _test_lift_over
     mutation_19 = "19:21131664:T"
     mutation_18 = "19:20923504:T"
-    source_build = Organism.default_code("Hsa")
+    source_build = "Hsa/feb2014"
     target_build = "Hsa/may2009"
     assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
     assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
   end
-  def test_orhtolog
+  def _test_orhtolog
     require 'rbbt/entity/gene'
     assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
   end
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
     assert Organism.chromosome_sizes["2"].to_i > 10_000_000
   end
-  def test_build_organism
+  def _test_build_organism
     assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
     assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
     assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
   end
-  #def test_genes_at_chromosome
+  #def _test_genes_at_chromosome
   #  pos = [12, 117799500]
   #  assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
   #end
-  #def test_genes_at_chromosome_array
+  #def _test_genes_at_chromosome_array
   #  pos = [12, [117799500, 106903900]]
   #  assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
   #end
-  #def test_genes_at_genomic_positions
+  #def _test_genes_at_genomic_positions
   #  pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
   #  assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
   #end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-sources
 version: !ruby/object:Gem::Version
-  version: 3.4.0
+  version: 3.4.1
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-05-08 00:00:00.000000000 Z
+date: 2025-01-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -120,10 +120,10 @@ files:
 - share/install/KEGG/Rakefile
 - share/install/Matador/Rakefile
 - share/install/NCI/Rakefile
-- share/install/Organism/Hsa/Rakefile
-- share/install/Organism/Mmu/Rakefile
-- share/install/Organism/Rno/Rakefile
-- share/install/Organism/Sce/Rakefile
+- share/install/Organism/Hsa.rake
+- share/install/Organism/Mmu.rake
+- share/install/Organism/Rno.rake
+- share/install/Organism/Sce.rake
 - share/install/Organism/organism_helpers.rb
 - share/install/PharmaGKB/Rakefile
 - share/install/Pina/Rakefile
@@ -133,6 +133,7 @@ files:
 - share/install/lib/rake_helper.rb
 - test/rbbt/sources/test_HPRD.rb
 - test/rbbt/sources/test_biomart.rb
+- test/rbbt/sources/test_ensembl_ftp.rb
 - test/rbbt/sources/test_entrez.rb
 - test/rbbt/sources/test_go.rb
 - test/rbbt/sources/test_gscholar.rb
@@ -166,13 +167,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.9
+rubygems_version: 3.5.23
 signing_key:
 specification_version: 4
 summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
 test_files:
 - test/rbbt/sources/test_HPRD.rb
 - test/rbbt/sources/test_biomart.rb
+- test/rbbt/sources/test_ensembl_ftp.rb
 - test/rbbt/sources/test_entrez.rb
 - test/rbbt/sources/test_go.rb
 - test/rbbt/sources/test_gscholar.rb

data/share/install/Organism/Sce/Rakefile DELETED Viewed

@@ -1,52 +0,0 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
-require 'rbbt/sources/biomart'
-require 'rbbt/sources/entrez'
-require File.join(File.dirname(__FILE__), '../../lib/helpers')
-$taxs = [559292,4932]
-$scientific_name = "Saccharomyces cerevisiae"
-#$ortholog_key = "yeast_ensembl_gene"
-$biomart_db = 'scerevisiae_gene_ensembl'
-$biomart_lexicon = [
-  [ 'Associated Gene Name' , "external_gene_id"],
-]
-$biomart_protein_identifiers = [
-  [ 'Protein ID', "protein_id"  ],
-  [ 'RefSeq Protein ID', "refseq_peptide"  ],
-  [ 'Unigene ID', "unigene"  ],
-  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
-  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
-]
-$biomart_probe_identifiers = [
-]
-$biomart_identifiers = [
-  [ 'Entrez Gene ID', "entrezgene"],
-  [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
-  [ 'Associated Gene Name', "external_gene_id"  ],
-  [ 'Protein ID', "protein_id"  ],
-  [ 'RefSeq Protein ID', "refseq_peptide"  ],
-  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
-  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
-  [ 'EMBL (Genbank) ID' , "embl"] ,
-  [ 'RefSeq mRNA' , "refseq_mrna"] ,
-]
-$biomart_go= [
-  ["GO ID", 'go_id'],
-  ["GO Namespace", 'namespace_1003'],
-]
-$biomart_go_2009= [
-  ["GO BP ID", 'go_biological_process_id'],
-  ["GO MF ID", 'go_molecular_function_id'],
-  ["GO CC ID", 'go_cellular_component_id'],
-]
-$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
-Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
-load File.join(File.dirname(__FILE__), '../organism_helpers.rb')