RubyGems - rbbt-sources - Versions diffs - 3.3.0 → 3.4.1 - Mend

rbbt-sources 3.3.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/etc/allowed_biomart_archives +2 -4
data/etc/biomart/missing_in_archive +2 -0
data/etc/build_organism +4 -4
data/etc/organisms +1 -0
data/lib/rbbt/sources/biomart.rb +48 -13
data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
data/lib/rbbt/sources/entrez.rb +13 -0
data/lib/rbbt/sources/go.rb +2 -2
data/lib/rbbt/sources/mesh.rb +26 -0
data/lib/rbbt/sources/organism.rb +45 -24
data/lib/rbbt/sources/pubmed.rb +13 -2
data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
data/share/install/Organism/Sce.rake +38 -0
data/share/install/Organism/organism_helpers.rb +126 -53
data/share/install/lib/rake_helper.rb +2 -2
data/test/rbbt/sources/test_biomart.rb +44 -6
data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
data/test/rbbt/sources/test_entrez.rb +5 -0
data/test/rbbt/sources/test_mesh.rb +10 -0
data/test/rbbt/sources/test_organism.rb +15 -15
data/test/rbbt/sources/test_pubmed.rb +18 -8
metadata +12 -7
data/share/install/Organism/Sce/Rakefile +0 -52

data/share/install/Organism/organism_helpers.rb CHANGED Viewed

@@ -1,8 +1,11 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
 require 'net/ftp'
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+require File.join(File.dirname(__FILE__), '../lib/helpers')
 require 'rbbt/sources/ensembl_ftp'
-#Thread.current['namespace'] = $namespace
 $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
 $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
 $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -77,6 +80,17 @@ $biomart_pfam= [
   ["Pfam Domain", 'pfam'],
 ]
+$biomart_go= [
+  ["GO ID", 'go_id'],
+  ["GO Namespace", 'namespace_1003'],
+]
+$biomart_go_2009= [
+  ["GO BP ID", 'go_biological_process_id'],
+  ["GO MF ID", 'go_molecular_function_id'],
+  ["GO CC ID", 'go_cellular_component_id'],
+]
 $biomart_gene_biotype= [
   ["Biotype", 'gene_biotype'],
 ]
@@ -91,7 +105,13 @@ $biomart_exons = [
 #{{{ Rules
 file 'entrez_taxids' do |t|
-  Misc.sensiblewrite(t.name, $taxs * "\n")
+  if $tax && $tax.any?
+    Misc.sensiblewrite(t.name, $taxs * "\n")
+  else
+    tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
+    taxs = tsv[$scientific_name] || []
+    Misc.sensiblewrite(t.name, taxs * "\n")
+  end
 end
 file 'scientific_name' do |t|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
   Misc.sensiblewrite(t.name, $ortholog_key)
 end
-file 'identifiers' do |t|
+file 'identifiers' => 'entrez_taxids' do |t|
+  tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
   identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
   identifiers.unnamed =  true
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
   end
   name_pos = identifiers.identify_field "Associated Gene Name"
-  entrez2name = Entrez.entrez2name($taxs)
-  identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
-    names = values[name_pos]
+  if tax_codes and tax_codes.any?
+    entrez2name = Entrez.entrez2name(tax_codes)
+    identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
+      names = values[name_pos] || []
-    matches = entrez.select do |e|
-      entrez2name.include?(e) && (names & entrez2name[e]).any?
-    end
+      matches = entrez.select do |e|
+        entrez2name.include?(e) && (names & entrez2name[e]).any?
+      end
-    if matches.any?
-      matches
-    else
-      entrez
+      if matches.any?
+        matches
+      else
+        entrez
+      end
     end
   end
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
     identifiers = identifiers.reorder(:key, ordered_fields)
   end
-  entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
-  entrez_synonyms.key_field = "Entrez Gene ID"
-  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
+  if tax_codes and tax_codes.any?
+    entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
+    entrez_synonyms.key_field = "Entrez Gene ID"
+    entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
-  identifiers.attach entrez_synonyms
+    identifiers.attach entrez_synonyms
+  end
   identifiers.with_unnamed do
     identifiers.each do |key, values|
       values.each do |list|
+        list ||= []
         list.reject!{|v| v.nil? or v.empty?}
         list.uniq!
       end
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
   Misc.sensiblewrite(t.name, identifiers.to_s)
 end
-file 'lexicon' => 'identifiers' do |t|
+file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
   tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
+  tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
-  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
+  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
   entrez_description.key_field = "Entrez Gene ID"
   entrez_description.fields = ["Entrez Gene Description"]
@@ -308,8 +335,9 @@ end
 # {{{ Other info
-file 'gene_pmids' do |t|
-  tsv =  Entrez.entrez2pubmed($taxs)
+file 'gene_pmids' => 'entrez_taxids' do |t|
+  tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
+  tsv =  Entrez.entrez2pubmed(tax_codes)
   text = "#: :namespace=#{Thread.current['namespace']}\n"
   text += "#Entrez Gene ID\tPMID"
   tsv.each do |gene, pmids|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
   gene_go.monitor = true
   gene_go.process "GO ID" do |key, go_id, values|
-    clean = values.zip_fields.select do |id, type|
+    clean = NamedArray.zip_fields(values).select do |id, type|
       type == "biological_process"
     end
     clean.collect{|id, type| id}
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
 end
 file 'chromosomes' do |t|
-  goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
+  tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
-  Misc.sensiblewrite(t.name, goterms.to_s)
+  Misc.sensiblewrite(t.name, tsv.keys * "\n")
 end
 file 'blacklist_chromosomes' => 'chromosomes' do |t|
@@ -511,6 +539,15 @@ end
 rule /^chromosome_.*/ do |t|
   chr = t.name.match(/chromosome_(.*)/)[1]
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
+  organism = File.basename(dirname)
+  if organism =~ /^[a-z]{3}20[0-9]{2}/
+    archive = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, archive)
+  end
   # HACK: Skip LRG chromosomes
   raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
   release = Ensembl.releases[archive]
-  ftp = Net::FTP.new("ftp.ensembl.org")
+  fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
+  server, _, path = fasta_url.partition("/")
+  path = "/" + path
+  ftp = Net::FTP.new(server)
   ftp.passive = true
   ftp.login
-  if release.nil? or release == 'current'
-    ftp.chdir("pub/current_fasta/")
-  else
-    ftp.chdir("pub/#{ release }/fasta/")
-  end
-  ftp.chdir($scientific_name.downcase.sub(" ",'_'))
+  ftp.chdir(path)
   ftp.chdir('dna')
-  file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
-  raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
-  Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
+  file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
+  if file
+    Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
-  Misc.lock t.name + '.rake' do
-    TmpFile.with_file do |tmpfile|
-      ftp.getbinaryfile(file, tmpfile)
-      Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
-      ftp.close
+    Misc.lock t.name + '.rake' do
+      TmpFile.with_file do |tmpfile|
+        ftp.getbinaryfile(file, tmpfile)
+        Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+        ftp.close
+      end
     end
+  else
+    file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
+    Misc.lock t.name + '.rake' do
+      TmpFile.with_file do |tmpfile|
+        ftp.getbinaryfile(file, tmpfile)
+        txt = Open.read(tmpfile, :gzip => true)
+        chr_txt = []
+        in_chr = false
+        txt.split("\n").each do |line|
+          if line.start_with?(">#{chr}")
+            in_chr = true
+          elsif line.start_with?(">")
+            in_chr = false
+          else
+            chr_txt << line if in_chr
+          end
+        end
+        Misc.sensiblewrite(t.name, chr_txt * "" )
+        ftp.close
+      end
+    end
+    raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
   end
 end
@@ -584,6 +644,16 @@ end
 require 'bio'
 file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
+  organism = File.basename(dirname)
+  if organism =~ /^[a-z]{3}20[0-9]{2}/
+    archive = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, archive)
+  end
   exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
   chr_transcript_ranges ||= {}
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
   chr_transcript_ranges.each do |chr, transcript_ranges|
     begin
       raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
-      p = File.expand_path("./chromosome_#{chr}")
-      Organism.root.annotate p
-      p.sub!(%r{.*/organisms/},'share/organisms/')
-      chr_str = p.produce.read
+      pkgdir = Thread.current["resource"]
+      p = pkgdir[organism]["chromosome_#{chr}"]
+      p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
+      chr_str = p.read
     rescue Exception
       Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
       raise $! unless $!.message =~ /not supported/
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
     organism = File.join(organism, archive)
   end
-  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
+  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
   if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
     exon2ensembl       = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
     transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
   end
-  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unmamed => true)
-  transcript_exons   = TSV.open("./transcript_exons", :unmamed => true)
-  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
+  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unnamed => true)
+  transcript_exons   = TSV.open("./transcript_exons", :unnamed => true)
+  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
   transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
   transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
@@ -719,12 +789,13 @@ end
 file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
   transcript_5utr     = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
   transcript_3utr     = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
-  transcript_phase     = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
+  transcript_phase    = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
   transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
   transcript_protein  = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
   protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
+  transcript_sequence.monitor = true
   transcript_sequence.through do |transcript, sequence|
     protein = transcript_protein[transcript]
     next if protein.nil? or protein.empty?
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
     uni_seq = UniProt.get_uniprot_sequence(uni)
     ensps = uni2ensps[uni]
     next if ensps.nil? or ensps.empty?
     best_ensp = ensps.sort_by do |ensp|
       ensp_seq = ensp2seq[ensp]
       if ensp_seq
@@ -806,7 +878,7 @@ file 'gene_set' do |t|
   build_code = Organism.GRC_build(organism)
   scientific_name = $scientific_name
   url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
-  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
+  Open.download(url, "#{t.name}.gz")
   nil
 end
@@ -825,7 +897,8 @@ file 'cdna_fasta' do |t|
   num = release.split("-").last
   build_code = Organism.GRC_build(organism)
   scientific_name = Organism.scientific_name(organism)
-  url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
-  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
+  url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.cdna.all.fa.gz"
+  Open.download(url, "#{t.name}.gz")
   nil
 end

data/share/install/lib/rake_helper.rb CHANGED Viewed

@@ -9,7 +9,7 @@ SOURCE_DIR = 'source'
 def define_source_tasks(sources)
   sources.each do |name, url|
     file File.join(SOURCE_DIR, name) do |t|
-      FileUtils.mkdir SOURCE_DIR unless File.exists? SOURCE_DIR
+      FileUtils.mkdir SOURCE_DIR unless File.exist? SOURCE_DIR
       Log.log "Retrieving file '#{name}' into '#{t.name}': '#{url}'", Log::LOW
       Open.write(t.name, Open.open(url, :cache => false, :wget_options => {"--no-check-certificate" => true, "--quiet" => false, :pipe => true}))
     end
@@ -87,5 +87,5 @@ end
 task :all => :default
 task :clean do
-  ($__tsv_tasks + $__files).each do |file| FileUtils.rm file.to_s if File.exists?(file.to_s) end
+  ($__tsv_tasks + $__files).each do |file| FileUtils.rm file.to_s if File.exist?(file.to_s) end
 end

data/test/rbbt/sources/test_biomart.rb CHANGED Viewed

@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
 require 'rbbt/sources/pubmed'
 require 'test/unit'
 require 'rbbt/sources/biomart'
+require 'rbbt/sources/organism'
 require 'rbbt/util/tmpfile'
 require 'test/unit'
 class TestBioMart < Test::Unit::TestCase
   def setup
-    BioMart.set_archive Organism.default_code("Hsa")
+    BioMart.set_archive "feb2014"
   end
   def teardown
     BioMart.unset_archive
   end
-  def _test_get
+  def test_get_Sce
     assert_raise BioMart::QueryError do
       BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
     end
-    data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :merge => true, :wget_options => {:quiet => false})
+    BioMart.set_archive "feb2023-fungi"
+    data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
     tsv = TSV.open data, :double, :merge => true
-    assert(tsv['852236'][0].include? 'CAA84864')
+    assert(tsv['852236'][0].include? 'CAA84864.1')
-    data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
+    data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
     tsv = TSV.open data, :double, :merge => true
     assert(tsv['852236'][1].include? 'YBL044W')
   end
-  def _test_query
+  def test_get_Hsa
+    Log.severity = 0
+    data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
+    tsv = TSV.open data, :double, :merge => true
+    assert(tsv['852236'][0].include? 'CAA84864.1')
+  end
+  def test_query
     data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['external_gene_id'].include? 'YBL044W')
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
     end
   end
+  def __test_transcrip_exons
+    Log.with_severity 1 do
+      TmpFile.with_file do |f|
+        fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
+        main = fields[0]
+        attrs = fields.values_at(1, 2)
+        attrs_first = [attrs.first]
+        attrs_last = [attrs.last]
+        database = 'hsapiens_gene_ensembl'
+        filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
+        ppp Open.read(filename)
+        data = TSV.open Open.open(filename)
+        assert(data['852236']['external_gene_id'].include? 'YBL044W')
+      end
+    end
+  end
   def test_tsv
     data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
     assert(data['852236']['Protein ID'].include? 'CAA84864')

data/test/rbbt/sources/test_ensembl_ftp.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
+require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
+class TestEnsemblFTP < Test::Unit::TestCase
+  def test_ftp_for
+    assert_nothing_raised do
+      Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
+    end
+  end
+end

data/test/rbbt/sources/test_entrez.rb CHANGED Viewed

@@ -14,6 +14,11 @@ class TestEntrez < Test::Unit::TestCase
     assert(lexicon['855611'].include? 'S000005056')
   end
+  def test_entrez2name
+    tax    = $yeast_tax
+    Entrez.entrez2name(tax)
+  end
   def test_entrez2pubmed
     tax   = $yeast_tax

data/test/rbbt/sources/test_mesh.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
+require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
+class TestMESH < Test::Unit::TestCase
+  def test_vocab
+    tsv = MeSH.vocabulary.tsv
+    assert_equal "3T3 Cells", tsv["D016475"]
+  end
+end

data/test/rbbt/sources/test_organism.rb CHANGED Viewed

@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
 class TestOrganism < Test::Unit::TestCase
-  def test_known_ids
+  def _test_known_ids
     assert Organism.known_ids("Hsa").include?("Associated Gene Name")
   end
-  def test_location
+  def _test_location
     assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
   end
-  def test_identifiers
+  def _test_identifiers
     assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
     assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
     assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
   end
-  def test_lexicon
+  def _test_lexicon
     assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
   end
-  def test_guess_id
+  def _test_guess_id
     ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
     gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
     assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
     assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
   end
-  def test_organisms
+  def _test_organisms
     assert Organism.organisms.include? "Hsa"
     assert_equal "Hsa", Organism.organism("Homo sapiens")
   end
-  def test_attach_translations
+  def _test_attach_translations
     tsv = TSV.setup({"1020" => []}, :type => :list)
     tsv.key_field = "Entrez Gene ID"
     tsv.fields = []
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
     assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
   end
-  def test_entrez_taxids
+  def _test_entrez_taxids
     assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
   end
-  def test_lift_over
+  def _test_lift_over
     mutation_19 = "19:21131664:T"
     mutation_18 = "19:20923504:T"
-    source_build = Organism.default_code("Hsa")
+    source_build = "Hsa/feb2014"
     target_build = "Hsa/may2009"
     assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
     assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
   end
-  def test_orhtolog
+  def _test_orhtolog
     require 'rbbt/entity/gene'
     assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
   end
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
     assert Organism.chromosome_sizes["2"].to_i > 10_000_000
   end
-  def test_build_organism
+  def _test_build_organism
     assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
     assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
     assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
   end
-  #def test_genes_at_chromosome
+  #def _test_genes_at_chromosome
   #  pos = [12, 117799500]
   #  assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
   #end
-  #def test_genes_at_chromosome_array
+  #def _test_genes_at_chromosome_array
   #  pos = [12, [117799500, 106903900]]
   #  assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
   #end
-  #def test_genes_at_genomic_positions
+  #def _test_genes_at_genomic_positions
   #  pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
   #  assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
   #end

data/test/rbbt/sources/test_pubmed.rb CHANGED Viewed

@@ -5,7 +5,17 @@ require 'test/unit'
 class TestPubMed < Test::Unit::TestCase
-  def test_get_article
+  def test_mesh
+    pmid = '10866666'
+    assert_include PubMed.get_article(pmid).mesh, "D016475"
+  end
+  def _test_substance
+    pmid = '10866666'
+    assert_include PubMed.get_article(pmid).substance, "C000717247"
+  end
+  def _test_get_article
     pmid = '16438716'
     assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
@@ -13,38 +23,38 @@ class TestPubMed < Test::Unit::TestCase
     assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
   end
-  def test_get_multi_abstract
+  def _test_get_multi_abstract
     pmid = "32141403"
     assert PubMed.get_article(pmid).abstract.include?("This study shows PCOS patients are at increased risk of incident schizophrenia, and the metformin treatment has a protective effect against incident schizophrenia.")
   end
-  def test_full_text
+  def _test_full_text
     pmid = '16438716'
     assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
   end
-  def test_pmc_full_xml
+  def _test_pmc_full_xml
     pmid = '4304705'
     assert PubMed.get_article(pmid).pmc_full_xml.include?("HBV antigen")
   end
-  def test_query
+  def _test_query
     assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
   end
-  def test_year
+  def _test_year
     pmid = '16438716'
     assert_equal "2006", PubMed.get_article(pmid).year
   end
-  def test_bibentry
+  def _test_bibentry
     assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
     assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
   end
-  def test_missing
+  def _test_missing
     pmids = '18627426,014966295'.split(",")
     Log.severity = 0
     assert PubMed.get_article(pmids).include? "014966295"