RubyGems - rbbt-sources - Versions diffs - 2.0.2 → 2.1.0 - Mend

rbbt-sources 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +15 -0
data/lib/rbbt/sources/COSMIC.rb +100 -4
data/lib/rbbt/sources/NCI.rb +1 -1
data/lib/rbbt/sources/STITCH.rb +5 -5
data/lib/rbbt/sources/dbSNP.rb +141 -48
data/lib/rbbt/sources/ensembl.rb +13 -0
data/lib/rbbt/sources/ensembl_ftp.rb +17 -6
data/lib/rbbt/sources/entrez.rb +23 -21
data/lib/rbbt/sources/genomes1000.rb +57 -0
data/lib/rbbt/sources/go.rb +8 -8
data/lib/rbbt/sources/organism.rb +5 -1
data/lib/rbbt/sources/pfam.rb +24 -23
data/lib/rbbt/sources/pubmed.rb +5 -2
data/lib/rbbt/sources/tfacts.rb +0 -3
data/lib/rbbt/sources/uniprot.rb +58 -1
data/share/Ensembl/release_dates +2 -1
data/share/install/Organism/organism_helpers.rb +33 -6
data/test/rbbt/sources/test_gscholar.rb +14 -0
data/test/rbbt/sources/test_organism.rb +5 -0
metadata +8 -17

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    NjczYWU0NDMyM2IwZDBlYWFjNGVlNWU4NTg5ODFhMGEzYmEwZGJiYw==
+  data.tar.gz: !binary |-
+    MjUzNGFjZDJjYzk1ZGJiMjIwNzllMjA4ZDMyODI2YTQzYzhhNzU0Yg==
+!binary "U0hBNTEy":
+  metadata.gz: !binary |-
+    NGZiMjgxYzQ0OGY2MzgxYmUzMzEzN2E1NzBjNDc4MjU3YjRmZjM0OTMwMTcz
+    YzFmMTU4Y2FkMzI4OTljZTA2MTJhNmVhZDQzNzA2NDAwNGM4ODc0ZTAwYzEx
+    MDZjYzAzODEyZjc1OTlmODJhYWE5YjE3ZjI3ODNlYWZlODZmYzc=
+  data.tar.gz: !binary |-
+    NWExMTU0MGMyZWExY2U5NWI2YWJhODYzZDcxMDFkYTc0NWZjN2M3ZDAzZTRh
+    Njk4NTgwMDgwZWJkNjhiNWM3OTA0MDE5Y2IwZjI1OTFhYzU3YmJkZWFhN2M4
+    ZGY2ZTA3NGNjOTM4MDBmZWY4NmQ0ZTMzODc3NmIwMzE1MTM1YjY=

data/lib/rbbt/sources/COSMIC.rb CHANGED Viewed

@@ -1,21 +1,31 @@
 require 'rbbt'
 require 'rbbt/resource'
 module COSMIC
   extend Resource
   self.subdir = "share/databases/COSMIC"
-  COSMIC.claim COSMIC.Mutations, :proc do
-    url = "ftp://ftp.sanger.ac.uk/pub/CGP/wgs/data_export/CosmicWGS_MutantExport_v61_260912.tsv.gz"
+  COSMIC.claim COSMIC.mutations, :proc do
+    url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicCompleteExport_v64_260313.tsv.gz"
-    tsv = TSV.open(Open.open(url), :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
+    stream = CMD.cmd('awk \'BEGIN{FS="\t"} { if ($12 != "" && $12 != "Mutation ID") { sub($12, "COSM" $12 ":" $4)}; print}\'', :in => Open.open(url), :pipe => true)
+    tsv = TSV.open(stream, :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
     tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
     tsv.add_field "Genomic Mutation" do |mid, values|
       position = values["Mutation GRCh37 genome position"]
       cds = values["Mutation CDS"]
       if position.nil? or position.empty?
         nil
       else
         position = position.split("-").first
+        chr, pos = position.split(":")
+        chr = "X" if chr == "23"
+        chr = "Y" if chr == "24"
+        chr = "M" if chr == "25"
+        position = [chr, pos ] * ":"
         if cds.nil?
           position
         else
@@ -52,6 +62,92 @@ module COSMIC
         end
       end
     end
-    tsv.to_s.gsub(/^(\d)/m,'COSM\1').gsub(/(\d)-(\d)/,'\1:\2')
+    tsv.to_s.gsub(/(\d)-(\d)/,'\1:\2')
+  end
+  COSMIC.claim COSMIC.mutations_hg18, :proc do |filename|
+    require 'rbbt/sources/organism'
+    file = COSMIC.mutations.open
+    begin
+      while (line = file.gets) !~ /Genomic Mutation/; end
+      fields = line[1..-2].split("\t")
+      mutation_pos = fields.index "Genomic Mutation"
+      mutations = CMD.cmd("grep -v '^#'|cut -f #{mutation_pos + 1}|sort -u", :in => COSMIC.mutations.open).read.split("\n").select{|m| m.include? ":" }
+      translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
+      File.open(filename, 'w') do |f|
+        f.puts "#: :type=:list#:namespace=Hsa/may2009"
+        f.puts "#" + fields * "\t"
+        while line = file.gets do
+          next if line[0] == "#"[0]
+          line.strip!
+          parts = line.split("\t")
+          parts[mutation_pos] = translations[parts[mutation_pos]]
+          f.puts parts * "\t"
+        end
+      end
+    rescue Exception
+      FileUtils.rm filename if File.exists? filename
+      raise $!
+    ensure
+      file.close
+    end
+    nil
+  end
+  def self.rsid_index(organism, chromosome = nil)
+    build = Organism.hg_build(organism)
+    tag = [build, chromosome] * ":"
+    fwt = nil
+    Persist.persist("StaticPosIndex for COSMIC [#{ tag }]", :fwt, :persist => true) do
+      value_size = 0
+      file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
+      chr_positions = []
+      begin
+        Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
+          next if line[0] == "#"[0]
+          rsid, mutation = line.split("\t").values_at 0, 25
+          next if mutation.nil? or mutation.empty?
+          chr, pos = mutation.split(":")
+          next if chr != chromosome or pos.nil? or pos.empty?
+          chr_positions << [rsid, pos.to_i]
+          value_size = rsid.length if rsid.length > value_size
+        end
+      rescue
+      end
+      fwt = FixWidthTable.new :memory, value_size
+      fwt.add_point(chr_positions)
+      fwt
+    end
+  end
+  def self.mutation_index(organism)
+    build = Organism.hg_build(organism)
+    file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
+    @mutation_index ||= {}
+    @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
+  end
+end
+if defined? Entity
+  if defined? Gene and Entity === Gene
+    module Gene
+      property :COSMIC_rsids => :single2array do
+        COSMIC.rsid_index(organism, chromosome)[self.chr_range]
+      end
+      property :COSMIC_mutations => :single2array do
+        GenomicMutation.setup(COSMIC.mutation_index(organism).values_at(*self.COSMIC_rsids).uniq, "COSMIC mutations over #{self.name || self}", organism, false)
+      end
+    end
   end
 end

data/lib/rbbt/sources/NCI.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module NCI
   extend Resource
   self.subdir = "share/databases/NCI"
-  NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
+  NCI.claim NCI.root, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
 end
 if defined? Entity

data/lib/rbbt/sources/STITCH.rb CHANGED Viewed

@@ -5,11 +5,11 @@ module STITCH
   extend Resource
   self.subdir = "share/databases/STITCH"
-  STITCH.claim STITCH.source.chemical_chemical.find, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
-  STITCH.claim STITCH.source.protein_chemical.find, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
-  STITCH.claim STITCH.source.actions.find, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
-  STITCH.claim STITCH.source.aliases.find, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
-  STITCH.claim STITCH.source.sources.find, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
+  STITCH.claim STITCH.source.chemical_chemical, :url, "http://stitch.embl.de/download/chemical_chemical.links.detailed.v3.1.tsv.gz"
+  STITCH.claim STITCH.source.protein_chemical, :url, "http://stitch.embl.de/download/protein_chemical.links.detailed.v3.1.tsv.gz"
+  STITCH.claim STITCH.source.actions, :url, "http://stitch.embl.de/download/actions.v3.1.tsv.gz"
+  STITCH.claim STITCH.source.aliases, :url, "http://stitch.embl.de/download/chemical.aliases.v3.1.tsv.gz"
+  STITCH.claim STITCH.source.sources, :url, "http://stitch.embl.de/download/chemical.sources.v3.1.tsv.gz"
   Organism.installable_organisms.each do |organism|
     STITCH.claim STITCH.chemical_protein(organism), :proc do

data/lib/rbbt/sources/dbSNP.rb CHANGED Viewed

@@ -10,25 +10,27 @@ module DbSNP
   URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
   DbSNP.claim DbSNP.mutations_ncbi, :proc do
-    tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :single)
+    tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :flat)
     file = Open.open(URL, :nocache => true)
     while line = file.gets do
       next if line[0] == "#"[0]
       chr, position, id, ref, alt = line.split "\t"
-      alt = alt.split(",").first
-      if alt[0] == ref[0]
-        alt[0] = '+'[0]
+      mutations = alt.split(",").collect do |a|
+        if alt[0] == ref[0]
+          alt[0] = '+'[0]
+        end
+        [chr, position, alt] * ":"
       end
-      mutation = [chr, position, alt] * ":"
       tsv.namespace = "Hsa/may2012"
-      tsv[id] = mutation
+      tsv[id] = mutations
     end
     tsv.to_s
   end
-  DbSNP.claim DbSNP.mutations, :proc do
+  DbSNP.claim DbSNP.rsids, :proc do |filename|
     ftp = Net::FTP.new('ftp.broadinstitute.org')
     ftp.passive = true
     ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
@@ -37,65 +39,156 @@ module DbSNP
     tmpfile = TmpFile.tmp_file + '.gz'
     ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
-    tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation", "GMAF", "G5", "G5A", "dbSNP Build ID"], :type => :list)
     file = Open.open(tmpfile, :nocache => true)
-    while line = file.gets do
-      next if line[0] == "#"[0]
+    begin
+      File.open(filename, 'w') do |f|
+        f.puts "#: :type=:list#:namespace=Hsa/may2012"
+        f.puts "#" + ["RS ID", "GMAF", "G5", "G5A", "dbSNP Build ID"] * "\t"
+        while line = file.gets do
+          next if line[0] == "#"[0]
+          chr, position, id, ref, muts, qual, filter, info = line.split "\t"
+          g5 = g5a = dbsnp_build_id = gmaf = nil
+          gmaf = $1 if info =~ /GMAF=([0-9.]+)/
+          g5 = true if info =~ /\bG5\b/
+          g5a = true if info =~ /\bG5A\b/
+          dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
-      chr, position, id, ref, mut, qual, filter, info = line.split "\t"
-      chr.sub!('chr', '')
-      mut = mut.split(",").first
-      case
-      when ref == '-'
-        mut = "+" << mut
-      when mut == '-'
-        mut = "-" * ref.length
-      when (mut.length > 1 and ref.length > 1)
-        mut = '-' * ref.length << mut
-      when (mut.length > 1 and ref.length == 1 and mut.index(ref) == 0)
-        mut = '+' << mut[1..-1]
-      when (mut.length == 1 and ref.length > 1 and ref.index(mut) == 0)
-        mut = '-' * (ref.length - 1)
-      else
-        mut = mut
+          f.puts [id, gmaf, g5, g5a, dbsnp_build_id] * "\t"
+        end
       end
+    rescue Exception
+      FileUtils.rm filename if File.exists? filename
+      raise $!
+    ensure
+      file.close
+      FileUtils.rm tmpfile
+    end
-      g5 = g5a = dbsnp_build_id = gmaf = nil
-      gmaf = $1 if info =~ /GMAF=([0-9.]+)/
-      g5 = true if info =~ /\bG5\b/
-      g5a = true if info =~ /\bG5A\b/
-      dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
+    nil
+  end
-      mutation = [chr, position, mut] * ":"
+  DbSNP.claim DbSNP.mutations, :proc do |filename|
+    ftp = Net::FTP.new('ftp.broadinstitute.org')
+    ftp.passive = true
+    ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
+    ftp.chdir('/bundle/2.3/hg19')
-      tsv.namespace = "Hsa/may2012"
+    tmpfile = TmpFile.tmp_file + '.gz'
+    ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
-      tsv[id] = [mutation, gmaf, g5, g5a, dbsnp_build_id]
-    end
+    file = Open.open(tmpfile, :nocache => true)
+    begin
+      File.open(filename, 'w') do |f|
+        f.puts "#: :type=:flat#:namespace=Hsa/may2012"
+        f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
+        while line = file.gets do
+          next if line[0] == "#"[0]
-    FileUtils.rm tmpfile
+          chr, position, id, ref, muts, qual, filter, info = line.split "\t"
-    tsv.to_s
+          chr.sub!('chr', '')
+          position, muts = Misc.correct_vcf_mutation(position.to_i, ref, muts)
+          mutations = muts.collect{|mut| [chr, position, mut] * ":" }
+          f.puts ([id] + mutations) * "\t"
+        end
+      end
+    rescue Exception
+      FileUtils.rm filename if File.exists? filename
+      raise $!
+    ensure
+      file.close
+      FileUtils.rm tmpfile
+    end
+    nil
   end
-  DbSNP.claim DbSNP.mutations_hg18, :proc do
+  DbSNP.claim DbSNP.mutations_hg18, :proc do |filename|
     require 'rbbt/sources/organism'
-    hg19_tsv = DbSNP.mutations.tsv :unnamed => true
-    mutations = hg19_tsv.values
+    mutations = CMD.cmd("grep -v '^#'|cut -f 2|sort -u", :in => DbSNP.mutations.open).read.split("\n").collect{|l| l.split("|")}.flatten
     translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
+    begin
+      file = Open.open(DbSNP.mutations.find, :nocache => true)
+      File.open(filename, 'w') do |f|
+        f.puts "#: :type=:flat#:namespace=Hsa/may2009"
+        f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
+        while line = file.gets do
+          next if line[0] == "#"[0]
+          parts = line.split("\t")
+          parts[1..-1] = parts[1..-1].collect{|p| translations[p]} * "|"
+          f.puts parts * "\t"
+        end
+      end
+    rescue Exception
+      FileUtils.rm filename if File.exists? filename
+      raise $!
+    ensure
+      file.close
+    end
+    nil
+  end
-    tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
-      translations[mutation]
+  def self.rsid_index(organism, chromosome = nil)
+    build = Organism.hg_build(organism)
+    tag = [build, chromosome] * ":"
+    Persist.persist("StaticPosIndex for dbSNP [#{ tag }]", :fwt, :persist => true) do
+      value_size = 0
+      file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
+      chr_positions = []
+      Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
+        next if line[0] == "#"[0]
+        rsid, mutation = line.split("\t")
+        next if mutation.nil? or mutation.empty?
+        chr, pos = mutation.split(":")
+        next if chr != chromosome or pos.nil? or pos.empty?
+        chr_positions << [rsid, pos.to_i]
+        value_size = rsid.length if rsid.length > value_size
+      end
+      fwt = FixWidthTable.new :memory, value_size
+      fwt.add_point(chr_positions)
+      fwt
     end
+  end
-    tsv.namespace = "Hsa/may2009"
+  def self.mutation_index(organism)
+    build = Organism.hg_build(organism)
+    file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
+    @mutation_index ||= {}
+    @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
+  end
+end
+if defined? Entity
+  if defined? Gene and Entity === Gene
+    module Gene
+      property :dbSNP_rsids => :single2array do
+        DbSNP.rsid_index(organism, chromosome)[self.chr_range]
+      end
+      property :dbSNP_mutations => :single2array do
+        GenomicMutation.setup(DbSNP.mutation_index(organism).values_at(*self.dbSNP_rsids).compact.flatten.uniq, "dbSNP mutations over #{self.name || self}", organism, true)
+      end
+    end
+  end
+  if defined? GenomicMutation and Entity === GenomicMutation
+    module GenomicMutation
+      property :dbSNP => :array2single do
+        dbSNP.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["RS ID"], :type => :single).values_at *self
+      end
+    end
-    tsv.to_s
   end
 end

data/lib/rbbt/sources/ensembl.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'rbbt'
+module Ensembl
+  def self.releases
+    @releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
+  end
+  def self.org2release(organism)
+    releases[organism.split("/").last || "current"]
+  end
+end

data/lib/rbbt/sources/ensembl_ftp.rb CHANGED Viewed

@@ -1,24 +1,31 @@
 require 'rbbt/util/open'
 require 'rbbt/sources/organism'
 require 'rbbt/tsv'
+require 'rbbt/sources/ensembl'
 require 'net/ftp'
 module Ensembl
-  def self.releases
-    @releases ||= Rbbt.share.Ensembl.release_dates.find.tsv :key_field => "build"
-  end
   module FTP
     SERVER = "ftp.ensembl.org"
+    def self.mysql_path(release)
+    end
     def self.ftp_name_for(organism)
       code, build = organism.split "/"
       build ||= "current"
       if build.to_s == "current"
+        release = 'current'
+        name = Organism.scientific_name(organism)
+        ftp = Net::FTP.new(Ensembl::FTP::SERVER)
+        ftp.passive = true
+        ftp.login
+        ftp.chdir(File.join('pub', 'current_mysql'))
+        file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
+        ftp.close
       else
         release = Ensembl.releases[build]
         name = Organism.scientific_name(organism)
@@ -34,7 +41,11 @@ module Ensembl
     def self.ftp_directory_for(organism)
       release, ftp_name = ftp_name_for(organism)
-      File.join('/pub/', release, 'mysql', ftp_name)
+      if release == 'current'
+        File.join('/pub/', 'current_mysql', ftp_name)
+      else
+        File.join('/pub/', release, 'mysql', ftp_name)
+      end
     end
     def self.base_url(organism)

data/lib/rbbt/sources/entrez.rb CHANGED Viewed

@@ -139,10 +139,12 @@ module Entrez
     else
       filename = gene_filename geneid
       if FileCache.found(filename)
         return Gene.new(Open.read(FileCache.path(filename)))
       else
         xml = get_online(geneid)
         FileCache.add(filename, xml) unless FileCache.found(filename)
         return Gene.new(xml)
@@ -150,30 +152,30 @@ module Entrez
     end
   end
-  # Counts the words in common between a chunk of text and the text
-  # found in Entrez Gene for that particular gene. The +gene+ may be a
-  # gene identifier or a Gene class instance.
-  def self.gene_text_similarity(gene, text)
-    case
-    when Entrez::Gene === gene
-      gene_text = gene.text
-    when String === gene || Fixnum === gene
-      begin
-        gene_text =  get_gene(gene).text
-      rescue CMD::CMDError
-        return 0
-      end
-    else
+# Counts the words in common between a chunk of text and the text
+# found in Entrez Gene for that particular gene. The +gene+ may be a
+# gene identifier or a Gene class instance.
+def self.gene_text_similarity(gene, text)
+  case
+  when Entrez::Gene === gene
+    gene_text = gene.text
+  when String === gene || Fixnum === gene
+    begin
+      gene_text =  get_gene(gene).text
+    rescue CMD::CMDError
       return 0
     end
+  else
+    return 0
+  end
-    gene_words = gene_text.words.to_set
-    text_words = text.words.to_set
+  gene_words = gene_text.words.to_set
+  text_words = text.words.to_set
-    return 0 if gene_words.empty? || text_words.empty?
+  return 0 if gene_words.empty? || text_words.empty?
-    common = gene_words.intersection(text_words)
-    common.length / (gene_words.length + text_words.length).to_f
-  end
+  common = gene_words.intersection(text_words)
+  common.length / (gene_words.length + text_words.length).to_f
+end
 end

data/lib/rbbt/sources/genomes1000.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'rbbt'
 require 'rbbt/util/open'
 require 'rbbt/resource'
+require 'rbbt/entity/gene'
 module Genomes1000
   extend Resource
@@ -49,4 +50,60 @@ module Genomes1000
     tsv.to_s
   end
+  def self.rsid_index(organism, chromosome = nil)
+    build = Organism.hg_build(organism)
+    tag = [build, chromosome] * ":"
+    Persist.persist("StaticPosIndex for Genomes1000 [#{ tag }]", :fwt, :persist => true) do
+      value_size = 0
+      file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
+      chr_positions = []
+      Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
+        next if line[0] == "#"[0]
+        rsid, mutation = line.split("\t")
+        next if mutation.nil? or mutation.empty?
+        chr, pos = mutation.split(":")
+        next if chr != chromosome or pos.nil? or pos.empty?
+        chr_positions << [rsid, pos.to_i]
+        value_size = rsid.length if rsid.length > value_size
+      end
+      fwt = FixWidthTable.new :memory, value_size
+      fwt.add_point(chr_positions)
+      fwt
+    end
+  end
+  def self.mutation_index(organism)
+    build = Organism.hg_build(organism)
+    file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
+    @mutation_index ||= {}
+    @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
+  end
 end
+if defined? Entity
+  if defined? Gene and Entity === Gene
+    module Gene
+      property :genomes_1000_rsids => :single2array do
+        Genomes1000.rsid_index(organism, chromosome)[self.chr_range]
+      end
+      property :genomes_1000_mutations => :single2array do
+        GenomicMutation.setup(Genomes1000.mutation_index(organism).values_at(*self.genomes_1000_rsids).uniq, "1000 Genomes mutations over #{self.name || self}", organism, true)
+      end
+    end
+  end
+  if defined? GenomicMutation and Entity === GenomicMutation
+    module GenomicMutation
+      property :genomes_1000 => :array2single do
+        Genomes1000.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["Variant ID"], :type => :single).values_at *self
+      end
+    end
+  end
+end

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module GO
   # only the name field is used.
   def self.init
     Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
-      info.serializer = :marshal if info.respond_to? :serializer and info.serializer == :type
+      info.serializer = :marshal if info.respond_to? :serializer
       Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
         term_info = {}
@@ -37,11 +37,11 @@ module GO
       }
       info
-    end
+    end.tap{|o| o.unnamed = true}
   end
   def self.info
-    @info ||= self.init
+    @@info ||= self.init
   end
   def self.goterms
@@ -94,7 +94,7 @@ if defined? Entity
     self.annotation :organism
     property :name => :array2single do
-      @name ||= GO.id2name(self)
+      GO.id2name(self)
     end
     property :genes => :array2single do |*args|
@@ -117,19 +117,19 @@ if defined? Entity
   if defined? Gene and Entity === Gene
     module Gene
       property :go_terms => :array2single do
-        @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
+        @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
       end
       property :go_bp_terms => :array2single do
-        @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
+        @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
       end
       property :go_cc_terms => :array2single do
-        @go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
+        @go_cc_terms ||= Organism.gene_go_cc(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
       end
       property :go_mf_terms => :array2single do
-        @go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).values_at *self.ensembl
+        @go_mf_terms ||= Organism.gene_go_mf(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true, :namespace => organism).chunked_values_at self.ensembl
       end
     end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -46,7 +46,11 @@ module Organism
       return positions
     end
-    positions_bed = positions.collect{|position| chr, pos = position.split(":").values_at(0,1); ["chr" << chr, pos.to_i-1, pos, position] * "\t"} * "\n" + "\n"
+    positions_bed = positions.collect{|position|
+      chr, pos = position.split(":").values_at(0,1)
+      ["chr" << chr, pos.to_i-1, pos, position] * "\t"
+    } * "\n" + "\n"
     new_positions = {}
     TmpFile.with_file(positions_bed) do |source_bed|

data/lib/rbbt/sources/pfam.rb CHANGED Viewed

@@ -4,6 +4,29 @@ require 'rbbt/resource'
 require 'rbbt/entity'
 require 'rbbt/sources/InterPro'
+InterPro.claim InterPro.pfam_names, :proc do
+  pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
+  tsv = nil
+  TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
+    tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
+  end
+  tsv.key_field = "InterPro ID"
+  tsv.fields = ["Domain Name"]
+  tsv.to_s
+end
+InterPro.claim InterPro.pfam_equivalences, :proc do
+  pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
+  tsv = nil
+  TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
+    tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
+  end
+  tsv.key_field = "InterPro ID"
+  tsv.fields = ["Pfam Domain"]
+  tsv.to_s
+end
 module Pfam
   extend Resource
   self.subdir = "share/databases/Pfam"
@@ -14,7 +37,7 @@ module Pfam
     tsv.to_s
   end
-  NAMES_FILE = InterPro.pfam_names.find
+  NAMES_FILE = InterPro.pfam_names.produce
   def self.name_index
     @name_index ||= TSV.open NAMES_FILE, :single, :unnamed => true
@@ -31,28 +54,6 @@ module InterPro
   end
 end
-InterPro.claim InterPro.pfam_names, :proc do
-  pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
-  tsv = nil
-  TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
-    tsv = TSV.open(CMD.cmd("cut -f 4,3 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 1, :fields => [0], :type => :single)
-  end
-  tsv.key_field = "InterPro ID"
-  tsv.fields = ["Domain Name"]
-  tsv.to_s
-end
-InterPro.claim InterPro.pfam_equivalences, :proc do
-  pfam_domains = Pfam.domains.read.split("\n").collect{|l| l.split("\t").first}.compact.flatten
-  tsv = nil
-  TmpFile.with_file(pfam_domains * "\n") do |tmpfile|
-    tsv = TSV.open(CMD.cmd("cut -f 2,4 | sort -u |grep -w -f #{ tmpfile }", :in => InterPro.source.protein2ipr.open, :pipe => true), :key_field => 0, :fields => [1], :type => :single)
-  end
-  tsv.key_field = "InterPro ID"
-  tsv.fields = ["Pfam Domain"]
-  tsv.to_s
-end
 if defined? Entity
   module PfamDomain

data/lib/rbbt/sources/pubmed.rb CHANGED Viewed

@@ -54,6 +54,7 @@ module PubMed
       [:year     , "Journal/JournalIssue/PubDate/Year"],
       [:month    , "Journal/JournalIssue/PubDate/Month"],
       [:pages    , "Pagination/MedlinePgn"],
+      [:author    , "AuthorList/Author"],
       [:abstract , "Abstract/AbstractText"],
     ]
@@ -154,7 +155,7 @@ module PubMed
         end
       end
-      text
+      Misc.fixutf8(text)
     end
     def bibtex
@@ -187,7 +188,9 @@ module PubMed
     # Join the text from title and abstract
     def text
-      [title, abstract].join("\n")
+      text = [title, abstract].join("\n")
+      Misc.fixutf8(text)
     end
   end

data/lib/rbbt/sources/tfacts.rb CHANGED Viewed

@@ -50,16 +50,13 @@ if defined? Entity and defined? Gene and Entity === Gene
       tfs = TFacts.targets.keys
       self.name.collect{|gene| tfs.include? gene}
     end
-    persist :_ary_is_transcription_factor?
     property :transcription_regulators => :array2single do
       Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
     end
-    persist :_ary_transcription_regulators
     property :transcription_targets => :array2single do
       Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
     end
-    persist :_ary_transcription_targets
   end
 end

data/lib/rbbt/sources/uniprot.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'rbbt'
 require 'rbbt/util/open'
 require 'rbbt/resource'
 require 'rbbt/sources/cath'
@@ -33,6 +34,7 @@ module UniProt
   UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
+  UNIPROT_FASTA="http://www.uniprot.org/uniprot/[PROTEIN].fasta"
   def self.pdbs(protein)
     url = UNIPROT_TEXT.sub "[PROTEIN]", protein
     text = Open.read(url)
@@ -44,15 +46,70 @@ module UniProt
       id, method, resolution, region = $1.split(";").collect{|v| v.strip}
       begin
         chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
+        start = start.to_i
+        eend = eend.to_i
+        start, eend = eend, start if start > eend
       rescue
         Log.warn("Error process Uniprot PDB line: #{line}")
         next
       end
-      pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
+      pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start..eend), :chains => chains}
     }
     pdb
   end
+  def self.sequence(protein)
+    url = UNIPROT_FASTA.sub "[PROTEIN]", protein
+    text = Open.read(url)
+    text.split(/\n/).select{|line| line !~ /^>/} * ""
+  end
+  def self.features(protein)
+    url = UNIPROT_TEXT.sub "[PROTEIN]", protein
+    text = Open.read(url)
+    text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
+    parts = text.split(/^(FT   \w+)/)
+    parts.shift
+    features = []
+    type = nil
+    parts.each do |part|
+      parts
+      if part[0..1] == "FT"
+        type = part.gsub(/FT\s+/,'')
+        next
+      end
+      value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
+      case
+      when value.match(/(\d+) (\d+) (.*)/)
+        start, eend, description = $1, $2, $3
+        description.gsub(/^FT\s+/m, '')
+      when value.match(/(\d+) (\d+)/)
+        start, eend = $1, $2
+        description = nil
+      else
+        Log.debug "Value not understood: #{ value }"
+      end
+      feature = {
+        :type => type,
+        :start => start.to_i,
+        :end => eend.to_i,
+        :description => description,
+      }
+      features << feature
+    end
+    features
+  end
   def self.variants(protein)
     url = UNIPROT_TEXT.sub "[PROTEIN]", protein
     text = Open.read(url)

data/share/Ensembl/release_dates CHANGED Viewed

@@ -1,6 +1,7 @@
 #: :type=:single
 #Release	build
-current	jul2012
+release-70	jan2013
+release-69	oct2012
 release-68	jul2012
 release-67	may2012
 release-66	feb2012

data/share/install/Organism/organism_helpers.rb CHANGED Viewed

@@ -290,7 +290,8 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
                   []
                 end
-  transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
+  #transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
+  transcripts
 end
 def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
@@ -440,6 +441,23 @@ file 'chromosomes' do |t|
   File.open(t.name, 'w') do |f| f.puts goterms end
 end
+file 'blacklist_chromosomes' => 'chromosomes' do |t|
+  list = TSV.open(t.prerequisites.first).keys.select{|c| c.index('_') or c.index('.')}
+  File.open(t.name, 'w') do |f| f.puts list * "\n" end
+end
+file 'blacklist_genes' => ['blacklist_chromosomes', 'gene_positions'] do |t|
+  Open.read(t.prerequisites.first)
+  genes = CMD.cmd("grep -f '#{t.prerequisites.first}' | cut -f 1", :in => Open.open(t.prerequisites.last)).read.split("\n").uniq
+  File.open(t.name, 'w') do |f| f.puts genes * "\n" end
+end
+file 'sanctioned_genes' => ['blacklist_genes', 'gene_positions'] do |t|
+  genes = CMD.cmd("cut -f 1", :in => Open.open(t.prerequisites.last)).read.split("\n").uniq - Open.read(t.prerequisites.first).split("\n")
+  File.open(t.name, 'w') do |f| f.puts genes * "\n" end
+end
 rule /^chromosome_.*/ do |t|
   chr = t.name.match(/chromosome_(.*)/)[1]
@@ -450,7 +468,11 @@ rule /^chromosome_.*/ do |t|
   ftp = Net::FTP.new("ftp.ensembl.org")
   ftp.passive = true
   ftp.login
-  ftp.chdir("pub/#{ release }/fasta/")
+  if release.nil? or release == 'current'
+    ftp.chdir("pub/current_fasta/")
+  else
+    ftp.chdir("pub/#{ release }/fasta/")
+  end
   ftp.chdir($scientific_name.downcase.sub(" ",'_'))
   ftp.chdir('dna')
   file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
@@ -530,7 +552,7 @@ file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
     begin
       p = Organism.root
       p.replace File.expand_path("./chromosome_#{chr}")
-      p.sub!(/.*\/.rbbt\//,'')
+      p.sub!(%r{.*/organisms/},'share/organisms/')
       p = Path.setup(p, 'rbbt', Organism)
       chr_str = p.produce.read
     rescue Exception
@@ -624,10 +646,10 @@ end
 file 'transcript_3utr' => ["transcript_5utr"] do |t|
 end
-file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
+file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
   transcript_5utr     = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
   transcript_3utr     = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
+  transcript_phase     = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
   transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
   transcript_protein  = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
@@ -638,7 +660,12 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
     next if protein.nil? or protein.empty?
     utr5 = transcript_5utr[transcript]
     utr3 = transcript_3utr[transcript]
-    psequence = Bio::Sequence::NA.new(sequence[utr5..sequence.length-utr3-1]).translate
+    phase = transcript_phase[transcript] || 0
+    if phase < 0
+      utr5 = - phase if utr5 == 0
+      phase = 0
+    end
+    psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
     protein_sequence[protein]=psequence
   end

data/test/rbbt/sources/test_gscholar.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'rbbt/sources/gscholar'
+require 'test/unit'
+class TestGScholar < Test::Unit::TestCase
+  def test_citation
+    assert_match GoogleScholar.citation_link("Ten Years of Pathway Analysis: Current Approaches and Outstanding Challenges").to_s, /cites/
+    assert_match GoogleScholar.number_cites("Ten Years of Pathway Analysis: Current Approaches and Outstanding Challenges").to_s, /\d+/
+  end
+end

data/test/rbbt/sources/test_organism.rb CHANGED Viewed

@@ -61,6 +61,11 @@ class TestOrganism < Test::Unit::TestCase
     assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
   end
+  def test_orhtolog
+    require 'rbbt/entity/gene'
+    assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog("Hsa/jun2011")
+  end
   #def test_genes_at_chromosome
   #  pos = [12, 117799500]
   #  assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)

metadata CHANGED Viewed

@@ -1,20 +1,18 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-sources
 version: !ruby/object:Gem::Version
-  version: 2.0.2
-  prerelease:
+  version: 2.1.0
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-01-09 00:00:00.000000000 Z
+date: 2013-10-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-text
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: libxml-ruby
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: bio
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -111,6 +100,7 @@ files:
 - lib/rbbt/sources/biomart.rb
 - lib/rbbt/sources/cath.rb
 - lib/rbbt/sources/dbSNP.rb
+- lib/rbbt/sources/ensembl.rb
 - lib/rbbt/sources/ensembl_ftp.rb
 - lib/rbbt/sources/entrez.rb
 - lib/rbbt/sources/genomes1000.rb
@@ -139,37 +129,38 @@ files:
 - test/rbbt/sources/test_go.rb
 - test/rbbt/sources/test_entrez.rb
 - test/rbbt/sources/test_biomart.rb
+- test/rbbt/sources/test_gscholar.rb
 - test/rbbt/sources/test_organism.rb
 - test/rbbt/sources/test_pubmed.rb
 - test/test_helper.rb
 homepage: http://github.com/mikisvaz/rbbt-sources
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 2.0.3
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
 test_files:
 - test/rbbt/sources/test_go.rb
 - test/rbbt/sources/test_entrez.rb
 - test/rbbt/sources/test_biomart.rb
+- test/rbbt/sources/test_gscholar.rb
 - test/rbbt/sources/test_organism.rb
 - test/rbbt/sources/test_pubmed.rb
 - test/test_helper.rb