RubyGems - rbbt-sources - Versions diffs - 1.1.0 → 1.2.0 - Mend

rbbt-sources 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/lib/rbbt/sources/NCI.rb +70 -12
data/lib/rbbt/sources/cath.rb +142 -0
data/lib/rbbt/sources/go.rb +14 -3
data/lib/rbbt/sources/organism.rb +1 -1
data/lib/rbbt/sources/pfam.rb +35 -0
data/lib/rbbt/sources/pubmed.rb +7 -11
data/lib/rbbt/sources/tfacts.rb +0 -1
data/lib/rbbt/sources/uniprot.rb +125 -0
data/share/install/Organism/Hsa/Rakefile +1 -4
data/share/install/Organism/Mmu/Rakefile +57 -0
data/share/install/Organism/Rno/Rakefile +1 -0
data/share/install/Organism/Sce/Rakefile +1 -0
data/share/install/Organism/organism_helpers.rb +54 -1
metadata +8 -5
data/lib/rbbt/sources/organism/sequence.rb +0 -612

data/lib/rbbt/sources/NCI.rb CHANGED Viewed

@@ -8,57 +8,115 @@ end
 if defined? Entity
-  module NCINaturePathways
+  module NCINaturePathway
     extend Entity
     self.format = "NCI Nature Pathway ID"
+    self.annotation :organism
+    def self.name_index
+      @name_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :single)
+    end
+    def self.gene_index
+      @gene_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
+    end
+    def self.filter(query, field = nil, options = nil, entity = nil)
+      return true if query == entity
+      return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
+      false
+    end
     property :name => :array2single do
-      @name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
+      @name ||= NCINaturePathway.name_index.values_at *self
     end
     property :genes => :array2single do
-      @genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
+      @genes ||= NCINaturePathway.gene_index.values_at *self
     end
   end
-  module NCIReactomePathways
+  module NCIReactomePathway
     extend Entity
     self.format = "NCI Reactome Pathway ID"
+    self.annotation :organism
+    def self.name_index
+      @name_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
+    end
+    def self.gene_index
+      @gene_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
+    end
+    def self.filter(query, field = nil, options = nil, entity = nil)
+      return true if query == entity
+      return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
+      false
+    end
     property :name => :array2single do
-      @name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
+      @name ||= NCIReactomePathway.name_index.values_at *self
     end
     property :genes => :array2single do
-      @genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
+      @genes ||= NCIReactomePathway.gene_index.values_at *self
     end
   end
-  module NCIBioCartaPathways
+  module NCIBioCartaPathway
     extend Entity
     self.format = "NCI BioCarta Pathway ID"
+    self.annotation :organism
+    def self.name_index
+      @name_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :single)
+    end
+    def self.gene_index
+      @gene_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true)
+    end
+    def self.filter(query, field = nil, options = nil, entity = nil)
+      return true if query == entity
+      return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
+      false
+    end
     property :name => :array2single do
-      @name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
+      @name ||= NCIBioCartaPathway.name_index.values_at *self
     end
     property :genes => :array2single do
-      @genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
+      @genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
+        each{|pth| pth.organism = organism if pth.respond_to? :organism }
     end
   end
   if defined? Gene and Entity === Gene
     module Gene
       property :nature_pathways => :array2single do
-        @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
+        @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
+          values_at(*self.to("UniProt/SwissProt Accession")).
+          each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
       end
       property :reactome_pathways => :array2single do
-        @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
+        @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
+          each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
       end
       property :biocarta_pathways => :array2single do
-        @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
+        @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at(*self.entrez).
+          each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIBioCartaPathway.setup(o, organism)}
       end
     end
   end

data/lib/rbbt/sources/cath.rb ADDED Viewed

@@ -0,0 +1,142 @@
+require 'rbbt'
+require 'rbbt/resource'
+module Cath
+  extend Resource
+  Rbbt.claim Rbbt.share.databases.CATH.CathNames, :proc do
+    tsv = TSV.setup({}, :key_field => "CATH Code", :type => :list, :fields => ["PDB ID", "CATH Domain", "CATH Description"])
+    Open.read("http://release.cathdb.info/v3.4.0/CathNames").split(/\n/).each do |line|
+      next if line =~ /^#/
+      code, pdb, domain, name = line.match(/([\d\.]+)\s+(\w\w\w\w)(\w\w\w)\s+:(.*)/).values_at 1,2,3,4
+      tsv[code] = [pdb.downcase, domain, name]
+    end
+    tsv.to_s
+  end
+  Rbbt.claim Rbbt.share.databases.CATH.CathUnclassifiedList , :proc do
+    Open.read("http://release.cathdb.info/v3.4.0/CathUnclassifiedList").split(/\n/).collect do |line|
+      next if line =~ /^#/
+      line.split(/\s/).first
+    end * "\n"
+  end
+  Rbbt.claim Rbbt.share.databases.CATH.CathDomainSeqs, :proc do
+    tsv = TSV.setup({}, :key_field => "CATH Domain", :type => :single, :fields => ["Cath Domain Sequence"])
+    Open.read("http://release.cathdb.info/v3.4.0/CathDomainSeqs.ATOM").split(/>pdb\|/).each do |chunk|
+      next if chunk.empty?
+      domain, sequence = chunk.strip.match(/(.*)\n(.*)/).values_at 1, 2
+      tsv[domain] = sequence
+    end
+    tsv.to_s
+  end
+  Rbbt.claim Rbbt.share.databases.CATH.CathRegions, :proc do
+    domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["Start", "End"])
+    Open.read("http://release.cathdb.info/v3.4.0/CathDomall").split(/\n/).each do |line|
+      next if line =~ /^#/
+      chain, ndomains, nfragments, rest = line.match(/(\w\w\w\w\w)\s+D(\d+)\s+F(\d+)\s+(.*)/).values_at 1,2,3,4
+      ndomains.to_i.times do |dn|
+        nsegments, rest = rest.match(/^\s*(\d+)\s+(.*)/).values_at 1, 2
+        segments = []
+        nsegments.to_i.times do |sn|
+          start, eend, rest = rest.match(/\w\s+(-?\d+)\s+.\s+\w\s+(-?\d+)\s+.(.*)/).values_at 1, 2, 3
+          segments << [start, eend]
+        end
+        domain = chain + "%02d" % dn.to_i
+        segments = segments[0].zip(*segments[1..-1])
+        domains[domain] = segments
+      end
+    end
+    domains.to_s
+  end
+  Rbbt.claim Rbbt.share.databases.CATH.CathDomainList, :proc do
+    domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["CATH domain name (seven characters)",
+                        "Class number", "Architecture number", "Topology number", "Homologous superfamily number", "S35 sequence cluster number",
+                        "S60 sequence cluster number", "S95 sequence cluster number", "S100 sequence cluster number", "S100 sequence count number",
+                        "Domain length", "Structure resolution (Angstroms)"], :type => :list)
+    Open.read("http://release.cathdb.info/v3.4.0/CathDomainList").split(/\n/).each do |line|
+      next if line =~ /^#/
+      parts = line.chomp.split /\s+/
+      domain = parts.shift
+      domains[domain] = parts
+    end
+    domains.to_s
+  end
+  def self.cath_index
+    @@cath ||= Rbbt.share.databases.CATH.CathNames.tsv :persist => true, :case_insensitive => true
+  end
+  def self.pdb_index
+    if not defined? @@pdb or @@pdb.nil?
+      @@pdb = {}
+      Rbbt.share.databases.CATH.CathDomainSeqs.read.split("\n").each do |line|
+        domain = line.split(/\t/).first
+        pdb = domain[0..3]
+        @@pdb[pdb] ||= []
+        @@pdb[pdb] << domain
+      end
+    end
+    @@pdb
+  end
+  def self.unclassified
+    @@unclassified = {}
+    Rbbt.share.databases.CATH.CathUnclassifiedList.read.split("\n").each do |domain|
+      pdb = domain[0..3]
+      @@unclassified[pdb] ||= []
+      @@unclassified[pdb] << domain
+    end
+    @@unclassified
+  end
+  def self.domain_sequences
+    @@domain_sequences ||= Rbbt.share.databases.CATH.CathDomainSeqs.tsv(:persist => true)
+  end
+  def self.pdbs(cath_code)
+    cath = cath_index
+    if cath.include? cath_code
+      cath[cath_code]["PDB ID"]
+    else
+      nil
+    end
+  end
+  def self.domains_for_pdb(pdb)
+    pdb2cath = pdb_index
+    (pdb2cath[pdb] || []) + (unclassified[pdb] || [])
+  end
+  def self.align(domain, sequence)
+    require 'bio'
+    return nil if not domain_sequences.include? domain
+    TmpFile.with_file(">target\n" << sequence) do |target|
+      TmpFile.with_file(">domain\n" << domain_sequences[domain]) do |domain|
+        result = CMD.cmd("fasta35 #{ target } #{ domain }").read
+        if result.match(/([\d\.]+)% identity.*overlap \((\d+)-(\d+):/s)
+          {:identity => $1.to_f, :range => ($2.to_i..$3.to_i)}
+        else
+          false
+        end
+      end
+    end
+  end
+end

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -91,22 +91,33 @@ if defined? Entity
     extend Entity
     self.format = "GO ID"
+    self.annotation :organism
     property :name => :array2single do
       @name ||= GO.id2name(self)
     end
     property :genes => :array2single do |organism|
+      organism ||= self.organism
       @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
     end
+    property :description => :single2array do
+      description = GO.info[self]['def']
+      description.gsub!(/"|\[.*\]/,'') if description
+      description
+    end
   end
   if defined? Gene and Entity === Gene
     module Gene
-      property :go_terms => :array2single do |organism|
+      property :go_terms => :array2single do
         @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
       end
-      property :go_bp_terms => :array2single do |organism|
+      property :go_bp_terms => :array2single do
         @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
       end
     end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module Organism
   self.pkgdir = "rbbt"
   self.subdir = "share/organisms"
-  ["Hsa", "Rno", "Sce"].each do |organism|
+  ["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
     claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
     module_eval "#{ organism } = with_key '#{organism}'"

data/lib/rbbt/sources/pfam.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'rbbt'
+require 'rbbt/tsv'
+require 'rbbt/resource'
+module Pfam
+  extend Resource
+  self.subdir = "share/databases/Pfam"
+  Pfam.claim Pfam.domains, :proc  do
+    url = "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
+    tsv = TSV.open(Open.open(url), :key_field => "Pfam Domain ID", :fields => ["Pfam Clan ID", "Code Name", "Name", "Description"])
+    tsv.to_s
+  end
+  NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
+  def self.name_index
+    @name_index ||= TSV.open NAMES_FILE, :single
+  end
+  def self.name(id)
+    name_index[id]
+  end
+end
+if defined? Entity
+  module PfamDomain
+    extend Entity
+    self.format = "Pfam Domain"
+    property :name => :array2single do
+      self.collect{|id| Pfam.name(id)}
+    end
+  end
+end

data/lib/rbbt/sources/pubmed.rb CHANGED Viewed

@@ -13,11 +13,13 @@ module PubMed
     pmids_complete =  pmids.is_a?(Array) ? pmids : [pmids]
+    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     articles = []
-    Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
-      url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
-      xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
+    Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
+      postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
+      xml = TmpFile.with_file(postdata) do |postfile|
+        Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
+      end
       articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
     end
@@ -202,14 +204,8 @@ module PubMed
       }
       return list unless missing.any?
-      chunk_size = [100, missing.length].min
-      chunks = (missing.length.to_f / chunk_size).ceil
-      articles = {}
-      chunks.times do |chunk|
-        pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
-        articles.merge!(get_online(pmids))
-      end
+      articles = get_online(missing)
       articles.each{|p, xml|
         filename = p + '.xml'

data/lib/rbbt/sources/tfacts.rb CHANGED Viewed

@@ -45,7 +45,6 @@ module TFacts
   end
 end
 if defined? Entity and defined? Gene and Entity === Gene
   module Gene

data/lib/rbbt/sources/uniprot.rb ADDED Viewed

@@ -0,0 +1,125 @@
+require 'rbbt/util/open'
+require 'rbbt/resource'
+require 'rbbt/sources/cath'
+require 'rbbt/sources/uniprot'
+module Uniprot
+  extend Resource
+  self.subdir = "share/databases/Uniprot"
+  Uniprot.claim Uniprot.annotated_variants, :proc do
+    url = "http://www.uniprot.org/docs/humsavar.txt"
+    tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
+                   :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
+                   :fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
+    tsv.unnamed = true
+    tsv.process "Amino Acid Mutation" do |mutation|
+      if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
+        wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
+        mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
+        [wt, $2, mut] * ""
+      else
+        mutation
+      end
+    end
+    uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
+    mutation_pos = tsv.identify_field "Amino Acid Mutation"
+    tsv.add_field "Mutated Isoform" do |key, values|
+      [values[uniprot_pos], values[mutation_pos]] * ":"
+    end
+    tsv.reorder("Mutated Isoform").to_s
+  end
+  UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
+  def self.pdbs(protein)
+    url = UNIPROT_TEXT.sub "[PROTEIN]", protein
+    text = Open.read(url)
+    pdb = {}
+    text.split(/\n/).each{|l|
+      next unless l =~ /^DR\s+PDB; (.*)\./
+      id, method, resolution, region = $1.split(";").collect{|v| v.strip}
+      chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
+      pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
+    }
+    pdb
+  end
+  def self.variants(protein)
+    url = UNIPROT_TEXT.sub "[PROTEIN]", protein
+    text = Open.read(url)
+    text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
+    parts = text.split(/^(FT   \w+)/)
+    parts.shift
+    variants = []
+    type = nil
+    parts.each do |part|
+      if type.nil?
+        type = part
+      else
+        if type !~ /VARIANT/
+          type = nil
+          next
+        end
+        type = nil
+        value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
+        # 291 291 K -> E (in sporadic cancers; somatic mutation). /FTId=VAR_045413.
+        case
+        when value.match(/(\d+) (\d+) ([A-Z])\s*\-\>\s*([A-Z]) (.*)\. \/FTId=(.*)/)
+          start, eend, ref, mut, desc, id = $1, $2, $3, $4, $5, $6
+        when value.match(/(\d+) (\d+) (.*)\. \/FTId=(.*)/)
+          start, eend, ref, mut, desc, id = $1, $2, nil, nil, $3, $4
+        else
+          Log.debug "Value not understood: #{ value }"
+        end
+        variants << {
+          :start => start,
+          :end => eend,
+          :ref => ref,
+          :mut => mut,
+          :desc => desc,
+          :id => id,
+        }
+      end
+    end
+    variants
+  end
+  def self.cath(protein)
+    url = UNIPROT_TEXT.sub "[PROTEIN]", protein
+    text = Open.read(url)
+    cath = {}
+    text.split(/\n/).each{|l|
+      next unless l =~ /^DR\s+Gene3D; G3DSA:(.*)\./
+      id, description, cuantity = $1.split(";").collect{|v| v.strip}
+      cath[id] = {:description => description, :cuantity => cuantity}
+    }
+    cath
+  end
+  def self.cath_domains(protein)
+    pdbs = pdbs(protein).keys.uniq
+    pdbs.collect do |pdb|
+      Cath.domains_for_pdb(pdb)
+    end.flatten.compact
+  end
+  def self.pdbs_covering_aa_position(protein, aa_position)
+    Uniprot.pdbs(protein).select do |pdb, info|
+      info[:region].include? aa_position
+    end
+  end
+end

data/share/install/Organism/Hsa/Rakefile CHANGED Viewed

@@ -5,6 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
 $taxs = [9606]
 $scientific_name = "Homo sapiens"
+$ortholog_key = "human_ensembl_gene"
 $biomart_db = 'hsapiens_gene_ensembl'
 $biomart_db_germline_variation = 'hsapiens_snp'
@@ -97,9 +98,5 @@ $biomart_go_2009= [
   ["GO CC ID", 'go_cellular_component_id'],
 ]
-$biomart_pfam= [
-  ["Pfam Domain", 'pfam'],
-]
 $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
 load File.join(File.dirname(__FILE__), '../organism_helpers.rb')

data/share/install/Organism/Mmu/Rakefile ADDED Viewed

@@ -0,0 +1,57 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+require File.join(File.dirname(__FILE__), '../../lib/helpers')
+$taxs = [10090]
+$scientific_name = "Mus musculus"
+$ortholog_key = "mouse_ensembl_gene"
+$biomart_db = 'mmusculus_gene_ensembl'
+$biomart_db_germline_variation = 'mmusculus_snp'
+$biomart_db_somatic_variation = 'mmusculus_snp_som'
+$biomart_lexicon = [
+  [ 'Associated Gene Name' , "external_gene_id"],
+  [ 'HGNC symbol', "hgnc_symbol"  ],
+  [ 'HGNC automatic gene name', "hgnc_automatic_gene_name"  ],
+  [ 'HGNC curated gene name ', "hgnc_curated_gene_name"  ],
+]
+$biomart_protein_identifiers = [
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'Unigene ID', "unigene"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+]
+$biomart_probe_identifiers = [
+]
+$biomart_identifiers = [
+  [ 'Entrez Gene ID', "entrezgene"],
+  [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
+  [ 'Associated Gene Name', "external_gene_id"  ],
+  [ 'CCDS ID', "ccds"  ],
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'Unigene ID', "unigene"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+  [ 'EMBL (Genbank) ID' , "embl"] ,
+]
+$biomart_go= [
+  ["GO ID", 'go_id'],
+  ["GO Namespace", 'namespace_1003'],
+]
+$biomart_go_2009= [
+  ["GO BP ID", 'go_biological_process_id'],
+  ["GO MF ID", 'go_molecular_function_id'],
+  ["GO CC ID", 'go_cellular_component_id'],
+]
+$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
+load File.join(File.dirname(__FILE__), '../organism_helpers.rb')

data/share/install/Organism/Rno/Rakefile CHANGED Viewed

@@ -9,6 +9,7 @@ $scientific_name = "Rattus norvegicus"
 $biomart_db = 'rnorvegicus_gene_ensembl'
 $biomart_db_germline_variation = 'rnorvegicus_snp'
 $biomart_db_somatic_variation = 'rnorvegicus_snp_som'
+$ortholog_key = "rat_ensembl_gene"
 $biomart_lexicon = [
   [ 'Associated Gene Name' , "external_gene_id"],

data/share/install/Organism/Sce/Rakefile CHANGED Viewed

@@ -8,6 +8,7 @@ $native = "SGD ID"
 $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
 $biomart_db = 'scerevisiae_gene_ensembl'
 $biomart_main = ['Entrez Gene ID', 'entrezgene']
+$ortholog_key = "yeast_ensembl_gene"
 file 'scientific_name' do |t|

data/share/install/Organism/organism_helpers.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'net/ftp'
 $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
 $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
 $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -56,7 +58,9 @@ $biomart_exon_phase = [
   ['Phase','phase'],
 ]
+$biomart_pfam= [
+  ["Pfam Domain", 'pfam'],
+]
 $biomart_exons = [
   $biomart_ensembl_gene,
@@ -71,6 +75,12 @@ file 'scientific_name' do |t|
   File.open(t.name, 'w') do |f| f.write $scientific_name end
 end
+file 'ortholog_key' do |t|
+  raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
+  File.open(t.name, 'w') do |f| f.write $ortholog_key end
+end
 file 'identifiers' do |t|
   identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
   identifiers.unnamed =  true
@@ -456,6 +466,49 @@ file 'chromosomes' do |t|
   File.open(t.name, 'w') do |f| f.puts goterms end
 end
+rule /^chromosome_.*/ do |t|
+  chr = t.name.match(/chromosome_(.*)/)[1]
+  archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
+  release = case archive
+            when "may2009"
+              "release-54"
+            when "jun2011"
+              "release-64"
+            when nil
+              Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
+            end
+  ftp = Net::FTP.new("ftp.ensembl.org")
+  ftp.login
+  ftp.chdir("pub/#{ release }/fasta/")
+  ftp.chdir($scientific_name.downcase.sub(" ",'_'))
+  ftp.chdir('dna')
+  file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
+  raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
+  Log.debug("Downloading chromosome sequence: #{ file }")
+  TmpFile.with_file do |tmpfile|
+    ftp.getbinaryfile(file, tmpfile)
+    Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+    ftp.close
+  end
+end
+rule /^possible_ortholog_(.*)/ do |t|
+  other = t.name.match(/ortholog_(.*)/)[1]
+  other_key = Organism.ortholog_key(other).produce.read
+  BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
+end
+rule /^ortholog_(.*)/ do |t|
+  other = t.name.match(/ortholog_(.*)/)[1]
+  other_key = Organism.ortholog_key(other).produce.read
+  BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
+end
 rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
   t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-sources
 version: !ruby/object:Gem::Version
-  hash: 19
+  hash: 31
   prerelease:
   segments:
   - 1
-  - 1
+  - 2
   - 0
-  version: 1.1.0
+  version: 1.2.0
 platform: ruby
 authors:
 - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-17 00:00:00 +01:00
+date: 2012-01-13 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -108,20 +108,23 @@ files:
 - lib/rbbt/sources/barcode.rb
 - lib/rbbt/sources/bibtex.rb
 - lib/rbbt/sources/biomart.rb
+- lib/rbbt/sources/cath.rb
 - lib/rbbt/sources/entrez.rb
 - lib/rbbt/sources/go.rb
 - lib/rbbt/sources/gscholar.rb
 - lib/rbbt/sources/jochem.rb
 - lib/rbbt/sources/organism.rb
-- lib/rbbt/sources/organism/sequence.rb
+- lib/rbbt/sources/pfam.rb
 - lib/rbbt/sources/polysearch.rb
 - lib/rbbt/sources/pubmed.rb
 - lib/rbbt/sources/tfacts.rb
+- lib/rbbt/sources/uniprot.rb
 - lib/rbbt/sources/wgEncodeBroadHmm.rb
 - share/install/InterPro/Rakefile
 - share/install/JoChem/Rakefile
 - share/install/NCI/Rakefile
 - share/install/Organism/Hsa/Rakefile
+- share/install/Organism/Mmu/Rakefile
 - share/install/Organism/Rno/Rakefile
 - share/install/Organism/Sce/Rakefile
 - share/install/Organism/organism_helpers.rb

data/lib/rbbt/sources/organism/sequence.rb DELETED Viewed

@@ -1,612 +0,0 @@
-require 'rbbt/sources/organism'
-require 'rbbt/util/workflow'
-require 'bio'
-# Sequence analyses
-module Organism
-  extend WorkFlow
-  relative_to Rbbt, "share/organisms"
-  self.jobdir = Rbbt.var.organism.find
-  def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
-    exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
-    transcript_info  ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
-    transcripts = begin
-                    exon_transcripts[exon].first
-                  rescue
-                    []
-                  end
-    transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
-  end
-  def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
-    transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
-    transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
-    utr5 = transcript_5utr[transcript]
-    raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
-    return nil if utr5 > offset
-    sequence = transcript_sequence[transcript]
-    raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
-    ccds_offset = offset - utr5
-    return nil if ccds_offset > sequence.length
-    range = (utr5..-1)
-    sequence = sequence[range]
-    codon = ccds_offset / 3
-    codon_offset =  ccds_offset % 3
-    [sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
-  end
-  def self.codon_change(allele, codon, offset)
-    original = Bio::Sequence::NA .new(codon).translate
-    codon = codon.dup
-    codon[offset] = allele
-    new = Bio::Sequence::NA .new(codon).translate
-    [original, new]
-  end
-  def self.genes_at_chromosome_positions(org, chromosome, positions)
-    chromosome = chromosome.to_s
-    chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
-      tsv = file.tsv(:persistence => false, :type => :list)
-      tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
-        [gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
-      end
-    end
-    if Array === positions
-      positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
-    else
-      pos = chromosome_bed[positions];
-      pos.nil? ? nil : pos.first
-    end
-  end
-  def self.genes_at_genomic_positions(org, positions)
-    positions = [positions] unless Array === positions.first
-    genes = []
-    chromosomes = {}
-    indices     = {}
-    positions.each_with_index do |info,i|
-      chr, pos = info
-      chromosomes[chr] ||= []
-      indices[chr] ||= []
-      chromosomes[chr] << pos
-      indices[chr] << i
-    end
-    chromosomes.each do |chr, pos_list|
-      chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
-      chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
-    end
-    genes
-  end
-  def self.exons_at_chromosome_positions(org, chromosome, positions)
-    chromosome = chromosome.to_s
-    chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
-      tsv = file.tsv(:persistence => true, :type => :list)
-      tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
-        [exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
-      end
-    end
-    if Array === positions
-      positions.collect{|position|
-        chromosome_bed[position];
-      }
-    else
-      chromosome_bed[positions];
-    end
-  end
-  def self.exons_at_genomic_positions(org, positions)
-    positions = [positions] unless Array === positions.first
-    exons = []
-    chromosomes = {}
-    indices     = {}
-    positions.each_with_index do |info,i|
-      chr, pos = info
-      chromosomes[chr] ||= []
-      indices[chr] ||= []
-      chromosomes[chr] << pos
-      indices[chr] << i
-    end
-    chromosomes.each do |chr, pos_list|
-      chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
-      chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
-    end
-    exons
-  end
-  def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
-    exons            ||= Organism.exons(org).tsv(:persistence => true)
-    transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
-    sizes = [0]
-    rank = nil
-    transcript_exons[transcript].zip_fields.each do |_exon, _rank|
-      _rank = _rank.to_i
-      s, e = exons[_exon].values_at("Start", "End")
-      size = e.to_i - s.to_i + 1
-      sizes[_rank] =  size
-      rank = _rank if _exon == exon
-    end
-    if not rank.nil?
-      sizes[0..rank - 1].inject(0){|e,acc| acc += e}
-    else
-      nil
-    end
-  end
-  def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
-    exon_info       ||= Organism.exons(org).tsv(:persistence => true)
-    exon_offsets    ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
-    exons = [exons] unless Array === exons
-    transcript_offsets = {}
-    exons.each do |exon|
-      transcript_offsets[exon] ||= {}
-      offsets = nil
-      next unless exon_offsets.include? exon
-      offsets = exon_offsets[exon].zip_fields
-      offsets.collect do |transcript, offset|
-        next if transcript.empty?
-        transcript_offsets[exon][transcript] = offset.to_i
-      end
-    end
-    transcript_offsets
-  end
-  def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
-    exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
-    exon_start   ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
-    exon_end     ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
-    exon_strand  ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
-    exons          = exons_at_genomic_positions(org, positions)
-    offsets        = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
-    position_exons = {}
-    positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
-    position_offsets = {}
-    position_exons.each do |position,pos_exons|
-      chr, pos = position
-      next if pos_exons.nil? or pos_exons.empty?
-      pos_exons.each do |exon|
-        if offsets.include? exon
-          if exon_strand[exon] == 1
-            offset_in_exon = (pos.to_i - exon_start[exon].to_i)
-          else
-            offset_in_exon = (exon_end[exon] - pos.to_i)
-          end
-          position_offsets[position] ||= {}
-          offsets[exon].each do |transcript, offset|
-            if not offset.nil?
-              position_offsets[position][transcript] = [offset  + offset_in_exon, exon_strand[exon]]
-            end
-          end
-        end
-      end
-    end
-    position_offsets
-  end
-  def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
-    chromosome = chromosome.to_s
-    chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
-      tsv = file.tsv(:persistence => true, :type => :list)
-      tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
-        [exon, values["Exon Chr Start"].to_i]
-      end
-    end
-    chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
-      tsv = file.tsv(:persistence => true, :type => :list)
-      tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
-        [exon, values["Exon Chr End"].to_i]
-      end
-    end
-    if Array === positions
-      positions.collect{|position|
-        position = position.to_i
-        chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
-      }
-    else
-      position = positions.to_i
-      chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
-    end
-  end
-  def self.exon_junctures_at_genomic_positions(org, positions)
-    positions = [positions] unless Array === positions.first
-    exons = []
-    chromosomes = {}
-    indices     = {}
-    positions.each_with_index do |info,i|
-      chr, pos = info
-      chromosomes[chr] ||= []
-      indices[chr] ||= []
-      chromosomes[chr] << pos
-      indices[chr] << i
-    end
-    chromosomes.each do |chr, pos_list|
-      chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
-      chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
-    end
-    exons
-  end
-  def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
-    chromosome = chromosome.to_s
-    chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
-      rows = []
-      chromosome = options[:chromosome]
-      f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
-      while not f.eof?
-        line = f.gets.chomp
-        id, chr, pos = line.split "\t"
-        rows << [id, pos.to_i]
-      end
-      rows
-    end
-    if Array === positions
-      positions.collect{|position|
-        chromosome_bed[position];
-      }
-    else
-      chromosome_bed[positions];
-    end
-  end
-  def self.identify_variations_at_genomic_positions(org, positions, variations_file)
-    positions = [positions] unless Array === positions.first
-    variations = []
-    chromosomes = {}
-    indices     = {}
-    positions.each_with_index do |info,i|
-      chr, pos = info
-      chromosomes[chr] ||= []
-      indices[chr] ||= []
-      chromosomes[chr] << pos
-      indices[chr] << i
-    end
-    chromosomes.each do |chr, pos_list|
-      chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
-      chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
-    end
-    variations
-  end
-  task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
-  task_dependencies nil
-  task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
-    genomic_mutations = case
-                        when TSV === genomic_mutations
-                          genomic_mutations
-                        else
-                          TSV.new StringIO.new(genomic_mutations), :list
-                        end
-    genomic_mutations.key_field ||= "Position"
-    genomic_mutations.fields    ||= ["Mutation"]
-    positions = genomic_mutations.keys.collect{|l| l.split(":")}
-    step(:resources, "Load Resources")
-    exon_junctures = {}
-    genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
-      exon_junctures[position] = exons
-    end
-    genomic_mutations.add_field "Exon Junctions" do |position, values|
-      exon_junctures[position] * "|"
-    end
-    genomic_mutations.to_s :sort, true
-  end
-  task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
-  task_dependencies nil
-  task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
-    genomic_mutations = case
-                        when TSV === genomic_mutations
-                          genomic_mutations
-                        else
-                          TSV.new StringIO.new(genomic_mutations), :list
-                        end
-    genomic_mutations.key_field ||= "Position"
-    genomic_mutations.fields    ||= ["Mutation"]
-    positions = genomic_mutations.keys.collect{|l| l.split(":")}
-    step(:resources, "Load Resources")
-    genes_at_positions = Hash[*genomic_mutations.keys.zip(Organism.genes_at_genomic_positions(org, positions)).flatten]
-    genomic_mutations.add_field "#{org.sub(/\/.*/,'')}:Ensembl Gene ID" do |position, values|
-      genes_at_positions[position]
-    end
-    genomic_mutations
-  end
-  task_description <<-EOF
-Translates a collection of mutations in genomic coordinates into mutations in aminoacids for the
-protein products of transcripts including those positions.
-  EOF
-  task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
-  task_dependencies nil
-  task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
-    genomic_mutations = case
-                        when TSV === genomic_mutations
-                          genomic_mutations
-                        else
-                          TSV.new StringIO.new(genomic_mutations), :list
-                        end
-    genomic_mutations.key_field ||= "Position"
-    genomic_mutations.fields    ||= ["Mutation"]
-    positions = genomic_mutations.keys.collect{|l| l.split(":")}
-    step(:prepare, "Prepare Results")
-    results = TSV.new({})
-    results.key_field = "Position"
-    results.fields = ["#{org.sub(/\/.*/,'')}:Ensembl Transcript ID", "Protein Mutation"]
-    results.type = :double
-    results.filename = path
-    step(:resources, "Load Resources")
-    transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
-    transcript_5utr     = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
-    exon_offsets        = Organism.exon_offsets(org).tsv(:double, :persistence => true)
-    exon_start          = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
-    exon_end            = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
-    exon_strand         = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
-    transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
-    step(:offsets, "Find transcripts and offsets for mutations")
-    offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
-    step(:aminoacid, "Translate mutation to amino acid substitutions")
-    offsets.each do |position, transcripts|
-      if genomic_mutations.type === :double
-        alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
-      else
-        alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
-      end
-      transcripts.each do |transcript, offset_info|
-        offset, strand = offset_info
-        codon = begin
-                  Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
-                rescue
-                  Log.medium $!.message
-                  next
-                end
-        if not codon.nil? and not codon.empty?
-          alleles.each do |allele|
-            allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
-            change = Organism.codon_change(allele, *codon.values_at(0,1))
-            pos_code = position * ":"
-            mutation = [change.first, codon.last + 1, change.last] * ""
-            if results.include? pos_code
-              results[pos_code] = results[pos_code].merge [transcript, mutation]
-            else
-              results[pos_code] = [[transcript], [mutation]]
-            end
-          end
-        end
-      end
-    end
-    step(:identify_proteins, "Identify Proteins for Transcripts")
-    transcript_field = results.identify_field "Ensembl Transcript ID"
-    results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
-      values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
-    end
-    results
-  end
-  task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
-  task_dependencies nil
-  task :identify_germline_variations => :tsv do |org,genomic_mutations|
-    genomic_mutations = case
-                        when TSV === genomic_mutations
-                          genomic_mutations
-                        else
-                          TSV.new StringIO.new(genomic_mutations), :list
-                        end
-    genomic_mutations.key_field ||= "Position"
-    genomic_mutations.fields    ||= ["Mutation"]
-    positions = genomic_mutations.keys.collect{|l| l.split(":")}
-    step(:prepare, "Prepare Results")
-    results = TSV.new({})
-    results.key_field = "Position"
-    results.fields = ["SNP Id"]
-    results.type = :double
-    results.filename = path
-    step(:resources, "Load Resources")
-    snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
-    snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
-    genomic_mutations.add_field "Germline SNP Id" do |position, values|
-      snps_for_positions[position]
-    end
-    genomic_mutations
-  end
-  task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
-  task_dependencies nil
-  task :identify_somatic_variations => :tsv do |org,genomic_mutations|
-    genomic_mutations = case
-                        when TSV === genomic_mutations
-                          genomic_mutations
-                        else
-                          TSV.new StringIO.new(genomic_mutations), :list
-                        end
-    genomic_mutations.key_field ||= "Position"
-    genomic_mutations.fields    ||= ["Mutation"]
-    positions = genomic_mutations.keys.collect{|l| l.split(":")}
-    step(:prepare, "Prepare Results")
-    results = TSV.new({})
-    results.key_field = "Position"
-    results.fields = ["SNP Id"]
-    results.type = :double
-    results.filename = path
-    step(:resources, "Load Resources")
-    snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
-    snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
-    genomic_mutations.add_field "Germline SNP Id" do |position, values|
-      snps_for_positions[position]
-    end
-    genomic_mutations
-  end
-end
-if __FILE__ == $0
-  require 'rbbt/util/log'
-  require 'benchmark'
-  select = <<-EOF
-3:64581875
-  EOF
-  select = select.split("\n").collect{|l| l.split(":")}
-  picmi_test = <<-EOF
-#Chromosome	Name	Position	Reference	Tumor
-1	100382265	C	G
-1	100380997	A	G
-22	30163533	A	C
-X	10094215	G	A
-X	10085674	C	T
-20	50071099	G	T
-21	19638426	G	T
-2	230633386	C	T
-2	230312220	C	T
-1	100624830	T	A
-4	30723053	G	T
-  EOF
-  # Build 37
-  picmi_test = <<-EOF
-#Chromosome	Name	Position	Reference	Tumor
-1	100624830	T	A
-21 19638426 G T
-  EOF
-  exon_juncture_test = <<-EOF
-#Position Mutation
-7:150753996 T
-  EOF
-  job =  Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
-  job.run
-  job.clean if job.error?
-  puts job.messages
-  puts job.read
-#  # Build 36
-#  picmi_test = <<-EOF
-##Chromosome	Name	Position	Reference	Tumor
-#3 81780820 T C
-#2 43881517 A T
-#2 43857514 T C
-#6 88375602 G A
-#16 69875502 G T
-#16 69876078 T C
-#16 69877147 G A
-#17 8101874 C T
-#  EOF
-  Log.severity = 2
-  org = 'Hsa/may2009'
-  file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
-  #positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
-  positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
-  positions.key_field = "Position"
-  positions.fields = %w(Reference Control Tumor)
-  #positions.fields = %w(Reference Tumor)
-  #puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
-  #positions =  positions.select ["10:98099540"]
-  Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
-  job =  Organism.job :genomic_mutations_to_protein_mutations, "Metastasis", org, positions.slice("Tumor")
-  job.run
-  while not job.done?
-    puts job.step
-    sleep 2
-  end
-  raise job.messages.last if job.error?
-  mutations = job.load
-end