RubyGems - rbbt-sources - Versions diffs - 0.4.0 → 1.0.0 - Mend

rbbt-sources 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/etc/biomart/missing_in_archive +15 -0
data/lib/rbbt/sources/COSMIC.rb +14 -0
data/lib/rbbt/sources/COSTART.rb +1 -1
data/lib/rbbt/sources/CTCAE.rb +1 -1
data/lib/rbbt/sources/InterPro.rb +17 -0
data/lib/rbbt/sources/NCI.rb +7 -0
data/lib/rbbt/sources/biomart.rb +9 -9
data/lib/rbbt/sources/entrez.rb +44 -17
data/lib/rbbt/sources/go.rb +10 -7
data/lib/rbbt/sources/jochem.rb +4 -0
data/lib/rbbt/sources/organism.rb +24 -25
data/lib/rbbt/sources/organism/sequence.rb +253 -19
data/lib/rbbt/sources/polysearch.rb +5 -5
data/lib/rbbt/sources/pubmed.rb +10 -5
data/lib/rbbt/sources/wgEncodeBroadHmm.rb +37 -0
data/share/install/InterPro/Rakefile +29 -0
data/share/install/JoChem/Rakefile +67 -0
data/share/install/NCI/Rakefile +79 -0
data/share/install/Organism/Hsa/Rakefile +20 -1
data/share/install/Organism/Rno/Rakefile +2 -0
data/share/install/Organism/organism_helpers.rb +134 -77
data/share/install/lib/helpers.rb +6 -5
data/test/rbbt/sources/test_biomart.rb +8 -5
data/test/rbbt/sources/test_organism.rb +23 -19
metadata +39 -14

data/etc/biomart/missing_in_archive ADDED Viewed

@@ -0,0 +1,15 @@
+may2009:
+    - agilent_wholegenome
+    - agilent_cgh_44b
+    - illumina_humanwg_6_v2
+    - illumina_humanwg_6_v3
+dec2007:
+    - protein_id
+    - affy_hc_g110
+    - affy_hg_u133a_2
+    - affy_huex_1_0_st_v2
+    - affy_hugene_1_0_st_v1
+    - agilent_wholegenome
+    - agilent_cgh_44b
+    - illumina_humanwg_6_v2
+    - illumina_humanwg_6_v3

data/lib/rbbt/sources/COSMIC.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'rbbt'
+require 'rbbt/resource'
+module COSMIC
+  extend Resource
+  self.subdir = "share/databases/COSMIC"
+  COSMIC.claim COSMIC.Mutations, :proc do
+    url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicMutantExport_v54_120711.tsv"
+    TSV.open(Open.open(url), :header_hash => "", :key_field => "Mutation GRCh37 genome position", :merge => true).to_s
+  end
+end
+puts COSMIC.Mutations.produce

data/lib/rbbt/sources/COSTART.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'rbbt-util'
 module COSTART
-  Rbbt.share.databases.COSTART.COSTART.define_as_proc do
+  Rbbt.claim Rbbt.share.databases.COSTART.COSTART, :proc do
       terms = ["#COSTART Terms"]
       Open.open('http://hedwig.mgh.harvard.edu/biostatistics/files/costart.html').lines.each do |line|
         puts line

data/lib/rbbt/sources/CTCAE.rb CHANGED Viewed

@@ -2,5 +2,5 @@ require 'rbbt-util'
 require 'rbbt/util/excel2tsv'
 module CTCAE
-  Rbbt.share.CTCAE.CTCAE.define_as_url TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls')
+  Rbbt.claim Rbbt.share.databases.CTCAE.CTCAE, :proc do TSV.excel2tsv('http://evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03_2010-06-14.xls').to_s end
 end

data/lib/rbbt/sources/InterPro.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'rbbt-util'
+module InterPro
+  extend Resource
+  self.subdir = "share/databases/InterPro"
+  InterPro.claim InterPro.root.find, :rake, Rbbt.share.install.InterPro.Rakefile.find(:lib)
+  def self.tsv(*args)
+    old_url = BioMart::BIOMART_URL
+    begin
+      BioMart::BIOMART_URL.replace "http://www.ebi.ac.uk/interpro/biomart/martservice?query="
+      BioMart.tsv(*args)
+    ensure
+      BioMart::BIOMART_URL.replace old_url
+    end
+  end
+end

data/lib/rbbt/sources/NCI.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'rbbt-util'
+module NCI
+  extend Resource
+  self.subdir = "share/databases/NCI"
+  NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
+end

data/lib/rbbt/sources/biomart.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'rbbt'
-require 'rbbt/util/tsv'
+require 'rbbt/tsv'
+require 'rbbt/tsv/attach'
 require 'rbbt/util/log'
 require 'cgi'
@@ -14,7 +15,7 @@ module BioMart
   BIOMART_URL = 'http://biomart.org/biomart/martservice?query='
-  MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.yaml
+  MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.yaml : {}
   private
@@ -68,10 +69,10 @@ module BioMart
     new_datafile = TmpFile.tmp_file
     if data.nil?
-      TSV.merge_rows Open.open(result_file), new_datafile
+      TSV.merge_row_fields Open.open(result_file), new_datafile
       data = new_datafile
     else
-      TSV.paste_merge data, result_file, new_datafile
+      TSV.merge_different_fields data, result_file, new_datafile
       FileUtils.rm data
       data = new_datafile
     end
@@ -117,7 +118,6 @@ module BioMart
     }
     chunks << chunk if chunk.any?
     Log.low "Chunks: #{chunks.length}"
     chunks.each_with_index{|chunk,i|
@@ -125,15 +125,15 @@ module BioMart
       data = get(database, main, chunk, filters, data, open_options)
     }
-    open_options[:filename] ||= "BioMart: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}"
+    open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
     if filename.nil?
-      results = TSV.new data, open_options
+      results = TSV.open data, open_options
       results.key_field = main
       results.fields = attrs
       results
     else
       Open.write(filename) do |f|
-        f.puts "#: " << Misc.hash2string(TSV::EXTRA_ACCESSORS.collect{|key| [key, open_options[key]]})
+        f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
         if field_names.nil?
           f.puts "#" << [main, attrs].flatten * "\t"
         else
@@ -148,7 +148,7 @@ module BioMart
   def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
     if @archive_url
-      attrs = attrs.reject{|attr| MISSING_IN_ARCHIVE[@archive].include? attr[1]}
+      attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
     end
     codes = attrs.collect{|attr| attr[1]}

data/lib/rbbt/sources/entrez.rb CHANGED Viewed

@@ -1,18 +1,19 @@
-require 'rbbt-util'
-require 'rbbt/util/tsv'
+require 'rbbt'
+require 'rbbt/tsv'
+require 'rbbt/resource'
 require 'rbbt/bow/bow'
 require 'set'
 module Entrez
-  Rbbt.share.databases.entrez.gene_info.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
-  Rbbt.share.databases.entrez.gene2pubmed.define_as_url 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
+  Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
+  Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
   def self.entrez2native(taxs, options = {})
-    options = Misc.add_defaults options, :key => 1, :fields => 5, :persistence => true, :merge => true
+    options = Misc.add_defaults options, :key_field => 1, :fields => 5, :persist => true, :merge => true
     taxs = [taxs] unless Array === taxs
-    options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
+    options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
     tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
     tsv.key_field = "Entrez Gene ID"
@@ -20,12 +21,24 @@ module Entrez
     tsv
   end
+  def self.entrez2name(taxs, options = {})
+    options = Misc.add_defaults options, :key_field => 1, :fields => 2, :persist => true, :merge => true
+    taxs = [taxs] unless Array === taxs
+    options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
+    tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
+    tsv.key_field = "Entrez Gene ID"
+    tsv.fields    = ["Associated Gene Name"]
+    tsv
+  end
   def self.entrez2pubmed(taxs)
-    options = {:key => 1, :fields => 2, :persistence => true, :merge => true}
+    options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
     taxs = [taxs] unless taxs.is_a?(Array)
-    taxs = taxs.collect{|t| t.to_s}
-    options.merge! :grep => taxs.collect{|t| "^#{ t }\t"}
+    options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
     Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
   end
@@ -58,18 +71,31 @@ module Entrez
   private
   def self.get_online(geneids)
-    geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
-    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
-    xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
+    genes_complete =  geneids.is_a?(Array) ? geneids : [geneids]
-    genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
+    genes = []
+    Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
+      begin
+        Misc.try3times do
+          url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
+          xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
+          genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
+        end
+      rescue
+        puts $!.message
+        genes += geneids_list.collect{|g| nil}
+      end
+    end
     if geneids.is_a? Array
-      list = {}
-      genes.each_with_index{|gene,i|
-        geneid = geneids[i]
-        list[geneid ] = gene
+      list = Hash[*genes_complete.zip([nil]).flatten]
+      genes.each{|gene|
+        geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
+        geneid = geneid.to_i unless list.include? geneid
+        list[geneid] = gene
       }
       return list
     else
@@ -99,6 +125,7 @@ module Entrez
         end
       }
       return list unless missing.any?
       genes = get_online(missing)

data/lib/rbbt/sources/go.rb CHANGED Viewed

@@ -1,22 +1,24 @@
-require 'rbbt-util'
+require 'rbbt'
+require 'rbbt/resource'
+require 'rbbt/persist/tsv'
 # This module holds helper methods to deal with the Gene Ontology files. Right
 # now all it does is provide a translation form id to the actual names.
 module GO
-  Rbbt.share.databases.GO.gene_ontology.define_as_url 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
-  Rbbt.share.databases.GO.gslim_generic.define_as_url 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
+  Rbbt.claim Rbbt.share.databases.GO.gene_ontology, :url, 'ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo'
+  Rbbt.claim Rbbt.share.databases.GO.gslim_generic, :url, 'http://www.geneontology.org/GO_slims/goslim_generic.obo'
   MULTIPLE_VALUE_FIELDS = %w(is_a)
-  TSV_GENE_ONTOLOGY = File.join(Persistence.cachedir, 'gene_ontology')
+  TSV_GENE_ONTOLOGY = File.join(Persist.cachedir, 'gene_ontology')
   # This method needs to be called before any translations can be made, it is
   # called automatically the first time the id2name method is called. It loads
   # the gene_ontology.obo file and extracts all the fields, although right now,
   # only the name field is used.
   def self.init
-    init = Persistence.persist_tsv('gene_ontology', :Misc) do
-      info = {}
+    Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
+      info.serializer = :marshal if info.respond_to? :serializer and info.serializer == :type
       Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
         term_info = {}
@@ -33,12 +35,13 @@ module GO
         next if term_info["id"].nil?
         info[term_info["id"]] = term_info
       }
       info
     end
   end
   def self.info
-    self.init
+    @info ||= self.init
   end
   def self.goterms

data/lib/rbbt/sources/jochem.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'rbbt-util'
+module JoChem
+  Rbbt.claim Rbbt.share.databases.JoChem, :rake, Rbbt.share.install.JoChem.Rakefile.find
+end

data/lib/rbbt/sources/organism.rb CHANGED Viewed

@@ -1,32 +1,35 @@
-require 'rbbt-util'
-require 'rbbt/util/resource'
+require 'rbbt'
+require 'rbbt/resource'
+require 'rbbt/resource/with_key'
 module Organism
   extend Resource
-  relative_to Rbbt, "share/organisms"
+  self.pkgdir = "rbbt"
+  self.subdir = "share/organisms"
-  class OrganismNotProcessedError < StandardError; end
+  ["Hsa", "Rno", "Sce"].each do |organism|
+    claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
-  def self.datadir(org)
-    File.join(Rbbt.datadir, 'organisms', org)
-  end
+    module_eval "#{ organism } = with_key '#{organism}'"
+  end
+  class OrganismNotProcessedError < StandardError; end
   def self.attach_translations(org, tsv, target = nil, fields = nil, options = {})
     Log.high "Attaching Translations for #{ org.inspect }, target #{target.inspect}, fields #{fields.inspect}"
-    options = Misc.add_defaults options, :persistence => true, :case_insensitive => false
+    options = Misc.add_defaults options, :persist => true, :case_insensitive => false
-    options.merge! :key    => target unless target.nil?
+    options.merge! :key_field    => target unless target.nil?
     options.merge! :fields => fields unless fields.nil?
     index = identifiers(org).tsv options
-    tsv.attach index, [:key]
+    tsv.attach index, :fields => [:key], :persist_input => true
   end
   def self.normalize(org, list, target = nil, fields = nil, options = {})
     return [] if list.nil? or list.empty?
-    options = Misc.add_defaults options, :persistence => true, :case_insensitive => true, :double => false
+    options = Misc.add_defaults options, :persist => true, :case_insensitive => true, :double => false
     double = Misc.process_options options, :double
@@ -50,14 +53,20 @@ module Organism
     end
   end
-  def self.guess_id(org, values)
-    identifiers = TSV.new(Organism.identifiers(org), :persistence => true)
+  def self.guess_id(org, values, identifiers = nil)
+    identifiers ||= TSV.setup(Organism.identifiers(org), :persist => true)
     field_matches = identifiers.field_matches(values)
     field_matches.sort_by{|field, matches| matches.uniq.length}.last
   end
+  def self.guess_id(org, values)
+    field_matches = TSV.field_match_counts(Organism.identifiers(org).find, values)
+    field_matches.sort_by{|field, count| count.to_i}.last
+  end
   def self.organisms
-    Dir.glob(File.join(Rbbt.share.organisms.find, '*')).collect{|f| File.basename(f)}
+    Dir.glob(File.join(Organism.root.find, '*')).collect{|f| File.basename(f)}
   end
   def self.name(organism)
@@ -70,14 +79,4 @@ module Organism
     }.first
   end
-  ["Hsa", "Rno", "Sce"].each do |organism|
-    rakefile = Rbbt["share/install/Organism/#{ organism }/Rakefile"]
-    rakefile.lib_dir = Resource.caller_lib_dir __FILE__
-    rakefile.pkgdir = 'phgx'
-    Organism[organism].define_as_rake rakefile
-    module_eval "#{ organism } = with_key '#{organism}'"
-  end
 end

data/lib/rbbt/sources/organism/sequence.rb CHANGED Viewed

@@ -11,7 +11,12 @@ module Organism
     exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
     transcript_info  ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
-    transcripts = exon_transcripts[exon].first
+    transcripts = begin
+                    exon_transcripts[exon].first
+                  rescue
+                    []
+                  end
     transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
   end
@@ -156,6 +161,8 @@ module Organism
     transcript_offsets = {}
     exons.each do |exon|
       transcript_offsets[exon] ||= {}
+      offsets = nil
+      next unless exon_offsets.include? exon
       offsets = exon_offsets[exon].zip_fields
       offsets.collect do |transcript, offset|
@@ -173,7 +180,7 @@ module Organism
     exon_end     ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
     exon_strand  ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
-    exons = exons_at_genomic_positions(org, positions)
+    exons          = exons_at_genomic_positions(org, positions)
     offsets        = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
     position_exons = {}
@@ -203,8 +210,135 @@ module Organism
     position_offsets
   end
+  def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
+    chromosome = chromosome.to_s
+    chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
+      tsv = file.tsv(:persistence => true, :type => :list)
+      tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
+        [exon, values["Exon Chr Start"].to_i]
+      end
+    end
+    chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
+      tsv = file.tsv(:persistence => true, :type => :list)
+      tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
+        [exon, values["Exon Chr End"].to_i]
+      end
+    end
+    if Array === positions
+      positions.collect{|position|
+        position = position.to_i
+        chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
+      }
+    else
+      position = positions.to_i
+      chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
+    end
+  end
+  def self.exon_junctures_at_genomic_positions(org, positions)
+    positions = [positions] unless Array === positions.first
+    exons = []
+    chromosomes = {}
+    indices     = {}
+    positions.each_with_index do |info,i|
+      chr, pos = info
+      chromosomes[chr] ||= []
+      indices[chr] ||= []
+      chromosomes[chr] << pos
+      indices[chr] << i
+    end
+    chromosomes.each do |chr, pos_list|
+      chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
+      chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
+    end
+    exons
+  end
+  def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
+    chromosome = chromosome.to_s
+    chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
+      rows = []
+      chromosome = options[:chromosome]
+      f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
+      while not f.eof?
+        line = f.gets.chomp
+        id, chr, pos = line.split "\t"
+        rows << [id, pos.to_i]
+      end
+      rows
+    end
+    if Array === positions
+      positions.collect{|position|
+        chromosome_bed[position];
+      }
+    else
+      chromosome_bed[positions];
+    end
+  end
+  def self.identify_variations_at_genomic_positions(org, positions, variations_file)
+    positions = [positions] unless Array === positions.first
+    variations = []
+    chromosomes = {}
+    indices     = {}
+    positions.each_with_index do |info,i|
+      chr, pos = info
+      chromosomes[chr] ||= []
+      indices[chr] ||= []
+      chromosomes[chr] << pos
+      indices[chr] << i
+    end
+    chromosomes.each do |chr, pos_list|
+      chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
+      chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
+    end
+    variations
+  end
+  task_option :organism, "Organism", :string, "Hsa"
+  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
+  task_dependencies nil
+  task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
+    genomic_mutations = case
+                        when TSV === genomic_mutations
+                          genomic_mutations
+                        else
+                          TSV.new StringIO.new(genomic_mutations), :list
+                        end
+    genomic_mutations.key_field ||= "Position"
+    genomic_mutations.fields    ||= ["Mutation"]
+    positions = genomic_mutations.keys.collect{|l| l.split(":")}
+    step(:resources, "Load Resources")
+    exon_junctures = {}
+    genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
+      exon_junctures[position] = exons
+    end
+    genomic_mutations.add_field "Exon Junctions" do |position, values|
+      exon_junctures[position] * "|"
+    end
+    genomic_mutations.to_s :sort, true
+  end
   task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
+  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
   task_dependencies nil
   task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
     genomic_mutations = case
@@ -213,8 +347,8 @@ module Organism
                         else
                           TSV.new StringIO.new(genomic_mutations), :list
                         end
-    genomic_mutations.key_field = "Position"
-    genomic_mutations.fields = ["Mutation"]
+    genomic_mutations.key_field ||= "Position"
+    genomic_mutations.fields    ||= ["Mutation"]
     positions = genomic_mutations.keys.collect{|l| l.split(":")}
@@ -234,7 +368,7 @@ Translates a collection of mutations in genomic coordinates into mutations in am
 protein products of transcripts including those positions.
   EOF
   task_option :organism, "Organism", :string, "Hsa"
-  task_option :genomic_mutations, "Position (chr:position), Allele", :tsv
+  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
   task_dependencies nil
   task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
     genomic_mutations = case
@@ -244,8 +378,8 @@ protein products of transcripts including those positions.
                           TSV.new StringIO.new(genomic_mutations), :list
                         end
-    genomic_mutations.key_field = "Position"
-    genomic_mutations.fields = ["Mutation"]
+    genomic_mutations.key_field ||= "Position"
+    genomic_mutations.fields    ||= ["Mutation"]
     positions = genomic_mutations.keys.collect{|l| l.split(":")}
@@ -256,7 +390,6 @@ protein products of transcripts including those positions.
     results.type = :double
     results.filename = path
     step(:resources, "Load Resources")
     transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
     transcript_5utr     = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
@@ -264,26 +397,31 @@ protein products of transcripts including those positions.
     exon_start          = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
     exon_end            = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
     exon_strand         = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
+    transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
     step(:offsets, "Find transcripts and offsets for mutations")
     offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
     step(:aminoacid, "Translate mutation to amino acid substitutions")
     offsets.each do |position, transcripts|
-      alleles = genomic_mutations[position * ":"].collect{|allele| Misc.IUPAC_to_base(allele)}.flatten
+      if genomic_mutations.type === :double
+        alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
+      else
+        alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
+      end
       transcripts.each do |transcript, offset_info|
         offset, strand = offset_info
-        begin
-          codon = Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
-        rescue
-          Log.medium $!.message
-          next
-        end
-        if not codon.nil?
+        codon = begin
+                  Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
+                rescue
+                  Log.medium $!.message
+                  next
+                end
+        if not codon.nil? and not codon.empty?
           alleles.each do |allele|
-            allele = Misc::BASE2COMPLEMENT[allele] if strand == -1
+            allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
             change = Organism.codon_change(allele, *codon.values_at(0,1))
             pos_code = position * ":"
             mutation = [change.first, codon.last + 1, change.last] * ""
@@ -298,8 +436,93 @@ protein products of transcripts including those positions.
     end
+    step(:identify_proteins, "Identify Proteins for Transcripts")
+    transcript_field = results.identify_field "Ensembl Transcript ID"
+    results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
+      values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
+    end
     results
   end
+  task_option :organism, "Organism", :string, "Hsa"
+  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
+  task_dependencies nil
+  task :identify_germline_variations => :tsv do |org,genomic_mutations|
+    genomic_mutations = case
+                        when TSV === genomic_mutations
+                          genomic_mutations
+                        else
+                          TSV.new StringIO.new(genomic_mutations), :list
+                        end
+    genomic_mutations.key_field ||= "Position"
+    genomic_mutations.fields    ||= ["Mutation"]
+    positions = genomic_mutations.keys.collect{|l| l.split(":")}
+    step(:prepare, "Prepare Results")
+    results = TSV.new({})
+    results.key_field = "Position"
+    results.fields = ["SNP Id"]
+    results.type = :double
+    results.filename = path
+    step(:resources, "Load Resources")
+    snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
+    snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
+    genomic_mutations.add_field "Germline SNP Id" do |position, values|
+      snps_for_positions[position]
+    end
+    genomic_mutations
+  end
+  task_option :organism, "Organism", :string, "Hsa"
+  task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
+  task_dependencies nil
+  task :identify_somatic_variations => :tsv do |org,genomic_mutations|
+    genomic_mutations = case
+                        when TSV === genomic_mutations
+                          genomic_mutations
+                        else
+                          TSV.new StringIO.new(genomic_mutations), :list
+                        end
+    genomic_mutations.key_field ||= "Position"
+    genomic_mutations.fields    ||= ["Mutation"]
+    positions = genomic_mutations.keys.collect{|l| l.split(":")}
+    step(:prepare, "Prepare Results")
+    results = TSV.new({})
+    results.key_field = "Position"
+    results.fields = ["SNP Id"]
+    results.type = :double
+    results.filename = path
+    step(:resources, "Load Resources")
+    snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
+    snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
+    genomic_mutations.add_field "Germline SNP Id" do |position, values|
+      snps_for_positions[position]
+    end
+    genomic_mutations
+  end
 end
 if __FILE__ == $0
@@ -333,6 +556,17 @@ X	10085674	C	T
 21 19638426 G T
   EOF
+  exon_juncture_test = <<-EOF
+#Position Mutation
+7:150753996 T
+  EOF
+  job =  Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
+  job.run
+  job.clean if job.error?
+  puts job.messages
+  puts job.read
 #  # Build 36
 #  picmi_test = <<-EOF