RubyGems - rbbt-phgx - Versions diffs - 1.0.0 → 2.0.0 - Mend

rbbt-phgx 1.0.0 → 2.0.0

Files changed (24) hide show

data/lib/rbbt/mutation/mutation_assessor.rb +32 -24
data/lib/rbbt/mutation/oncodriveFM.rb +146 -0
data/lib/rbbt/mutation/polyphen.rb +59 -30
data/lib/rbbt/mutation/sift.rb +16 -4
data/lib/rbbt/mutation/snps_and_go.rb +1 -1
data/lib/rbbt/mutation/transFIC.rb +97 -0
data/lib/rbbt/sources/kegg.rb +38 -39
data/lib/rbbt/sources/pharmagkb.rb +1 -1
data/lib/rbbt/sources/pina.rb +26 -0
data/lib/rbbt/sources/string.rb +19 -0
data/share/install/Pina/Rakefile +2 -2
data/share/install/STRING/Rakefile +1 -1
data/share/install/software/OncodriveFM +13 -0
data/test/rbbt/mutation/test_mutation_assessor.rb +1 -5
data/test/rbbt/mutation/test_oncodriveFM.rb +13 -0
data/test/rbbt/mutation/test_polyphen.rb +5 -3
data/test/rbbt/mutation/test_transFIC.rb +14 -0
data/test/rbbt/sources/test_kegg.rb +20 -0
data/test/test_helper.rb +0 -3
metadata +42 -56
data/lib/rbbt/sources/hprd.rb +0 -6
data/lib/rbbt/sources/reactome.rb +0 -6
data/share/install/HPRD/Rakefile +0 -15
data/share/install/Reactome/Rakefile +0 -36

data/lib/rbbt/mutation/mutation_assessor.rb CHANGED

@@ -5,12 +5,13 @@ require 'digest/md5'
 module MutationAssessor
   class NotDone < StandardError; end
-  URL="http://mutationassessor.org/"
+  URL="http://mutationassessor.org"
   ASTERISK = "*"[0]
   # mutations is a hash of genes in Uniprot protein accession pointing to lists
   # of aminoacid substitutions
   def self.predict(mutations)
+    return TSV.setup({}, :header_hash => "", :type => :list) if mutations.empty? or mutations.nil?
     vars = mutations.collect{|gene, list|
       list = [list] unless Array === list
       list.collect do |mut|
@@ -36,12 +37,11 @@ module MutationAssessor
         doc = Nokogiri::HTML(Open.read(URL, :wget_options => {"--post-file" => post_file }, :nocache => nocache))
       end
-      textareas = doc.css('textarea')
+      textareas = doc.css('p')
       if textareas.empty?
-        puts "No text area"
-        puts doc
-        puts
+        Log.debug "No text area"
+        Log.debug doc.to_s
         raise NotDone, "No text aread found in response HTML"
       end
@@ -70,11 +70,11 @@ module MutationAssessor
       end
     end
-    if result.empty?
+    if result.empty? and mutations.any?
       tmp = TmpFile.tmp_file
       html = tmp + ".html"
       variants = tmp + ".list"
-      Open.write(tmp, doc.content)
+      Open.write(html, doc.content)
       Open.write(variants, post_data )
       raise "Result empty. Possible error. html in #{ html }, variants in #{variants}"
     end
@@ -82,22 +82,34 @@ module MutationAssessor
     result.sub! /^\t/, ''
     result.gsub! /\n\s*\d+\s*\t/s, "\n"
+    Log.medium "Mutation Assessor DONE."
     if result.empty?
       TSV.setup({}, :header_hash => "", :type => :list)
     else
-      TSV.open(StringIO.new(result), :header_hash => "", :type => :list)
+      res = TSV.open(StringIO.new(result), :header_hash => "", :type => :list)
+      res = res.slice((res.fields - ["Mapping issue"]))
+      res
     end
   end
-  def self.chunked_predict(mutations)
-    chunks = mutations.length.to_f / 1000
+  def self.chunked_predict(mutations, max = 1000)
+    flattened_mutations = mutations.collect{|g,list| list = [list] unless Array === list; list.collect{|m| [g,m] } }.flatten(1)
+    chunks = flattened_mutations.length.to_f / max
     chunks = chunks.ceil
-    Misc.divide(mutations.sort_by{|m| m * ":"}, chunks).inject(nil) do |acc, list|
+    Log.debug("Mutation Assessor ran with #{chunks} chunks of #{ max } mutations") if chunks > 1
+    num = 1
+    Misc.divide(flattened_mutations, chunks).inject(nil) do |acc, list|
+      Log.debug("Mutation Assessor ran with #{chunks} chunks: chunk #{num}") if chunks > 1
+      unflattened_mutations = {}
+      list.each{|g,m| next if g.nil?; unflattened_mutations[g] ||= []; unflattened_mutations[g] << m}
       if acc.nil?
-        acc = predict(list)
+        acc = predict(unflattened_mutations)
       else
-        acc = TSV.setup(acc.merge(predict(list)))
+        acc = TSV.setup(acc.merge(predict(unflattened_mutations)))
       end
+      num += 1
       acc
     end
   end
@@ -131,19 +143,15 @@ module MutationAssessor
     data.sort!
     predictions = {}
     predict(data).each{|uni_acc, values|
       protein, mutation = uni_acc.split(/\s+/)
-      values = values.zip_fields
-      values.each do |v|
-        pred     = v["Func. Impact"]
-        predictions[protein] ||= {}
-        predictions[protein][mutation] = pred
-      end
+      pred     = values["Func. Impact"]
+      predictions[protein] ||= {}
+      predictions[protein][mutation] = pred
     }
     uni_acc_pos = tsv.identify_field "UniProt/SwissProt ID"
     protein_field = tsv.identify_field "Protein Mutation"
@@ -169,11 +177,11 @@ module MutationAssessor
                   "No Prediction"
                 else
                   list = []
-                  list = predictions[uni_acc][mutation] if predictions.include? uni_acc
-                  if list.nil?
+                  pred = predictions[uni_acc][mutation] if predictions.include? uni_acc
+                  if pred.nil?
                     "No Prediction"
                   else
-                    list.first
+                    pred
                   end
                 end
           res

data/lib/rbbt/mutation/oncodriveFM.rb ADDED

@@ -0,0 +1,146 @@
+require 'rbbt-util'
+require 'rbbt/util/open'
+require 'rbbt/tsv'
+require 'digest/md5'
+require 'rbbt/sources/organism'
+module OncodriveFM
+  Rbbt.claim Rbbt.software.opt.OncodriveFM, :install, Rbbt.share.install.software.OncodriveFM.find
+  def self.process_cohort(cohort)
+    all_mutated_isoforms = cohort.metagenotype.mutated_isoforms.compact.flatten.uniq
+    nonsense = all_mutated_isoforms.select{|mi| mi.consequence == "MISS-SENSE"}
+    mutation_assessor = MutEval.job(:mutation_assessor, "OncodriveFM", :mutations => all_mutated_isoforms.subset(nonsense)).run
+    sift              = MutEval.job(:sift, "OncodriveFM", :mutations => all_mutated_isoforms.subset(nonsense)).run
+    polyphen          = MutEval.job(:polyphen, "OncodriveFM", :mutations => all_mutated_isoforms.subset(nonsense)).run
+    mutation_assessor_max = mutation_assessor.slice("Mutation Assessor Score").values.flatten.collect{|v| (v.nil? or v.empty?) ? nil : v.to_f}.compact.max
+    sift_max              = sift.slice("SIFT Score").values.flatten.collect{|v| (v.nil? or v.empty?) ? nil : v.to_f}.compact.max
+    polyphen_max          = polyphen.slice("Polyphen Score").values.flatten.collect{|v| (v.nil? or v.empty?) ? nil : v.to_f}.compact.max
+    mutation_file = []
+    cohort.each do |genotype|
+      sample = genotype.jobname
+      genotype.each do |mutation|
+        genes = mutation.genes
+        next if genes.empty?
+        mut_mis = mutation.mutated_isoforms
+        next if mut_mis.nil? or mut_mis.empty?
+        genes.each do |gene|
+          mis = mut_mis.select{|mi| mi.protein and mi.protein.gene == gene}
+          mutation_assessor.values_at(*mis)
+          ma_score       = mutation_assessor.values_at(*mis).compact.collect{|v| v["Mutation Assessor Score"]}.first
+          sift_score     = sift.values_at(*mis).compact.collect{|v| v["SIFT Score"]}.first
+          polyphen_score = polyphen.values_at(*mis).compact.collect{|v| v["Polyphen Score"]}.first
+          ma_score       = mutation_assessor_max if mis.select{|mi| mi.truncated}.any?
+          sift_score     = sift_max            if mis.select{|mi| mi.truncated}.any?
+          polyphen_score = polyphen_max    if mis.select{|mi| mi.truncated}.any?
+          mutation_file << [gene, sift_score || "NA", polyphen_score || "NA", ma_score || "NA", sample] * "\t"
+        end
+      end
+    end
+    TmpFile.with_file(mutation_file * "\n") do |fmuts|
+      TmpFile.with_file do |outdir|
+        FileUtils.mkdir_p outdir unless File.exists? outdir
+        name = "Tumor"
+        TmpFile.with_file(config(fmuts, outdir, "[TUMOR]" => name)) do |fconf|
+          puts Open.read(fconf)
+          CMD.cmd("cd #{Rbbt.software.opt.OncodriveFM.bin.find}; ./pipeline_launcher.pl '#{fconf}'").read
+        end
+        outfile = File.join(outdir, name + '.fimp')
+        text = Open.read(outfile).gsub(/WARNING.*?\n/m,'').gsub(/\t-\t/,"\t\t").gsub(/\t-$/,"\t")
+        tsv = TSV.open(StringIO.new(text), :type => :list)
+        tsv.key_field = "Ensembl Gene ID"
+        tsv.fields = ["Associated Gene Name", "Sample count", "p-value", "unknown"]
+        tsv
+      end
+    end
+  end
+  CONFIG_TEMPLATE=<<-EOF
+###########################################################################################
+# Input data specific for the tumor under analysis
+#tumor: This name will be used as prefix to name all intermediate and final pipeline files
+tumor='[TUMOR]'
+#mutfile: File that contains the mutations data of the tumor you want to analyze. Each row corresponds to the mutation of one gene in one sample. Its format should be:
+#
+####Ensembl_Gene_ID MA_Zscore CHASM_Zscore  Sample_ID
+mutfile='[MUTFILE]'
+####numFIS: number of functional scores included in the mutations file and used to compute the functional impact bias
+numFIS='[NUMFIS]'
+###########################################################################################
+###########################################################################################
+# Common input data (change these only if you have downloaded different info files)
+#genes2gos: File that contains the genes2gos mapping
+genes2gos='[DATA_DIR]/common/slimgos_distrib/genes2gos'
+#gosdistribs: Directory with the files that contain the distributions of SIFT, PPH2 and MA scores for each slimGOA obtained from 1000genomes.
+gosdistribs='[DATA_DIR]/common/slimgos_distrib/'
+#genes2symbols: File that contains the genes2symbols mapping obtained from BioMart. Its format should be:
+#
+####Ensembl_Gene_ID Gene_Symbol
+genes2symbols='[DATA_DIR]/common/genes2symbols.txt'
+extrec='NONE'
+#genes2probes: File that contains the genes2probes mapping obtained from BioMart. Its format should be:
+#
+####Ensembl_Gene_ID Probe_ID
+cp='[DATA_DIR]/common/cp.format'
+#genesattr: File that contains genes' longest CDS' lengths obtained from BioMart and genes' basal nsSNVs rates computed from 1000genomes. This are used to assess the statistical significance of genes' mutations recurrence and genes' overmutation rates. Its format should be:
+#
+####Ensembl_Gene_ID Longest_CDS_length  Basal_nsSNVs_rate
+genesattr='[DATA_DIR]/common/ensgenes_cds.recurrence'
+#outdir: Directory to write output files
+outdir='[OUTDIR]'
+#tmpdir: Directory to write intermediate files
+tmpdir='[TMPDIR]'
+#internal: whether the null distribution will be taken from variants observed in the tumor
+internal='[INTERNAL]'
+###########################################################################################
+  EOF
+  def self.config(mutfile, outdir, options = {})
+    options = Misc.add_defaults options,
+      "[TUMOR]" => "Tumor",
+      "[MUTFILE]" => mutfile,
+      "[NUMFIS]" => 3,
+      "[DATA_DIR]" => Rbbt.software.opt.OncodriveFM.data.find,
+      "[OUTDIR]" => outdir,
+      "[TMPDIR]" => Rbbt.tmp.OncodriveFM.find,
+      "[INTERNAL]" => 1
+    FileUtils.mkdir_p options["[TMPDIR]"] unless File.exists? options["[TMPDIR]"]
+    txt = CONFIG_TEMPLATE.dup
+    options.each do |key,value|
+      txt.gsub!(key, value.to_s)
+    end
+    txt
+  end
+end

data/lib/rbbt/mutation/polyphen.rb CHANGED

@@ -31,52 +31,81 @@ module Polyphen2
       "_ggi_target_manage" => "Refresh",
     }
-  def self.predict(query)
-    options = OPTIONS.merge "_ggi_batch" => query
+    def self.predict(query)
+      options = OPTIONS.merge "_ggi_batch" => query
-    desc =  Digest::MD5.hexdigest(options.inspect)
-    options["description"] = desc
+      desc =  Digest::MD5.hexdigest(options.inspect)
+      options["description"] = desc
-    doc = Nokogiri::HTML(Open.read(Polyphen2::URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
+      doc = Nokogiri::HTML(Open.read(Polyphen2::URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
-    sid = doc.css('input[name=sid]').attr('value')
+      sid = doc.css('input[name=sid]').attr('value')
-    options = REFRESH_OPTIONS.merge "sid" => sid
-    finished = false
+      options = REFRESH_OPTIONS.merge "sid" => sid
+      finished = false
-    view_link = nil
-    while not finished do
-      doc = Nokogiri::HTML(Open.read(Polyphen2::URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
+      view_link = nil
+      while not finished do
+        doc = Nokogiri::HTML(Open.read(Polyphen2::URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
-      result_table =  doc.css('body > table')[1].css('table')[2]
+        result_table =  doc.css('body > table')[1].css('table')[2]
-      rows = result_table.css('tr')
+        rows = result_table.css('tr')
-      row = rows.select{|row| row.css('td').length == 6}.select{|row| row.css('td').last.content.strip == desc}.first
+        row = rows.select{|row| row.css('td').length == 6}.select{|row| row.css('td').last.content.strip == desc}.first
-      cells = row.css('td')
-      if cells[2].content =~ /Error/
-        view_link = nil
-        break
-      end
+        cells = row.css('td')
+        if cells[2].content =~ /Error/
+          view_link = nil
+          break
+        end
-      if cells[1].content =~ /Short/
-        view_link =  cells[1].css('a').attr('href')
-        break
+        if cells[1].content =~ /Short/
+          view_link =  cells[1].css('a').attr('href')
+          break
+        end
+        sleep 5
       end
-      sleep 3
-    end
+      return nil if view_link.nil?
-    return nil if view_link.nil?
+      tsv = TSV.open Open.open(Polyphen2::URL_BASE + view_link, :nocache => true), :double, :merge => true, :fix => Proc.new{|l| l.gsub(/ *\t */, "\t")}
+      tsv.fields = tsv.fields.collect{|f| f.strip}
+      tsv.key_field = tsv.key_field.strip
-    tsv = TSV.open Open.open(Polyphen2::URL_BASE + view_link, :nocache => true), :double, :merge => true, :fix => Proc.new{|l| l.gsub(/ *\t */, "\t")}
-    tsv.fields = tsv.fields.collect{|f| f.strip}
-    tsv.key_field = tsv.key_field.strip
+      new_tsv = TSV.setup({}, :key_field => "Protein Mutation", :fields => tsv.fields)
-    return tsv
-  end
+      tsv.through do |acc, values|
+        values.zip_fields.each do |v|
+          pos, wt, mt = v.values_at "o_pos", "o_aa1", "o_aa2"
+          key = [acc, [wt,pos,mt] * "" ] * ":"
+          new_tsv[key] = v
+        end
+      end
+      return new_tsv
+    end
+    def self.chunked_predict(query, max = 1000)
+      mutations = query.split("\n")
+      chunks = mutations.length.to_f / max
+      chunks = chunks.ceil
+      num = 0
+      Log.debug("Polyphen2 ran with #{chunks} chunks of #{ max } mutations") if chunks > 1
+      Misc.divide(mutations, chunks).inject(nil) do |acc, list|
+        num += 1
+        Log.debug("Polyphen2 ran with #{chunks} chunks: chunk #{num}") if chunks > 1
+        list = list * "\n"
+        if acc.nil?
+          acc = predict(list)
+        else
+          acc = TSV.setup(acc.merge(predict(list)))
+        end
+        acc
+      end
+    end
   end

data/lib/rbbt/mutation/sift.rb CHANGED

@@ -9,9 +9,14 @@ module SIFT
     data_str = mutations.collect{|mut| mut.sub(':', ',')}.uniq * "\n"
     doc = Nokogiri::HTML(Open.read(URL_ENSP, :wget_options => {"--post-data=" => "'ENSP=#{data_str}'"}))
+    if doc.to_s.match(/Your computer has exceeded its daily limit/)
+      Open.clean_cache(URL_ENSP, :wget_options => {"--post-data=" => "'ENSP=#{data_str}'"})
+      raise "Daily limit reached"
+    end
     rows = []
     doc.css('tr').each do |row|
-      rows << row.css('td').collect{|cell| cell.content.strip.sub "\302\240\302\240&nbsp", ""}
+      rows << row.css('td').collect{|cell| content = cell.content.strip; content.sub(/\s*&nbsp.*/, "").sub(/[^\w,]*$/,'')}
     end
     rows.shift
@@ -24,12 +29,19 @@ module SIFT
     end
   end
-  def self.chunked_predict(mutations)
-    chunks = mutations.length.to_f / 100
+  def self.chunked_predict(mutations, max = 500)
+    chunks = mutations.length.to_f / max
     chunks = chunks.ceil
+    Log.debug("SIFT ran with #{chunks} chunks of #{ max } mutations") if chunks > 1
     tsv = TSV.setup({}, :type => :list, :key_field => "Mutated Isoform", :fields =>["Ensembl Protein ID", "Amino Acid Position", "Wildtype Amino Acid", "Mutant Amino Acid", "Prediction", "Score 1", "Score 2", "Score 3"])
+    num = 1
     Misc.divide(mutations.uniq.sort, chunks).inject(tsv) do |acc, list|
-        acc = TSV.setup(acc.merge(predict(list)))
+      Log.debug("SIFT ran with #{chunks} chunks: chunk #{num}") if chunks > 1
+      acc = TSV.setup(acc.merge(predict(list)))
+      num + 1
+      acc
     end
   end

data/lib/rbbt/mutation/snps_and_go.rb CHANGED

@@ -14,7 +14,7 @@ module SNPSandGO
     res = Open.read(url)
-    raise "Error in prediction" unless res =~ /RESULTS/
+    raise "Error in prediction: #{$1}" if res =~ /ERROR: (.*)/
     res.match(/Position\s+WT\s+NEW\s+Effect\s+RI\n\s+\d+\s+[A-Z]\s+[A-Z]\s+(\w+)\s+(\d+)/).values_at 1,2
   end

data/lib/rbbt/mutation/transFIC.rb ADDED

@@ -0,0 +1,97 @@
+require 'rbbt-util'
+require 'rbbt/util/open'
+require 'rbbt/tsv'
+require 'nokogiri'
+require 'digest/md5'
+require 'rest_client'
+require 'rbbt/sources/organism'
+module TransFIC
+  class NotDone < StandardError; end
+  URL="http://bg.upf.edu/transfic/taskService"
+  ASTERISK = "*"[0]
+  # mutations is a hash of genes in Uniprot protein accession pointing to lists
+  # of aminoacid substitutions
+  def self.predict(mutations)
+    options = {}
+    ensp2uni = Organism.identifiers("Hsa").index :target => "UniProt/SwissProt ID", :fields => "Ensembl Protein ID", :persist => true
+    searchText = mutations.collect{|mutation| protein, change = mutation.split(":"); next if ensp2uni[protein].nil?; [ensp2uni[protein], change] * "\t"}.compact.uniq * "\n"
+    Log.debug "Querying TransFIC for: #{mutations.length} mutations"
+    TmpFile.with_file(searchText) do |file|
+      test_url = CMD.cmd("curl -X PUT -T '#{ file }' '#{ URL }'").read
+      result = nil
+      begin
+        Misc.insist(5) do
+          result = CMD.cmd("curl -X GET '#{ test_url }'").read
+          raise result.split("\n").select{|line| line =~ /Error/}.first if result =~ /Error/
+          while result =~ /executing/
+            sleep 10
+            result = CMD.cmd("curl -X GET '#{ test_url }'").read
+          end
+          raise result.split("\n").select{|line| line =~ /Error/}.first if result =~ /Error/
+        end
+      rescue
+        if $!.message =~ /validating/
+          Log.debug(Open.read(file))
+        end
+        raise $!
+      end
+      Log.medium("TransFIC DONE")
+      tsv = TSV.setup({}, :key_field => "Protein Mutation", :fields => %w(siftTransfic siftTransficLabel pph2Transfic pph2TransficLabel maTransfic maTransficLabel), :type => :list)
+      result.split("\n").each do |line|
+        next if line[0] == "#"[0]
+        id, hgnc, hgncdesc, transcript, ensp, sw, protein_position, amino_acids, sift, polyphen, mass,
+          siftTransfic, siftTransficLabel, pph2Transfic, pph2TransficLabel, maTransfic, maTransficLabel = line.split("\t")
+        change = [amino_acids.split("/").first, protein_position, amino_acids.split("/").last] * ""
+        mutation = [ensp,change] * ":"
+        tsv[mutation] = [siftTransfic, siftTransficLabel, pph2Transfic, pph2TransficLabel, maTransfic, maTransficLabel]
+      end
+      tsv.select(mutations)
+    end
+  end
+  def self.chunked_predict(mutations, max = 1000)
+    chunks = mutations.length.to_f / max
+    chunks = chunks.ceil
+    Log.debug("TransFIC ran with #{chunks} chunks of #{ max } mutations") if chunks > 1
+    num = 1
+    Misc.divide(mutations, chunks).inject(nil) do |acc, list|
+      Log.debug("TransFIC ran with #{chunks} chunks: chunk #{num}") if chunks > 1
+      begin
+        result = predict(list)
+      rescue
+        if list.length > 2
+          Log.debug("Error predicting in transFIC. Divinding list of size #{list.length}")
+          result = chunked_predict(list, list.length / 2)
+        else
+          Log.debug("Error predicting in transFIC. Single error detected")
+          next
+        end
+      end
+      if acc.nil?
+        acc = result
+      else
+        acc = TSV.setup(acc.merge(result))
+      end
+      num += 1
+      acc
+    end
+  end
+end

data/lib/rbbt/sources/kegg.rb CHANGED

@@ -10,30 +10,35 @@ module KEGG
   KEGG.claim KEGG.root.find, :rake, Rbbt.share.install.KEGG.Rakefile.find(:lib)
   def self.names
-    @@names ||= KEGG.pathways.tsv :fields => ["Pathway Name"], :persist => true, :type => :single
+    @@names ||= KEGG.pathways.tsv :fields => ["Pathway Name"], :persist => true, :type => :single, :unnamed => true
   end
   def self.descriptions
-    @@descriptions ||= KEGG.pathways.tsv(:fields => ["Pathway Description"], :persist => true, :type => :single).tap{|o| o.unnamed = true}
+    @@descriptions ||= KEGG.pathways.tsv(:fields => ["Pathway Description"], :persist => true, :type => :single, :unnamed => true)
   end
   def self.index2genes
-    @@index2genes ||= KEGG.gene_pathway.tsv(:key_field => "KEGG Pathway ID", :fields => ["KEGG Gene ID"], :persist => true, :type => :flat, :merge => true).tap{|o| o.unnamed = true}
+    @@index2genes ||= KEGG.gene_pathway.tsv(:key_field => "KEGG Pathway ID", :fields => ["KEGG Gene ID"], :persist => true, :type => :flat, :merge => true)
   end
   def self.index2ens
-    @@index2ens ||= KEGG.identifiers.index(:persist => true).tap{|o| o.unnamed = true}
+    @@index2ens ||= KEGG.identifiers.index(:persist => true)
   end
   def self.index2kegg
-    @@index2kegg ||= KEGG.identifiers.index(:target => "KEGG Gene ID", :persist => true).tap{|o| o.unnamed = true}
+    @@index2kegg ||= KEGG.identifiers.index(:target => "KEGG Gene ID", :persist => true)
   end
   def self.id2name(id)
     names[id]
   end
+  def self.name2id(name)
+    names.select{|id,n| n.downcase.index(name.downcase) == 0}.collect{|id,n| id} rescue []
+  end
   def self.description(id)
     descriptions[id]
   end
@@ -60,6 +65,7 @@ if defined? Entity
       name = KEGG.id2name(self)
       name.sub(/ - Homo.*/,'') unless name.nil?
     end
+    persist :name
     property :description => :single2array do
       KEGG.description(self)
@@ -67,9 +73,10 @@ if defined? Entity
     property :genes => :array2single do |*args|
       organism = args.first || self.organism
-      @genes ||= KEGG.index2genes.values_at(*self).
-        each{|pth| pth.organism = organism if pth.respond_to? :organism }
+      KEGG.index2genes.values_at(*self).
+        each{|gene| gene.organism = organism if gene.respond_to? :organism }
     end
+    persist :genes
   end
   if defined? Gene and Entity === Gene
@@ -85,50 +92,42 @@ if defined? Entity
         end
       end
-      def _from_kegg
-        return self.clean_annotations unless format == "KEGG Gene ID"
+      def from_kegg
+        return self unless format == "KEGG Gene ID"
         if Array === self
-          KEGG.index2ens.values_at(*self)
+          Gene.setup KEGG.index2ens.values_at(*self), "Ensembl Gene ID", organism
         else
-          KEGG.index2ens[self]
+          Gene.setup KEGG.index2ens[self], "Ensembl Gene ID", organism
         end
       end
-      def from_kegg
-        return self unless format == "KEGG Gene ID"
-        Gene.setup(_from_kegg, "Ensembl Gene ID", organism)
+      def self.gene_kegg_pathway_index
+        @@gene_kegg_pathway_index ||=
+          KEGG.gene_pathway.tsv(:persist => true, :key_field => "KEGG Gene ID", :fields => ["KEGG Pathway ID"], :type => :flat, :merge => true)
       end
-      property :_to => :array2single do |new_format|
-        return self if format == new_format
-        list = self._from_kegg
-        tsv = Translation.job(:tsv_translate, "", :organism => organism, :genes => list, :format => new_format).exec.tap{|o| o.unnamed = true}
-        tsv.values_at(*list)
-      end
-      property :to! => :array2single do |new_format|
-        return self if format == new_format
-        new = _to(new_format)
-        new.each_with_index do |n,i|
-          c = self.annotated_array_clean_get_brackets(i)
-          if c.nil? or n.nil?
-            self[i] = nil
-          else
-            c.replace n
-          end
+      property :to => :array2single do |new_format|
+        case
+        when format == new_format
+          self
+        when format == "KEGG Gene ID"
+          ensembl = from_kegg.clean_annotations
+          Gene.setup(Translation.job(:tsv_translate, "", :organism => organism, :genes => ensembl, :format => new_format).exec.values_at(*ensembl), new_format, organism)
+        when new_format == "KEGG Gene ID"
+          to_kegg
+        else
+          Gene.setup(Translation.job(:tsv_translate, "", :organism => organism, :genes => self, :format => new_format).exec.values_at(*self), new_format, organism)
         end
       end
+      persist :to
-      property :to => :array2single do |new_format|
-        return self if format == new_format
-        Gene.setup(_to(new_format), new_format, organism)
-      end
+      #property :to => :array2single do |new_format|
+      #  return self if format == new_format
+      #  to!(new_format).collect!{|v| Array === v ? v.first : v}
+      #end
       property :kegg_pathways => :array2single do
-        @kegg_pathways ||= KEGG.gene_pathway.tsv(:persist => true, :key_field => "KEGG Gene ID", :fields => ["KEGG Pathway ID"], :type => :flat, :merge => true).values_at(*self.to_kegg).
+        @kegg_pathways ||= Gene.gene_kegg_pathway_index.values_at(*self.to_kegg).
           each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| KeggPathway.setup(o, organism)}
       end
     end

data/lib/rbbt/sources/pharmagkb.rb CHANGED

@@ -5,5 +5,5 @@ module PharmaGKB
   self.pkgdir = "phgx"
   self.subdir = "share/pharmagkb"
-  PharmaGKB.claim PharmaGKB.root.find, :rake, Rbbt.share.install.PharmaGKB.Rakefile.find(:lib)
+  PharmaGKB.claim PharmaGKB.root, :rake, Rbbt.share.install.PharmaGKB.Rakefile.find(:lib)
 end

data/lib/rbbt/sources/pina.rb CHANGED

@@ -7,3 +7,29 @@ module Pina
   Pina.claim Pina.root.find, :rake, Rbbt.share.install.Pina.Rakefile.find(:lib)
 end
+if defined? Entity and defined? Gene and Entity === Gene
+  require 'rbbt/entity/gene'
+  require 'rbbt/entity/interactor'
+  require 'rbbt/sources/PSI_MI'
+  module Gene
+    property :pina_interactors => :array2single do
+      ens2uniprot = Organism.identifiers(organism).tsv :key_field => "Ensembl Gene ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :persist => true, :unnamed => true
+      pina        = Pina.protein_protein.tsv(:persist => true, :fields => ["Interactor UniProt/SwissProt Accession", "Method", "PMID"], :type => :double, :merge => true, :unnamed => true)
+      int = self.ensembl.collect do |ens|
+        uniprot = ens2uniprot[ens]
+        list = pina.values_at(*uniprot).compact.collect do |v|
+          Misc.zip_fields(v).collect do |o, method, articles|
+            Interactor.setup(o, PSI_MITerm.setup(method.split(";;")), PMID.setup(articles.split(";;")))
+          end
+        end.flatten.uniq
+        Gene.setup(list, "UniProt/SwissProt Accession", organism).extend(AnnotatedArray)
+      end
+      Gene.setup(int, "UniProt/SwissProt Accession", organism).extend(AnnotatedArray)
+    end
+  end
+end

data/lib/rbbt/sources/string.rb CHANGED

@@ -7,3 +7,22 @@ module STRING
   STRING.claim STRING.root.find, :rake, Rbbt.share.install.STRING.Rakefile.find(:lib)
 end
+if defined? Entity and defined? Gene and Entity === Gene
+  module Gene
+    property :string_interactors => :array2single do |*args|
+      threshold = args.first || 800
+      string = STRING.protein_protein.tsv(:unnamed => true, :persist => true, :type => :double)
+      all = self.ensembl.collect do |gene|
+        interactors = gene.proteins.collect{|protein| Misc.zip_fields((string[protein] || [[],[]])).select{|i, score| score.to_i > threshold}.collect{|ints,s| ints}}.compact.flatten.uniq
+        Protein.setup(interactors, "Ensembl Protein ID", organism).transcript.gene.compact.uniq
+      end
+      all.compact.first.annotate all if Annotated === all.compact.first
+      all
+    end
+    #persist :_ary_string_interactors
+  end
+end

data/share/install/Pina/Rakefile CHANGED

@@ -1,8 +1,8 @@
 require File.join(File.dirname(__FILE__),'../lib/rake_helper')
-define_source_tasks  "Homo sapiens-20110225.txt" => "http://csbi.ltdk.helsinki.fi/pina/download/Homo%20sapiens-20110225.txt"
+define_source_tasks  "Homo sapiens-20110628.txt" => "http://cbg.garvan.unsw.edu.au/pina/download/Homo%20sapiens-20110628.txt"
-process_tsv :protein_protein, 'Homo sapiens-20110225.txt',
+process_tsv :protein_protein, 'Homo sapiens-20110628.txt',
   :key         => 0,
   :fix         => lambda{|l| l.gsub("uniprotkb:", '').gsub("(gene name)",'').gsub("pubmed:",'').gsub("|", ';;').gsub(/\([^)]+\)/,'')},
   :fields      => [1,6,8],

data/share/install/STRING/Rakefile CHANGED

@@ -1,6 +1,6 @@
 require File.join(File.dirname(__FILE__),'../lib/rake_helper')
-define_source_tasks "protein_protein" => "http://string-db.org:8080/newstring_download/protein.links.v8.3.txt.gz"
+define_source_tasks "protein_protein" => "http://string-db.org/newstring_download/protein.links.v9.0.txt.gz"
 process_tsv :protein_protein, 'protein_protein', :grep => '9606\.ENSP', :fix => lambda{|l| l.gsub(/9606\./,'')}, :merge => true, :sep => "\s" do
   headers ['Ensembl Protein ID', 'Interactor Ensembl Protein ID', 'Score']

data/share/install/software/OncodriveFM ADDED

@@ -0,0 +1,13 @@
+#!/bin/bash
+INSTALL_HELPER_FILE="$1"
+RBBT_SOFTWARE_DIR="$2"
+source "$INSTALL_HELPER_FILE"
+name='OncodriveFM'
+url="http://bg.upf.edu/group/projects/oncodrivefm-1.1.0.tar.gz"
+install_src "$name" "$url"

data/test/rbbt/mutation/test_mutation_assessor.rb CHANGED

@@ -5,7 +5,7 @@ class TestMutationAssessor < Test::Unit::TestCase
   def test_predict_aminoacid_mutation
     mutations = {
-      "EGFR_HUMAN" => %w(R521K)
+      "EGFR_HUMAN" => %w(R521E)
     }
     assert_equal 1, MutationAssessor.predict(mutations).length
@@ -27,9 +27,5 @@ class TestMutationAssessor < Test::Unit::TestCase
     assert(MutationAssessor.chunked_predict(mutations).include? "EGFR_HUMAN R521K")
   end
 end

data/test/rbbt/mutation/test_oncodriveFM.rb ADDED

@@ -0,0 +1,13 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/mutation/oncodriveFM'
+class TestOncodriveFM < Test::Unit::TestCase
+  def test_CLL
+    require 'rbbt/workflow'
+    Workflow.require_workflow "StudyExplorer"
+    s = Study.setup("CLL")
+    puts OncodriveFM.process_cohort(s.cohort).select("p-value"){|v| not v.empty? and v.to_f < 0.05}
+  end
+end

data/test/rbbt/mutation/test_polyphen.rb CHANGED

@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
 require 'rbbt/mutation/polyphen'
 class TestPolyphen2 < Test::Unit::TestCase
-  def test_predict_disease
+  def _test_predict_disease
     accession = "A6NFZ4"
     mutation =  "Y34D"
@@ -11,10 +11,12 @@ class TestPolyphen2 < Test::Unit::TestCase
   def test_batch
     query =<<-EOF
-A6NFZ4 Y34D
+A6NFZ4 34 Y D
     EOF
-    assert_equal "probably damaging", Polyphen2::Batch.predict(query)["A6NFZ4_Y34D"]["prediction"].first
+    ddd Polyphen2::Batch.predict(query)["A6NFZ4:Y34D"]
+    assert_equal "probably damaging", Polyphen2::Batch.predict(query)["A6NFZ4:Y34D"]["prediction"]
+    assert_equal "probably damaging", Polyphen2::Batch.chunked_predict(query)["A6NFZ4:Y34D"]["prediction"]
   end
 end

data/test/rbbt/mutation/test_transFIC.rb ADDED

@@ -0,0 +1,14 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/mutation/transFIC'
+class TestTransFIC < Test::Unit::TestCase
+  def test_predict_aminoacid_mutation
+    mutations = [
+      "ENSP00000275493:R521K"
+    ]
+    puts TransFIC.predict(mutations)
+  end
+end

data/test/rbbt/sources/test_kegg.rb ADDED

@@ -0,0 +1,20 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require 'test/unit'
+require 'rbbt/util/tmpfile'
+require 'rbbt/entity/gene'
+require 'rbbt/sources/kegg'
+class TestKEGG < Test::Unit::TestCase
+  def test_kegg_gene
+    organism = "Hsa"
+    gene = Gene.setup "TP53", "Associated Gene Name", organism
+    assert_equal gene.organism, gene.to_kegg.from_kegg.organism
+    assert_equal "KEGG Gene ID", gene.to_kegg.format
+    assert_equal organism, gene.to_kegg.organism
+    assert_equal gene.ensembl, gene.to_kegg.ensembl
+    assert_equal gene.name, gene.to_kegg.ensembl.name
+    assert_equal gene.to_kegg.ensembl.name, gene.to_kegg.name
+  end
+end

data/test/test_helper.rb CHANGED

@@ -3,7 +3,4 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 $LOAD_PATH.unshift(File.dirname(__FILE__))
 class Test::Unit::TestCase
-  def test_datafile(file)
-    File.join(File.dirname(__FILE__), 'data', file)
-  end
 end

metadata CHANGED

@@ -1,64 +1,55 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: rbbt-phgx
-version: !ruby/object:Gem::Version
-  hash: 23
+version: !ruby/object:Gem::Version
+  version: 2.0.0
   prerelease:
-  segments:
-  - 1
-  - 0
-  - 0
-  version: 1.0.0
 platform: ruby
-authors:
+authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-01-13 00:00:00 +01:00
-default_executable:
-dependencies:
-- !ruby/object:Gem::Dependency
+date: 2012-12-21 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
   name: rbbt-util
-  prerelease: false
-  requirement: &id001 !ruby/object:Gem::Requirement
+  requirement: !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 63
-        segments:
-        - 4
-        - 0
-        - 0
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
         version: 4.0.0
   type: :runtime
-  version_requirements: *id001
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 4.0.0
 description: Pharmaco-genomics related data sources
 email: miguel.vazquez@fdi.ucm.es
 executables: []
 extensions: []
-extra_rdoc_files:
+extra_rdoc_files:
 - LICENSE
-files:
+files:
 - LICENSE
 - lib/phgx.rb
 - lib/rbbt/mutation/fireDB.rb
 - lib/rbbt/mutation/mutation_assessor.rb
+- lib/rbbt/mutation/oncodriveFM.rb
 - lib/rbbt/mutation/polyphen.rb
 - lib/rbbt/mutation/sift.rb
 - lib/rbbt/mutation/snps_and_go.rb
+- lib/rbbt/mutation/transFIC.rb
 - lib/rbbt/sources/biogrid.rb
 - lib/rbbt/sources/cancer.rb
 - lib/rbbt/sources/dbsnp.rb
-- lib/rbbt/sources/hprd.rb
 - lib/rbbt/sources/kegg.rb
 - lib/rbbt/sources/matador.rb
 - lib/rbbt/sources/pharmagkb.rb
 - lib/rbbt/sources/pina.rb
-- lib/rbbt/sources/reactome.rb
 - lib/rbbt/sources/stitch.rb
 - lib/rbbt/sources/string.rb
 - share/Cancer/anais_annotations
@@ -66,68 +57,63 @@ files:
 - share/Cancer/cancer_genes.tsv
 - share/install/Biogrid/Rakefile
 - share/install/DBSNP/Rakefile
-- share/install/HPRD/Rakefile
 - share/install/KEGG/Rakefile
 - share/install/Matador/Rakefile
 - share/install/NCI/Rakefile
 - share/install/PharmaGKB/Rakefile
 - share/install/Pina/Rakefile
-- share/install/Reactome/Rakefile
 - share/install/STITCH/Rakefile
 - share/install/STRING/Rakefile
 - share/install/lib/rake_helper.rb
+- share/install/software/OncodriveFM
 - test/rbbt/sources/test_matador.rb
 - test/rbbt/sources/test_pharmagkb.rb
 - test/rbbt/sources/test_stitch.rb
 - test/rbbt/sources/test_cancer.rb
+- test/rbbt/sources/test_kegg.rb
 - test/rbbt/mutation/test_snps_and_go.rb
 - test/rbbt/mutation/test_fireDB.rb
 - test/rbbt/mutation/test_sift.rb
 - test/rbbt/mutation/test_polyphen.rb
 - test/rbbt/mutation/test_mutation_assessor.rb
+- test/rbbt/mutation/test_oncodriveFM.rb
+- test/rbbt/mutation/test_transFIC.rb
 - test/test_helper.rb
-has_rdoc: true
 homepage: http://github.com/mikisvaz/rbbt-phgx
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.6.2
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: Pharmaco-genomics for the Ruby Bioinformatics Toolkit (rbbt)
-test_files:
+test_files:
 - test/rbbt/sources/test_matador.rb
 - test/rbbt/sources/test_pharmagkb.rb
 - test/rbbt/sources/test_stitch.rb
 - test/rbbt/sources/test_cancer.rb
+- test/rbbt/sources/test_kegg.rb
 - test/rbbt/mutation/test_snps_and_go.rb
 - test/rbbt/mutation/test_fireDB.rb
 - test/rbbt/mutation/test_sift.rb
 - test/rbbt/mutation/test_polyphen.rb
 - test/rbbt/mutation/test_mutation_assessor.rb
+- test/rbbt/mutation/test_oncodriveFM.rb
+- test/rbbt/mutation/test_transFIC.rb
 - test/test_helper.rb

data/lib/rbbt/sources/hprd.rb DELETED

@@ -1,6 +0,0 @@
-require 'phgx'
-module HPRD
-  extend Resource
-  data_module PhGx
-end

data/lib/rbbt/sources/reactome.rb DELETED

@@ -1,6 +0,0 @@
-require 'phgx'
-module Reactome
-  extend Resource
-  data_module PhGx
-end

data/share/install/HPRD/Rakefile DELETED

@@ -1,15 +0,0 @@
-require File.join(File.dirname(__FILE__),'../lib/rake_helper')
-file :protein_protein do |t|
-  begin
-    tsv = PhGx.share.hprd["BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt"].tsv :merge => true
-  rescue
-    raise "File BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt not found in 'share/hprd', download manually from http://www.hprd.org/"
-  end
-  tsv.key_field = "Associated Gene Name 1"
-  tsv.fields = ["HPRD id 1","RefSeq Protein ID 1","Associated Gene Name 2","HPRD id 2","RefSeq Protein ID 2", "Experiment type", "PMID"]
-  tsv.namespace = "Hsa"
-  Open.write(t.name, tsv.to_s)
-end

data/share/install/Reactome/Rakefile DELETED

@@ -1,36 +0,0 @@
-require File.join(File.dirname(__FILE__),'../lib/rake_helper')
-define_source_tasks  "human_ppi" => "http://www.genomeknowledge.org/download/current/homo_sapiens.interactions.txt.gz",
-  "protein_pathway" => "http://www.genomeknowledge.org/download/current/uniprot_2_pathways.stid.txt",
-  "pathway_genesets" => "http://www.genomeknowledge.org/download/current/ReactomePathways.gmt.zip"
-process_tsv :protein_protein, 'human_ppi',
-  :key         => 0,
-  :fix         => lambda{|l| l.gsub(/\t[a-z ]+:/i,"\t").gsub(/^[a-z ]+:/i,'')},
-  :fields      => [3,6,7,8],
-  :header_hash => "#",
-  :merge       => true,
-  :keep_empty  => true do
-  headers ['UniProt/SwissProt Accession', 'Interactor UniProt/SwissProt Accession', 'Interaction Type', 'Reactions Involved', 'Interaction PMIDS']
-end
-process_tsv :protein_pathway, 'protein_pathway',
-  :key         => 0,
-  :fix         => lambda{|l| l.gsub(/\t[a-z ]+:/i,"\t").gsub(/^[a-z ]+:/i,'')},
-  :fields      => [1,2],
-  :header_hash => "#",
-  :merge       => true,
-  :keep_empty  => true do
-  headers ['UniProt/SwissProt Accession', 'Pathway ID', 'Pathway Description']
-end
-process_tsv :pathway_genesets, 'pathway_genesets',
-  :key         => 0,
-  :fix         => lambda{|l| parts = l.split("\t"); [parts[0], parts[2..-1] * "|"] * "\t"},
-  :keep_empty  => true do
-  headers ['Reactome Pathway Name', 'Associated Gene Name']
-end