RubyGems - ms-error_rate - Versions diffs - 0.0.9 → 0.0.10 - Mend

ms-error_rate 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/.autotest +14 -0
data/.gitmodules +9 -0
data/History +16 -0
data/LICENSE +2 -0
data/Rakefile +52 -0
data/VERSION +1 -1
data/lib/ms/error_rate/decoy.rb +27 -0
data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
data/lib/ms/error_rate/qvalue.rb +93 -0
data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
data/lib/ms/error_rate/sbv.rb +111 -0
data/lib/ms/error_rate.rb +9 -0
data/lib/ms/ident.rb +125 -0
data/lib/support/sort_by_attributes.rb +51 -0
data/lib/transmembrane/phobius.rb +136 -0
data/lib/transmembrane/toppred.rb +368 -0
data/lib/transmembrane.rb +157 -0
data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
data/script/expert_addition.rb +26 -0
data/script/expert_list.rb +53 -0
data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
data/script/minimal_protein_set.rb +366 -0
data/script/unique_seq_stats.rb +72 -0
metadata +66 -14

data/lib/ms/ident.rb ADDED Viewed

@@ -0,0 +1,125 @@
+require 'ms/fasta'
+require 'ms/in_silico/digester'
+module Ms
+  module Ident
+    IPI_RE = /IPI:([\w\d\.]+)\|/
+    GI_RE = /gi|([\w\d\.]+)\|/
+    # the twenty standard amino acids
+    STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
+    DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 1, :min_length => 8, :enzyme => Ms::InSilico::Digester::TRYPSIN, :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
+    # writes a new file with the added 'min_aaseq<Integer>'
+    # creates a temporary digestion file that contains all peptides digesting
+    # with certain missed_cleavages (i.e., min_seq_length is not applied to
+    # this file but on the final peptide centric db)
+    def self.peptide_centric_db(fasta_file, opts={})
+      opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
+      (missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
+      unless id_regexp
+        id_regexp = Ms::Fasta.id_regexp(Ms::Fasta.filetype(fasta_file))
+        raise RuntimeError, "fasta file type not recognized, supply id_regexp" unless id_regexp
+      end
+      start_time = Time.now
+      print "Digesting #{fasta_file} ..." if $VERBOSE
+      if expand_aa
+        letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
+      end
+      base = fasta_file.chomp(File.extname(fasta_file))
+      digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
+      File.open(digestion_file, "w") do |fh|
+        Ms::Fasta.open(fasta_file) do |fasta|
+          fasta.each do |prot|
+            peptides = enzyme.digest(prot.sequence, missed_cleavages)
+            if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
+              m_peps = []
+              init_methionine_peps = []
+              peptides.each do |pep|
+                # if the peptide is at the beginning of the protein sequence
+                if prot.sequence[0,pep.size] == pep
+                  m_peps << pep[1..-1]
+                end
+              end
+              peptides.push(*m_peps)
+            end
+            if expand_aa
+              peptides = peptides.map do |pep|
+                if pep =~ letters_to_expand_re
+                  expand_peptides(pep, expand_aa)
+                else
+                  pep
+                end
+              end.flatten
+            end
+            fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
+          end
+        end
+      end
+      puts "#{Time.now - start_time} sec" if $VERBOSE
+      start_time = Time.now
+      print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
+      hash = Hash.new {|h,k| h[k] = [] }
+      IO.foreach(digestion_file) do |line|
+        (prot, *peps) = line.chomp!.split(/\s+/)
+        id = prot.match(id_regexp)[1]
+        peps.each do |pep|
+          if pep.size >= min_length
+            hash[pep] << id
+          end
+        end
+      end
+      puts "#{Time.now - start_time} sec" if $VERBOSE
+      base = digestion_file.chomp(File.extname(digestion_file))
+      final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
+      start_time = Time.now
+      print "Writing results to #{} ..." if $VERBOSE
+      File.open(final_outfile, 'w') do |out|
+        hash.each do |k,v|
+          out.puts( "#{k}: #{v.join('-')}" )
+        end
+      end
+      puts "#{Time.now - start_time} sec" if $VERBOSE
+      if remove_digestion_file
+        File.unlink(digestion_file)
+      end
+    end
+    # does combinatorial expansion of all letters requesting it.
+    # expand_aa is hash like: {'X'=>STANDARD_AA}
+    def self.expand_peptides(peptide, expand_aa)
+      letters_in_order = expand_aa.keys.sort
+      index_and_key = []
+      peptide.split('').each_with_index do |char,i|
+        if let_index = letters_in_order.index(char)
+          index_and_key << [i, letters_in_order[let_index]]
+        end
+      end
+      to_expand = [peptide]
+      index_and_key.each do |i,letter|
+        new_peps = []
+        while current_pep = to_expand.shift do
+          new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
+        end
+        to_expand = new_peps.flatten
+      end
+      to_expand
+    end
+  end
+end

data/lib/support/sort_by_attributes.rb ADDED Viewed

@@ -0,0 +1,51 @@
+require 'set'
+class Reverser
+  attr_accessor :obj
+  def initialize(obj)
+    @obj = obj
+  end
+  def <=>(other)
+    other.obj <=> self.obj
+  end
+end
+class Object
+  def rev
+    Reverser.new(self)
+  end
+end
+module Enumerable
+  # Provides sorting on multiple attributes (each directional) where atts is
+  # an array of symbols.
+  # the default is to sort ascending (small to large).
+  # the option :down => Symbol or ArrayOfSymbols
+  #   sort_by_attributes(:age,:height,:weight) # -> sorts by age, height, and weight
+  #   sort_by_attributes(:age,:height,:weight, :down => :height) # -> same as above, but sorts height from large to small
+  #   sort_by_attributes(:age,:height,:weight, :down => [:height,:weight]) # -> same as above, but sorts height and weight from large to small
+  def sort_by_attributes(*atts)
+    down =
+      if atts.last.is_a? Hash
+        hash = atts.pop
+        unless hash[:down].is_a?(Array)
+          hash[:down] = [hash[:down]]
+        end
+        Set.new(hash[:down])
+      else
+        Set.new
+      end
+    self.sort_by do |obj|
+      atts.collect do |att|
+        if down.include?(att)
+          obj.send(att).rev
+        else
+          obj.send(att)
+        end
+      end
+    end
+  end
+end

data/lib/transmembrane/phobius.rb ADDED Viewed

@@ -0,0 +1,136 @@
+require 'transmembrane'
+class Phobius ; end
+# This class will probably change its interface some in the future
+# That's the web portal
+# http://phobius.sbc.su.se/
+# How to run:
+# Select output format as 'Short'
+# then hit 'Submit Query'
+# note: to implement some of the TransmembraneIndex features, the update_aaseq
+# method must be called!
+class Phobius::Index < Hash
+  include TransmembraneIndex
+  # will update_aaseq if given a fasta_obj
+  def initialize(file)
+    Phobius.default_index(file, self)
+  end
+  # we need to match whatever function phobius uses to generate identifiers if
+  # we want derivative processes to be fast and accurate
+  def reference_to_key(reference)
+    if reference
+      if reference.size > 0
+        index = reference.index(' ')
+        string =
+          if index
+            reference[0...index]
+          else
+            reference
+          end
+        string.gsub('"','')
+      else
+        ''
+      end
+    else
+      nil
+    end
+  end
+end
+class Phobius
+  include TransmembraneIndex
+  # returns the default index
+  def self.default_index(file, index={})
+    parser = Phobius::Parser.new(:short)
+    parser.file_to_index(file, index)
+  end
+end
+module Phobius::Parser
+  def self.new(parser_type=:short)
+    klass =
+      case parser_type
+      when :short
+        Phobius::ParserShort
+      else
+        raise ArgumentError, "don't recognize parser type: #{parser_type}"
+      end
+    klass.new
+  end
+  def file_to_index(file, index={})
+    File.open(file) {|fh| to_index(fh, index) }
+  end
+end
+class Phobius::ParserShort
+  include Phobius::Parser
+  # takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
+  # and returns an array of hashes with the keys :start and :stop
+  def prediction_to_array(string)
+    segments = []
+    string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
+      segments << { :start => m1.to_i, :stop => m2.to_i }
+    end
+    segments
+  end
+  # returns a hash structure in this form: { identifier => {
+  # :num_certain_transmembrane_segments => Int,
+  # :transmembrane_segments => [:start => Int, :stop
+  # => Int] }
+  # can parse io even if there is no header to key in on.
+  def to_index(io, index={})
+    init_pos = io.pos
+    cnt = 0
+    found_header = false
+    loop do
+      if io.gets =~ /SEQENCE/
+        found_header = true
+        break
+      end
+      cnt += 1
+      break if cnt > 10
+    end
+    if !found_header
+      io.pos = init_pos
+    end
+    current_record = nil
+    io.each do |line|
+      line.chomp!
+      # grab values
+      ar = line.split(/\s+/)
+      next if ar.size != 4
+      (key, num_tms, signal_peptide, prediction) = ar
+      # cast the values
+      num_tms = num_tms.to_i
+      signal_peptide =
+        case signal_peptide
+        when 'Y'
+          true
+        when '0'
+          false
+        end
+      index[key] = {
+        :num_certain_transmembrane_segments => num_tms,
+        :signal_peptide => signal_peptide,
+      }
+      if num_tms > 0
+        index[key][:transmembrane_segments] = prediction_to_array(prediction)
+      end
+    end
+    index
+  end
+end

data/lib/transmembrane/toppred.rb ADDED Viewed

@@ -0,0 +1,368 @@
+require 'transmem'
+require 'xml_style_parser'
+class TopPred ; end
+class TopPred::Index < Hash
+  include TransmemIndex
+  # we need to match whatever function toppred uses to generate identifiers if
+  # we want derivative processes to be fast and accurate
+  def reference_to_key(reference)
+    if reference
+      ri = reference.index(' ')
+      frst =
+        if ri
+          reference[0...reference.index(' ')]
+        else
+          reference
+        end
+      if frst
+        frst.gsub(/[^0-9a-zA-Z]/,'_')
+      else
+        nil
+      end
+    else
+      nil
+    end
+  end
+  def initialize(file, kind=:default)
+    case kind
+    when :default
+      TopPred.default_index(file, self)
+    else
+      abort "can't do #{kind}"
+    end
+  end
+  # This class will probably change its interface some in the future
+  # That's the web portal
+  # http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
+  # How to run:
+  # uncheck 'Produce hydrophobicity graph image (-g)'
+  # choose 'Xml' or 'New: new text' output
+  # type in your email, then hit 'Run toppred'
+end
+class TopPred
+  include TransmemIndex
+  # returns the default index
+  def self.default_index(file, index={})
+    TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
+  end
+end
+module TopPred::Parser
+  # returns :xml or :text
+  def self.filetype(file)
+    File.open(file) do |fh|
+      case fh.gets
+      when /<\?xml version.*>/
+        :xml
+      when /Algorithm specific/
+        :text
+      else
+        nil
+      end
+    end
+  end
+  # type = :xml or :text
+  def self.new(parser_type=:xml)
+    klass =
+      case parser_type
+      when :xml
+        TopPred::Parser_XML
+      when :text
+        TopPred::Parser_Text
+      else
+        abort "don't recognize parser type: #{parser_type}"
+      end
+    klass.new
+  end
+  def file_to_index(file, index={})
+    File.open(file) {|fh| to_index(fh, index) }
+  end
+  # where each segment = [prob, first, last] and aaseq is a string each
+  # segment may also be a hash => first, last, probability (adding key
+  # 'aaseq')
+  # first/last '1' indexed returns segments where each is [prob,
+  # first, last, aaseq] or hash (above)
+  def add_sequences_to_segments(segments, aaseq)
+    if segments.first.is_a? Array
+      segments.each do |seg|
+        first_index = seg[1] - 1
+        length = (seg[2] - seg[1]) + 1
+        seg.push( aaseq[first_index, length] )
+      end
+    else
+      segments.each do |seg|
+        first_index = seg[:start] - 1
+        length = (seg[:stop] - seg[:start]) + 1
+        seg[:aaseq] = ( aaseq[first_index, length] )
+      end
+    end
+    segments
+  end
+end
+module TopPred::Parser_XML
+  include TopPred::Parser
+  include XMLStyleParser
+  def self.new(meth=:to_index)
+    parser = XMLStyleParser.choose_parser(self, meth).new
+    @method = meth
+    parser
+  end
+  def parse(file)
+    send(@method, file)
+  end
+end
+class TopPred::Parser_XML::DOM
+  include TopPred::Parser_XML
+  include XMLStyleParser
+=begin
+  YAL010C:
+  num_putative_transmembrane_segments: 1
+  aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
+  best_structure_probability: 1.0
+  transmembrane_segments:
+  - aaseq: SLGAEFWLGLVSLSPGCSTTL
+    stop: 252
+    start: 232
+    probability: 1.0
+  num_certain_transmembrane_segments: 1
+  num_found: 2
+=end
+  # should return a index
+  def to_index(io, index = {})
+    get_root_node_from_io(io) do |toppreds_n|
+      abort if toppreds_n.name != 'toppreds'
+      toppreds_n.find('child::toppred').each do |toppred_n|
+        att_hash = {}
+        sequence_n = toppred_n.find_first('child::sequence')
+        index[sequence_n['id']] = att_hash
+        att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
+        abort if att_hash[:aaseq].size != sequence_n['size'].to_i
+        tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
+        num_found = tmsummary_n['segments'].to_i
+        att_hash[:num_found] = num_found
+        if num_found > 0
+          num_certain_transmembrane_segments = 0
+          num_putative_transmembrane_segments = 0
+          tmsummary_n.find('child::segment').each do |segment_n|
+            abort if segment_n.name != 'segment'
+            case segment_n['type']
+            when 'certain'
+              num_certain_transmembrane_segments += 1
+            else # putative
+              num_putative_transmembrane_segments += 1
+            end
+          end
+          att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
+          att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
+          topologies_n = tmsummary_n.next
+          abort if topologies_n.name != 'topologies'
+          # get the top probability topology:
+          top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
+          tmsegments = []
+          top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
+            tmhash = {}
+            tmhash[:start] = tmsegment_n['start'].to_i
+            tmhash[:stop] = tmsegment_n['stop'].to_i
+            ## WARNING! it appears the probability is broken on xml output!!
+            tmhash[:probability] = tmsegment_n['prob'].to_f
+            tmsegments << tmhash
+          end
+          add_sequences_to_segments(tmsegments, att_hash[:aaseq])
+          att_hash[:transmembrane_segments] = tmsegments
+        end
+      end
+    end
+    index
+  end
+end
+class TopPred::Parser_Text
+  include TopPred::Parser
+  # returns a hash structure in this form: {identifier => {aaseq => String,
+  # num_found: Int, num_certain_transmembrane_segments => Int,
+  # num_putative_transmembrane_segments => Int, best_structure_probability =>
+  # Float, transmembrane_segments => [probability => Float, start => Int, stop
+  # => Int, aaseq => String] } }
+  def to_index(io, index={})
+    current_record = nil
+    io.each do |line|
+      if line =~ /^Sequence : (.*?) +\(/
+        current_identifier = $1.dup
+        index[current_identifier] = {}
+        current_record = index[current_identifier]
+        current_record[:aaseq] = read_aaseq(io)
+        read_segment_summary(io, current_record)
+      elsif line =~ /^HEADER\s+START\s+STOP/
+        top_struc = top_structure( read_structures(io) )
+        current_record[:best_structure_probability] = top_struc[:probability]
+        current_record[:transmembrane_segments] = top_struc[:tm]
+        add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
+        segment_arrays_to_hashes(current_record[:transmembrane_segments])
+      end
+    end
+    index
+  end
+  private
+  # returns a list of all structures given a filehandle starting just after
+  # the first "HEADER START STOP ..." line
+  def read_structures(fh)
+    structures = []
+    loop do
+      structures.push( read_structure(fh) )
+      break if fh.eof?
+      line = fh.readline
+      unless line =~ /^HEADER\s+START\s+STOP/
+        break
+      end
+    end
+    structures
+  end
+  # returns a hash with key :probability and key :tm contains an array of
+  # arrays: [prob(Float), start(Int), stop(Int)]
+  def read_structure(fh)
+    structure = {}
+    # READ the first line
+    line = fh.readline
+    structure[:probability] = line.split(/\s+/)[2].to_f
+    structure[:tm] = read_segments(fh)
+    structure
+  end
+  # returns an array of arrays of transmembrane segments: [prob(Float),
+  # start(Int), stop(Int)]
+  # returns after seeing '//'
+  def read_segments(fh)
+    segments = []
+    st = Regexp.escape('//') ; end_regex = /#{st}/
+    fh.each do |line|
+      if line =~ /^TRANSMEM/
+        (header, start, stop, len, prob) = line.split(/\s+/)[0,5]
+        segments << [prob.to_f, start.to_i, stop.to_i]
+      elsif line =~ end_regex
+        break
+      end
+    end
+    segments
+  end
+  # returns the top probability structure (first on tie)
+  def top_structure(list)
+    top_prob = list.first[:probability]
+    top_struc = list.first
+    list.each do |st|
+      if st[:probability] > top_prob
+        top_struc = st
+        top_prob = st[:probability]
+      end
+    end
+    top_struc
+  end
+  def read_aaseq(fh)
+    aaseq = ''
+    fh.each do |line|
+      line.chomp!
+      unless line =~ /[\w\*]/
+        break
+      end
+      aaseq << line
+    end
+    aaseq
+  end
+  def segment_arrays_to_hashes(list)
+    list.map! do |ar|
+      { :probability => ar[0],
+      :start => ar[1],
+      :stop => ar[2],
+      :aaseq => ar[3],
+      }
+    end
+  end
+  # returns [certain, putative]
+  # expects first line to be a tm segment
+  def num_certain_putative(fh)
+    certain = 0
+    putative = 0
+    fh.each do |line|
+      certainty = line.chomp.split(/\s+/).last
+      if !certainty
+        break
+      else
+        certain += 1 if certainty == 'Certain'
+        putative += 1 if certainty == 'Putative'
+      end
+    end
+    [certain, putative]
+  end
+  def read_segment_summary(fh, rec)
+    fh.each do |line|
+      if line =~ /Found: (.*?) segments/
+        rec[:num_found] = $1.to_i
+        break if rec[:num_found] == 0
+      elsif line =~ /Helix\s+Begin/
+        (cert, putat) = num_certain_putative(fh)
+        rec[:num_certain_transmembrane_segments] = cert
+        rec[:num_putative_transmembrane_segments] = putat
+        break
+      end
+    end
+  end
+end
+class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
+  def get_root_node_from_io(io, &block)
+    # turn off warnings because this doesn't seem to work:
+    # XML::Parser.default_load_external_dtd = false
+    # (There is a warning about not finding DTD)
+    xml_parser_warnings = XML::Parser.default_warnings
+    XML::Parser.default_warnings = false
+    doc = XML::Parser.io(io).parse
+    root = doc.root
+    block.call(root)
+    # reset the warning level of XML::Parser:
+    XML::Parser.default_warnings = xml_parser_warnings
+  end
+end
+class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
+  def get_root_node_from_io(io, &block)
+    root = ::AXML.parse(io)
+    block.call(root)
+  end
+end