RubyGems - mspire - Versions diffs - 0.1.7 → 0.2.0 - Mend

mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/Rakefile +41 -14
data/bin/bioworks2excel.rb +1 -1
data/bin/bioworks_to_pepxml.rb +46 -59
data/bin/fasta_shaker.rb +1 -1
data/bin/filter.rb +6 -0
data/bin/find_aa_freq.rb +23 -0
data/bin/id_precision.rb +3 -2
data/bin/mzxml_to_lmat.rb +2 -1
data/bin/pepproph_filter.rb +1 -1
data/bin/precision.rb +1 -1
data/bin/protein_summary.rb +2 -451
data/bin/raw_to_mzXML.rb +55 -0
data/bin/srf_group.rb +26 -0
data/changelog.txt +7 -0
data/lib/align.rb +3 -3
data/lib/fasta.rb +6 -1
data/lib/gi.rb +9 -4
data/lib/roc.rb +2 -0
data/lib/sample_enzyme.rb +2 -1
data/lib/spec/mzxml/parser.rb +2 -43
data/lib/spec/mzxml.rb +65 -2
data/lib/spec_id/aa_freqs.rb +10 -7
data/lib/spec_id/bioworks.rb +67 -87
data/lib/spec_id/filter.rb +794 -0
data/lib/spec_id/precision.rb +29 -36
data/lib/spec_id/proph.rb +5 -3
data/lib/spec_id/protein_summary.rb +459 -0
data/lib/spec_id/sequest.rb +323 -271
data/lib/spec_id/srf.rb +189 -135
data/lib/spec_id.rb +276 -227
data/lib/spec_id_xml.rb +101 -0
data/lib/toppred.rb +18 -0
data/script/degenerate_peptides.rb +47 -0
data/script/filter-peps.rb +5 -1
data/test/tc_align.rb +1 -1
data/test/tc_bioworks.rb +25 -22
data/test/tc_bioworks_to_pepxml.rb +37 -4
data/test/tc_fasta.rb +3 -1
data/test/tc_fasta_shaker.rb +8 -6
data/test/tc_filter.rb +203 -0
data/test/tc_gi.rb +6 -9
data/test/tc_id_precision.rb +31 -0
data/test/tc_mzxml.rb +8 -6
data/test/tc_peptide_parent_times.rb +2 -1
data/test/tc_precision.rb +1 -1
data/test/tc_proph.rb +5 -5
data/test/tc_protein_summary.rb +36 -13
data/test/tc_sequest.rb +78 -33
data/test/tc_spec_id.rb +128 -6
data/test/tc_srf.rb +84 -38
metadata +67 -62
data/bin/fasta_cat.rb +0 -39
data/bin/fasta_cat_mod.rb +0 -59
data/bin/fasta_mod.rb +0 -57
data/bin/filter_spec_id.rb +0 -365
data/bin/raw2mzXML.rb +0 -21
data/script/gen_database_searching.rb +0 -258

data/lib/spec_id.rb CHANGED Viewed

@@ -1,14 +1,15 @@
 require 'ostruct'
-class SampleEnzyme ; end
+require 'set'
+require 'hash_by'
+require 'spec_id/precision'
 require 'roc'
 require 'sample_enzyme'  # for others
 require 'spec_id/bioworks'
 require 'spec_id/sequest'
 require 'spec_id/proph'
-require 'spec_id/precision'
+require 'spec_id_xml'
+class SampleEnzyme ; end
 class Mass
   # http://expasy.org/tools/findmod/findmod_masses.html
@@ -70,123 +71,132 @@ class Mass
   }
 end
-class SpecID
+module SpecID ; end
+class GenericSpecID ; include SpecID ; end
+module SpecID
   MONO = Mass::MONO
   AVG = Mass::AVG
-  attr_accessor :obj
-  attr_writer :peps, :prots
+  attr_accessor :peps, :prots
   # True if a high protein/peptide score is better than low, false otherwise
   # This is set automatically for known file types
   attr_accessor :hi_prob_best
+  # A relative pathname of the file the specid object is derived from
+  attr_accessor :filename
   # tp = file_type
-  def initialize(file=nil, tp=nil)
-    @obj = nil
-    @peps = nil
-    @prots = nil
-    @hi_prob_best = nil
-    if file
+  # Will return a SpecID object (really, the object corresponding to the
+  # file type which mixes in SpecID [is_a?(SpecID) == true])
+  # If no file is given, will return a GenericSpecID object.
+  def self.new(file=nil, tp=nil)
+    if file
       from_file(file, tp)
+    else
+      GenericSpecID.new
     end
   end
   # tp = file_type
-  def from_file(file, tp=nil)
+  # only takes an array if they are srf files!
+  def self.from_file(file, tp=nil)
+    obj = nil
     unless tp
-      tp = self.class.file_type(file)
+      tp = file_type(file)
     end
-    case tp
+    obj = case tp
+    when 'srg'
+      @hi_prob_best = false
+      SRFGroup.new(file)
     when 'bioworks'
-      @obj = SpecID::Bioworks.new(file)
       @hi_prob_best = false
+      Bioworks.new(file)
     when 'protproph'
-      @obj = SpecID::Proph::ProtSummary.new(file)
       @hi_prob_best = true
+      Proph::ProtSummary.new(file)
     else
       abort "UNRECOGNIZED file type for #{file}"
     end
+    obj
   end
   def inspect
     "<#{self.class} #peps=\"#{peps.size}\">"
   end
-  # returns the top peptide hits per file dta (first_scan + charge)
-  # all hits with same score as top score are returned
-  # assumes that all fields are strings...
-  # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
-  # deletes the protein array (but not relevant proteins)
-  # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
-  def top_peps_prefilter!
-    peps.each do |pep|
-      pep.xcorr = pep.xcorr.to_f
-      pep.deltacn = pep.deltacn.to_f
-      pep.deltamass = pep.deltamass.to_f
-      pep.mass = pep.mass.to_f
-      pep.charge = pep.charge.to_f
+  # takes a comma separated list  or array and extends the last to create an
+  # array of desired size
+  def self.extend_args(arg, desired_size)
+    arg_arr = arg
+    if arg.is_a? String
+      arg_arr = arg.split(',')
     end
-    # get the top peptide by firstscan/charge (equivalent to .out files)
-    top_peps = []
-    #self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
-    self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
-      best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
-      top_score = best_to_worst.first.xcorr
-      best_to_worst.each do |pep|
-        if pep.xcorr == top_score
-          top_peps << pep
-        else ; break
-        end
+    new_arr = []
+    last_arg = arg_arr[0]
+    desired_size.times do |i|
+      if arg_arr[i]
+        new_arr[i] = arg_arr[i]
+        last_arg = new_arr[i]
+      else
+        new_arr[i] = last_arg
       end
     end
-    @peps = top_peps
-  end
-  # when kind == :common ; xcorr1, xcorr2, xcorr3, deltacn, rough_ppm
-  # interface very unstable.  For now, keeping it very loose...
-  # assumed that peptide xcorr, deltacn, deltamass, mass are Floats
-  # assumed that peptide charge is Integer
-  # returns prots
-  # must respond to 'peps'
-  def filter(kind, *args)
-    case kind
-    when :common
-      (x1, x2, x3, deltacn, rough_ppm) = args
-      # returns num proteins
-      peps_passed = self.peps.select do |pep|
-        # have to add the upper limit to deltacn because the lowest score is often
-        # assigned a 1.10 in bioworks!
-        pep_deltacn = pep.deltacn
-        pep_charge = pep.charge
-        (pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
-        #truth = (pep_deltacn >= deltacn) and
-        (
-         (pep_charge == 1 && pep.xcorr >= x1) or
-         (pep_charge == 2 && pep.xcorr >= x2) or
-         (pep_charge == 3 && pep.xcorr >= x3)
-        ) and
-        ((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
-      end
-      #deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
-      hash = peps_passed.hash_by(:prot)
-      prots_passed = hash.map do |prot,pep_arr|
-        prot.peps = pep_arr
-        prot
+    new_arr
+  end
+  # takes an array of proteins, each having peps
+  # peptide grouping is done
+  # by-
+  # the protein with the most unique peptides ends up taking any
+  # degenerate peptides, tie goes to one with most hits total, then the one
+  # that had the top xcorr(s) (before removing any peptides).All other
+  # proteins with identical peptides will lose those peptides.  So, the rich
+  # stay rich, and the poor get poorer.
+  # returns an array of triplets where each is [prot, pep_hits,
+  # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
+  # peptide.  The internal links (prot.peps and pep.prots) is NOT modified!!
+  # update_prots == true will set each protein with the peptides found
+  def self.occams_razor(array_of_prots, update_prots=false)
+    peps_found = Set.new
+    to_sort = array_of_prots.map do |prot|
+      pps = prot.peps
+      peps_by_uniq_aaseq = pps.hash_by(:aaseq)
+      uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
+      xcorrs = pps.map {|pep| pep.xcorr }
+      silly = OpenStruct.new
+      # 0                1         2            3     4            5
+      [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
+    end
+    prot_triplets = []
+    to_sort.sort.reverse.each do |ar|
+      prot = ar[3]
+      ## overlapping set:
+      common = peps_found & ar[4]
+      ## find the uniq ones in our little set of peptides:
+      uniq = ar[4] - common
+      pep_hits = []
+      if uniq.size != 0
+        ## add to the found list:
+        peps_found.merge(uniq)
+        uniq.each do |seq|
+          pep_hits.push( *(ar[5][seq]) )
+        end
+        prot_triplets << [prot, pep_hits, uniq.to_a]
+        prot.peps = pep_hits if update_prots
       end
-      [prots_passed, peps_passed]
-      #[prots_passed, peps_passed, deltacnstar_cnt]
-    else
-      abort "#{kind} not implemented"
     end
+    prot_triplets
   end
   ## basically, this is the command line wrapper
   def self.precision(argv)
-    SpecID::Precision.new.run_cmd_line(argv)
+    Prec.new.run_cmd_line(argv)
   end
@@ -197,27 +207,64 @@ class SpecID
   def by_tps(classification_method, tp, fp)
     ROC.new.by_tps(classification_method, tp, fp)
   end
+  # from the unique set of peptide hits, create a separate peptide hit for
+  # each protein reference where that peptide only references that protein
+  # e.g. pep.prots = [(a single protein)]
+  def pep_prots
+    pps = []
+    peps.each do |pep|
+      pep.prots.map do |prt|
+        pep.dup
+        pep.prots = [prt]
+        pps << pep
+      end
+    end
+    pps
+  end
   # returns [tp, fp] based on the protein prefix for items where items =
   # (:prot|:peps)
+  # this may result in a duplication of some peptides if they match both
+  # normal and decoy proteins.  In this case, the protein arrays are split,
+  # too, so that each points only to its breed of protein.
   def classify_by_prefix(items, prefix, fp_on_match=true)
     regex = /^#{Regexp.escape(prefix)}/
-    myproc = case items
+    case items
     when :prots
-      proc { |prt|
+      myproc = proc { |prt|
         if prt.reference =~ regex ; !fp_on_match
         else ; fp_on_match end
       }
+      return classify(items, myproc)
     when :peps
-      proc { |pep|
-        if pep.prot.reference =~ regex ; !fp_on_match
-        else ; fp_on_match end
-      }
+      match = [] ; nomatch = []
+      peps.each do |pep|
+        match_prots = [] ; nomatch_prots = []
+        (hit, nohit) = pep.prots.partition do |prot|
+          prot.reference =~ regex
+        end
+        if hit.size == 0
+          nomatch << pep
+        elsif nohit.size == 0
+          match << pep
+        else ## both have hits
+          pep.prots = match_prots
+          nomatch_pep = pep.dup
+          nomatch_pep.prots = nomatch_prots
+          match << pep
+          nomatch << pep
+        end
+      end
+      if fp_on_match
+        return [nomatch , match]
+      else
+        return [match, nomatch]
+      end
     else
-      abort "no go"
+      abort "don't recognize "
     end
-    classify(items, myproc)
-  end
+end
   ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
   #  # Returns tp, fp where each is an array of proteins where fp is determined
@@ -244,18 +291,6 @@ class SpecID
     [t,f]
   end
-  def peps
-    if @peps ; @peps
-    else @obj.peps
-    end
-  end
-  def prots
-    if @prots ; @prots
-    else @obj.prots
-    end
-  end
   # returns two arrays, true positives and false positives (determined by proc
   # classify_item_by) sorted by proc rank_item_by.  Items will be ranked from
   # lowest to highest based on the return value of rank_item_by. items is a
@@ -276,7 +311,7 @@ class SpecID
   # returns a proc for getting all probabilities so that an ascending sort
   # will put the best scores first
   def probability_proc
-    if @hi_prob_best
+    if hi_prob_best
       get_prob_proc = proc {|prt| prt.probability * -1 }
     else
       get_prob_proc = proc {|prt| prt.probability }
@@ -328,17 +363,13 @@ class SpecID
       if prt.reference =~ regex ; false
       else ; true end
     }
     real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
     (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
     [num_hits, precision]
   end
-  def method_missing(symbol, *args)
-    @obj.send(symbol, *args)
-  end
 #  # takes the existing spec_id object and marshals it into "file.msh"
 #  # a new file will always look for a file.msh to load
 #  def marshal(force=false)
@@ -348,7 +379,14 @@ class SpecID
 #  end
   # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
+  # 'srf' if SRF file, 'srg' if search results group file.
   def self.file_type(file)
+    if file =~ /\.srg$/
+      return 'srg'
+    end
+    if IO.read(file, 7,438) == 'Enzyme:'
+      return 'srf'
+    end
     File.open(file) do |fh|
       lines = ""
       4.times { lines << fh.readline }
@@ -397,7 +435,7 @@ class SpecID
     #peptides.each do |pep| print pep.class.to_s + " " end
     #puts peptides.first.is_a? Array
     #abort "DFHDFD"
-    peptides.collect{|pep| pep.peptide_probability }.sort
+    peptides.collect{|pep| pep.probability }.sort
   end
   # returns a sorted lists of probabilities based on all pepprots (a peptide
@@ -477,138 +515,149 @@ class SpecID
     end
     sorted_probabilities(min_peptides)
   end
-  # A Generic spectraID protein
-  class Prot
-    # probability is always a float!
-    attr_accessor :probability, :reference
-  end
-  class Pep
-    attr_accessor :probability
-    # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
-    # or last.
-    attr_accessor :sequence
-    attr_accessor :charge
-    # units can be :mmu, :amu, :ppm
-    def mass_accuracy(pep, unit=:ppm, mono=true)
-      # 10^6 * deltam accuracy/ m[measured]
-      # i.e., theoretical mass 1000, measured 999.9: 100ppm
-      # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
-      # pep.mass is the theoretical M+H of the peptide
-      # this assumes that the deltacn value we're being told is correct, but I
-      # have my suspicions (since the <mass> value is not accurate...)
-      ######## TO COMPLETE (and add to spec_id..?)
-      case unit
-      when :ppm
-      when :amu
-      when :mmu
-      end
-    end
-  end
 end
-# I would prefer to call this SpecID::XML, but I keep getting an error:
-# /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
-# constant XML referenced by SpecID::XML' This works around that for now.
-# Any major xml elements should return a newline at the end for simple
-# concatenation into a file
-module SpecIDXML
-  Special_chrs_hash = {
-    '"' => '&quot;',
-    '&' => '&amp;',
-    "'" => '&apos;',
-    '<' => '&lt;',
-    '>' => '&gt;',
-  }
-  # substitutes special xml chars
-  def escape_special_chars(string)
-    string.split('').map do |char|
-      if Special_chrs_hash.key? char ; Special_chrs_hash[char]
-      # if x = Special_chrs_hash[char] ; x  # <-- that's slightly slower
-      else ; char end
-    end.join
-  end
-  $DEPTH = 0
-  def tabs
-    # this is ugly
-    string = ""
-    $DEPTH.times { string << "\t" }
-    string
-  end
+# A Generic spectraID protein
+module SpecID::Prot
+  # probability is always a float!
+  attr_accessor :probability, :reference, :peps
-  def param_xml(symbol)
-    tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
+  def <=> (other)
+    self.reference <=> other.reference
   end
-  def params_xml(*symbol_list)
-    symbol_list.collect { |sy|
-      param_xml(sy)
-    }.join("\n") + "\n"
-  end
+end
-  def short_element_xml(element, att_list)
-    "#{tabs}<#{element} #{attrs_xml(att_list)}/>\n"
+module SpecID::Pep
+   Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
+  attr_accessor :prots
+  attr_accessor :probability
+  # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
+  # or last.
+  attr_accessor :sequence
+  # the basic amino acid sequence (no leading or trailing '.' or amino acids)
+  # should not contain any special symbols, etc.
+  attr_accessor :aaseq
+  attr_accessor :charge
+  # removes nonstandard chars with Non_standard_amino_acid_char_re
+  # preserves A-Z and '.' and '-'
+  def self.remove_non_amino_acids(sequence)
+    sequence.gsub(Non_standard_amino_acid_char_re, '')
+  end
+  # remove_non_amino_acids && split_sequence
+  def self.prepare_sequence(val)
+    nv = remove_non_amino_acids(val)
+    split_sequence(nv)
+  end
+  def <=>(other)
+    aaseq <=> other.aaseq
+  end
+  # Returns prev, peptide, next from sequence.  Parse errors return
+  # nil,nil,nil
+  #   R.PEPTIDE.A  # -> R, PEPTIDE, A
+  #   R.PEPTIDE.-  # -> R, PEPTIDE, -
+  #   PEPTIDE.A    # -> -, PEPTIDE, A
+  #   A.PEPTIDE    # -> A, PEPTIDE, -
+  #   PEPTIDE      # -> nil,nil,nil
+  def self.split_sequence(val)
+    peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
+    pieces = val.split('.')
+    case pieces.size
+    when 3
+      peptide_prev_aa, peptide, peptide_next_aa = *pieces
+    when 2
+      if pieces[0].size > 1  ## N termini
+        peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
+      else  ## C termini
+        peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
+      end
+    when 1  ## this must be a parse error!
+      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
+    when 0
+      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
+    end
+    return peptide_prev_aa, peptide, peptide_next_aa
+  end
+  ##
+  def self.sequence_to_aaseq(sequence)
+    after_removed = remove_non_amino_acids(sequence)
+    pieces = after_removed.split('.')
+    case pieces.size
+    when 3
+      pieces[1]
+    when 2
+      if pieces[0].size > 1  ## N termini
+        pieces[0]
+      else  ## C termini
+        pieces[1]
+      end
+    when 1  ## this must be a parse error!
+      pieces[0] ## which is the peptide itself
+    else
+      abort "bad peptide sequence: #{sequence}"
+    end
   end
-  def short_element_xml_and_att_string(element, att_string)
-    "#{tabs}<#{element} #{att_string}/>\n"
-  end
+  # This will rapidly determine the list of proteins for which given
+  # peptides belong.  It is meant to be low level and fast (eventually),
+  # so it asks for the data in a format amenable to this.
+  # returns a mirror array where each entry is an array of Fasta::Prot
+  # objects where each protein contains the sequence
+  def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
+    prots = fasta_obj.prots
+    prot_seqs = prots.map do |prot|
+      prot.aaseq
+    end
-  # requires that obj have attribute '@xml_element_name'
-  # displays all *instance_variables* (does not call methods!)
-  def short_element_xml_from_instance_vars(element_name)
-    string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
-    "#{tabs}<#{element_name} #{string}/>\n"
-  end
+    groups = peptide_strings_list.map do |pep_seq|
+      prot_index = 0
+      protein_group = []
+      prot_seqs.each do |prot_seq|
+        if prot_seq.include? pep_seq
+          protein_group << prots[prot_index]
+        end
+        prot_index += 1
+      end
+      protein_group
+    end
-  # takes an element as a symbol and returns the
-  def element_xml_no_atts(element)
-    start = "#{tabs}<#{element}>\n"
-    $DEPTH += 1
-    if block_given? ; middle = yield else ; middle = '' end
-    $DEPTH -= 1
-    start + middle + "#{tabs}</#{element}>\n"
+    groups
   end
-  # takes an element as a symbol and returns the
-  def element_xml(element, att_list)
+  # units can be :mmu, :amu, :ppm
+  def mass_accuracy(pep, unit=:ppm, mono=true)
+    # 10^6 * deltam accuracy/ m[measured]
+    # i.e., theoretical mass 1000, measured 999.9: 100ppm
+    # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
+    # pep.mass is the theoretical M+H of the peptide
+    # this assumes that the deltacn value we're being told is correct, but I
+    # have my suspicions (since the <mass> value is not accurate...)
-    start = "#{tabs}<#{element} #{attrs_xml(att_list)}>\n"
-    $DEPTH += 1
-    if block_given? ; middle = yield else ; middle = '' end
-    $DEPTH -= 1
-    start + middle + "#{tabs}</#{element}>\n"
+    ######## TO COMPLETE (and add to spec_id..?)
+    case unit
+    when :ppm
+    when :amu
+    when :mmu
+    end
   end
+end
-  # element as symbol and att_string as attributes
-  # takes a block of whatever
-  def element_xml_and_att_string(element, att_string)
-    start = "#{tabs}<#{element} #{att_string}>\n"
-    $DEPTH += 1
-    if block_given? ; middle = yield else ; middle = '' end
-    $DEPTH -= 1
-    start + middle + "#{tabs}</#{element}>\n"
-  end
+class SpecID::GenericProt
+  include SpecID::Prot
+end
-  def attr_xml(symbol)
-    "#{symbol}=\"#{send(symbol)}\""
-  end
+class SpecID::GenericPep
+  include SpecID::Pep
+end
-  def attrs_xml(list_of_symbols)
-    list_of_symbols.collect {|sy|
-      attr_xml(sy)
-    }.join(" ")
-  end
-end