RubyGems - mspire - Versions diffs - 0.3.1 → 0.3.9 - Mend

mspire 0.3.1 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/Rakefile +2 -2
data/bin/bioworks_to_pepxml.rb +15 -3
data/bin/ms_to_lmat.rb +2 -1
data/bin/sqt_group.rb +26 -0
data/changelog.txt +36 -0
data/lib/ms/msrun.rb +3 -1
data/lib/ms/parser/mzdata/dom.rb +14 -14
data/lib/ms/scan.rb +3 -3
data/lib/mspire.rb +1 -1
data/lib/sample_enzyme.rb +39 -0
data/lib/spec_id.rb +18 -0
data/lib/spec_id/aa_freqs.rb +6 -9
data/lib/spec_id/digestor.rb +16 -17
data/lib/spec_id/mass.rb +63 -1
data/lib/spec_id/parser/proph.rb +101 -2
data/lib/spec_id/precision/filter.rb +3 -2
data/lib/spec_id/precision/filter/cmdline.rb +3 -1
data/lib/spec_id/precision/filter/output.rb +1 -0
data/lib/spec_id/precision/prob.rb +88 -21
data/lib/spec_id/precision/prob/cmdline.rb +28 -16
data/lib/spec_id/precision/prob/output.rb +8 -2
data/lib/spec_id/proph/pep_summary.rb +25 -12
data/lib/spec_id/sequest.rb +28 -0
data/lib/spec_id/sequest/pepxml.rb +142 -197
data/lib/spec_id/sqt.rb +349 -0
data/lib/spec_id/srf.rb +33 -23
data/lib/validator.rb +40 -57
data/lib/validator/aa.rb +3 -90
data/lib/validator/aa_est.rb +112 -0
data/lib/validator/cmdline.rb +163 -31
data/lib/validator/decoy.rb +15 -7
data/lib/validator/digestion_based.rb +5 -4
data/lib/validator/q_value.rb +32 -0
data/script/peps_per_bin.rb +67 -0
data/script/sqt_to_meta.rb +24 -0
data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
data/specs/bin/fasta_shaker_spec.rb +2 -2
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
data/specs/bin/filter_and_validate_spec.rb +25 -6
data/specs/bin/ms_to_lmat_spec.rb +2 -2
data/specs/bin/prob_validate_spec.rb +5 -3
data/specs/sample_enzyme_spec.rb +86 -1
data/specs/spec_helper.rb +11 -9
data/specs/spec_id/bioworks_spec.rb +2 -1
data/specs/spec_id/precision/filter_spec.rb +5 -5
data/specs/spec_id/precision/prob_spec.rb +0 -67
data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
data/specs/spec_id/protein_summary_spec.rb +4 -4
data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
data/specs/spec_id/sequest_spec.rb +38 -0
data/specs/spec_id/sqt_spec.rb +111 -3
data/specs/spec_id_spec.rb +2 -0
data/specs/transmem/phobius_spec.rb +3 -1
data/specs/transmem/toppred_spec.rb +1 -1
data/specs/validator/aa_est_spec.rb +66 -0
data/specs/validator/aa_spec.rb +1 -68
data/specs/validator/background_spec.rb +2 -0
data/specs/validator/bias_spec.rb +3 -27
data/specs/validator/decoy_spec.rb +2 -2
data/specs/validator/transmem_spec.rb +2 -1
data/test_files/small.sqt +87 -0
metadata +312 -293

data/lib/spec_id/parser/proph.rb CHANGED Viewed

@@ -8,6 +8,20 @@ module SpecID::Parser ; end
 class SpecID::Parser::PepProph
   include XMLStyleParser
+  # gets the protein (and adds the pephit to the protein)
+  def get_protein(search_hit, name, description, global_prot_hash)
+    prot =
+      if global_prot_hash.key?(name)
+        global_prot_hash[name]
+      else
+        prt = Proph::PepSummary::Prot.new([name, description, []])
+        global_prot_hash[name] = prt
+      end
+    prot.peps << search_hit
+    prot
+  end
   def initialize(parse_type=:spec_id, version='3.0')
     @method = parse_type
     @version = version
@@ -29,7 +43,10 @@ class SpecID::Parser::PepProph
   end
   # returns the spec_id object
+  # :global_prot_hash is a hash if you have multiple of these files to be
+  # combined
   def spec_id(file, opts={})
     raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
     spec_id_obj =
       if x = opts[:spec_id]
@@ -37,11 +54,93 @@ class SpecID::Parser::PepProph
       else
         Proph::PepSummary.new
       end
+    global_prot_hash =
+      if y = opts[:global_prot_hash]
+        y
+      else
+        {}
+      end
     msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
     spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
-    msms_run_summary_n = msms_pipeline_analysis_n.find_first('child::msms_run_summary')
-    spec_id_obj.from_pepxml_node(msms_run_summary_n)
+    spec_id_obj.msms_run_summaries = msms_pipeline_analysis_n.find('child::msms_run_summary').map do |msms_run_summary_n|
+      parse_msms_run_summary(msms_run_summary_n, global_prot_hash)
+    end
+    peps = []
+    spec_id_obj.msms_run_summaries.each do |mrs|
+      mrs.spectrum_queries.each do |sq|
+        sq.search_results.each do |sr|
+          peps.push( *(sr.search_hits) )
+        end
+      end
+    end
+    spec_id_obj.peps = peps
+    spec_id_obj.prots = global_prot_hash.values
+    spec_id_obj
+  end
+  # returns an msms_run_summary object
+  def parse_msms_run_summary(msms_run_summary_n, global_prot_hash)
+    msms_run_summary_obj = Sequest::PepXML::MSMSRunSummary.new
+    msms_run_summary_obj.from_pepxml_node(msms_run_summary_n)
+    sample_enzyme_n = msms_run_summary_n.find_first("child::sample_enzyme")
+    msms_run_summary_obj.sample_enzyme = SampleEnzyme.from_pepxml_node( sample_enzyme_n )
+    search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
+    spectrum_queries_nds = search_summary_n.find("following-sibling::spectrum_query")
+    msms_run_summary_obj.spectrum_queries = spectrum_queries_nds.map do |sq_n|
+      sq = Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n)
+      sq.search_results = sq_n.children.map do |sr_n|
+        sr = Sequest::PepXML::SearchResult.new
+        sr.search_hits = sr_n.children.map do |sh_n|
+          sh = Proph::PepSummary::Pep.new  # descended from SearchHit
+          sh.from_pepxml_node(sh_n)
+          sh.spectrum_query = sq
+          prots = [ get_protein(sh, sh_n['protein'], sh_n['protein_descr'], global_prot_hash) ]
+          ## alternative proteins:
+          if sh.num_tot_proteins > 1
+            sh_n.find('child::alternative_protein').each do |alt_prot_n|
+              prots << get_protein(sh, alt_prot_n['protein'], alt_prot_n['protein_descr'], global_prot_hash)
+            end
+          end
+          sh.prots = prots
+          if modinfo_node = sh_n.find_first("child::modification_info")
+            sh.modification_info = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
+          end
+          ## search scores:
+          sh_n.find("child::search_score").each do |ss_n|
+            case ss_n['name']
+            when 'deltacnstar'
+              sh.deltacnstar = ss_n['value'].to_i
+            when 'xcorr'
+              sh.xcorr = ss_n['value'].to_f
+            when 'deltacn'
+              sh.deltacn = ss_n['value'].to_f
+            when 'spscore'
+              sh.spscore = ss_n['value'].to_f
+            when 'sprank'
+              sh.sprank = ss_n['value'].to_i
+            end
+          end
+          sh
+        end
+        sr
+      end
+      sq
+    end
+    ## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
+    #into a better search summary object (to eventually depracate the params object)
+    msms_run_summary_obj.search_summary = msms_run_summary_n
+    msms_run_summary_obj
   end
 end

data/lib/spec_id/precision/filter.rb CHANGED Viewed

@@ -295,7 +295,7 @@ class SpecID::Precision::Filter
           if opts[:hits_together]
             # we fake that the protein sets are together
-            decoy_validator_to_split_with = Validator::Decoy.new(unmerge_regexp)
+            decoy_validator_to_split_with = Validator::Decoy.new(:constraint => unmerge_regexp)
             decoy_peps.each do |pep|
               pep.prots.each {|prt| prt.reference = merge_prefix + prt.reference }
             end
@@ -599,13 +599,14 @@ class SpecID::Precision::Filter::Peps < Filter
   end
   # returns self for chaining
+  # ( >= +3 charge for the x3)
   def standard_sequest_filter(peps, x1,x2,x3,deltacn,ppm,include_deltacnstar=true)
     peps.select do |pep|
       pep_deltacn = pep.deltacn
       pep_charge = pep.charge
       ## The outer parentheses are critical to getting the correct answer!
-      _passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
+      _passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge >= 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
       if _passing
         if ((!include_deltacnstar) && (pep_deltacn > 1.0))

data/lib/spec_id/precision/filter/cmdline.rb CHANGED Viewed

@@ -131,8 +131,10 @@ module SpecID
             op.val_opt(:digestion, opts)
             op.val_opt(:bias, opts)
             op.val_opt(:bad_aa, opts)
+            op.val_opt(:bad_aa_est, opts)
             op.val_opt(:tmm, opts)
+            op.val_opt(:fasta, opts)
             op.val_opt(:tps, opts)
             op.separator ""
@@ -187,7 +189,7 @@ module SpecID
             if opts[:ties] == nil   # will be nil or false
               opts[:ties] = Validator::Cmdline::DEFAULTS[:ties]
             end
-            opts[:validators] = Validator::Cmdline.prepare_validators(opts, !opts[:ties], opts[:interactive], spec_id_obj)
+            opts[:validators] = Validator::Cmdline.prepare_validators(opts, !opts[:ties], opts[:interactive], opts[:postfilter], spec_id_obj)
             if opts[:output].size == 0
               opts[:output] = DEFAULTS[:output]

data/lib/spec_id/precision/filter/output.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'yaml'
 require 'table'
 require 'spec_id/precision/output'

data/lib/spec_id/precision/prob.rb CHANGED Viewed

@@ -34,11 +34,11 @@ class SpecID::Precision::Prob
   # opts may include:
   #   :proteins => true|*false
   #   :validators => array of Validator objects
+  #   adjusts the precision in the *probability* validators ajdusted =
+  #   (1+R)*prec / (R*precision +1) where R is the decoy_to_target ratio
+  #   used in the decoy validator (R = 0.0 if no decoy validator)
   #   NOTE: if you have decoy data, you MUST pass in a decoy validator for the
-  #   decoy pephits to be removed from other validator analyses!
-  #   (precision based on peptide probabilities are adjusted to account for
-  #   the decoy peptides being present: Precision(no_decoy) = (2*Prec)/(Prec+1)
-  #   which is derived from the 50/50 rule for decoy vs. embedded false hits
+  #   decoy pephits to be removed from other validator analyses!
   #
   # returns a hash of data
   #   :pephits_precision => [{validator => <name>, values => [<precision>,...]},... ]
@@ -49,6 +49,8 @@ class SpecID::Precision::Prob
   #   :modified_peptides => array of modified sequence (only included if
   #   applicable)
   #
+  # NOTE: For protein prophet, the results are given on a peptide+charge
+  # basis.
   #
   # TODO: implement tihs guy:
   #   prothits_precision => {validator => <name>, values => {worst => ,
@@ -58,7 +60,7 @@ class SpecID::Precision::Prob
     opt = PN_DEFAULTS.merge(opts)
     out = {}
-    num_pephits = []  # NOTE!: these are aaseq/aaseq_mod + charge (not really a pephit, but BEST)
+    num_pephits = []  # NOTE!: these are aaseq/aaseq_mod + charge for Prophet
     val_hash = Hash.new {|hash,key| hash[key] = [] }
     val_calc_bkg_hash = Hash.new {|hash,key| hash[key] = [] }
     pepstrings = []
@@ -67,16 +69,25 @@ class SpecID::Precision::Prob
     probabilities = []
     found_modified_peptide = false
+    check_precisions = []
+    check_precisions_decoy = []
     # do we need to deal with decoy peptides? (true/false)
     validators = opt[:validators].map
     decoy_vals = validators.select {|val| val.class == Validator::Decoy }
     if decoy_vals.size > 1
       raise(ArgumentError, "only one decoy validator allowed!")
     else
       decoy_val = decoy_vals.first
+      if decoy_val
+        decoy_to_target_ratio = decoy_val.decoy_to_target_ratio
+      end
     end
     validators.delete(decoy_val)
     other_validators = validators
@@ -89,17 +100,48 @@ class SpecID::Precision::Prob
     n_count = 0
     d_count = 0
+    # this is a peptide prophet
+    is_peptide_prophet =
+      if spec_id.peps.first.respond_to?(:fval) ; true
+      else ;false
+      end
+    use_q_value = spec_id.peps.first.respond_to?(:q_value)
+    ## ORDER THE PEPTIDE HITS:
     ordered_peps =
-      if opt[:sort_by_init]
-        spec_id.peps.sort_by{|v| [v.initial_probability, v.n_instances,  ( v.is_nondegenerate_evidence ? 1 : 0 ), v.n_enzymatic_termini, ( v.is_contributing_evidence ? 1 : 0 ), v.n_sibling_peptides] }.reverse
+      if use_q_value
+        spec_id.peps.sort_by {|v| v.q_value }
+      elsif is_peptide_prophet
+        spec_id.peps.reject {|v| v.probability == -1.0}.sort_by {|v| v.probability }.reverse
       else
-        spec_id.peps.sort_by{|v| [v.nsp_adjusted_probability, v.initial_probability, v.n_instances,  ( v.is_nondegenerate_evidence ? 1 : 0 ), v.n_enzymatic_termini, ( v.is_contributing_evidence ? 1 : 0 ), v.n_sibling_peptides] }.reverse
+        if opt[:sort_by_init]
+          spec_id.peps.sort_by{|v| [v.initial_probability, v.n_instances,  ( v.is_nondegenerate_evidence ? 1 : 0 ), v.n_enzymatic_termini, ( v.is_contributing_evidence ? 1 : 0 ), v.n_sibling_peptides] }.reverse
+        else
+          spec_id.peps.sort_by{|v| [v.nsp_adjusted_probability, v.initial_probability, v.n_instances,  ( v.is_nondegenerate_evidence ? 1 : 0 ), v.n_enzymatic_termini, ( v.is_contributing_evidence ? 1 : 0 ), v.n_sibling_peptides] }.reverse
+        end
       end
+    # for probability based precision with decoy database (not using prophet's
+    # -d flag) we do this:
+    # foreach peptide.sorted_by_probability
+    #   1. update the running precision of the validator REGARDLESS of
+    #   decoy/target status of peptide. the internal hit counts are
+    #   incremented.
+    #   2. only increment reported HIT COUNTS on a non-decoy hit and record
+    #   the precision as (1+R)*prec / (R*precision +1) where R is the ratio of
+    #   decoy hits to target hits.  If it is 1:1 (R = 1) then this becomes:
+    #   2*prec / (prec + 1)
+    ## WORK THROUGH EACH PEPTIDE:
     ordered_peps.each_with_index do |pep,i|
       # probability validators must work on the entire set of normal and decoy
       last_prob_values = probability_validators.map do |val|
-        val.increment_pephits_precision(pep)
+        reply = val.increment_pephits_precision(pep)
+        check_precisions << reply
+        reply
       end
       it_is_a_normal_pep =
@@ -113,13 +155,23 @@ class SpecID::Precision::Prob
           true
         end
+      if it_is_a_normal_pep
+        check_precisions_decoy << false
+      else
+        check_precisions_decoy << true
+      end
       if it_is_a_normal_pep
         n_count += 1
         # UPDATE validators:
-        val_hash[decoy_val] << decoy_precision
+        val_hash[decoy_val].push(decoy_precision) if decoy_val
         probability_validators.zip(last_prob_values) do |val,prec|
-          val_hash[val] << ( (prec * 2.0) / (prec + 1.0) )
+          if decoy_val
+            val_hash[val].push( ((decoy_to_target_ratio+1.0)*prec) / ((decoy_to_target_ratio*prec) + 1.0) )
+          else
+            val_hash[val] << prec
+          end
         end
         other_validators.each do |val|
           val_hash[val] << val.increment_pephits_precision(pep)
@@ -129,17 +181,28 @@ class SpecID::Precision::Prob
         end
         # UPDATE other basic useful information:
-        modified_pep_string =
-          if pep.mod_info
-            found_modified_peptide = true
-            pep.mod_info.modified_peptide
-          else
-            nil
-          end
-        modified_peptides << modified_pep_string
+        if pep.respond_to?(:mod_info)
+          modified_pep_string =
+            if pep.mod_info
+              found_modified_peptide = true
+              pep.mod_info.modified_peptide
+            else
+              nil
+            end
+          modified_peptides << modified_pep_string
+        else
+          modified_pep_string =
+            if pep.sequence =~ /[^A-Z\-\.]/
+              found_modified_peptide = true
+              pep.sequence
+            else
+              nil
+            end
+          modified_peptides << modified_pep_string
+        end
         pepcharges << pep.charge
         pepstrings << pep.aaseq
-        probabilities << pep.probability
+        probabilities << pep.probability  # this is the q_value if percolator
         num_pephits << (i+1)
       else
         d_count += 1
@@ -148,7 +211,11 @@ class SpecID::Precision::Prob
     if found_modified_peptide
       out[:modified_peptides] = modified_peptides
     end
-    out[:probabilities] = probabilities
+    if use_q_value
+      out[:q_values] = probabilities
+    else
+      out[:probabilities] = probabilities
+    end
     out[:count] = num_pephits
     out[:aaseqs] = pepstrings
     out[:charges] = pepcharges

data/lib/spec_id/precision/prob/cmdline.rb CHANGED Viewed

@@ -12,8 +12,13 @@ module SpecID
         COMMAND_LINE = {
           :sort_by_init => ['--sort_by_init', "sort the proteins based on init probability"],
+          :qval => ['--qval', "use percolator q-values to calculate precision"],
           :prob => ['--prob [TYPE]', "use prophet probabilites to calculate precision",
-                                     "TYPE = *nsp|init"],
+                                     "TYPE = nsp [default] prophet nsp",
+                                     "     (nsp also should be used for PeptideProphet results)",
+                                     "     = init (for ProteinProphet results) use initial",
+                                     "probability instead of nsp probability",
+        ],
           # OUTPUT
           :proteins => ["--proteins", "includes proteins (and validation)"],
           :output => ["-o", "--output format[:FILENAME]", "format to output filtering results.",
@@ -29,12 +34,11 @@ module SpecID
                                                ],
           # VALIDATION MODIFIERS:
-          :hits_separate => ["--hits_separate", "target/decoy hits are normally together when choosing",
-                                                "the top hit per peptide (in prefilter and postfilter)",
-                                                "in BOTH catenated and separate searches.  This flag",
-                                                "separates them when finding the top hit per scan.",
-                                                "[This option modifies behavior of --decoy options]"],
+          :pephits => ["--pephits <file>.srg", "an srg file pointing to the srf files for",
+                                               "the given -prot.xml run",
+                                               "[this or --digestion must be used for applicable]",
+                                               "validators (validators depending on a",
+                                               "false/total ratio)]"],
         }.merge( Validator::Cmdline::COMMAND_LINE )
@@ -60,9 +64,10 @@ module SpecID
               on(*COMMAND_LINE[arg]) {|v| opts[arg] = v}
             end
-            op.banner = "USAGE: #{File.basename($0)} [OPTS] <file>-prot.xml"
+            op.banner = "USAGE: #{File.basename($0)} [OPTS] <file>-prot.xml | <file>.sqg"
             op.separator ""
-            op.separator "    RETURNS: precision across the number of hits (based on probability)"
+            op.separator "    RETURNS: precision across the number of hits"
+            op.separator "             (based on probability or q-value)"
             op.separator "             (optional) other validation of the results."
             op.separator ""
@@ -90,12 +95,16 @@ module SpecID
             op.separator ""
             op.val_opt(:prob, opts)
+            op.val_opt(:qval, opts)
             op.val_opt(:decoy, opts)
+            op.val_opt(:pephits, opts)       # sets opts[:ties] = false
             op.val_opt(:digestion, opts)
             op.val_opt(:bias, opts)
             op.val_opt(:bad_aa, opts)
+            op.val_opt(:bad_aa_est, opts)
             op.val_opt(:tmm, opts)
+            op.val_opt(:fasta, opts)
             op.val_opt(:tps, opts)
             op.separator ""
@@ -108,16 +117,19 @@ module SpecID
           # prepare validators
           if args.size > 0
-            spec_id_obj =
-              if args[0] =~ /\.srf$/i
-                ::SpecID.new(args)
-              else
-                ::SpecID.new(args[0])
-              end
+            spec_id_obj = ::SpecID.new(args[0])
             if opts[:ties] == nil   # will be nil or false
               opts[:ties] = Validator::Cmdline::DEFAULTS[:ties]
             end
-            opts[:validators] = Validator::Cmdline.prepare_validators(opts, !opts[:ties], opts[:interactive], spec_id_obj)
+            postfilter =
+              if spec_id_obj.class == SQTGroup or spec_id_obj.class == Proph::PepSummary
+                puts 'making background estimates with: top_per_scan'
+                :top_per_scan
+              else
+                puts 'making background estimates with: top_per_aaseq_charge'
+                :top_per_aaseq_charge
+              end
+            opts[:validators] = Validator::Cmdline.prepare_validators(opts, !opts[:ties], opts[:interactive], postfilter, spec_id_obj)
             if opts[:output].size == 0
               opts[:output] = DEFAULTS[:output]