RubyGems - mspire - Versions diffs - 0.3.1 → 0.3.9 - Mend

mspire 0.3.1 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/Rakefile +2 -2
data/bin/bioworks_to_pepxml.rb +15 -3
data/bin/ms_to_lmat.rb +2 -1
data/bin/sqt_group.rb +26 -0
data/changelog.txt +36 -0
data/lib/ms/msrun.rb +3 -1
data/lib/ms/parser/mzdata/dom.rb +14 -14
data/lib/ms/scan.rb +3 -3
data/lib/mspire.rb +1 -1
data/lib/sample_enzyme.rb +39 -0
data/lib/spec_id.rb +18 -0
data/lib/spec_id/aa_freqs.rb +6 -9
data/lib/spec_id/digestor.rb +16 -17
data/lib/spec_id/mass.rb +63 -1
data/lib/spec_id/parser/proph.rb +101 -2
data/lib/spec_id/precision/filter.rb +3 -2
data/lib/spec_id/precision/filter/cmdline.rb +3 -1
data/lib/spec_id/precision/filter/output.rb +1 -0
data/lib/spec_id/precision/prob.rb +88 -21
data/lib/spec_id/precision/prob/cmdline.rb +28 -16
data/lib/spec_id/precision/prob/output.rb +8 -2
data/lib/spec_id/proph/pep_summary.rb +25 -12
data/lib/spec_id/sequest.rb +28 -0
data/lib/spec_id/sequest/pepxml.rb +142 -197
data/lib/spec_id/sqt.rb +349 -0
data/lib/spec_id/srf.rb +33 -23
data/lib/validator.rb +40 -57
data/lib/validator/aa.rb +3 -90
data/lib/validator/aa_est.rb +112 -0
data/lib/validator/cmdline.rb +163 -31
data/lib/validator/decoy.rb +15 -7
data/lib/validator/digestion_based.rb +5 -4
data/lib/validator/q_value.rb +32 -0
data/script/peps_per_bin.rb +67 -0
data/script/sqt_to_meta.rb +24 -0
data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
data/specs/bin/fasta_shaker_spec.rb +2 -2
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
data/specs/bin/filter_and_validate_spec.rb +25 -6
data/specs/bin/ms_to_lmat_spec.rb +2 -2
data/specs/bin/prob_validate_spec.rb +5 -3
data/specs/sample_enzyme_spec.rb +86 -1
data/specs/spec_helper.rb +11 -9
data/specs/spec_id/bioworks_spec.rb +2 -1
data/specs/spec_id/precision/filter_spec.rb +5 -5
data/specs/spec_id/precision/prob_spec.rb +0 -67
data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
data/specs/spec_id/protein_summary_spec.rb +4 -4
data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
data/specs/spec_id/sequest_spec.rb +38 -0
data/specs/spec_id/sqt_spec.rb +111 -3
data/specs/spec_id_spec.rb +2 -0
data/specs/transmem/phobius_spec.rb +3 -1
data/specs/transmem/toppred_spec.rb +1 -1
data/specs/validator/aa_est_spec.rb +66 -0
data/specs/validator/aa_spec.rb +1 -68
data/specs/validator/background_spec.rb +2 -0
data/specs/validator/bias_spec.rb +3 -27
data/specs/validator/decoy_spec.rb +2 -2
data/specs/validator/transmem_spec.rb +2 -1
data/test_files/small.sqt +87 -0
metadata +312 -293

data/lib/validator.rb CHANGED Viewed

@@ -1,19 +1,34 @@
 class Validator
+  # in the absence of digestion, does the spec_id type requires pephits for
+  # validation?
+  def self.requires_pephits?(spec_id_obj)
+    case spec_id_obj
+    when Proph::ProtSummary : true
+    when Proph::PepSummary : true
+    when SQTGroup : true
+    else ; false
+    end
+  end
   Validator_to_string = {
     'Validator::AA' => 'badAA',
+    'Validator::AAEst' => 'badAAEst',
     'Validator::Decoy' => 'decoy',
     'Validator::Transmem::Protein' => 'tmm',
     'Validator::TruePos' => 'tps',
     'Validator::Bias' => 'bias',
     'Validator::Probability' => 'prob',
+    'Validator::QValue' => 'qval',
     :bad_aa => 'badAA',
+    :bad_aa_est => 'badAAEst',
     :decoy => 'decoy',
     :tmm => 'tmm',
     :tps => 'tps',
     :bias => 'bias',
     :prob => 'prob',
+    :qval => 'qval',
   }
   def initialize_increment
@@ -45,12 +60,12 @@ class Validator
     @increment_tps += tps.size
     @increment_fps += fps.size
     (num_tps, num_fps) =
-    if self.respond_to?(:calc_precision_prep)  # for digestion based validators
-      (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
-      [num_tps, num_fps]
-    else
-      [@increment_tps, @increment_fps]
-    end
+      if self.respond_to?(:calc_precision_prep)  # for digestion based validators
+        (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
+        [num_tps, num_fps]
+      else
+        [@increment_tps, @increment_fps]
+      end
     calc_precision(num_tps, num_fps)
   end
@@ -97,12 +112,16 @@ class Validator
       case val
       when Validator::TruePos
         hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
+      when Validator::AAEst
+        %w(frequency background calculated_background).each do |cat|
+          hash[cat.to_sym] = val.send(cat.to_sym)
+        end
       when Validator::AA
-        %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
+        %w(false_to_total_ratio background calculated_background).each do |cat|
           hash[cat.to_sym] = val.send(cat.to_sym)
         end
       when Validator::Decoy
-        %w(correct_wins decoy_on_match).each do |cat|
+        %w(decoy_to_target_ratio correct_wins decoy_on_match).each do |cat|
           hash[cat.to_sym] = val.send(cat.to_sym)
         end
         hash[:constraint] = val.constraint.inspect if val.constraint
@@ -119,6 +138,8 @@ class Validator
         %w(prob_method).each do |cat|
           hash[cat.to_sym] = val.send(cat.to_sym)
         end
+      when Validator::QValue
+        # no params to add
       else ; raise ArgumentError, "Don't know the validator class #{val}"
       end
       klass_as_s = val.class.to_s
@@ -127,46 +148,6 @@ class Validator
       hash
     end
   end
-=begin
-  ## THIS IS WITH STRINGS AS KEYS!
-  # takes an array of validators and returns a fresh array where each has been
-  # turned into a sensible hash (with symbols as the keys!)
-  def self.sensible_validator_hashes(validators)
-    validators.map do |val|
-      hash = {}
-      case val
-      when Validator::TruePos
-        hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
-      when Validator::AA
-        %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
-          hash[cat] = val.send(cat.to_sym)
-        end
-      when Validator::Decoy
-        %w(correct_wins decoy_on_match).each do |cat|
-          hash[cat] = val.send(cat.to_sym)
-        end
-        hash['constraint'] = val.constraint.inspect if val.constraint
-      when Validator::Bias
-        %w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
-          hash[cat] = val.send(cat.to_sym)
-        end
-        hash['file'] = val.fasta.filename
-      when Validator::Transmem::Protein
-        %w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
-          hash[cat] = val.send(cat.to_sym)
-        end
-      when Validator::Probability
-      else ; raise ArgumentError, "Don't know the validator class #{val}"
-      end
-      klass_as_s = val.class.to_s
-      hash['type'] = Validator_to_string[klass_as_s]
-      hash['class'] = klass_as_s
-      hash
-    end
-  end
-=end
 end
 module Precision::Calculator
@@ -186,11 +167,11 @@ end
 # normal hits (which may be true or false) and the second are decoy hits.
 # edge case:  if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
 module Precision::Calculator::Decoy
-  def calc_precision(num_normal, num_decoy)
+  def calc_precision(num_normal, num_decoy, decoy_to_target_ratio=1.0)
     # will calculate as floats in case fractional amounts passed in for
     # whatever reason
     num_normal_f = num_normal.to_f
-    num_true_pos = num_normal.to_f - num_decoy
+    num_true_pos = num_normal_f - (num_decoy.to_f / decoy_to_target_ratio)
     precision =
       if num_normal_f == 0.0
         if num_decoy.to_f > 0.0
@@ -204,11 +185,13 @@ module Precision::Calculator::Decoy
   end
 end
-require 'validator/true_pos'
-require 'validator/aa'
-require 'validator/bias'
-require 'validator/decoy'
-require 'validator/transmem'
-require 'validator/probability'
-require 'validator/prot_from_pep'
+#require 'validator/true_pos'
+#require 'validator/aa'
+#require 'validator/aa_est'
+#require 'validator/bias'
+#require 'validator/decoy'
+#require 'validator/transmem'
+#require 'validator/probability'
+#require 'validator/q_value'
+#require 'validator/prot_from_pep'

data/lib/validator/aa.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'validator'  # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
 require 'validator/digestion_based'
 require 'fasta'
 require 'spec_id/aa_freqs'
@@ -12,12 +11,7 @@ class Validator::AA < Validator::DigestionBased
   # it is a false hit if the amino acid is located in the peptide
   attr_accessor :false_if_found
-  # if given, the frequency of the amino acid is used to estimate the false to
-  # total ratio based on the pephits given for pephit_precision.
-  # see Validator::AA.calc_frequency to calculate a frequency
-  attr_accessor :frequency
   DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
     :false_if_found => true,
   } )
@@ -34,21 +28,9 @@ class Validator::AA < Validator::DigestionBased
     end
   end
-  # takes a fasta object and sets the frequency based on constraint.
-  # constraint is one acceptable to initialize!
-  # returns self
-  def set_frequency(fasta_obj)
-    table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
-    @frequency = table[@constraint.to_sym]
-    self
-  end
   # right now only accepts single amino acids as constraints (as a string,
   # e.g. 'C', or symbol, e.g. :C)
   # options:
-  #  :frequency OR :false_to_total_ratio should be used (NOT both)
-  #  :frequency => Float, if the frequency of the amino acid is known (see
-  #                Validator::AA.calc_frequency)
   #  :false_to_total_ratio => if a true digestion was already performed (see
   #                           Validator::AA.calc_false_to_total_ratio)
   #  :false_if_found => it is a false positive if the amino acid is found.
@@ -56,80 +38,11 @@ class Validator::AA < Validator::DigestionBased
   def initialize(constraint, options={})
     @constraint = constraint.to_s
     opts = DEFAULTS.merge(options)
-    (@frequency, @false_to_total_ratio, @false_if_found, @background) = opts.values_at(:frequency, :false_to_total_ratio, :false_if_found, :background)
-  end
-  # if expected is 0 then will return precision = 1.0
-  def pephit_precision(peps)
-    if @frequency
-      (actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
-      if expected == 0.0
-        1.0
-      else
-        # what's this guy ?? good for??
-        fraction_of_expected = actual.to_f/expected
-        pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
-      end
-    elsif @false_to_total_ratio
-      super(peps)
-    else
-      raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
-    end
-  end
-  # returns (Actual(Int), Expected(Float)) based on how many peptides have at
-  # least one amino_acid, the frequency it is observed in background (then we
-  # can look at the size of each peptide and determine the likelihood of
-  # having the peptide with at least one amino acid).
-  # amino_acid should be a string (e.g., 'C')
-  def at_least_one(amino_acid, freq, amino_acid_seqs)
-    one_minus_freq = 1.0 - freq
-    probs = []
-    actual = 0
-    expected = 0.0
-    amino_acid_seqs.each do |aaseq|
-      expected += (1.0 - (one_minus_freq**aaseq.size))
-      if aaseq.include?(amino_acid)
-        actual += 1
-      end
-    end
-    [actual, expected]
-  end
-  # given: (actual # with 'AA', expected # with 'AA', total#peptides,
-  # mean_fraction_of_cysteines_true)
-  #
-  # PepHit('AA') = Peptide containing at least one 'AA'
-  #   # expected PepHit('AA')                 # observed Bad Pep ('AA')
-  #   ----------------------- proportional_to -------------------------
-  #   # total PepHits                         # Total Bad PepHit
-  #
-  #  returns the precision
-  #  the background correction factor will not reduce the actual count of
-  #  peptides to < 0.  One can still get negative precision scores, however,
-  #  depending on the other variables.
-  #  background is the number of peptides with the amino acid in the purest
-  #  sample over the total number of peps.
-  #---
-  # this is thoroughly explained in my 2007_09 presentations (inkscape)
-  #+++
-  def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
-    actual = actual.to_f
-    @calculated_background = actual / total_peps
-    actual -= (total_peps * background)
-    # We were doing it compared to the number expected.. but this is more
-    # clear
-    # actual/false_hits = expected/total_peps_passing
-    # false_hits = (total_peps_passing * actual) / expected
-    if actual < 0.0 ; actual = 0.0 end
-    total_number_false = (actual * total_peps).to_f / expected
-    #fppr = total_number_false / total_peps
-    prec = (total_peps - total_number_false) / total_peps
+    (@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
   end
   def to_param_string
-    "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
+    "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
   end
 end

data/lib/validator/aa_est.rb ADDED Viewed

@@ -0,0 +1,112 @@
+require 'validator/aa'
+class Validator ; end
+class Validator::AA ; end
+# A class that uses the peps given to it and a background frequency to
+# calculate the false_to_total_ratio at each turn.
+class Validator::AAEst < Validator::AA
+  attr_accessor :constraint
+  attr_accessor :false_if_found
+  # the frequency of the amino acid is used to estimate the false to
+  # total ratio based on the pephits given for pephit_precision.
+  # see Validator::AA.calc_frequency to calculate a frequency
+  # or use set_frequency to set from pep hits.
+  attr_accessor :frequency
+  DEFAULTS = {
+    :false_if_found => true
+  }.merge(Validator::DigestionBased::DEFAULTS)  # background 0.0
+  # only takes a string right now for constraint
+  def initialize(constraint, options={})
+    @constraint = constraint.to_s
+    opts = DEFAULTS.merge(options)
+    (@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
+  end
+  def pephit_precision(peps)
+    set_false_to_total_ratio(peps)
+    super(peps)
+  end
+  def set_false_to_total_ratio(peps)
+    if peps.size > 0
+      expected = 0.0
+      peps.each do |pep|
+        expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
+      end
+      @false_to_total_ratio = expected / peps.size
+    else
+      @false_to_total_ratio = 1.0
+    end
+  end
+  def set_ongoing_false_to_total_ratio(peps)
+    if peps.size > 0
+      peps.each do |pep|
+        @expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
+      end
+      # @increment_total_submitted should == @increment_tps and @increment_fps
+      # since these are either/or
+      @false_to_total_ratio = @expected / @increment_total_submitted
+    else
+      @false_to_total_ratio = 1.0
+    end
+  end
+  def to_param_string
+    "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
+  end
+  # takes objects responding to aaseq and sets the frequency based on
+  # constraint.  constraint is one acceptable to initialize!  returns self
+  def set_frequency(objs)
+    table = SpecID::AAFreqs.new.calculate_frequencies(objs)
+    @frequency = table[@constraint.to_sym]
+    self
+  end
+   # if adding pephits in groups at a time, the entire group does not need to be
+  # queried, just the individual hit.  Use this OR pephits_precision (NOT
+  # both).  The initial query to this method will begin a running tally that
+  # is saved by the validator.
+  # takes either an array or a single pephit (determined by if it is a
+  # SpecID::Pep)
+  def increment_pephits_precision(peps)
+    tmp = $VERBOSE; $VERBOSE = nil
+    unless @increment_initialized
+      initialize_increment
+      @expected = 0.0
+    end
+    $VERBOSE = tmp
+    to_submit =
+      if peps.is_a? SpecID::Pep
+        [peps]
+      else
+        peps
+      end
+    @increment_total_submitted += to_submit.size
+    (tps, fps) = partition(to_submit)
+    #### THIS IS THE MAGIC FOR THIS VALIDATOR:
+    set_ongoing_false_to_total_ratio(to_submit)
+    @increment_tps += tps.size
+    @increment_fps += fps.size
+    (num_tps, num_fps) =
+      if self.respond_to?(:calc_precision_prep)  # for digestion based validators
+        (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
+        [num_tps, num_fps]
+      else
+        [@increment_tps, @increment_fps]
+      end
+    calc_precision(num_tps, num_fps)
+  end
+end

data/lib/validator/cmdline.rb CHANGED Viewed

@@ -1,14 +1,31 @@
 require 'validator'
+require 'validator/true_pos'
+require 'validator/aa'
+require 'validator/aa_est'
+require 'validator/bias'
+require 'validator/decoy'
+require 'validator/transmem'
+require 'validator/probability'
+require 'validator/q_value'
+require 'validator/prot_from_pep'
+## these all for a stupid check...
+require 'spec_id/sqt'
+require 'spec_id/proph/prot_summary'
+require 'spec_id/proph/pep_summary'
 class Validator::Cmdline
   Validator_symbols_to_classes = {
     :tmm => Validator::Transmem::Protein,
     :decoy => Validator::Decoy,
     :bad_aa => Validator::AA,
+    :bad_aa_est => Validator::AAEst,
     :tps => Validator::TruePos,
     :bias => Validator::Bias,
     :prob => Validator::Probability,
+    :qval => Validator::QValue,
   }
   # was VAL_DEFAULTS
   DEFAULTS = {
@@ -24,11 +41,16 @@ class Validator::Cmdline
     {
       :hits_together => true,
       :decoy_on_match => true,
+      :decoy_to_target_ratio => 1.0,
     },
     :bad_aa =>
     {
       :false_if_found => true,
-      :estimate => true,
+      :bkg => 0.0,
+    },
+    :bad_aa_est =>
+    {
+      :false_if_found => true,
       :bkg => 0.0,
     },
     :bias =>
@@ -39,7 +61,7 @@ class Validator::Cmdline
     :ties => true,
   }
   COMMAND_LINE = {
-    :decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
+    :decoy => ["--decoy /REGEXP/|FILENAME[,DTR,DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
                                                 "FILENAME of separate search on decoys.",
                                                 "All regular expressions must be surrounded by '/'",
                                                 "(no extended options [trailing modifiers]).",
@@ -50,21 +72,30 @@ class Validator::Cmdline
                                                 "    --decoy '/^\\s*REVERSE/'",
                                                 "If decoys proteins were searched in a separate file,",
                                                 "then give the FILENAME (e.g., --decoy decoy.srg)",
+                                                "DTR = Decoy to Target Ratio (default: #{DEFAULTS[:decoy][:decoy_to_target_ratio]})",
                                                 "DOM = *true/false, decoy on match",],
         :tps => ["--tps <fasta>", "for a completely defined sample, this is the",
                                   "fasta file containing the true protein hits"],
          # may require digestion:
-        :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
-                                                         "information (that is shared between them).",
+        :fasta => ["--fasta FASTA", "fasta file for phobius transmembrane",
+                                    "(needed if PEPS options is not false)"],
+        :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "[not recommended]",
+                                                         "Creates the 'false/total' ratio with in silico",
+                                                         "digestion.  Otherwise, the 3rd-10th best hits (sorted by",
+                                                         "xcorr) are used.",
+                                                         "The following validators will use this",
+                                                         "information (shared between them) if option given",
                                                          "ORIG_FASTA = the fasta file used to do the run",
                                                          "PARAMS = the params file used to do the run",],
         :bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
                                                   "PE = *true|false proteins in fasta file expected in sample",
                                                   "BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
-        :bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
+        :bad_aa => ["--bad_aa AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
                                                         "AA = The amino acid (e.g., 'C')",
-                                                        "EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
                                                         "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
+        :bad_aa_est => ["--bad_aa_est AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
+                                                        "AA = The amino acid (e.g., 'C')",
+                                                        "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa_est][:bkg]}):",],
         :tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
                                                          "phobius.small:",
@@ -110,17 +141,27 @@ class Validator::Cmdline
           end
         opts[:validators].push([:prob, mthd])
       },
+        :qval => lambda {|ar, opts| opts[:validators].push([:qval]) },
         :decoy => lambda {|ar, opts|
         myargs = [:decoy]
         first_arg = ar[0]
-        myargs[1] =
+        val_opts = {}
+        val_opts[:constraint] =
           if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
+            # cast as a regular expression of has '/ /'
             Regexp.new(first_arg[1...-1])
           else
+            # assume that it is a filename
+            raise ArgumentError, "File does not exist: #{first_arg}\n(was this supposed to be a regular expression? if so, should be given: /#{first_arg}/)" unless File.exist?(first_arg)
             first_arg
           end
-        myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
-        opts[:validators].push(myargs)
+        val_opts[:decoy_to_target_ratio] = (ar[1] || DEFAULTS[:decoy][:decoy_to_target_ratio]).to_f
+        val_opts[:decoy_on_match] = self.boolean(ar[2], DEFAULTS[:decoy][:decoy_on_match])
+        myargs.push(val_opts)
+        opts[:validators].push(myargs)
+      },
+        :fasta => lambda {|arg, opts|
+        opts[:fasta] = Fasta.new(arg)
       },
         :digestion => lambda {|ar, opts|
         raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
@@ -138,6 +179,9 @@ class Validator::Cmdline
           else
             DEFAULTS[:bias][:bkg]
           end
+        if ar[3]
+          val_opts[:false_to_total_ratio] = ar[3].to_f
+        end
         myargs.push(val_opts)
         opts[:validators].push(myargs)
       },
@@ -146,16 +190,36 @@ class Validator::Cmdline
         myargs = [:bad_aa]
         myargs.push( ar[0] )
         val_opts = {}
-        val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
         val_opts[:background] =
-          if ar[2]
-            ar[2].to_f
+          if ar[1]
+            ar[1].to_f
           else
             DEFAULTS[:bad_aa][:bkg]
           end
+        if ar[2]
+          val_opts[:false_to_total_ratio] = ar[2].to_f
+        end
         myargs.push(val_opts)
         opts[:validators].push(myargs)
       },
+        :bad_aa_est => lambda {|ar, opts|
+        ## GET the FREQUENCY
+        myargs = [:bad_aa_est]
+        myargs.push( ar[0] )
+        val_opts = {}
+        val_opts[:background] =
+          if ar[1]
+            ar[1].to_f
+          else
+            DEFAULTS[:bad_aa_est][:bkg]
+          end
+        if ar[2]
+          val_opts[:frequency] = ar[2].to_f
+        end
+        myargs.push(val_opts)
+        opts[:validators].push(myargs)
+      },
         :tmm =>  lambda {|ar, opts|
         myargs = [:tmm]
         myargs.push( ar[0] )
@@ -177,16 +241,38 @@ class Validator::Cmdline
           if ar[4] ; ar[4].to_f
           else ; DEFAULTS[:tmm][:bkg]
           end
+        if ar[5]
+          val_opts[:false_to_total_ratio] = ar[5].to_f
+        end
         myargs.push(val_opts)
         opts[:validators].push( myargs )
       },
+      :pephits => lambda {|v,opts| opts[:pephits] = SpecID.new(v) },
       :tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
       :false_on_tie => lambda {|v,opts| opts[:ties] = false },
       }
+      def self.requires_pephits?(spec_id_obj)
+        case spec_id_obj
+        when Proph::ProtSummary : true
+        # at least currently (subject to change)
+        when Proph::PepSummary : true
+        when SQTGroup
+          if spec_id_obj.peps.first.respond_to?(:q_value)
+            # its percolator output and we don't have other hits to use
+            true
+          else
+            false
+          end
+        else ; false
+        end
+      end
       # remove the keys from opts involved in validators and return an array
       # of validators
-      def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
+      # postfilter is one of :top_per_scan, :top_per_aaseq,
+      # :top_per_aaseq_charge (of which last two are subsets of scan)
+      def self.prepare_validators(opts, false_on_tie, interactive, postfilter, spec_id)
         validator_args = opts[:validators]
         correct_wins = !false_on_tie
         need_false_to_total_ratio = []
@@ -199,7 +285,9 @@ class Validator::Cmdline
             case tp
             when :tmm
               val_args[1][:correct_wins] = correct_wins
-              val_args[1][:fasta] = opts[:digestion_objects][0]
+              if opts.key?(:fasta)
+                val_args[1][:fasta] = opts[:fasta]
+              end
               val_args
             when :bias
               val_args[1][:correct_wins] = correct_wins
@@ -208,10 +296,10 @@ class Validator::Cmdline
               val_args = [val_args[0], correct_wins]
               val_args
             when :decoy
-              val_args = [val_args[0], val_args[1], correct_wins]
+              val_args[0][:correct_wins] = correct_wins
               # don't delete the key here since we need the decoy = regexp key
               val_args
-            else ## bad_aa and prob are represented here:
+            else ## bad_aa, prob, and qval are represented here:
               val_args
             end
           val = Validator_symbols_to_classes[tp].new( *val_args )
@@ -219,10 +307,12 @@ class Validator::Cmdline
           if tp == :tmm
             transmem_vals << val
           end
-          potential_digestion_classes = /Transmem|AA|Bias/
+          potential_digestion_classes = /Transmem|AA|AAEst|Bias/
           if val.class.to_s =~ potential_digestion_classes
-            if val_args[1][:estimate] == true
-              need_frequency << val
+            if val.class.to_s == 'Validator::AAEst'
+              need_frequency.push(val) if val.frequency.nil?
+            elsif !(val.false_to_total_ratio.nil?)
+              $stderr.puts "using false_to_total_ratio: #{val.false_to_total_ratio}"
             else
               need_false_to_total_ratio << val
             end
@@ -230,20 +320,62 @@ class Validator::Cmdline
           val
         end
-        if need_false_to_total_ratio.size > 0
-          raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
-          peps = Digestor.digest( *(opts[:digestion_objects]) )
-          need_false_to_total_ratio.each do |val|
-            val.set_false_to_total_ratio( peps )
-          end
-        end
-        if need_frequency.size > 0
-          raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
-          need_frequency.each do |val|
-            val.set_frequency( opts[:digestion_objects][0] )
+        if ((need_false_to_total_ratio.size > 0) or (need_frequency.size > 0))
+          if opts.key?(:digestion_objects)
+            #raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
+            peps = Digestor.digest( *(opts[:digestion_objects]) )
+            need_false_to_total_ratio.each do |val|
+              val.set_false_to_total_ratio( peps )
+            end
+            if need_frequency.size > 0
+              need_frequency.each do |val|
+                val.set_frequency( opts[:digestion_objects][0] )
+              end
+            end
+            opts.delete(:digestion_objects)
+          else  ## do the new and improved selection of non-top hits to get false_to_total_ratios and freqs
+            $stderr.puts "...using pephits to calculate background ratios"
+            # first_index, last_index
+            pephits =
+              if opts[:pephits]  ## protein prophet (since it needs to get ratios somewhere
+                $stderr.puts "using --pephits"
+                opts[:pephits].peps
+              elsif requires_pephits?(spec_id)
+                raise ArgumentError, "with objects of class '#{spec_id.class}', one of your validators requires --pephits or --digestion"
+              else
+                $stderr.puts "using given spec_id.peps"
+                spec_id.peps
+              end
+            not_first_or_second_peps = Sequest.other_hits_sorted_by_xcorr(pephits, 2, 9, [:base_name, :first_scan, :charge])
+            pephits =
+              case postfilter
+              when :top_per_scan
+                $stderr.puts "using top_per_scan" ; not_first_or_second_peps
+              when :top_per_aaseq
+                # it doesn't matter which one is given since validators are
+                # based on amino acid sequence
+                $stderr.puts 'using top_per_aaseq'
+                not_first_or_second_peps.hash_by(:aaseq).values.map {|pep| pep.first }
+              when :top_per_aaseq_charge
+                $stderr.puts 'using top_per_aaseq_charge'
+                not_first_or_second_peps.hash_by(:aaseq, :charge).values.map {|pep| pep.first }
+              else
+                raise ArgumentError, "must have a valid postfilter method, yours: '#{postfilter}'"
+              end
+            need_false_to_total_ratio.each do |val|
+              val.set_false_to_total_ratio( pephits )
+              $stderr.puts "false_to_total_ratio for #{val.class.to_s}: #{val.false_to_total_ratio}"
+            end
+            if need_frequency.size > 0
+              need_frequency.each do |val|
+                $stderr.puts "Setting frequency!"
+                val.set_frequency( pephits )
+              end
+            end
           end
         end
-        opts.delete(:digestion_objects)
         if (transmem_vals.size > 0)   #  and interactive   ## we'd like to just run this for interactive
           # This is overkill if we are doing a single filtering job, but it