RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/lib/validator/background.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require 'validator'
+require 'vec'
+require 'enumerator'
+class Validator ; end
+class Validator::Background
+  attr_accessor :data
+  def initialize(data=nil)
+    @data = data
+  end
+  def delete_nan!(vec)
+    vec.each_with_index do |v,i|
+      if v.nan?
+        vec[i] = 0
+      end
+    end
+  end
+  def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
+    data_vec = VecD[*@data]
+    delete_nan!(data_vec)
+    stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread  }
+    smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
+    smoothed_stdev_derivs = smoothed_stdev.chim
+    last_0_index = index_of_last_0(smoothed_stdev_derivs)
+    min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
+  end
+  # not really working right currently
+  def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
+    data_vec = VecD[*@data]
+    delete_nan!(data_vec)
+    drvs = data_vec.chim
+    # absolute value
+    drvs.each_with_index {|x,i| drvs[i] = x.abs }
+    mv_avg = drvs.transform(avg_points) {|v| v.avg }
+    last_0_index = index_of_last_0(mv_avg.chim)
+    min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
+  end
+  def index_of_last_0(vec)
+    last_0_index = nil
+    vec.each_with_index do |v,i|
+      if v == 0
+        last_0_index = i
+      end
+    end
+    last_0_index
+  end
+  # returns the minimum value in the window centered on index
+  def min_in_window(vec, index, pre, post)
+    last_index = vec.size - 1
+    start = index - pre
+    stop = index + post
+    start = 0 if start < 0
+    stop = last_index if stop > last_index
+    vec[start..stop].min
+  end
+  # very simple, should work
+  def min_mesa(start, stop, points=3)
+    data_vec = VecD[*@data]
+    delete_nan!(data_vec)
+    smoothed = data_vec.transform(3) {|v| v.avg }
+    smoothed[start..stop].min
+  end
+end

data/lib/validator/bias.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require 'validator'
+require 'validator/digestion_based'
+# class for any generic kind of bias.  For instance, a list of high abundance
+# proteins we would expect to see, or a list of low abundance proteins we
+# would not expect to see, or proteins that have been filtered out in some
+# way, etc.
+class Validator::Bias < Validator::DigestionBased
+  include Precision::Calculator
+  # a fasta object (by default containing proteins expected to be in the
+  # sample [see proteins_expected to modify that behavior])
+  attr_reader :fasta
+  # correct_wins means that only a single protein from a pep.aaseq must match
+  # the fasta object for the pep hit to be considered valid.  Otherwise, all
+  # must be a match (logic negated by proteins_expected)
+  attr_accessor :correct_wins
+  # proteins_expected==true means we expect to see the proteins in the sample
+  # proteins_expected==false means we do not expect to see these proteins in
+  # the sample
+  attr_accessor :proteins_expected
+  # a hash made by taking each fasta reference in fasta_object, (everything
+  # until a space) and setting the value to true.  It can be queried with the
+  # start of an fasta sequence
+  attr_accessor :short_reference_hash
+  DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
+    :proteins_expected => true,
+    :correct_wins => true,
+  } )
+  # options:
+  #   (t = true, f = false, '*'= default)
+  #   :proteins_expected => *t/f  we expect to see the fasta proteins in our hit list
+  #   :correct_wins => *t/f  a single peptide hit from one of these proteins
+  #                    constitutes a true positive
+  #   :background => Float  (*0.0-1.0)
+  #   :false_to_total_ratio => Float (*nil by default)
+  def initialize(fasta_object, options={})
+    opts = DEFAULTS.merge(options)
+    (@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
+    @fasta = fasta_object
+    @header_split_hash = @fasta.prots.map {|prot| prot.reference }
+    @short_reference_hash = self.class.make_short_reference_hash(fasta_object)
+  end
+  def self.make_short_reference_hash(fasta_object)
+    hash = {}
+    fasta_object.each do |prot|
+      hash[prot.first_entry] = true
+    end
+    hash
+  end
+  def partition(peps)
+    klass = self.class
+    cw =
+      if !@proteins_expected
+        !@correct_wins
+      else
+        @correct_wins
+      end
+    (tp, fp) =
+      if cw
+        peps.partition do |pep|
+          pep.prots.any? do |pepprot|
+            @short_reference_hash.key?( pepprot.first_entry )
+          end
+        end
+      else
+        peps.partition do |pep|
+          pep.prots.any? do |pepprot|
+            !@short_reference_hash.key?( pepprot.first_entry )
+          end
+        end
+      end
+    if !@correct_wins
+      tp, fp = fp, tp
+    end
+    [tp, fp]
+  end
+  # pephit_precision is done through inheritance
+  def to_param_string
+    "abundance=" +  ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
+  end
+end

data/lib/validator/cmdline.rb ADDED Viewed

@@ -0,0 +1,260 @@
+require 'validator'
+class Validator::Cmdline
+  Validator_symbols_to_classes = {
+    :tmm => Validator::Transmem::Protein,
+    :decoy => Validator::Decoy,
+    :bad_aa => Validator::AA,
+    :tps => Validator::TruePos,
+    :bias => Validator::Bias,
+    :prob => Validator::Probability,
+  }
+  # was VAL_DEFAULTS
+  DEFAULTS = {
+    :tmm =>
+    {
+      # file
+      :min_num_tmm_seqs => 1,
+      :expect_soluble => true,
+      :no_include_tm_peps => 0.8,
+      :bkg => 0.0,
+    },
+    :decoy =>
+    {
+      :hits_together => true,
+      :decoy_on_match => true,
+    },
+    :bad_aa =>
+    {
+      :false_if_found => true,
+      :estimate => true,
+      :bkg => 0.0,
+    },
+    :bias =>
+    {
+      :bkg => 0.0,
+      :proteins_expected => true,
+    },
+    :ties => true,
+  }
+  COMMAND_LINE = {
+    :decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
+                                                "FILENAME of separate search on decoys.",
+                                                "All regular expressions must be surrounded by '/'",
+                                                "(no extended options [trailing modifiers]).",
+                                                "e.g., a run using concatenated reversed proteins that",
+                                                "includes 'REVERSE' in the fasta heading:",
+                                                "    --decoy /REVERSE/",
+                                                "Anything fancier should be quoted:",
+                                                "    --decoy '/^\\s*REVERSE/'",
+                                                "If decoys proteins were searched in a separate file,",
+                                                "then give the FILENAME (e.g., --decoy decoy.srg)",
+                                                "DOM = *true/false, decoy on match",],
+        :tps => ["--tps <fasta>", "for a completely defined sample, this is the",
+                                  "fasta file containing the true protein hits"],
+         # may require digestion:
+        :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
+                                                         "information (that is shared between them).",
+                                                         "ORIG_FASTA = the fasta file used to do the run",
+                                                         "PARAMS = the params file used to do the run",],
+        :bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
+                                                  "PE = *true|false proteins in fasta file expected in sample",
+                                                  "BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
+        :bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
+                                                        "AA = The amino acid (e.g., 'C')",
+                                                        "EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
+                                                        "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
+        :tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
+                                                         "phobius.small:",
+                                                         "http://phobius.cgb.ki.se/",
+                                                         "(select 'Short' output, and save output as file)",
+                                                         "toppred.out:",
+                                                         "http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html",
+                                                         "(output 'toppred.out' in 'New' or 'Xml' format)",
+                                                         "MIN = Int, minimum number transmembrane seqs (def: #{DEFAULTS[:tmm][:min_num_tmm_seqs]})",
+                                                         "SOL = true|false, this is a soluble fraction( def: #{DEFAULTS[:tmm][:expect_soluble]})",
+                                                         "PEPS = Float | false, don't consider tm peps (>= fraction",
+                                                         "                   tm content) (false skips) (def: #{DEFAULTS[:tmm][:no_include_tm_peps]})",
+                                                         "BKG = Float , background contaminating insoluble (def: #{DEFAULTS[:tmm][:bkg]})"],
+        # VALIDATION MODIFIERS
+        :false_on_tie => ["--false_on_tie", "if peptide belongs to correct AND incorrect proteins",
+                                            "it will be counted as correct"],
+  }
+      def self.boolean(arg, default)
+        case arg
+        when 'true' ; true
+        when 'false' ; false
+        else ; default
+        end
+      end
+      PrepArgs = {
+        :prob => lambda {|ar, opts|
+        mthd =
+          if ar
+            if ar == 'nsp'
+              :probability
+            elsif ar == 'init'
+              :initial_probability
+            else
+              raise ArgumentError, "--prob [arg], optional arg can only be 'nsp' or 'init'!"
+            end
+          else
+            :probability
+          end
+        opts[:validators].push([:prob, mthd])
+      },
+        :decoy => lambda {|ar, opts|
+        myargs = [:decoy]
+        first_arg = ar[0]
+        myargs[1] =
+          if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
+            Regexp.new(first_arg[1...-1])
+          else
+            first_arg
+          end
+        myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
+        opts[:validators].push(myargs)
+      },
+        :digestion => lambda {|ar, opts|
+        raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
+        opts[:digestion] = ar.dup
+        opts[:digestion_objects] = [Fasta.new(ar[0]), Sequest::Params.new(ar[1])]
+      },
+        :bias => lambda {|ar, opts|
+        myargs = [:bias]
+        myargs.push( Fasta.new(ar[0]) )
+        val_opts = {}
+        val_opts[:proteins_expected] = self.boolean(ar[1], DEFAULTS[:bias][:proteins_expected])
+        val_opts[:background] =
+          if ar[2]
+            ar[2].to_f
+          else
+            DEFAULTS[:bias][:bkg]
+          end
+        myargs.push(val_opts)
+        opts[:validators].push(myargs)
+      },
+        :bad_aa => lambda {|ar, opts|
+        ## GET the FREQUENCY
+        myargs = [:bad_aa]
+        myargs.push( ar[0] )
+        val_opts = {}
+        val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
+        val_opts[:background] =
+          if ar[2]
+            ar[2].to_f
+          else
+            DEFAULTS[:bad_aa][:bkg]
+          end
+        myargs.push(val_opts)
+        opts[:validators].push(myargs)
+      },
+        :tmm =>  lambda {|ar, opts|
+        myargs = [:tmm]
+        myargs.push( ar[0] )
+        val_opts = {}
+        val_opts[:min_num_tms] =
+          if ar[1] ; ar[1].to_i
+          else ; DEFAULTS[:tmm][:min_num_tmm_seqs]
+          end
+        val_opts[:soluble_fraction] = self.boolean(ar[2], DEFAULTS[:tmm][:expect_soluble])
+        val_opts[:no_include_tm_peps] =
+          if ar[3]
+            case ar[3]
+            when 'false' ; false
+            else ; ar[3].to_f
+            end
+          else ; DEFAULTS[:tmm][:no_include_tm_peps]
+          end
+        val_opts[:background] =
+          if ar[4] ; ar[4].to_f
+          else ; DEFAULTS[:tmm][:bkg]
+          end
+        myargs.push(val_opts)
+        opts[:validators].push( myargs )
+      },
+      :tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
+      :false_on_tie => lambda {|v,opts| opts[:ties] = false },
+      }
+      # remove the keys from opts involved in validators and return an array
+      # of validators
+      def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
+        validator_args = opts[:validators]
+        correct_wins = !false_on_tie
+        need_false_to_total_ratio = []
+        need_frequency = []
+        transmem_vals = []
+        validators = validator_args.map do |args|
+          tp = args.shift
+          val_args = args.dup # protect the original keys
+          val_args =
+            case tp
+            when :tmm
+              val_args[1][:correct_wins] = correct_wins
+              val_args[1][:fasta] = opts[:digestion_objects][0]
+              val_args
+            when :bias
+              val_args[1][:correct_wins] = correct_wins
+              val_args
+            when :tps
+              val_args = [val_args[0], correct_wins]
+              val_args
+            when :decoy
+              val_args = [val_args[0], val_args[1], correct_wins]
+              # don't delete the key here since we need the decoy = regexp key
+              val_args
+            else ## bad_aa and prob are represented here:
+              val_args
+            end
+          val = Validator_symbols_to_classes[tp].new( *val_args )
+          # make some lists of validators based on pre-processing needs:
+          if tp == :tmm
+            transmem_vals << val
+          end
+          potential_digestion_classes = /Transmem|AA|Bias/
+          if val.class.to_s =~ potential_digestion_classes
+            if val_args[1][:estimate] == true
+              need_frequency << val
+            else
+              need_false_to_total_ratio << val
+            end
+          end
+          val
+        end
+        if need_false_to_total_ratio.size > 0
+          raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
+          peps = Digestor.digest( *(opts[:digestion_objects]) )
+          need_false_to_total_ratio.each do |val|
+            val.set_false_to_total_ratio( peps )
+          end
+        end
+        if need_frequency.size > 0
+          raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
+          need_frequency.each do |val|
+            val.set_frequency( opts[:digestion_objects][0] )
+          end
+        end
+        opts.delete(:digestion_objects)
+        if (transmem_vals.size > 0)   #  and interactive   ## we'd like to just run this for interactive
+          # This is overkill if we are doing a single filtering job, but it
+          # ensures that it works in all the ways I'm doing it.  Should
+          # refactor eventually !!
+          transmem_vals.each do |val|                      ## but, prob uses it too!
+            val.transmem_status_hash = val.create_transmem_status_hash(spec_id.peps)
+          end
+        end
+        validators
+      end
+end

data/lib/validator/decoy.rb ADDED Viewed

@@ -0,0 +1,94 @@
+require 'validator'
+class Validator::Decoy < Validator
+  include Precision::Calculator::Decoy
+  attr_accessor :constraint
+  attr_accessor :decoy_on_match
+  attr_accessor :correct_wins
+  attr_accessor :last_pep_was_decoy
+  attr_accessor :increment_normal
+  attr_accessor :increment_decoy
+  attr_accessor :increment_total_submitted
+  attr_reader :normal_peps_just_submitted
+  def initialize(constraint=nil, decoy_on_match = true, correct_wins = true)
+    @decoy_on_match = decoy_on_match
+    @correct_wins = correct_wins
+    @constraint = constraint
+  end
+  # returns [normal, decoy] (?? I think ??)
+  def partition(peps)
+    if @decoy_on_match
+      if @correct_wins
+        peps.partition do |pep|
+          !(pep.prots.all? {|prot| prot.reference.match(@constraint) })
+        end
+      else  # fp wins
+        peps.partition do |pep|
+          !(pep.prots.any? {|prot| prot.reference.match(@constraint) })
+        end
+      end
+    else
+      if @correct_wins
+        peps.partition do |pep|
+          pep.prots.any? {|prot| prot.reference.match(@constraint) }
+        end
+      else
+        peps.partition do |pep|
+          pep.prots.all? {|prot| prot.reference.match(@constraint) }
+        end
+      end
+    end
+  end
+  def initialize_increment
+    @increment_normal = 0
+    @increment_decoy = 0
+    @increment_total_submitted = 0
+    @increment_initialized = true
+  end
+  # does not deal in separate_peps right now!!
+  # will take an array or single peptide
+  def increment_pephits_precision(peps)
+    tmp = $VERBOSE; $VERBOSE = nil
+    initialize_increment unless @increment_initialized
+    $VERBOSE = tmp
+    to_submit =
+      if peps.is_a? SpecID::Pep
+        [peps]
+      else
+        peps
+      end
+    @increment_total_submitted += to_submit.size
+    (normal, decoy) = partition(to_submit)
+    @normal_peps_just_submitted = normal
+    @increment_normal += normal.size
+    @increment_decoy += decoy.size
+    calc_precision(@increment_normal, @increment_decoy)
+  end
+  def pephit_precision(peps, separate_peps=nil)
+    if separate_peps
+      calc_precision(peps.size, separate_peps.size)
+    else
+      (norm, decoy) = partition(peps)
+      calc_precision(norm.size, decoy.size)
+    end
+  end
+  def to_param_string
+    "decoy="+ ["{constraint=#{(constraint ? constraint.inspect : '')}", "decoy_on_match=#{@decoy_on_match}", "correct_wins=#{@correct_wins}}"].join(", ")
+  end
+end

data/lib/validator/digestion_based.rb ADDED Viewed

@@ -0,0 +1,69 @@
+require 'validator'
+require 'fasta'
+require 'spec_id/sequest/params'
+# objects of this class can calculate pephit_precision given an array of
+# SpecID::Pep objects using the pephit_precision method.
+class Validator::DigestionBased < Validator
+  DEFAULTS = {
+    :false_to_total_ratio => 1.0,
+    :background => 0.0,
+  }
+  # the number of tps
+  attr_accessor :increment_tps
+  # the number of fps
+  attr_accessor :increment_fps
+  # the total peptides submitted to the validator (regardless of tp, fp, or
+  # nil)
+  attr_accessor :increment_total_submitted
+  # the ratio of false hits to total peptides in the fasta file
+  attr_accessor :false_to_total_ratio
+  # the false_to_total_ratio calculated (but not applied)
+  attr_reader :calculated_background
+  # For a sample with no false hits in it, (under defaults) this is the
+  # fraction of peptides with the constraint over the total number of peptides
+  # from which these hits are derived.
+  attr_accessor :background
+  # expects that classes define a partition method, and a @background
+  def pephit_precision(peps)
+    ## this gives us the fraction that are transmembrane (under defaults):
+    (tps, fps) = partition(peps)
+    (num_tps, num_fps) = calc_precision_prep(tps.size, fps.size)
+    calc_precision(num_tps, num_fps)
+  end
+  # returns [num_tps, num_fps]
+  def calc_precision_prep(num_tps, num_fps)
+    total_peps_passing_partition = num_tps + num_fps
+    num_fps = adjust_fps_for_background(num_tps, num_fps, @background)
+    ## we must use the false_to_total_ratio to estimate how many are really
+    ## incorrect!
+    # FALSE/TOTAL  = FALSE(found)/TOTAL(found)
+    # TOTAL(found) = FALSE(found) * TOTAL/FALSE
+    #              = FALSE(found) / (FALSE/TOTAL)
+    total_false = num_fps / @false_to_total_ratio
+    # NOTE: the partition algorithm drops peptides that are transmembrane
+    # under certain options.  Thus, the total false estimate must be tempered
+    # by this lower number of total peptides.
+    adjusted_tps = total_peps_passing_partition.to_f - total_false
+    [adjusted_tps, total_false]
+  end
+  # returns self
+  # assumes partition returns (tps, fps)
+  def set_false_to_total_ratio(peps)
+    (tps, fps) = partition(peps)
+    @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
+    self
+  end
+end

data/lib/validator/probability.rb ADDED Viewed

@@ -0,0 +1,48 @@
+class Validator::Probability
+  attr_accessor :prob_method
+  def initialize(prob_method=:probability)
+    @prob_method = prob_method
+  end
+  # objs should respond_to probability
+  def precision(objs)
+    return 1.0 if objs.size == 0
+    current_sum_one_minus_prob = 0.0
+    # this should work!
+    #objs.inject(0.0) {|sum,obj| sum + (1.0 - obj.probability) }
+    objs.each do |obj|
+      # SUM(1-probX)/#objs
+      current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
+    end
+    prec = 1.0 - (current_sum_one_minus_prob / objs.size)
+  end
+  # objs should respond_to probability
+  # These should be added from high probability(1.0) to low (0.0)
+  def increment_precision(objs)
+    if objs.is_a?(SpecID::Pep) or objs.is_a?(SpecID::Prot)
+      objs = [objs]
+    end
+    @total_objs ||= 0
+    @current_sum_one_minus_prob ||= 0.0
+    @total_objs += objs.size
+    objs.each do |obj|
+      @current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
+    end
+    prec = 1.0 - (@current_sum_one_minus_prob / @total_objs)
+  end
+  alias_method :pephit_precision, :precision
+  alias_method :prothit_precision, :precision
+  alias_method :increment_pephits_precision, :increment_precision
+end