RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/lib/validator/prot_from_pep.rb ADDED Viewed

@@ -0,0 +1,234 @@
+require 'validator'
+require 'set'
+require 'group_by'
+require 'shuffle'
+# calculates protein hit precision based on peptide precision
+class Validator::ProtFromPep < Validator
+  # calculate protein precision based on the number of false peptides
+  # returns the precision based on the number of proteins *completely false*
+  # calculates the worst precision by assuming that proteins with the fewest
+  # peptides are all false (before prots with more pephits)
+  # note that this approaches the worst, but is not guaranteed to be worst
+  # unless each pephit maps to a single protein hit.
+  # [worst, normal_mean, normal_stddev]
+  # options
+  #    :num_its_normal => Integer, # num iterations for normal (d: 10)
+  #    :num_its_worstcase => Integer, # num iterations for worstcase (d: 10)
+  #
+  def prothit_precision(peps, num_false_pephits, opts={})
+    opts[:num_its_normal] ||= 10
+    opts[:num_its_worstcase] ||= 10
+    # get the num_peps_per_protein array
+    worst = worstcase_prothit_precision(peps, num_false_pephits, :num_its => opts[:num_its_worstcase])
+    (normal_mean, normal_stdev) = normal_prothit_precision( peps, num_false_pephits, :num_its => opts[:num_its_normal])
+    [worst, normal_mean, normal_stdev]
+  end
+  # returns an array of the number of peptide hits in each protein
+  def num_peps_per_protein(peps)
+    num_pephits_by_prot = Hash.new { 0 }
+    peps.each do |pep|
+      pep.prots.each do |prot|
+        num_pephits_by_prot[prot.reference] += 1
+      end
+    end
+    num_pephits_by_prot.values
+  end
+  # returns the worstcase precision.  This assumes that every small protein
+  # with the fewest peptide hits is completely 'filled' with incorrect hits in
+  # preference to any higher hit protein.
+  # Where each peptide hit maps to a single protein, this is guaranteed to be
+  # worst-case.  If this doesn't hold, there are some extreme cases where a
+  # poorer precision could be generated, but this is still probably fairly
+  # close.  Thus, a slightly different answer may be generated each time.
+  # ...variation is produced by shuffling the order of the proteins from which
+  # peptides are removed within groups of proteins having the same number of
+  # peptides.
+  # This method does NOT require that the prothits be updated to reflect only
+  # those pephits being passed in.
+  #
+  #   validator.worstcase_prothit_precision(peps, 14, 1) # => 0.232111
+  #
+  # options:
+  #   :num_its => Integer (default: 10) number of times to run (finds minimum)
+  #   :one_prot_per_pep => true | *false   assumes each peptide maps to a
+  #                                        single protein
+  def worstcase_prothit_precision(peps, num_false_pephits, opts = {})
+    num_its = opts[:num_its] || 10
+    one_prot_per_pep = opts[:one_prot_per_pep]  # nil or false still == false
+    one_prot_per_pep = false if one_prot_per_pep == nil
+    ##############################################
+    # The END Cases (can be dealt with quickly)
+    ##############################################
+    if num_false_pephits == 0
+      return 1.0
+    elsif num_false_pephits >=  peps.size
+      return 0.0
+    end
+    if one_prot_per_pep
+      num_peps_per_prot = num_peps_per_protein(peps)
+      return worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
+    else
+      #####################################
+      # HERE's the basic plan!!
+      #####################################
+      # order the proteins by num peptides
+      # create a set of peptides
+      # delete peptides from the proteins off the set o' peptides (ensuring that
+      # a deleted one cannot be deleted twice)
+      #####################################
+      # order the proteins by num peptides
+      # and create a hash that holds the peptides (given here) in those proteins
+      prots_to_peps_here = Hash.new {|h,k| h[k] = [] }
+      prots_to_peps_size = Hash.new { 0 }
+      pep_ids = []
+      pep_ids_to_prot_ids = Hash.new {|h,k| h[k] = [] }
+      peps.each do |pep|
+        #puts pep.prots.size
+        pep.prots.each do |prot|
+          #p prot.reference
+          prots_to_peps_here[prot] << pep
+          prots_to_peps_size[prot] += 1
+          pep_ids << pep
+          pep_ids_to_prot_ids[pep] << prot
+        end
+      end
+      prot_ids_listed_by_peps_size = prots_to_peps_size.keys
+      tot_num_prots = prot_ids_listed_by_peps_size.size
+      sample = Array.new(num_its)
+      srand( 777 )
+      precision_sample = (0...num_its).to_a.map do
+        num_false_pephits_counter = num_false_pephits
+        # create a set of peptides
+        pep_ids_set = pep_ids.to_set
+        # shuffle the proteins within size groups
+        finished = false
+        prot_ids_listed_by_peps_size.group_by {|prot_id| prots_to_peps_size[prot_id] }.sort.each do |k,group_of_proteins_with_same_pep_size|
+          group_of_proteins_with_same_pep_size.shuffle!
+          group_of_proteins_with_same_pep_size.each do |prot_id|
+            prots_to_peps_here[prot_id].each do |pep_id|
+              if pep_ids_set.include?(pep_id)  # if 1
+                # remove a peptide
+                pep_ids_set.delete(pep_id)
+                num_false_pephits_counter -= 1
+                if num_false_pephits_counter == 0  # if 2
+                  finished = true
+                end                                # close if 2
+              end                                  # close if 1
+              break if finished  # each pep
+            end
+            break if finished  # each prot
+          end
+          break if finished  # each group_of_proteins_with_same_pep_size
+        end # each group_of_proteins_with_same_pep_size
+        ## Figure out the number of proteins left!
+        proteins_still_around = pep_ids_set.inject(Set.new) {|protset,pep_id| protset.merge( pep_ids_to_prot_ids[pep_id]) }
+        proteins_still_around.size.to_f / tot_num_prots
+      end # a sample
+      return precision_sample.min
+    end # FINAL else
+  end
+  # returns the precision of the worst possible outcome
+  def worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
+    completely_false_proteins = 0
+    num_peps_per_prot.sort.each do |num_peps|
+      num_false_pephits -= num_peps
+      if num_false_pephits >= 0
+        completely_false_proteins += 1
+      end
+      if num_false_pephits <= 0
+        break
+      end
+    end
+    num_prots = num_peps_per_prot.size
+    (num_prots - completely_false_proteins).to_f/num_prots
+  end
+  # normal as in a standard normal distribution of peptide hits per protein
+  # they are distributed randomly and the precision is assumed to take on a
+  # standard normal distribution.
+  # num_peps_per_protein is an array of the number of peptides per protein hit
+  # (these are the true hits)
+  # assumes that the number follows a gaussian distribution (binomial
+  # distributions tend toward gaussians, I believe, at large N)
+  # returns [mean_precision, stdev_precision]
+  # options:
+  #   :num_its => Integer (default: 10)
+  #
+  # if num_iterations is set at 1, then only the precision will be returned
+  # though random, the same seed is always used to start this process, meaning
+  # that the same results will be produced on consecutive attempts.
+  #
+  #   validator.normal_prothit_precision(peps, 13, :num_its => 1) # -> 0.95433
+  #   validator.normal_prothit_precision(peps, 13, :num_its => 2) # -> [0.92002, 1.2223]
+  def normal_prothit_precision( peps, num_false_pephits, opts={})
+    num_iterations = opts[:num_its] || 10
+    srand( 38272 )
+    ##############################################
+    # The END Cases (can be dealt with quickly)
+    ##############################################
+    if num_false_pephits == 0
+      if num_iterations == 1
+        return 1.0
+      else
+        return [1.0, 0.0]
+      end
+    elsif num_false_pephits >=  peps.size
+      if num_iterations == 1
+        return 0.0
+      else
+        return [0.0, 0.0]
+      end
+    end
+    ##############################################
+    # Everything else:
+    ##############################################
+    sample = Array.new(num_iterations)
+    base_indices = (0...(peps.size)).to_a
+    ### ACUTALLY, I THINK WE WANT TO CREATE AND MERGE!!!!
+    # This would mean that only a single hit would validate the protein
+    # if we are subtracting, then we lose the protein on a single peptide!!!!
+    prot_id_set = peps.inject(Set.new) do |prtset, pep|
+      prtset.merge( pep.prots.map {|prot| prot } )
+    end
+    tot_num_prots = prot_id_set.size
+    # could also merge off the good indices
+    # TODO: we should optimize based on how many false pephits given...
+    precision_sample = (0...num_iterations).to_a.map do
+      shuffled_indices = base_indices.map
+      shuffled_indices.shuffle!
+      good_indices = shuffled_indices[num_false_pephits..-1]
+      still_remaining = Set.new
+      peps.values_at(*good_indices).each do |pep|
+        still_remaining.merge(pep.prots.map {|prot| prot })
+      end
+      still_remaining.size.to_f / tot_num_prots
+    end
+    if num_iterations == 1
+      precision_sample.shift
+    else
+      #puts "PRECISION GROUP: "
+      #p precision_sample
+      sample_stats(precision_sample)
+    end
+  end
+end

data/lib/validator/transmem.rb ADDED Viewed

@@ -0,0 +1,272 @@
+require 'validator'
+require 'validator/digestion_based'
+require 'transmem'
+require 'fasta'
+require 'spec_id/digestor'
+require 'spec_id/sequest/params'
+require 'spec_id/sequest/pepxml'
+module Validator::Transmem ; end
+# objects of this class can calculate pephit_precision given an array of
+# SpecID::Pep objects using the pephit_precision method.
+class Validator::Transmem::Protein < Validator::DigestionBased
+  include Precision::Calculator
+  # a hash keyed by index reference which is true if >= min_num_tms
+  attr_accessor :transmem_by_ti_key
+  attr_accessor :transmem_index
+  # min_num_tms: Integer (1...), the min # certain transmembrane segments to
+  # consider the protein a transmembrane protein
+  attr_reader :min_num_tms
+  # soluble_fraction: *true/false
+  attr_accessor :soluble_fraction
+  # correct_wins: *true/false,
+  #   if the peptide is found in some proteins that are transmembrane and some
+  #   that are not, then if soluble_fraction==true, this peptide will be
+  #   considered non-transmembrane.  If soluble_fraction==false, then this
+  #   will be considered transmembrane.
+  attr_accessor :correct_wins
+  # no_include_tm_peps: false or Float (0.0-1.0), peptides that have a
+  #   fraction of amino acids that fall inside transmembrane sequences greater
+  #   than or equal to the value of the argument will not be considered in the final
+  #   calculation of peptide hit precision.  (A transmembrane segment is
+  #   likely to have very different properties than the rest of the peptides,
+  #   so the assumption of equally flyable peptides is broken unless these are
+  #   removed)  nil or false will skip this filter.  A reasonable value is
+  #   probably 0.7.
+  attr_accessor :no_include_tm_peps
+  # if nil, then this will be calculated whe pephit_precision is called.
+  attr_accessor :transmem_status_hash
+  # the file used (toppred or phobius file)
+  attr_accessor :transmem_file
+  DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( { :min_num_tms => 1, :soluble_fraction  =>  true, :correct_wins  =>  true, :no_include_tm_peps => false, :transmem_status_hash => nil} )
+  # expects a toppred.out file (see transmem/toppred)
+  # other types of transmembrane predictions)
+  # fasta_obj is a Fasta object.
+  # sequest_params_obj is a Sequest::Params object.
+  # OPTIONS:
+  #   (see Validator::Transmem::Protein::DEFAULTS for defaults)
+  #
+  #   no_include_tm_peps: *false
+  #
+  # NOTE: if fasta_obj and sequest_params_obj are not passed in then
+  #   'false_to_total_ratio' must be set later.
+  def initialize(a_transmem_file, options={})
+    @transmem_file = a_transmem_file
+    opts = self.class::DEFAULTS.merge(options)
+    (@min_num_tms, @soluble_fraction, @correct_wins, @no_include_tm_peps, @background, @transmem_status_hash, @false_to_total_ratio, fasta) = opts.values_at(:min_num_tms, :soluble_fraction, :correct_wins, :no_include_tm_peps, :background, :transmem_status_hash, :false_to_total_ratio, :fasta)
+    # fasta object is used to update hte phobius index if given
+    # a hash by reference => true/false (depending on min_num_tms)
+    @transmem_index = TransmemIndex.new(@transmem_file, fasta)
+    @transmem_by_ti_key = create_transmem_by_ti_key_hash(@transmem_index, @min_num_tms)
+  end
+  # Designates each protein as transmembrane or not depending on :min_num_tms
+  # The hash is keyed by the TransmemIndex key.
+  def create_transmem_by_ti_key_hash(transmem_index, min_num_tms)
+    _transmem_by_ti_key = {}
+    num_certain_hash = transmem_index.num_certain_index
+    num_certain_hash.each do |id, num_certain|
+      if num_certain >= min_num_tms
+        _transmem_by_ti_key[id] = true
+      else
+        _transmem_by_ti_key[id] = false
+      end
+    end
+    _transmem_by_ti_key
+  end
+  # returns a hash where each protein (and peptide if given peps) is indexed
+  # with itself with true/false/nil depending on transmembrane status.  If
+  # given peptides, and :no_include_tm_peps is not false, will also set the
+  # attribute for peptides.
+  # the attribute (:no_include_tm_peps)
+  # NOTE: if given a list of peptides, this implementation will not overwrite a
+  # protein if it already has a true/false for transmem.  This is so that a
+  # lookup does not have to be performed if the value is already defined as
+  # the assumption is that many peptides will point to the same protein.
+  def create_transmem_status_hash(peps)
+    thash = {}
+    peps.each do |pep|
+      pep.prots.each do |prot|
+        if !thash.key?(prot)
+          #prot.transmem == nil
+          thash[prot] = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
+        end
+      end
+      if @no_include_tm_peps
+        thash[pep] = pep_is_transmem?(pep)
+      end
+    end
+    thash
+  end
+  # sets the false_to_total_ratio and returns self for chaining.
+  # peps will usually be the peptides created by calling:
+  #     peps = Digestor.digest( fasta_obj, sequest_params_obj )
+  def set_false_to_total_ratio(peps)
+    tm_hash = create_transmem_status_hash(peps)
+    (tps, fps) = partition(peps, tm_hash)
+    @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
+    self
+  end
+  def pephit_precision(peps)
+    if !@transmem_status_hash
+      @transmem_status_hash = create_transmem_status_hash(peps)
+    end
+    super(peps)
+  end
+  # regardless of transmembrane status of proteins peptide belongs to, asks
+  # what the avg overlap is with transmembrane sequences.
+  def pep_is_transmem?(pep)
+    prts = pep.prots
+    prts_w_keys = 0
+    sum_of_fractions = 0.0
+    prts.each do |prot|
+      key = @transmem_index.reference_to_key(prot.reference)
+      ans = @transmem_index.avg_overlap(key, pep.aaseq, :fraction)
+      if ans
+        sum_of_fractions += ans
+        prts_w_keys += 1
+      end
+    end
+    if prts_w_keys > 0
+      avg_of_fractions = sum_of_fractions / prts_w_keys
+      avg_of_fractions >= @no_include_tm_peps
+    else
+      nil
+    end
+  end
+  # each peptide must have prots and the prots must respond true/false to
+  # the 'transmem' method
+  # if given a hash, it will override the @transmem_status_hash
+  def partition(peps, transmem_status_hash=nil)
+    # The fast way to do this is to play with the logic
+    # For the insoluble fraction we calculate as if incorrect wins
+    # and swap the tp's and fp's (I've verified that this is correct
+    # empirically)
+    # the code could be cleaner here, but efforts to minimize calls in the
+    # inner loops create this structure...
+    tm_hash = transmem_status_hash || @transmem_status_hash
+    my_peps =
+      if @no_include_tm_peps
+        # remove all thos peps with fractional overlap >= @no_include
+        # [1,2,3,4].reject {|n| n >= 3}  #-> [1, 2]
+        # remove pep.transmem == true and pep.transmem == nil
+        if tm_hash
+          peps.reject do |pep|
+            tm_hash[pep] != false
+          end
+        else
+          peps.reject do |pep|
+            pep_is_transmem?(pep) != false
+          end
+        end
+      else
+        peps
+      end
+    cw = @correct_wins
+    sf = @soluble_fraction
+    if !sf
+      cw = !cw
+    end
+    tp = []
+    fp = []
+    if cw
+      my_peps.each do |pep|
+        one_prot_is_not_transmem = false
+        not_all_nil = false
+        if tm_hash
+          pep.prots.each do |prot|
+            tm_status = tm_hash[prot]
+            if tm_status == false
+              one_prot_is_not_transmem = true
+              break
+            elsif tm_status == true
+              not_all_nil = true
+            end
+          end
+        else
+          pep.prots.each do |prot|
+            tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
+            if tm_status == false
+              one_prot_is_not_transmem = true
+              break
+            elsif tm_status == true
+              not_all_nil = true
+            end
+          end
+        end
+        if one_prot_is_not_transmem
+          tp << pep
+        else
+          if not_all_nil
+            fp << pep
+          end
+        end
+      end
+    else
+      my_peps.each do |pep|
+        one_prot_is_transmem = false
+        not_all_nil = false
+        if tm_hash
+          pep.prots.each do |prot|
+            tm_status = tm_hash[prot]
+            if tm_status == true
+              one_prot_is_transmem = true
+              break
+            elsif tm_status == false
+              not_all_nil = true
+            end
+          end
+        else
+          pep.prots.each do |prot|
+            tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
+            if tm_status == true
+              one_prot_is_transmem = true
+              break
+            elsif tm_status == false
+              not_all_nil = true
+            end
+          end
+        end
+        if one_prot_is_transmem
+          fp << pep
+        else
+          if not_all_nil
+            tp << pep
+          end
+        end
+      end
+    end
+    if !sf # swap
+      fp,tp = tp,fp
+      cw = !cw
+    end
+    #puts "PARTITION ARRAY"
+    #p [tp, fp].map{|v| v.size}
+    [tp, fp]
+  end
+end

data/lib/validator/true_pos.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'validator'
+class Validator::TruePos < Validator
+  include Precision::Calculator
+  attr_reader :fasta
+  attr_accessor :correct_wins
+  # correct_wins means that only a single protein from a pep.aaseq must match
+  # the fasta object for the pep hit to be considered valid.  Otherwise, all
+  # must be a match
+  def initialize(fasta_obj, correct_wins = true)
+    @fasta = fasta_obj
+    @fasta_headers = @fasta.prots.map {|prot| prot.header }
+    @correct_wins = correct_wins
+  end
+  def partition(peps)
+    if @correct_wins
+      peps.partition do |pep|
+        @fasta_headers.any? do |header|
+          pep.prots.any? do |pepprot|
+            header.include? pepprot.reference
+          end
+        end
+      end
+    else
+      peps.partition do |pep|
+        pep.prots.all? do |pepprot|
+          @fasta_headers.any? do |header|
+            header.include? pepprot.reference
+          end
+        end
+      end
+    end
+  end
+  def pephit_precision(peps)
+    (tp, fp) = partition(peps)
+    calc_precision(tp.size, fp.size)
+  end
+  def to_param_string
+    "true_positives(tps)=" +  ["{fasta=#{@fasta.filename}", "correct_wins=#{@correct_wins}}"].join(", ")
+  end
+end