RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/lib/spec_id.rb CHANGED Viewed

@@ -7,7 +7,7 @@ require 'sample_enzyme'  # for others
 require 'spec_id/bioworks'
 require 'spec_id/sequest'
 require 'spec_id/proph'
-require 'spec_id/false_positive_rate'
+require 'spec_id/precision'
 class Mass
@@ -112,11 +112,12 @@ class SpecID
     "<#{self.class} #peps=\"#{peps.size}\">"
   end
-  # returns the top peptide hits per dta (first_scan + charge)
+  # returns the top peptide hits per file dta (first_scan + charge)
   # all hits with same score as top score are returned
   # assumes that all fields are strings...
   # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
   # deletes the protein array (but not relevant proteins)
+  # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
   def top_peps_prefilter!
     peps.each do |pep|
       pep.xcorr = pep.xcorr.to_f
@@ -127,7 +128,8 @@ class SpecID
     end
     # get the top peptide by firstscan/charge (equivalent to .out files)
     top_peps = []
-    self.peps.hash_by {|pep| [pep.first_scan.to_i, pep.charge.to_i]}.map do |k,v|
+    #self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
+    self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
       best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
       top_score = best_to_worst.first.xcorr
       best_to_worst.each do |pep|
@@ -158,6 +160,7 @@ class SpecID
         pep_deltacn = pep.deltacn
         pep_charge = pep.charge
         (pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
+        #truth = (pep_deltacn >= deltacn) and
         (
          (pep_charge == 1 && pep.xcorr >= x1) or
          (pep_charge == 2 && pep.xcorr >= x2) or
@@ -166,6 +169,8 @@ class SpecID
         ((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
       end
+      #deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
       hash = peps_passed.hash_by(:prot)
       prots_passed = hash.map do |prot,pep_arr|
@@ -173,14 +178,15 @@ class SpecID
         prot
       end
       [prots_passed, peps_passed]
+      #[prots_passed, peps_passed, deltacnstar_cnt]
     else
       abort "#{kind} not implemented"
     end
   end
   ## basically, this is the command line wrapper
-  def self.false_positive_rate(argv)
-    SpecID::FalsePositiveRate.new.run_cmd_line(argv)
+  def self.precision(argv)
+    SpecID::Precision.new.run_cmd_line(argv)
   end
@@ -266,16 +272,6 @@ class SpecID
     return tp, fp
   end
-  # type_of_analysis can be (:precision|...)
-  def area_under_curve(items, fp_prefix)
-    if items == :prots
-        (tp,fp) = classify_by_prefix(items, fp_prefix)
-      (tp, prec, fpr2) = tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
-      ############################################## HERERERERER!!!!
-    end
-  end
   # returns a proc for getting all probabilities so that an ascending sort
   # will put the best scores first
@@ -299,22 +295,43 @@ class SpecID
     end
   end
+  # sorts the probabilities and then
+  # calcs predicted number hits and precision for protein probabilities
+  # (summing probabilities)
+  # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
+  # called false positive rate
+  # SUM(1-probX)/#prots
+  def num_hits_and_ppv_for_protein_prophet_probabilities
+    current_sum_one_minus_prob = 0.0
+    num_prots = []
+    ppv = []
+    prot_cnt = 0
+    probs = prots.map {|v| v.probability}
+    sorted = probs.sort.reverse
+    sorted.each do |prob|
+      prot_cnt += 1
+      num_prots << prot_cnt
+      current_sum_one_minus_prob += 1.0 - prob
+      ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
+      # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
+    end
+    [num_prots, ppv]
+  end
   # convenience method for the common task of determining precision for
   # proteins (with decoy proteins found by prefix)
-  # returns (tps1, precs, fprs)
-  def tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
+  # returns (num_hits, precision)
+  def num_hits_and_ppv_for_prob(fp_prefix)
     regex = /^#{Regexp.escape(fp_prefix)}/
     prob_proc = probability_proc
     myproc = proc { |prt|
       if prt.reference =~ regex ; false
       else ; true end
     }
-    tp, fp = rank_and_classify(:prots, prob_proc, myproc)
-    tps1, precs = by_tps(:precision, tp, fp)
-    tps2, fprs = by_tps(:fpr2_times2, tp, fp)
-    if tps1 != tps2 ; puts "true positives not the same for precision and fpr2_times2. Exiting"
-    end
-    [tps1, precs, fprs]
+    real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
+    (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
+    [num_hits, precision]
   end
   def method_missing(symbol, *args)
@@ -389,11 +406,17 @@ class SpecID
     sorted_probabilities(peps)
   end
+  ##########################################################################
+  # WARNING! These might be dangerous to your health if there are multiple
+  # files collected in your bioworks file
+  ##########################################################################
   # (prob_list_by_min, prob_list_by_best10)
   # returns 2 sorted lists of probabilities based on:
   #   1. best peptide hit
   #   2. top 10 peptide hits
   # on a per scan basis
+  # NOTE: you may want to hash on base_name first!
   def pep_probs_by_scan
     hash = peps.hash_by(:first_scan, :last_scan)
     return min_and_best10(hash)
@@ -402,6 +425,7 @@ class SpecID
   #(prob_list_by_min, prob_list_by_best10)
   # same as pep_probs_by_scan but per charge state
+  # NOTE: you may want to hash on base_name first!
   def pep_probs_by_scan_charge
     hash = peps.hash_by(:first_scan, :last_scan, :charge)
     return min_and_best10(hash)
@@ -410,6 +434,7 @@ class SpecID
   # (prob_list_by_min)
   # hashes on seq-charge and returns the sorted list of probabilities of top
   # hit per seq-charge
+  # NOTE: you may want to hash on base_name first!
   def pep_probs_by_seq_charge
     hash = peps.hash_by(:sequence, :charge)
     min_peptides = hash.collect do |k,v|
@@ -418,6 +443,42 @@ class SpecID
     sorted_probabilities(min_peptides)
   end
+  ##########################################################################
+  # USE these if you have multiple files in your bioworks.xml file
+  ##########################################################################
+  # (prob_list_by_min, prob_list_by_best10)
+  # returns 2 sorted lists of probabilities based on:
+  #   1. best peptide hit
+  #   2. top 10 peptide hits
+  # on a per scan basis
+  # NOTE: you may want to hash on base_name first!
+  def pep_probs_by_bn_scan
+    hash = peps.hash_by(:base_name, :first_scan, :last_scan)
+    return min_and_best10(hash)
+  end
+  #(prob_list_by_min, prob_list_by_best10)
+  # same as pep_probs_by_scan but per charge state
+  # NOTE: you may want to hash on base_name first!
+  def pep_probs_by_bn_scan_charge
+    hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
+    return min_and_best10(hash)
+  end
+  # (prob_list_by_min)
+  # hashes on seq-charge and returns the sorted list of probabilities of top
+  # hit per seq-charge
+  # NOTE: you may want to hash on base_name first!
+  def pep_probs_by_bn_seq_charge
+    hash = peps.hash_by(:base_name, :sequence, :charge)
+    min_peptides = hash.collect do |k,v|
+      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
+    end
+    sorted_probabilities(min_peptides)
+  end
   # A Generic spectraID protein
   class Prot
     # probability is always a float!
@@ -458,6 +519,23 @@ end
 # concatenation into a file
 module SpecIDXML
+  Special_chrs_hash = {
+    '"' => '&quot;',
+    '&' => '&amp;',
+    "'" => '&apos;',
+    '<' => '&lt;',
+    '>' => '&gt;',
+  }
+  # substitutes special xml chars
+  def escape_special_chars(string)
+    string.split('').map do |char|
+      if Special_chrs_hash.key? char ; Special_chrs_hash[char]
+      # if x = Special_chrs_hash[char] ; x  # <-- that's slightly slower
+      else ; char end
+    end.join
+  end
   $DEPTH = 0
   def tabs
@@ -486,6 +564,12 @@ module SpecIDXML
     "#{tabs}<#{element} #{att_string}/>\n"
   end
+  # requires that obj have attribute '@xml_element_name'
+  # displays all *instance_variables* (does not call methods!)
+  def short_element_xml_from_instance_vars(element_name)
+    string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
+    "#{tabs}<#{element_name} #{string}/>\n"
+  end
   # takes an element as a symbol and returns the
   def element_xml_no_atts(element)

data/release_notes.txt ADDED Viewed

@@ -0,0 +1,11 @@
+Note two potentially significant bugs in the software corrected (see the
+changelog).  I haven't finished modifying the tests to reflect these changes,
+but I wanted to get the faulty software off the top of the stack.  A new
+release will shortly follow that passes all tests.  Use this release only as a
+correction to the previous.
+tests currently failing:
+gi
+spec_id
+id_precision

data/script/estimate_fpr_by_cysteine.rb ADDED Viewed

@@ -0,0 +1,226 @@
+#!/usr/bin/ruby -w
+## The yeast Scal db mean background is: 0.00984
+## The yeast Cysteine background freq is: 0.0131986582396467
+pep_seq_re = /<search_hit .* peptide="(\w+)"/o
+pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
+if ARGV.size != 3
+  puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
+  puts "  outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
+  abort
+end
+def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+  File.open(base_toplot, "w") do |fh|
+    fh.puts 'XYData'
+    fh.puts base
+    fh.puts title
+    fh.puts xaxis
+    fh.puts yaxis
+    cats.each do |ar|
+      fh.puts ar.join(" & ")
+      ar.each do |a|
+        fh.puts hash[a].join(" ")
+      end
+    end
+  end
+end
+  ############################################################################
+#### DO NOT MODIFY THIS GUY!  HE IS TAKEN FROM bin/filter_spec_id.rb
+#### CHANGE HIM THERE (eventually we need to put him in a lib file)
+# (actual # with cys, expected # with cys, total#peptides,
+# mean_fraction_of_cysteines_true, std)
+# PepHit(C) = Peptide containing cysteine
+#   # Total PepHit(C)                   # Observed Bad Pep (C)
+#   ------------------ proportional_to  ----------------------
+#   # Total PepHit                      # Total Bad PepHit (X)
+def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
+  # the number of bona fide BAD cysteine hits
+  # (some of the cysteine hits (~5%) are true positives)
+  ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
+  if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
+  total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
+  fpr = total_number_false / total_peptides
+  [fpr, total_number_false]
+end
+############################################################################
+(cysteine_background_freq, background_freq, file) = ARGV
+cysteine_background_freq = cysteine_background_freq.to_f
+background_freq = background_freq.to_f
+seq_probs = []
+last_seq_prob = nil
+File.open(file) do |fh|
+  fh.each do |line|
+    if line =~ pep_seq_re
+      ar = Array.new(2)
+      ar[0] = $1
+      seq_probs << ar
+      last_seq_prob = ar
+    elsif line =~ pep_prob_re
+      last_seq_prob[1] = $1.to_f
+    end
+  end
+end
+#seq_probs.each do |seq|
+#  if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
+#    abort "BAD PARSING!!"
+#  end
+#end
+amino_acid_as_st = 'C'
+sorted = seq_probs.sort_by {|v| v[1] }.reverse
+## traverse the peptides
+actual_cys_containing_peps = 0
+expected_cys_containing_peps = 0.0
+current_sum_one_minus_prob = 0.0
+prob_estimated_fpr = 0.0
+pep_cnt = 0
+one_minus_freq = 1.0 - cysteine_background_freq
+## tabulate:
+pep_cnts = []
+probs = []
+prob_fprs = []
+prob_tps = []
+cys_fprs = []
+cys_tps = []
+fpr_diff = []
+sorted.each do |ar|
+  pep_cnt += 1
+  pep = ar[0]
+  prob = ar[1]
+  ## Cysteine FPR: ##
+  # Expected:
+  expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
+  # Actual:
+  if pep.include?(amino_acid_as_st)
+    actual_cys_containing_peps += 1
+  end
+  (cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
+  cys_tp = pep_cnt.to_f - total_num_false_by_cys
+  ## FPR by prob: ##
+  # SUM(1-probX)/#peps
+  current_sum_one_minus_prob += 1.0 - prob
+  prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
+  prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
+  ## GRAB or report the data:
+  pep_cnts << pep_cnt
+  probs << prob
+  prob_fprs << prob_estimated_fpr
+  prob_tps << prob_tp
+  cys_fprs << cys_fpr
+  cys_tps << cys_tp
+  fpr_diff << prob_estimated_fpr - cys_fpr
+  #puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
+end
+hash = {
+  'pep_cnts' => pep_cnts,
+  'probs' => probs,
+  'prob_fprs' => prob_fprs,
+  'prob_tps' => prob_tps,
+  'cys_fprs' => cys_fprs,
+  'cys_tps' => cys_tps,
+  'fpr_diff' => fpr_diff,
+}
+real_base = file.sub(/\.xml/,'')
+## TPS vs FPR
+base = real_base.dup
+base << "." << "tps_vs_fpr"
+base_toplot = base + '.to_plot'
+title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
+xaxis = "TPs"
+yaxis = "FPR"
+cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+## PEPHITS vs FPR
+base = real_base.dup
+base << "." << "num_pep_hits_vs_fpr"
+base_toplot = base + '.to_plot'
+title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
+xaxis = "num peptide hits"
+yaxis = "FPR"
+cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+## PEPHITS VS FPR DIFF
+base = real_base.dup
+base << "." << "num_pep_hits_vs_fpr_diff"
+base_toplot = base + '.to_plot'
+title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
+xaxis = "num peptide hits"
+yaxis = "FPR diff (prob - cysteine)"
+cats = [['pep_cnts', 'fpr_diff']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+## PROB VS FPR DIFF
+base = real_base.dup
+base << "." << "prob_vs_fpr_diff"
+base_toplot = base + '.to_plot'
+title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
+xaxis = "peptide probability"
+yaxis = "FPR diff (prob - cysteine)"
+cats = [['probs', 'fpr_diff']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+=begin
+returns [number_of_prots, actual_fpr]
+def num_prots_above_fpr(prots, desired_fpr)
+  current_fpr_rate_percent = 0.0
+  previous_fpr_rate_percent = 0.0
+  current_sum_one_minus_prob = 0.0
+  proteins_within_fpr = 0
+  actual_fpr = nil
+  already_found = false
+  prot_cnt = 0
+  prots.each do |prot|
+    prot_cnt += 1
+    # SUM(1-probX)/#prots
+    current_sum_one_minus_prob += 1.0 - prot._probability.to_f
+    current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
+    if current_fpr_rate_percent > desired_fpr && !already_found
+      actual_fpr = previous_fpr_rate_percent
+      proteins_within_fpr = prot_cnt
+      already_found = true
+    end
+    previous_fpr_rate_percent = current_fpr_rate_percent
+  end
+  [proteins_within_fpr, actual_fpr]
+end
+=end

data/script/filter-peps.rb CHANGED Viewed

@@ -80,13 +80,13 @@ def number_passing(peps)
   np = {}
   np["PepProts"] = filter(peps).size
-  by_scan_charge = peps.hash_by(:first_scan, :last_scan, :charge).values
+  by_scan_charge = peps.hash_by(:base_name, :first_scan, :last_scan, :charge).values
   analyze(by_scan_charge, "ScanCharge", np)
-  by_scan = peps.hash_by(:first_scan, :last_scan).values
+  by_scan = peps.hash_by(:base_name, :first_scan, :last_scan).values
   analyze(by_scan, "Scan", np)
-  by_seq_charge = peps.hash_by(:sequence, :charge).values
+  by_seq_charge = peps.hash_by(:base_name, :sequence, :charge).values
   analyze(by_seq_charge, "SeqCharge", np)
   np

data/script/find_cysteine_background.rb ADDED Viewed

@@ -0,0 +1,137 @@
+#!/usr/bin/ruby -w
+require 'vec'
+# FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
+# is 0.0101409563168847
+# <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
+def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+  File.open(base_toplot, "w") do |fh|
+    fh.puts 'XYData'
+    fh.puts base
+    fh.puts title
+    fh.puts xaxis
+    fh.puts yaxis
+    cats.each do |ar|
+      fh.puts ar.join(" & ")
+      ar.each do |a|
+        fh.puts hash[a].join(" ")
+      end
+    end
+  end
+  system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
+end
+peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
+unless ARGV.size == 2
+  abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
+end
+(cysteine_background_freq, file) = ARGV
+# each pep = [nsp_prob, init_prob, SEQUENCE]
+peps = []
+File.open(file) do |fh|
+  fh.each do |line|
+    if line =~ peptide_re
+      peps << [$3.to_f,$2.to_f,$1]
+    end
+  end
+end
+amino_acid_as_st = 'C'
+one_minus_freq = 1.0 - cysteine_background_freq.to_f
+actual_cys_containing_peps = 0
+expected_cys_containing_peps = 0.0
+current_sum_one_minus_prob = 0.0
+prob_estimated_fpr = 0.0
+pep_cnt = 0
+the_probs = []
+the_fractions = []
+special_probs = []
+#peps.sort.reverse.each do |ar|
+#peps.sort.each do |ar|
+peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
+  (nsp_prob, init_prob, pep) = ar
+  ## Cysteine FPR: ##
+  # Expected:
+  expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
+  # Actual:
+  if pep.include?(amino_acid_as_st)
+    actual_cys_containing_peps += 1
+  end
+  fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
+  special_prob = (3.0 * nsp_prob) + init_prob
+  ## Get the final fraction
+  #if special_prob < 4.0
+  #  #puts the_fractions.join(" ")
+  #  puts the_fractions.last
+  #  abort
+  #end
+  # gather data to plot
+  the_probs << nsp_prob
+  special_probs << special_prob
+  the_fractions << fraction_ac_exp
+end
+hash = {
+  'probs' => the_probs,
+  'fractions' => the_fractions,
+  'special_probs' => special_probs,
+}
+real_base = file.sub(/\.xml/,'')
+=begin
+## PROB VS FPR DIFF
+base = real_base.dup
+base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
+base_toplot = base + '.to_plot'
+title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
+xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
+yaxis = "fraction with cysteines (actual/expected)"
+cats = [['probs', 'fractions']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+=end
+=begin
+## PROB VS FPR DIFF
+base = real_base.dup
+base << "." << "prob_vs_actual_expected_fraction"
+base_toplot = base + '.to_plot'
+title = "peptide prob vs fraction with cysteines (actual/expected)"
+xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
+yaxis = "fraction with cysteines (actual/expected)"
+cats = [['probs', 'fractions']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
+=end
+## SPECIAL PROB VS FPR DIFF
+base = real_base.dup
+base << "." << "special_prob_vs_actual_expected_fraction"
+base_toplot = base + '.to_plot'
+title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
+xaxis = "(3 * nsp_prob) + init_prob"
+yaxis = "fraction with cysteines (actual/expected)"
+cats = [['special_probs', 'fractions']]
+plot(base_toplot, base, title, xaxis, yaxis, hash, cats)

data/script/gen_database_searching.rb CHANGED Viewed

@@ -109,10 +109,12 @@ def run_sequest ; "Run Sequest with a Normal and an Inverse Database
 If you don't already have one, here's how to make an inverse database:
-    fasta_mod.rb invert <yourfile.fasta>
+    fasta_shaker.rb reverse <yourfile.fasta>
-This will create a file with the trailing tag '_INV.fasta'.  Just type
-`fasta_mod.rb` for more details.
+This will create a file with the trailing tag '_reverse.fasta'.  Just type
+`fasta_shaker.rb` for more details.
+Run sequest with 'report duplicate references' set to >= 40
 "
 end
@@ -166,11 +168,13 @@ def run_sequest ; "Run Sequest with a Concatenated Inverse Database
 If you don't already have one, here's how to make one:
-    fasta_cat_mod.rb invert <yourfile.fasta>
+    fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
+This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'.  Each
+inverted protein name will be prefixed with 'INV_'.  Just type
+`fasta_shaker.rb` for more details.
-This will create a file with the trailing tag '_CAT_INV.fasta'.  Each inverted
-protein name will be prefixed with 'INV_'.  Just type `fasta_cat_mod.rb` for
-more details.
+Run sequest with 'report duplicate references' set to >= 40
 "
 end