RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/bin/gi2annot.rb CHANGED Viewed

@@ -1,13 +1,6 @@
 #!/usr/bin/ruby -w
-require 'open-uri'
-require 'rexml/document'
-require 'rexml/streamlistener'
-BATCH_SIZE = 500
-$LOG = nil
-$ANNOTS = []
+require 'gi'
 if ARGV.size < 1
   puts "usage: #{File.basename(__FILE__)} <gi> ..."
@@ -15,108 +8,7 @@ if ARGV.size < 1
 end
-# db=
-# retstart=
-# retmax=
-class Listener
-  include REXML
-  include StreamListener
-  def initialize
-    @get_title = false
-  end
-  def tag_start(name, attributes)
-    #puts "NAME" + name
-    #p attributes
-    if name == "Item" && attributes["Name"] == "Title"
-      @get_title = true
-    end
-  end
-  def text(text)
-    #puts "TEXT: " + text + @get_title.to_s
-    if @get_title
-      #puts "GETTING TITLE!"
-      $ANNOTS.push text.chomp
-      @get_title = false
-    end
-  end
-end
-# Returns a list of Annotation strings
-def parse_etool_output(handle)
-  listener = Listener.new
-  parser = REXML::Parsers::StreamParser.new(handle, listener)
-  parser.parse
-  $ANNOTS
-end
-#$LOG = File.open("log.log", "w")
 gis = ARGV.to_a.dup
-while(true) do
-  batch = gis.slice!(0..BATCH_SIZE)
-  if batch.size == 0 then break end
-  string = batch.join(",")
-  url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
-  #puts url
-  annots = []
-  open(url) do |handle|
-    annots = parse_etool_output(handle)
-  end
-  puts annots.join("\n")
-end
-#$LOG.close
-=begin
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
-<eSummaryResult>
-<DocSum>
-<Id>24115498</Id>
-<Item Name="Caption" Type="String">NP_710008</Item>
-<Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
-<Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
-<Item Name="Gi" Type="Integer">24115498</Item>
-<Item Name="CreateDate" Type="String">2002/10/16</Item>
-<Item Name="UpdateDate" Type="String">2006/04/03</Item>
-<Item Name="Flags" Type="Integer">512</Item>
-<Item Name="TaxId" Type="Integer">198214</Item>
-<Item Name="Status" Type="String">live</Item>
-<Item Name="ReplacedBy" Type="String"></Item>
-<Item Name="Comment" Type="String"><![CDATA[  ]]></Item>
-</DocSum>
-<DocSum>
-<Id>434011</Id>
-<Item Name="Caption" Type="String">CAA24741</Item>
-<Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
-<Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
-<Item Name="Gi" Type="Integer">434011</Item>
-<Item Name="CreateDate" Type="String">1983/12/06</Item>
-<Item Name="UpdateDate" Type="String">2005/04/18</Item>
-<Item Name="Flags" Type="Integer">0</Item>
-<Item Name="TaxId" Type="Integer">562</Item>
-<Item Name="Status" Type="String">live</Item>
-<Item Name="ReplacedBy" Type="String"></Item>
-<Item Name="Comment" Type="String"><![CDATA[  ]]></Item>
-</DocSum>
-</eSummaryResult>
+puts( GI.gi2annot(gis).join("\n") )
-=end

data/bin/id_class_anal.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'spec_id'
 require 'generator'
 require 'optparse'
 require 'ostruct'
+require 'roc'
 def file_noext(file)
   file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
@@ -21,7 +22,8 @@ jtplot_file = jtplot_base + '.toplot'
 OptionParser.new do |op|
   op.on("-p", "--prefix PREFIX", "prefix for false positive proteins") {|v| opt.p = v.split(',') }
   op.on("-j", "--jtplot", "output file '#{jtplot_file}' for jtp plotting program") {|v| opt.j = v }
-  op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
+#  op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
+  op.on("-a", "--area", "outputs area under the curve") {|v| opt.a = v }
 end.parse!
 if ARGV.size < 1
@@ -32,55 +34,59 @@ if ARGV.size < 1
   probabilities) or protein_prophet-prot.xml file which has been run with
   decoy proteins.
-  Outputs tp's, precision, and the false positive rate [as calculated by Gygi
-  2*(#mod/(#norm+#mod))].  Each of these will be in a column with a label at
-  the top.  Outputs columns (delimited by '\\t') to STDOUT.
-  To capture to file: #{File.basename(__FILE__)} protein_file.xml > out.csv
+  Outputs tp's and precision.
+  [The false positive predictive rate (FPPR) is 1 - precision]
+  The two columns will be labeled at the top.
+  (delimited by '\\t') to STDOUT.  To capture to file:
+  #{File.basename(__FILE__)} protein_file.xml > out.csv
-  Also takes gzipped (extension: xml.gz) files.
   OPTIONS:
   <s> = string
   -p  --prefix <s[,s...]>  Prefix(s) by which to determine decoy proteins (default #{def_pre})
   -j  --jtplot        outputs #{jtplot_file} for plotting by plot.rb
                       [% plot.rb -w lp --yrange n0.1:1.1 --noenhanced <file> ]
+  -a  --area          outputs area under the curve instead of tps/precision
   NOTE: protein prophet files not yet functional!!!
   ABBR:
     TP = True Positives
     FP = False Positives
     Prec = Precision = TP/(TP+FP)
-    FPR = False Positive Rate (as defined by Gygi) 2*[FP/(TP+FP)]
   "
   exit
 end
+###########################################################
+# I DON"T think option -e is functional yet...
+###########################################################
 files = ARGV.to_a
 out = nil
 if opt.j
   out = File.open(jtplot_file, "w")
-  lines = ['XYData', jtplot_base, "Classification Analysis", "Num TPs", "(Prec|FPR)"]
+  lines = ['XYData', jtplot_base, "Classification Analysis", "Num Hits", "Precision"]
   lines.each {|l| out.puts l}
 end
 headings = files.collect do |file|
-  %w(TP Prec FPR).collect {|v| v + " (#{file_noext(file)})" }
+  %w(TP Precision).collect {|v| v + " (#{file_noext(file)})" }
 end
-#headings = ["# True Positives", "Precision (TP/(TP+FP))", "FP Rate 2*(FP/(TP+FP))"]
-puts headings.flatten.join(delimiter)
 all_arrs = []
 files.each_with_index do |file,i|
   sp = SpecID.new(file)
-  #puts sp.prots.first.respond_to?
-  if opt.e
-    headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", ]
-    arrs = sp.tps_and_precision_and_fpr2_times2_for_prob(opt.p[i])
-  else
-    headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
-    arrs = sp.tps_and_precision_and_fpr2_times2_for_prob(opt.p[i])
+  headers = [file_noext(file)]
+  arrs = sp.num_hits_and_ppv_for_prob(opt.p[i])
+  if opt.a
+    (num_hits, prec) = arrs
+    roc = ROC.new
+    prec_area = roc.area_under_curve(num_hits, prec)
+    puts "#{file} (area under curve [num_hits, precision])"
+    puts "Prec [#TPPrec = TP/(TP+FP)]:\t#{prec_area}"
   end
   all_arrs.push(*arrs)
   lns = []
@@ -95,8 +101,12 @@ files.each_with_index do |file,i|
   end
 end
-SyncEnumerator.new(*all_arrs).each do |row|
-  puts row.join(delimiter)
+unless opt.a
+  puts headings.flatten.join(delimiter)
+  SyncEnumerator.new(*all_arrs).each do |row|
+    puts row.join(delimiter)
+  end
 end
 out.close if opt.j

data/bin/id_precision.rb CHANGED Viewed

@@ -13,7 +13,7 @@ opts = OptionParser.new do |op|
   op.banner = "usage: #{File.basename(__FILE__)} prefix bioworks.xml"
   op.separator ""
   op.separator "takes Bioworks 3.2 xml output files (with probabilities)"
-  op.separator "rank orders the probabilities and outputs tp's and sensitivity"
+  op.separator "rank orders the probabilities and outputs num hits and precision"
   op.separator "Also takes gzipped (xml.gz) files labeled as such"
   op.separator ""
   op.separator "Outputs a comma separated value to STDOUT (.csv)"
@@ -50,10 +50,11 @@ tp_obj.peps = tp
 two_lists = [tp_obj, fp_obj].map do |obj|
   list = []
   list.push( obj.pep_probs_by_pep_prots )
-  list.push( obj.pep_probs_by_seq_charge )
+  list.push( obj.pep_probs_by_bn_seq_charge )
   # These each have a by_min and a by_top10
-  list.push(*( obj.pep_probs_by_scan ) )
-  list.push(*( obj.pep_probs_by_scan_charge ) )
+  list.push(*( obj.pep_probs_by_bn_scan ) )
+  list.push(*( obj.pep_probs_by_bn_scan_charge ) )
   list
 end
@@ -61,19 +62,22 @@ end
 headings = ["PepProts", "SeqCharge", "Scan(TopHit)", "Scan(Top10)", "ScanCharge(TopHit)", "ScanCharge(Top10)"]
 csv_headings = []
 headings.each do |head|
-  csv_headings << head + ": TP"
+  csv_headings << head + ": NH"
   csv_headings << head + ": PR"
 end
 pairs = two_lists[0].zip two_lists[1]
-roc = ROC.new
+roc = DecoyROC.new
 x_y= []
 area_under_curve = []
 #start_x = []
 #end_x = []
 pairs.each do |pair|
-  x,y = roc.tps_and_precision(pair[0], pair[1])
+  #x,y = roc.pred_and_tps_and_ppv(pair[0], pair[1])
+  (num_hits, tps, ppv) = roc.pred_and_tps_and_ppv(pair[0], pair[1])
+  x = num_hits
+  y = ppv
   if $AREAS_ONLY
     x.unshift 0
     y.unshift 1.0
@@ -99,7 +103,7 @@ end
 # X axis is the number of peptides id# (i.e., # of peps in TP db)
 # Y axis is the precision = TP/(TP+FP)
-## Make some legend comments at the top of the file:
+puts "#  NH = number of hits"
 puts "#  TP = true positives"
 puts "#  FP = false positives"
 puts "#  PR = precision = TP/(TP+FP)"

data/bin/{false_positive_rate.rb → precision.rb} RENAMED Viewed

@@ -2,4 +2,4 @@
 require 'spec_id'
-SpecID.false_positive_rate(ARGV)
+SpecID.precision(ARGV)

data/bin/protein_summary.rb CHANGED Viewed

@@ -8,6 +8,7 @@ require 'spec_id'
 #############################################################
 # GLOBALS:
+PRECISION_PROGRAM_BASE = 'precision'
 DEF_PREFIX = "INV_"
 DEF_PERCENT_FP = "5.0"
 #############################################################
@@ -62,7 +63,7 @@ class Runner
             background-color: #FF0000;
             color: #FFFFFF
         }
-        div.file_info, div.software, div.fpr, div.num_proteins{
+        div.file_info, div.software, div.fppr, div.num_proteins{
             margin-left: 20px;
             margin-top: 20px;
         }
@@ -173,38 +174,38 @@ class Runner
   end
   # assumes that these are sorted on probability
-  # desired_fpr is a float
-  # returns [number_of_prots, actual_fpr]
-  def num_prots_above_fpr(prots, desired_fpr)
-    current_fpr_rate_percent = 0.0
-    previous_fpr_rate_percent = 0.0
+  # desired_fppr is a float
+  # returns [number_of_prots, actual_fppr]
+  def num_prots_above_fppr(prots, desired_fppr)
+    current_fppr_rate_percent = 0.0
+    previous_fppr_rate_percent = 0.0
     current_sum_one_minus_prob = 0.0
-    proteins_within_fpr = 0
-    actual_fpr = nil
+    proteins_within_fppr = 0
+    actual_fppr = nil
     already_found = false
     prot_cnt = 0
     prots.each do |prot|
       prot_cnt += 1
       # SUM(1-probX)/#prots
       current_sum_one_minus_prob += 1.0 - prot._probability.to_f
-      current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
+      current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
-      if current_fpr_rate_percent > desired_fpr && !already_found
-        actual_fpr = previous_fpr_rate_percent
-        proteins_within_fpr = prot_cnt
+      if current_fppr_rate_percent > desired_fppr && !already_found
+        actual_fppr = previous_fppr_rate_percent
+        proteins_within_fppr = prot_cnt
         already_found = true
       end
-      previous_fpr_rate_percent = current_fpr_rate_percent
+      previous_fppr_rate_percent = current_fppr_rate_percent
     end
-    [proteins_within_fpr, actual_fpr]
+    [proteins_within_fppr, actual_fppr]
   end
-    ####    #readable_previous_fpr_rate_percent = sprintf("%.2f", previous_fpr_rate_percent)
+    ####    #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
   # returns a string of the table rows
   # false_positive_rate (give as a %) is the cutoff mark
-  # returns the number of proteins at the desired_fpr (if given)
-  def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fpr, actual_percent_fp, peptide_count_filename=nil)
+  # returns the number of proteins at the desired_fppr (if given)
+  def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
     prot_cnt = 0
     uniq_prots.map do |prot|
       tr do
@@ -267,18 +268,20 @@ class Runner
     "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
   end
-  def proph_output(file, outfn, opt, fpr_output_as_html)
+  def proph_output(file, outfn, opt, fppr_output_as_html)
     header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'),  at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
     num_cols = header_anchors.size
     theaders = ths(header_anchors)
     root = AXML.parse_file(file)
     prots = []
-    ## find the min_prob at a fpr of XX
-    min_prob_redline = 1.01  # if no fpr is less than what they give, then all are redlined!
+    ## find the min_prob at a fppr of XX
+    min_prob_redline = 1.01  # if no fppr is less than what they give, then all are redlined!
-    if opt.c
+    if opt.c
       actual_percent_fp = opt.c.to_f
+    elsif opt.cut_at
+      actual_percent_fp = opt.cut_at.to_f
     else
       actual_percent_fp = nil
     end
@@ -289,20 +292,24 @@ class Runner
     end
     uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
     filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
-    output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
     ## num proteins above cutoff (if opt.c)
     num_prots_html = ''
-    if opt.c
-      (num_prots, actual_fpr) = num_prots_above_fpr(filtered_sorted_prots, opt.c.to_f)
-      num_prots_html = num_prots_to_html(opt.c.to_f, actual_fpr, num_prots)
+    if opt.c || opt.cut_at
+      (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
+      num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
+    end
+    if opt.cut_at
+      filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
     end
+    output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
     table_string = table do
       tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
     end
-    er_info = opt.fpr ? error_info(file) : ""
-    html_pieces = [outfn, header, fpr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
+    er_info = opt.precision ? error_info(file) : ""
+    html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
     print_html_pieces(*html_pieces)
   end # proph_output
@@ -311,7 +318,7 @@ class Runner
     "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
   end
-  def bioworks_output(file, outfn, opt, fpr_output_as_html)
+  def bioworks_output(file, outfn, opt, fppr_output_as_html)
     header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
     num_cols = header_anchors.size
     theaders = ths(header_anchors)
@@ -339,7 +346,7 @@ class Runner
     table_string = table do
       tr{theaders} + rows
     end
-    print_html_pieces(outfn, header, fpr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
+    print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
   end # bioworks_output
   def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
@@ -358,7 +365,7 @@ class Runner
     #puts lines ?? is this supposed to be commented out?
     lines = lines.reject do |obj| obj =~ /\*{10}/ end
     lines.map! do |line| "#{line}<br/>" end
-    "<div class=\"fpr\">
+    "<div class=\"fppr\">
     <h3>Classification Analysis</h3>
     #{lines.join("\n")}
     </div>"
@@ -366,7 +373,7 @@ class Runner
   # transforms the output string of file_as_decoy into html
   def prefix_as_decoy_to_html(string)
-    "<div class=\"fpr\">
+    "<div class=\"fppr\">
     <h3>Classification Analysis</h3>
     </div>" +
     string
@@ -384,21 +391,18 @@ class Runner
       op.separator "    outputs: <file>.summary.html"
       op.separator ""
       op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
-      op.separator("             if --fpr then -f is used to specify a file or prefix")
-      op.separator("             to indicate false positives.")
+      op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
+      op.separator("             if --precision then -f is used to specify a file or prefix")
+      op.separator("             that indicates the false positives.")
       op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
       op.separator ""
-      op.separator "Options for False Positive Rate:"
-      op.on("--fpr", "include output of false_positive_rate.rb,") {|v| opt.fpr = v}
-      op.separator("                                     type 'false_positive_rate.rb' for details")
-      op.separator("  These options are passed on:")
-      op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
-      op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
-      op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
+      op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
+      op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
+      op.separator("                                     type '#{PRECISION_PROGRAM_BASE}.rb' for details")
       op.separator ""
       op.separator "Specific to ProteinProphet (with no concatenated DB):"
-      op.on("-c", "--cutoff percent", "displays red line at given % fpr") {|v| opt.c = v }
+      op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
+      op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
     end
     opts.parse!
@@ -408,31 +412,23 @@ class Runner
       exit
     end
-    fpr_output_as_html = ''
+    fppr_output_as_html = ''
     files = argv.to_a
     files.each do |file|
       outfn = file.gsub(/\.xml$/, '.summary.html')
       ## False Positive Rate Calculation:
-      if opt.fpr
+      if opt.precision
         opt.o = outfn # won't actually be written over, but used
-        to_use_argv = create_false_positive_rate_argv(file, opt)
-        (out_string, opt, file_as_decoy) = SpecID::FalsePositiveRate.new.false_positive_rate(to_use_argv)
-        if file_as_decoy  ## need to wrap this guy up in some html
-          ## DISABLE the opt.f (it's a filename) so it doesn't interfere with
-          ## filtering:
-          opt.f = nil
-          fpr_output_as_html = file_as_decoy_to_html(out_string)
-        else
-          fpr_output_as_html = prefix_as_decoy_to_html(out_string)
-        end
+        to_use_argv = create_precision_argv(file, opt)
+        (out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
+        fppr_output_as_html = prefix_as_decoy_to_html(out_string)
       end
       case SpecID.file_type(file)
       when "protproph"
-        proph_output(file, outfn, opt, fpr_output_as_html)
+        proph_output(file, outfn, opt, fppr_output_as_html)
       when "bioworks"
-        bioworks_output(file, outfn, opt, fpr_output_as_html)
+        bioworks_output(file, outfn, opt, fppr_output_as_html)
       else
         abort "filetype for #{file} not recognized!"
       end
@@ -440,15 +436,12 @@ class Runner
   end # method go
-  def create_false_positive_rate_argv(file, opt)
+  def create_precision_argv(file, opt)
     # include only those options specific
     new_argv = [file]
     if opt.f ; new_argv << '-f' << opt.f end
-    if opt.g ; new_argv << '-g' end
-    if opt.p ; new_argv << '-p' end
-    if opt.n ; new_argv << '-n' end
     if opt.o ; new_argv << '-o' << opt.o end
-    new_argv
+    new_argv
   end
 end   # Runner

data/changelog.txt ADDED Viewed

@@ -0,0 +1,34 @@
+1. A couple of scripts and subroutines were hashing peptides but not on the file
+basename.  This would result in slightly incorrect results (any time there
+were overlapping scan numbers in multiple datasets, only the top one would be
+chosen).  The results would be correct for single runs.
+Output files that could be affected:
+*.top_per_scan.txt
+*.all_peps_per_scan.txt
+Scripts that could be affected:
+script/top_hit_per_scan.rb
+bin/filter_spec_id.rb
+script/filter-peps.rb
+bin/id_precision.rb
+Subroutines that were affected:
+spec_id.rb (pep_probs_by_* )
+spec_id.rb (top_peps_prefilter!)
+proph.rb uniq_by_seqcharge
+align.rb called uniq_by_seqcharge
+2. false_positive_rate.rb and protein_summary.rb (by extension) were using
+number of true positives on the x axis while in reality I was plotting the
+number of hits.  I've updated x axis labels to reflect this change.  In
+addition, since the term 'false positive rate' has such a distinct definition
+in classical ROC plots and binary statistics, I've decided to work primarily
+in terms of precision (TP/(TP+FP)).  I've purged the terms 'False Positive
+Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
+called the False Positive Predictive Rate (FPPR).  I will probably implement
+this in a future release.

data/lib/align.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 require 'spec/mzxml/parser'
-require 'hash_by'
 require 'spec/msrun'
 require 'spec_id/proph'
 require 'vec'