RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/Rakefile CHANGED Viewed

@@ -16,7 +16,7 @@ NAME = "mspire"
 lib_files = FL["lib/**/*"]
 test_dir_too = FL["test/**/*"]
-little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*"]
+little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "changelog.txt", "release_notes.txt", "{bin,script,tutorial}/**/*"]
 dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*", test_dir_too]
 dist_files = little_dist_files # comment out to include test files
@@ -107,12 +107,15 @@ end
 # PACKAGE / INSTALL / UNINSTALL
 ###############################################
+## To release a package on rubyforge:
+## Login to rubyforge and go the 'Files' tab
+## then "To create a new release click here"
 tm = Time.now
 spec = Gem::Specification.new do |s|
   s.platform = Gem::Platform::RUBY
   s.name = NAME
-  s.version = "0.1.5"
+  s.version = "0.1.7"
   s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
   s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
   s.email = "jprince@icmb.utexas.edu"

data/bin/bioworks_to_pepxml.rb CHANGED Viewed

@@ -7,8 +7,10 @@ DEFAULT_DATABASE_PATH = "/project/marcotte/marcotte/ms/database"
 DEFAULT_MZXML_PATH = "."
 DEFAULT_OUTDIR = "pepxml"
 DEFAULT_PARAMS_GLOB = "*.params"
+DEFAULT_PARAMS_FILE = Dir[DEFAULT_PARAMS_GLOB].first
 DEFAULT_PEPXML_VERSION = 18
 DEFAULT_MS_MODEL = 'LCQ'
+DEFAULT_MASS_ANALYZER = 'Ion Trap'
 ##############################################################
 require 'spec_id'
@@ -26,78 +28,120 @@ else
 end
 opt = OpenStruct.new
-opt.mspath = DEFAULT_MZXML_PATH
-opt.outdir = DEFAULT_OUTDIR
-opt.params = Dir[DEFAULT_PARAMS_GLOB].first
-opt.pepxml_version = DEFAULT_PEPXML_VERSION
-opt.model = DEFAULT_MS_MODEL
 opt_obj = OptionParser.new do |op|
-  op.banner = "\nusage: #{File.basename(__FILE__)} [options] bioworks.xml"
+  op.banner = "\nusage: #{File.basename(__FILE__)} [options] <file>.srf ...
+usage: #{File.basename(__FILE__)} [options] bioworks.xml"
   op.on_head "
-  Takes the xml exported output of Bioworks multi-consensus view (no filtering)
-  and outputs pepXML files (which can be fed into the trans-proteomic pipeline).
+  Takes .srf files or the xml exported output of Bioworks multi-consensus view
+  (no filtering) and outputs pepXML files (to feed the trans-proteomic pipeline).
 Options:"
-  op.on('-p', '--params file', "sequest params file  d: '#{opt.params}'") {|v| opt.params = v }
-  op.on('-d', '--dbpath path', "path to databases    d: '#{def_dbpath}'") {|v| opt.dbpath = v }
-  op.on('-m', '--mspath path', "path to MS files     d: '#{opt.mspath}'") {|v| opt.mspath = v }
-  op.on('-o', '--outdir path', "output directory     d: '#{opt.outdir}'") {|v| opt.outdir = v }
-  op.on('--model <LCQ|Orbi>', "MS model             d: '#{opt.model}'") {|v| opt.model = v }
-  op.on('-v', '--version pepxml_version', "pepxml version       d: '#{opt.pepxml_version}'") {|v| opt.pepxml_version = v.to_i }
-  op.on_tail "
+  op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
+  op.on('-o', '--outdir path', "output directory     d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
+  op.separator ""
+  op.separator "bioworks.xml files may require additional options:"
+  op.separator ""
+  op.on('-p', '--params file', "sequest params file  d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
+  op.on('-d', '--dbpath path', "path to databases    d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
+  op.on('-m', '--mspath path', "path to MS files     d: '#{DEFAULT_MZXML_PATH}'") {|v| opt.mspath = v }
+  op.on('--model <LCQ|Orbi|string>', "MS model             d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
+  op.on('--mass_analyzer <string>',  "Mass Analyzer        d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
+  op.on('-v', '--version pepxml_version', "pepxml version       d: '#{DEFAULT_PEPXML_VERSION}'") {|v| opt.pepxml_version = v.to_i }
+end
+more_notes = "
 Notes:
   mspath: Directory to RAW or mzXML (version 1) files.
           This option is not used with Bioworks 3.3 files.
   outdir: Path will be created if it does not already exist.
   model : LCQ -> 'LCQ Deca XP Plus'
         : Orbi -> 'LTQ Orbitrap'
+        : other string -> That's the string that will be used.
+  options with spaces should be quoted: e.g., \"Time of Flight\"
 Database Path:
-  If dbpath opt is given it will be used as the database path (overriding all).
   If the database path in the sequest.params file is valid, that will be used.
-  If no database_path is given, will try (in order):
+  Otherwise, will try (in order):
+      1. --dbpath or -d option
       1. environmental variable BIOWORKS_DBPATH (currently: '#{db_env_var}')
       2. constant at top of this script         (currently: '#{DEFAULT_DATABASE_PATH}')
   "
-end
 opt_obj.parse!
+# intercept before argv count
+if opt.help
+  puts opt_obj
+  puts more_notes
+  exit
+end
 if ARGV.size < 1
   puts opt_obj
   exit
 end
-case opt.model
-when "LCQ"
-  model = 'LCQ Deca XP Plus'
-when "Orbi"
-  model = 'LTQ Orbitrap'
-else
-  abort "Bad MS model argument: #{opt.model}"
+opt.outdir ||= DEFAULT_OUTDIR
+## Create dbpath if does not exist
+if opt.outdir
+  FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
 end
-## Ensure params file exists (unless opt given)
-params_obj = SpecID::Sequest::Params.new(opt.params)
-# Ensure the database exists!
-if opt.dbpath
-  params_obj.database_path = opt.dbpath
+files = ARGV.to_a
+if files[0] =~ /\.srf/i
+  opt.dbpath ||= def_dbpath
+  files.each do |file|
+    hash = {
+      :backup_db_path => opt.dbpath || def_dbpath,
+      :out_path => opt.outdir,
+    }
+    xml_obj = SpecID::Sequest::PepXML.new_from_srf(file, hash)
+    xml_obj.to_pepxml(xml_obj.base_name + ".xml")
+  end
 else
+  ## Ensure params file exists (unless opt given)
+  opt.params ||= DEFAULT_PARAMS_FILE
+  params_obj = SpecID::Sequest::Params.new(opt.params)
+  # Ensure the database exists!
   unless File.exist?( params_obj.database )
-    params_obj.database_path = def_dbpath
+    if opt.dbpath
+      params_obj.database_path = opt.dbpath
+    else
+      params_obj.database_path = def_dbpath
+    end
   end
-end
-## Create dbpath if does not exist
-FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
+  opt.mspath ||= DEFAULT_MZXML_PATH
+  opt.pepxml_version ||= DEFAULT_PEPXML_VERSION
+  opt.model ||= DEFAULT_MS_MODEL
+  opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
+  case opt.model
+  when "LCQ"
+    model = 'LCQ Deca XP Plus'
+  when "Orbi"
+    model = 'LTQ Orbitrap'
+  else
+    model = opt.model
+  end
-bioworks = ARGV[0]
-xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
-xml_objs.each do |obj|
-  obj.to_pepxml(obj.base_name + ".xml")
+  bioworks = files[0]
+  xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
+  xml_objs.each do |obj|
+    obj.to_pepxml(obj.base_name + ".xml")
+  end
 end

data/bin/fasta_shaker.rb ADDED Viewed

@@ -0,0 +1,100 @@
+#!/usr/bin/ruby
+# This is my second attempt at writing a simple interface for messing with
+# fasta files.  Acheiving simplicity (and power) is challenging.  It usually
+# only happens on the second (or sometimes more) try.  Of course, in
+# retrospect the simple solution seems sooo obvious.  But its deceptive.
+# It takes work to acheive simplicity for complex tasks.  That's my thought
+# for the day.
+# fasta_shaker as in a salt shaker.  Shake up your fasta proteins and let them
+# season your dinner (hopefully a protein dinner).  Mmmm.  Don't they taste
+# good all mixed up?  If you want, you can think of it as a pepper shaker.
+# I don't usually comment on my scripts (in my script, anyway), but this one
+# came out so nice and clean that I feel like I have room to spare.
+require 'fasta'
+require 'cmdparse'
+opt = {}
+opts = OptionParser.new do |op|
+  prog = File.basename(__FILE__)
+  op.banner = "usage: #{prog} <method> [OPTIONS] <file>.fasta"
+  op.separator "   <method> = reverse | shuffle"
+  op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
+  op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
+  op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
+  op.on("-f", "--fraction <float>", "creates some fraction of proteins") {|v| opt[:fraction] = v }
+  op.separator "        [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
+  op.separator "         (after any given prefix) so that proteins are unique]"
+  op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
+  op.separator "EXAMPLES: "
+  op.separator "   #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
+  op.separator "   #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
+  op.separator "   #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
+  op.separator "   #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
+end
+opts.parse!
+if ARGV.size < 2
+  puts opts
+  exit
+end
+(method, file) = ARGV
+if opt[:cat] && !opt[:prefix]
+  puts "WARNING: concatenated proteins don't have unique headers"
+  puts "[you probably wanted to use the '--prefix' option!]"
+end
+# OUT filename:
+unless opt[:out]
+  filebase = file.sub(/\..*$/,'')
+  parts = [filebase]
+  parts << 'cat' if opt[:cat]
+  parts << method
+  parts << 'prefix' << opt[:prefix] if opt[:prefix]
+  parts << 'fraction' << opt[:fraction] if opt[:fraction]
+  parts << 'tryptic_peptides' if opt[:tryptic_peptides]
+  opt[:out] = parts.join("_") << ".fasta"
+end
+## READ the file
+fasta = Fasta.new.read_file(file)
+## CAT (save an original copy)
+fasta_orig = fasta.dup if opt[:cat]
+## FRACTION the proteins
+if f = opt[:fraction]
+  prefix = nil
+  f = f.to_f
+  if f > 1.0
+    prefix = proc {|cnt| "f#{cnt}_" }
+  end
+  fasta = fasta.fraction_of_prots(f, prefix)
+end
+## PREFIX the proteins
+if pre = opt[:prefix]
+  fasta.header_prefix!(pre)
+end
+## MODIFY the proteins
+fasta.aaseq!((method + '!').to_sym, opt[:tryptic_peptides])
+## CAT (finish it up)
+if opt[:cat]
+  fasta_orig << fasta
+  fasta = fasta_orig
+end
+## WRITE out the file
+fasta.write_file(opt[:out])

data/bin/filter_spec_id.rb CHANGED Viewed

@@ -1,10 +1,15 @@
 #!/usr/bin/ruby -w
 require 'spec_id'
-require 'hash_by'
 require 'optparse'
 require 'ostruct'
+require 'spec_id/aa_freqs'
+########################################################
+WRITE_MARSHAL = true
+TABULATE_DATA = true
+WRITE_CYS_FIND = false
+########################################################
 opt = OpenStruct.new
 opt.x1 = 1.0
@@ -14,14 +19,19 @@ opt.c = 0.5
 opt.rppm = 1000.0
 opt.false = false
+# prints shortened number for display
+def short(num)
+  sprintf( "%.3f",num)
+end
 opts = OptionParser.new do |op|
-  op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml [decoy.xml]"
-  op.separator("prints number of proteins (and FPR if decoy.xml)")
+  op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
+  op.separator("prints number of proteins (and FPR if -f option)")
   op.separator ""
   op.separator("** only takes the top hit per scan+charge")
-  op.separator("** Excludes all deltacn's over 1.0")
-  op.separator("   (in BioworksBrowser worst hits often given deltacn of 1.1)")
+  op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
+  op.separator("   (these are peptides who are the only hit with xcorr > 0)")
   op.separator ""
   op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge  d: #{opt.x1}") {|v| opt.x1 = v.to_f}
   op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge  d: #{opt.x2}") {|v| opt.x2 = v.to_f}
@@ -30,40 +40,151 @@ opts = OptionParser.new do |op|
   op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass)  d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
   op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
   op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
+  op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
+  op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
+  op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
+  op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
 end
+$cys_mean = nil
+$cys_stdev = nil
 # fpr is a SpecID obj that is the false positives
-def filter_round(files, spec_ids, kind, args, fpr=nil, interactive=false)
+# cysteines holds an aafreqs object or nil
+def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
   (x1, x2, x3, deltacn, rppm) = args
+  combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
   puts "=========================================================================="
   puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
   # push fpr on the end for the calculations
   if fpr ; spec_ids.push(fpr) ; end
-  arr_of_prots_and_peps = spec_ids.map do |spec_id|
-    prots_and_peps = spec_id.filter(kind, *args)
+  arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
+    (prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
+    if cysteines
+      if cysteines.is_a? Float
+        freq = cysteines
+      else
+        freq = cysteines.aafreqs[:C]
+      end
+      (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
+      [prots, peps, deltacnstar_cnt, [ac,exp]]
+    else
+      [prots, peps, deltacnstar_cnt]
+    end
   end
-  arr_of_num_of_prots = arr_of_prots_and_peps.map {|ar| ar[0].size }
-  arr_of_num_of_peps = arr_of_prots_and_peps.map {|ar| ar[1].size }
+  arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
+  arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
+  deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
+  cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
   prot_nums = arr_of_num_of_prots
   pep_nums = arr_of_num_of_peps
+  ## files = [file1, file2, file3]
+  ## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
+  ## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
   files.each_with_index do |file,i|
     if !interactive
       puts "#{file} [prots]:\t#{prot_nums[i]}"
-      puts "#{file}  [peps]:\t#{pep_nums[i]}"
+      puts "#{file}  [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
     else
       puts "file#{i+1} [prots]:  #{prot_nums[i]}"
-      puts "file#{i+1}  [peps]:  #{pep_nums[i]}"
+      puts "file#{i+1}  [peps]:  #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
     end
     if fpr
-      puts "FPR [prots] :  " + sprintf( "%.3f", 100.0*(prot_nums[-1].to_f/prot_nums[0].to_f) ) + " % (#{prot_nums[-1]})"
-      puts "FPR  [peps] :  " + sprintf( "%.3f", 100.0*(pep_nums[-1].to_f/pep_nums[0].to_f) ) + " % (#{pep_nums[-1]})"
+      #puts "FPR [prots] :  " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
+      #puts "FPR  [peps] :  " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
+      ## For separate searches: every false positive = one less TP
+      ## For concatenated searches: every false positive is one less TP
+      ## THAT's what I've been doing already !
+      prot_tps = prot_nums[i] - prot_nums[-1]
+      pep_tps = pep_nums[i] - pep_nums[-1]
+      prot_fps = prot_nums[i] - prot_tps
+      pep_fps = pep_nums[i] - pep_tps
+      prot_fpr = prot_fps.to_f/prot_nums[i].to_f
+      pep_fpr = pep_fps.to_f/pep_nums[i].to_f
+      # those are the same!
+      puts "FPR [prots] :  " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
+      puts "FPR  [peps] :  " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
+    end
+    if cysteines
+      (ac, exp) = cys_reports[i]
+      (cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
+      fraction_of_expected = ac.to_f/exp
+      cys_tps = pep_nums[i] - total_num_false
+      puts "CYSTEINE FPR: "
+      puts "  (# peps containing >= 1 cysteines)"
+      puts "              actual: #{ac}"
+      puts "fraction of expected: #{short(fraction_of_expected)}"
+      puts "     expected # FP's: " + short(total_num_false)
+      puts "      estimated  FPR: " + short( 100.0*cys_fprate ) + " % "
+      puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
+      puts "Combined Score & FPR"
+      puts "#{combined_score}\t#{cys_fprate}"
+      puts "Combined Score & fraction of expected"
+      #puts "#{combined_score} #{fraction_of_expected}"
+      to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
+      puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
+      to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
+      puts to_tab.join("\t") if TABULATE_DATA
+    end
+    if $true_pos_aaseqs
+      peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
+      real_tps = 0
+      real_fps = 0
+      # could also do with partition
+      peps.each do |pep|
+        if pep.sequence =~ /\.([\w\*]+)\.?/
+          if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
+            real_tps += 1
+          else
+            real_fps += 1
+          end
+        else
+          abort "Couldn't Match: #{pep.sequence}"
+        end
+      end
+      if peps.size > 0
+        real_fpr = real_fps.to_f/peps.size
+      else
+        real_fpr = 0.0
+      end
+      puts "REAL FPR: #{real_fpr}"
+      puts "REAL #TP: #{real_tps}"
+      to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
+      puts to_tab.join("\t") if TABULATE_DATA
     end
   end
   #puts files.join(' | ')
   #puts nums.join(' | ')
 end
+# (actual # with cys, expected # with cys, total#peptides,
+# mean_fraction_of_cysteines_true, std)
+# PepHit(C) = Peptide containing cysteine
+#   # Total PepHit(C)                   # Observed Bad Pep (C)
+#   ------------------ proportional_to  ----------------------
+#   # Total PepHit                      # Total Bad PepHit (X)
+def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
+  # the number of bona fide BAD cysteine hits
+  # (some of the cysteine hits (~5%) are true positives)
+  ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
+  if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
+  total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
+  fpr = total_number_false / total_peptides
+  [fpr, total_number_false]
+end
 # assumes its already chomped
 # updates the 5 globals
 def prep_reply(reply, base)
@@ -108,8 +229,22 @@ def prep_reply(reply, base)
 end
 def file_to_prefiltered_spec_id(file)
-  spec_id = SpecID.new(file)
-  spec_id.top_peps_prefilter!
+  spec_id = nil
+  marshal_file = file + ".prefiltered.msh"
+  if File.exist?(marshal_file)
+    File.open(marshal_file) do |fh|
+      spec_id = Marshal.load(fh)
+    end
+  else
+    spec_id = SpecID.new(file)
+    spec_id.top_peps_prefilter!
+    ## marshal it!
+    if WRITE_MARSHAL
+      File.open(marshal_file, "w") do |fh|
+        Marshal.dump(spec_id,fh)
+      end
+    end
+  end
   spec_id
 end
@@ -123,7 +258,6 @@ def interactive_help
   puts "'q' to quit"
 end
 opts.parse!
 if ARGV.size < 1
@@ -140,6 +274,28 @@ arr_of_spec_ids = files.map do |file|
 end
 fpr = nil
+cysteines = nil
+if opt.cysteines
+  puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
+  if File.exist? opt.cysteines
+    cysteines = SpecID::AAFreqs.new(opt.cysteines)
+  else
+    cysteines = opt.cysteines.to_f
+  end
+  if opt.cback
+    ($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
+  end
+end
+$true_pos_aaseqs = nil
+if opt.true_pos
+  puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
+  fasta = Fasta.new.read_file(opt.true_pos)
+  $true_pos_aaseqs = fasta.prots.map do |prot|
+    prot.aaseq.chomp
+  end
+end
 if opt.false
   # its a file if it exists
   if File.exist? opt.false
@@ -163,12 +319,18 @@ end
 base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
-if opt.i
+if opt.from_file
+  lines = IO.readlines(opt.from_file)
+  lines.each do |line|
+    line.chomp!
+    answer = prep_reply(line, base_args)
+    next if answer == false
+    base_args = answer
+    filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
+  end
+elsif opt.i
   interactive_help
   puts "*******************************************************"
   puts "Number of proteins in files (this order):"
@@ -187,13 +349,13 @@ if opt.i
         interactive_help
       else
         base_args = answer
-        filter_round(files, arr_of_spec_ids, :common, base_args, fpr, true)
+        filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
         break
       end
     end
   end
 else
-  filter_round(files, arr_of_spec_ids, :common, base_args, fpr, false)
+  filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
 end