RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/Rakefile CHANGED Viewed

@@ -16,7 +16,7 @@ NAME = "mspire"
 lib_files = FL["lib/**/*"]
 test_dir_too = FL["test/**/*"]
-little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*"]
+little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "changelog.txt", "release_notes.txt", "{bin,script,tutorial}/**/*"]
 dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*", test_dir_too]
 dist_files = little_dist_files # comment out to include test files
@@ -107,12 +107,15 @@ end
 # PACKAGE / INSTALL / UNINSTALL
 ###############################################
+## To release a package on rubyforge:
+## Login to rubyforge and go the 'Files' tab
+## then "To create a new release click here"
 tm = Time.now
 spec = Gem::Specification.new do |s|
   s.platform = Gem::Platform::RUBY
   s.name = NAME
-  s.version = "0.1.5"
+  s.version = "0.1.7"
   s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
   s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
   s.email = "jprince@icmb.utexas.edu"

data/bin/bioworks_to_pepxml.rb CHANGED Viewed

@@ -7,8 +7,10 @@ DEFAULT_DATABASE_PATH = "/project/marcotte/marcotte/ms/database"
 DEFAULT_MZXML_PATH = "."
 DEFAULT_OUTDIR = "pepxml"
 DEFAULT_PARAMS_GLOB = "*.params"
+DEFAULT_PARAMS_FILE = Dir[DEFAULT_PARAMS_GLOB].first
 DEFAULT_PEPXML_VERSION = 18
 DEFAULT_MS_MODEL = 'LCQ'
+DEFAULT_MASS_ANALYZER = 'Ion Trap'
 ##############################################################
 require 'spec_id'
@@ -26,78 +28,120 @@ else
 end
 opt = OpenStruct.new
-opt.mspath = DEFAULT_MZXML_PATH
-opt.outdir = DEFAULT_OUTDIR
-opt.params = Dir[DEFAULT_PARAMS_GLOB].first
-opt.pepxml_version = DEFAULT_PEPXML_VERSION
-opt.model = DEFAULT_MS_MODEL
 opt_obj = OptionParser.new do |op|
-  op.banner = "\nusage: #{File.basename(__FILE__)} [options] bioworks.xml"
+  op.banner = "\nusage: #{File.basename(__FILE__)} [options] <file>.srf ...
+usage: #{File.basename(__FILE__)} [options] bioworks.xml"
   op.on_head "
-  Takes the xml exported output of Bioworks multi-consensus view (no filtering)
-  and outputs pepXML files (which can be fed into the trans-proteomic pipeline).
+  Takes .srf files or the xml exported output of Bioworks multi-consensus view
+  (no filtering) and outputs pepXML files (to feed the trans-proteomic pipeline).
 Options:"
-  op.on('-p', '--params file', "sequest params file  d: '#{opt.params}'") {|v| opt.params = v }
-  op.on('-d', '--dbpath path', "path to databases    d: '#{def_dbpath}'") {|v| opt.dbpath = v }
-  op.on('-m', '--mspath path', "path to MS files     d: '#{opt.mspath}'") {|v| opt.mspath = v }
-  op.on('-o', '--outdir path', "output directory     d: '#{opt.outdir}'") {|v| opt.outdir = v }
-  op.on('--model <LCQ|Orbi>', "MS model             d: '#{opt.model}'") {|v| opt.model = v }
-  op.on('-v', '--version pepxml_version', "pepxml version       d: '#{opt.pepxml_version}'") {|v| opt.pepxml_version = v.to_i }
-  op.on_tail "
+  op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
+  op.on('-o', '--outdir path', "output directory     d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
+  op.separator ""
+  op.separator "bioworks.xml files may require additional options:"
+  op.separator ""
+  op.on('-p', '--params file', "sequest params file  d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
+  op.on('-d', '--dbpath path', "path to databases    d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
+  op.on('-m', '--mspath path', "path to MS files     d: '#{DEFAULT_MZXML_PATH}'") {|v| opt.mspath = v }
+  op.on('--model <LCQ|Orbi|string>', "MS model             d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
+  op.on('--mass_analyzer <string>',  "Mass Analyzer        d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
+  op.on('-v', '--version pepxml_version', "pepxml version       d: '#{DEFAULT_PEPXML_VERSION}'") {|v| opt.pepxml_version = v.to_i }
+end
+more_notes = "
 Notes:
   mspath: Directory to RAW or mzXML (version 1) files.
           This option is not used with Bioworks 3.3 files.
   outdir: Path will be created if it does not already exist.
   model : LCQ -> 'LCQ Deca XP Plus'
         : Orbi -> 'LTQ Orbitrap'
+        : other string -> That's the string that will be used.
+  options with spaces should be quoted: e.g., \"Time of Flight\"
 Database Path:
-  If dbpath opt is given it will be used as the database path (overriding all).
   If the database path in the sequest.params file is valid, that will be used.
-  If no database_path is given, will try (in order):
+  Otherwise, will try (in order):
+      1. --dbpath or -d option
       1. environmental variable BIOWORKS_DBPATH (currently: '#{db_env_var}')
       2. constant at top of this script         (currently: '#{DEFAULT_DATABASE_PATH}')
   "
-end
 opt_obj.parse!
+# intercept before argv count
+if opt.help
+  puts opt_obj
+  puts more_notes
+  exit
+end
 if ARGV.size < 1
   puts opt_obj
   exit
 end
-case opt.model
-when "LCQ"
-  model = 'LCQ Deca XP Plus'
-when "Orbi"
-  model = 'LTQ Orbitrap'
-else
-  abort "Bad MS model argument: #{opt.model}"
+opt.outdir ||= DEFAULT_OUTDIR
+## Create dbpath if does not exist
+if opt.outdir
+  FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
 end
-## Ensure params file exists (unless opt given)
-params_obj = SpecID::Sequest::Params.new(opt.params)
-# Ensure the database exists!
-if opt.dbpath
-  params_obj.database_path = opt.dbpath
+files = ARGV.to_a
+if files[0] =~ /\.srf/i
+  opt.dbpath ||= def_dbpath
+  files.each do |file|
+    hash = {
+      :backup_db_path => opt.dbpath || def_dbpath,
+      :out_path => opt.outdir,
+    }
+    xml_obj = SpecID::Sequest::PepXML.new_from_srf(file, hash)
+    xml_obj.to_pepxml(xml_obj.base_name + ".xml")
+  end
 else
+  ## Ensure params file exists (unless opt given)
+  opt.params ||= DEFAULT_PARAMS_FILE
+  params_obj = SpecID::Sequest::Params.new(opt.params)
+  # Ensure the database exists!
   unless File.exist?( params_obj.database )
-    params_obj.database_path = def_dbpath
+    if opt.dbpath
+      params_obj.database_path = opt.dbpath
+    else
+      params_obj.database_path = def_dbpath
+    end
   end
-end
-## Create dbpath if does not exist
-FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
+  opt.mspath ||= DEFAULT_MZXML_PATH
+  opt.pepxml_version ||= DEFAULT_PEPXML_VERSION
+  opt.model ||= DEFAULT_MS_MODEL
+  opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
+  case opt.model
+  when "LCQ"
+    model = 'LCQ Deca XP Plus'
+  when "Orbi"
+    model = 'LTQ Orbitrap'
+  else
+    model = opt.model
+  end
-bioworks = ARGV[0]
-xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
-xml_objs.each do |obj|
-  obj.to_pepxml(obj.base_name + ".xml")
+  bioworks = files[0]
+  xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
+  xml_objs.each do |obj|
+    obj.to_pepxml(obj.base_name + ".xml")
+  end
 end

data/bin/fasta_shaker.rb ADDED Viewed

@@ -0,0 +1,100 @@
+#!/usr/bin/ruby
+# This is my second attempt at writing a simple interface for messing with
+# fasta files.  Acheiving simplicity (and power) is challenging.  It usually
+# only happens on the second (or sometimes more) try.  Of course, in
+# retrospect the simple solution seems sooo obvious.  But its deceptive.
+# It takes work to acheive simplicity for complex tasks.  That's my thought
+# for the day.
+# fasta_shaker as in a salt shaker.  Shake up your fasta proteins and let them
+# season your dinner (hopefully a protein dinner).  Mmmm.  Don't they taste
+# good all mixed up?  If you want, you can think of it as a pepper shaker.
+# I don't usually comment on my scripts (in my script, anyway), but this one
+# came out so nice and clean that I feel like I have room to spare.
+require 'fasta'
+require 'cmdparse'
+opt = {}
+opts = OptionParser.new do |op|
+  prog = File.basename(__FILE__)
+  op.banner = "usage: #{prog} <method> [OPTIONS] <file>.fasta"
+  op.separator "   <method> = reverse | shuffle"
+  op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
+  op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
+  op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
+  op.on("-f", "--fraction <float>", "creates some fraction of proteins") {|v| opt[:fraction] = v }
+  op.separator "        [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
+  op.separator "         (after any given prefix) so that proteins are unique]"
+  op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
+  op.separator "EXAMPLES: "
+  op.separator "   #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
+  op.separator "   #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
+  op.separator "   #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
+  op.separator "   #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
+end
+opts.parse!
+if ARGV.size < 2
+  puts opts
+  exit
+end
+(method, file) = ARGV
+if opt[:cat] && !opt[:prefix]
+  puts "WARNING: concatenated proteins don't have unique headers"
+  puts "[you probably wanted to use the '--prefix' option!]"
+end
+# OUT filename:
+unless opt[:out]
+  filebase = file.sub(/\..*$/,'')
+  parts = [filebase]
+  parts << 'cat' if opt[:cat]
+  parts << method
+  parts << 'prefix' << opt[:prefix] if opt[:prefix]
+  parts << 'fraction' << opt[:fraction] if opt[:fraction]
+  parts << 'tryptic_peptides' if opt[:tryptic_peptides]
+  opt[:out] = parts.join("_") << ".fasta"
+end
+## READ the file
+fasta = Fasta.new.read_file(file)
+## CAT (save an original copy)
+fasta_orig = fasta.dup if opt[:cat]
+## FRACTION the proteins
+if f = opt[:fraction]
+  prefix = nil
+  f = f.to_f
+  if f > 1.0
+    prefix = proc {|cnt| "f#{cnt}_" }
+  end
+  fasta = fasta.fraction_of_prots(f, prefix)
+end
+## PREFIX the proteins
+if pre = opt[:prefix]
+  fasta.header_prefix!(pre)
+end
+## MODIFY the proteins
+fasta.aaseq!((method + '!').to_sym, opt[:tryptic_peptides])
+## CAT (finish it up)
+if opt[:cat]
+  fasta_orig << fasta
+  fasta = fasta_orig
+end
+## WRITE out the file
+fasta.write_file(opt[:out])

data/bin/filter_spec_id.rb CHANGED Viewed

@@ -1,10 +1,15 @@
 #!/usr/bin/ruby -w
 require 'spec_id'
-require 'hash_by'
 require 'optparse'
 require 'ostruct'
+require 'spec_id/aa_freqs'
+########################################################
+WRITE_MARSHAL = true
+TABULATE_DATA = true
+WRITE_CYS_FIND = false
+########################################################
 opt = OpenStruct.new
 opt.x1 = 1.0
@@ -14,14 +19,19 @@ opt.c = 0.5
 opt.rppm = 1000.0
 opt.false = false
+# prints shortened number for display
+def short(num)
+  sprintf( "%.3f",num)
+end
 opts = OptionParser.new do |op|
-  op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml [decoy.xml]"
-  op.separator("prints number of proteins (and FPR if decoy.xml)")
+  op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
+  op.separator("prints number of proteins (and FPR if -f option)")
   op.separator ""
   op.separator("** only takes the top hit per scan+charge")
-  op.separator("** Excludes all deltacn's over 1.0")
-  op.separator("   (in BioworksBrowser worst hits often given deltacn of 1.1)")
+  op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
+  op.separator("   (these are peptides who are the only hit with xcorr > 0)")
   op.separator ""
   op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge  d: #{opt.x1}") {|v| opt.x1 = v.to_f}
   op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge  d: #{opt.x2}") {|v| opt.x2 = v.to_f}
@@ -30,40 +40,151 @@ opts = OptionParser.new do |op|
   op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass)  d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
   op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
   op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
+  op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
+  op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
+  op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
+  op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
 end
+$cys_mean = nil
+$cys_stdev = nil
 # fpr is a SpecID obj that is the false positives
-def filter_round(files, spec_ids, kind, args, fpr=nil, interactive=false)
+# cysteines holds an aafreqs object or nil
+def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
   (x1, x2, x3, deltacn, rppm) = args
+  combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
   puts "=========================================================================="
   puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
   # push fpr on the end for the calculations
   if fpr ; spec_ids.push(fpr) ; end
-  arr_of_prots_and_peps = spec_ids.map do |spec_id|
-    prots_and_peps = spec_id.filter(kind, *args)
+  arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
+    (prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
+    if cysteines
+      if cysteines.is_a? Float
+        freq = cysteines
+      else
+        freq = cysteines.aafreqs[:C]
+      end
+      (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
+      [prots, peps, deltacnstar_cnt, [ac,exp]]
+    else
+      [prots, peps, deltacnstar_cnt]
+    end
   end
-  arr_of_num_of_prots = arr_of_prots_and_peps.map {|ar| ar[0].size }
-  arr_of_num_of_peps = arr_of_prots_and_peps.map {|ar| ar[1].size }
+  arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
+  arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
+  deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
+  cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
   prot_nums = arr_of_num_of_prots
   pep_nums = arr_of_num_of_peps
+  ## files = [file1, file2, file3]
+  ## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
+  ## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
   files.each_with_index do |file,i|
     if !interactive
       puts "#{file} [prots]:\t#{prot_nums[i]}"
-      puts "#{file}  [peps]:\t#{pep_nums[i]}"
+      puts "#{file}  [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
     else
       puts "file#{i+1} [prots]:  #{prot_nums[i]}"
-      puts "file#{i+1}  [peps]:  #{pep_nums[i]}"
+      puts "file#{i+1}  [peps]:  #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
     end
     if fpr
-      puts "FPR [prots] :  " + sprintf( "%.3f", 100.0*(prot_nums[-1].to_f/prot_nums[0].to_f) ) + " % (#{prot_nums[-1]})"
-      puts "FPR  [peps] :  " + sprintf( "%.3f", 100.0*(pep_nums[-1].to_f/pep_nums[0].to_f) ) + " % (#{pep_nums[-1]})"
+      #puts "FPR [prots] :  " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
+      #puts "FPR  [peps] :  " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
+      ## For separate searches: every false positive = one less TP
+      ## For concatenated searches: every false positive is one less TP
+      ## THAT's what I've been doing already !
+      prot_tps = prot_nums[i] - prot_nums[-1]
+      pep_tps = pep_nums[i] - pep_nums[-1]
+      prot_fps = prot_nums[i] - prot_tps
+      pep_fps = pep_nums[i] - pep_tps
+      prot_fpr = prot_fps.to_f/prot_nums[i].to_f
+      pep_fpr = pep_fps.to_f/pep_nums[i].to_f
+      # those are the same!
+      puts "FPR [prots] :  " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
+      puts "FPR  [peps] :  " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
+    end
+    if cysteines
+      (ac, exp) = cys_reports[i]
+      (cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
+      fraction_of_expected = ac.to_f/exp
+      cys_tps = pep_nums[i] - total_num_false
+      puts "CYSTEINE FPR: "
+      puts "  (# peps containing >= 1 cysteines)"
+      puts "              actual: #{ac}"
+      puts "fraction of expected: #{short(fraction_of_expected)}"
+      puts "     expected # FP's: " + short(total_num_false)
+      puts "      estimated  FPR: " + short( 100.0*cys_fprate ) + " % "
+      puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
+      puts "Combined Score & FPR"
+      puts "#{combined_score}\t#{cys_fprate}"
+      puts "Combined Score & fraction of expected"
+      #puts "#{combined_score} #{fraction_of_expected}"
+      to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
+      puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
+      to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
+      puts to_tab.join("\t") if TABULATE_DATA
+    end
+    if $true_pos_aaseqs
+      peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
+      real_tps = 0
+      real_fps = 0
+      # could also do with partition
+      peps.each do |pep|
+        if pep.sequence =~ /\.([\w\*]+)\.?/
+          if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
+            real_tps += 1
+          else
+            real_fps += 1
+          end
+        else
+          abort "Couldn't Match: #{pep.sequence}"
+        end
+      end
+      if peps.size > 0
+        real_fpr = real_fps.to_f/peps.size
+      else
+        real_fpr = 0.0
+      end
+      puts "REAL FPR: #{real_fpr}"
+      puts "REAL #TP: #{real_tps}"
+      to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
+      puts to_tab.join("\t") if TABULATE_DATA
     end
   end
   #puts files.join(' | ')
   #puts nums.join(' | ')
 end
+# (actual # with cys, expected # with cys, total#peptides,
+# mean_fraction_of_cysteines_true, std)
+# PepHit(C) = Peptide containing cysteine
+#   # Total PepHit(C)                   # Observed Bad Pep (C)
+#   ------------------ proportional_to  ----------------------
+#   # Total PepHit                      # Total Bad PepHit (X)
+def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
+  # the number of bona fide BAD cysteine hits
+  # (some of the cysteine hits (~5%) are true positives)
+  ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
+  if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
+  total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
+  fpr = total_number_false / total_peptides
+  [fpr, total_number_false]
+end
 # assumes its already chomped
 # updates the 5 globals
 def prep_reply(reply, base)
@@ -108,8 +229,22 @@ def prep_reply(reply, base)
 end
 def file_to_prefiltered_spec_id(file)
-  spec_id = SpecID.new(file)
-  spec_id.top_peps_prefilter!
+  spec_id = nil
+  marshal_file = file + ".prefiltered.msh"
+  if File.exist?(marshal_file)
+    File.open(marshal_file) do |fh|
+      spec_id = Marshal.load(fh)
+    end
+  else
+    spec_id = SpecID.new(file)
+    spec_id.top_peps_prefilter!
+    ## marshal it!
+    if WRITE_MARSHAL
+      File.open(marshal_file, "w") do |fh|
+        Marshal.dump(spec_id,fh)
+      end
+    end
+  end
   spec_id
 end
@@ -123,7 +258,6 @@ def interactive_help
   puts "'q' to quit"
 end
 opts.parse!
 if ARGV.size < 1
@@ -140,6 +274,28 @@ arr_of_spec_ids = files.map do |file|
 end
 fpr = nil
+cysteines = nil
+if opt.cysteines
+  puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
+  if File.exist? opt.cysteines
+    cysteines = SpecID::AAFreqs.new(opt.cysteines)
+  else
+    cysteines = opt.cysteines.to_f
+  end
+  if opt.cback
+    ($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
+  end
+end
+$true_pos_aaseqs = nil
+if opt.true_pos
+  puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
+  fasta = Fasta.new.read_file(opt.true_pos)
+  $true_pos_aaseqs = fasta.prots.map do |prot|
+    prot.aaseq.chomp
+  end
+end
 if opt.false
   # its a file if it exists
   if File.exist? opt.false
@@ -163,12 +319,18 @@ end
 base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
-if opt.i
+if opt.from_file
+  lines = IO.readlines(opt.from_file)
+  lines.each do |line|
+    line.chomp!
+    answer = prep_reply(line, base_args)
+    next if answer == false
+    base_args = answer
+    filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
+  end
+elsif opt.i
   interactive_help
   puts "*******************************************************"
   puts "Number of proteins in files (this order):"
@@ -187,13 +349,13 @@ if opt.i
         interactive_help
       else
         base_args = answer
-        filter_round(files, arr_of_spec_ids, :common, base_args, fpr, true)
+        filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
         break
       end
     end
   end
 else
-  filter_round(files, arr_of_spec_ids, :common, base_args, fpr, false)
+  filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
 end