RubyGems - mspire - Versions diffs - 0.2.1 → 0.2.2 - Mend

mspire 0.2.1 → 0.2.2

Files changed (10) hide show

data/Rakefile +1 -1
data/changelog.txt +9 -0
data/lib/spec_id.rb +22 -18
data/lib/spec_id/filter.rb +7 -4
data/lib/spec_id/precision.rb +5 -4
data/lib/spec_id/protein_summary.rb +19 -12
data/test/tc_precision.rb +4 -4
data/test/tc_protein_summary.rb +1 -1
data/test/tc_spec_id.rb +3 -3
metadata +2 -2

data/Rakefile CHANGED Viewed

@@ -140,7 +140,7 @@ tm = Time.now
 spec = Gem::Specification.new do |s|
   s.platform = Gem::Platform::RUBY
   s.name = NAME
-  s.version = "0.2.1"
+  s.version = "0.2.2"
   s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
   s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
   s.email = "jprince@icmb.utexas.edu"

data/changelog.txt CHANGED Viewed

@@ -40,3 +40,12 @@ Added support for modifications to bioworks_to_pepxml.rb
 Can read .srf files (nearly interchangeable with bioworks files)
 Redid filter.rb
+## version 0.2.1
+minor bugfix
+## version 0.2.2
+made compatible with Bioworks fasta file reverser and updated tutorial.
+Killed classify_by_prefix routine in favor of classify_by_false_flag which has
+a prefix option

data/lib/spec_id.rb CHANGED Viewed

@@ -223,13 +223,7 @@ module SpecID
     pps
   end
-  # returns [tp, fp] based on the protein prefix for items where items =
-  # (:prot|:peps)
-  # this may result in a duplication of some peptides if they match both
-  # normal and decoy proteins.  In this case, the protein arrays are split,
-  # too, so that each points only to its breed of protein.
-  def classify_by_prefix(items, prefix, fp_on_match=true)
-    regex = /^#{Regexp.escape(prefix)}/
+  def classify_by_regex(items, regex, fp_on_match=true)
     case items
     when :prots
       myproc = proc { |prt|
@@ -264,15 +258,21 @@ module SpecID
     else
       abort "don't recognize "
     end
-end
+  end
-  ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
-  #  # Returns tp, fp where each is an array of proteins where fp is determined
-  #  # by a protein's reference matching the prefix.  fp is a protein matching!
-  #  def classify_prots_by_prefix(prefix)
-  #    regex = /^#{Regexp.escape(prefix)}/
-  #    classify(:prots, proc {|prot| prot.reference })
-  #  end
+  # returns [tp, fp] based on the protein prefix for items where items =
+  # (:prot|:peps)
+  # this may result in a duplication of some peptides if they match both
+  # normal and decoy proteins.  In this case, the protein arrays are split,
+  # too, so that each points only to its breed of protein.
+  def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
+    if prefix
+      regex = /^#{Regexp.escape(flag)}/
+    else
+      regex = /#{Regexp.escape(flag)}/
+    end
+    classify_by_regex(items, regex, fp_on_match)
+  end
   # Returns (match, nomatch)
   # items = symbol (:prots, :peps)
@@ -354,10 +354,14 @@ end
   end
   # convenience method for the common task of determining precision for
-  # proteins (with decoy proteins found by prefix)
+  # proteins (with decoy proteins found by false_flag)
   # returns (num_hits, precision)
-  def num_hits_and_ppv_for_prob(fp_prefix)
-    regex = /^#{Regexp.escape(fp_prefix)}/
+  def num_hits_and_ppv_for_prob(false_flag, prefix=false)
+    if prefix
+      regex = /^#{Regexp.escape(false_flag)}/
+    else
+      regex = /#{Regexp.escape(false_flag)}/
+    end
     prob_proc = probability_proc
     myproc = proc { |prt|
       if prt.reference =~ regex ; false

data/lib/spec_id/filter.rb CHANGED Viewed

@@ -200,7 +200,7 @@ class SpecID::Filter
             new_spec_ids << spec_id
             file_to_prefiltered_spec_id(prefix_or_file, opt)
           else
-            (tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
+            (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
             fps_specid = spec_id.class.new
             tps_specid = spec_id.class.new
@@ -339,8 +339,10 @@ class SpecID::Filter
       op.on("-p", "--ppm N", Float,     "<= ppm               d: #{opt.ppm}") {|v| opt.ppm = v}
       op.separator "                                     if bioworks.xml, = 10^6deltamass/mass"
       op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
-      op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
-      op.separator("                                     last given will apply to remaining files")
+      op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
+      op.separator("                                     e.g., for Bioworks: 'REVERSE'")
+      op.separator("                                     (last given will apply to remaining files)")
+      op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
       op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
         v[0] = get_cys_freq(v[0])
         opt.cys = v
@@ -354,7 +356,8 @@ class SpecID::Filter
       op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
       op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
       op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
-      op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
+      ## NEED TO IMPLEMENT THIS:
+      #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
       op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
     end

data/lib/spec_id/precision.rb CHANGED Viewed

@@ -245,11 +245,12 @@ class Prec
       op.separator ""
       op.separator "Options:"
-      op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
+      op.on("-f", "--fp_data <prefix_or_file>", "flag -or- decoy FILE") {|v| opt.f = v }
       op.separator ""
-      op.separator "        If searched with a concatenated DB, give a PREFIX to decoy proteins."
-      op.separator "        If files have different prefixes, separate with commas."
+      op.separator "        If searched with a concatenated DB, give a false flag to decoy proteins."
+      op.separator "        If files have different flags, separate with commas."
       op.separator "        If searched with a separate decoy DB, give the FILE name of decoy data"
+      op.on("--prefix", "false flag as prefix only") {|v| opt.prefix = v }
       op.separator ""
       ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
       op.separator ""
@@ -374,7 +375,7 @@ Example:
       sp = SpecID.new(file)
       #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
       if opt.f
-        (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
+        (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i], opt.prefix)
         all_arrs[i] << [num_hits,ppv]
         key[i] << ["Precision",  ["# hits", "Prec (decoy)"]]
       end

data/lib/spec_id/protein_summary.rb CHANGED Viewed

@@ -145,9 +145,13 @@ class ProteinSummary
     end
   end
-  def prefix_to_regex(prefix)
-    if prefix
-      /^#{Regexp.escape(prefix)}/
+  def flag_to_regex(flag, prefix=false)
+    if flag
+      if prefix
+        /^#{Regexp.escape(flag)}/
+      else
+        /#{Regexp.escape(flag)}/
+      end
     else
       nil
     end
@@ -164,12 +168,12 @@ class ProteinSummary
   end
   # filters on the false positive regex and sorts by prot probability
-  def filter_and_sort(uniq_prots, prefix=nil)
-    prefix_re = prefix_to_regex(prefix)
+  def filter_and_sort(uniq_prots, flag=nil, prefix=false)
+    false_flag_re = flag_to_regex(flag, prefix)
     sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
     ## filter on prefix
     if prefix
-      sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
+      sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
     end
     sorted
   end
@@ -292,7 +296,7 @@ class ProteinSummary
       end
     end
     uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
-    filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
+    filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
     ## num proteins above cutoff (if opt.c)
     num_prots_html = ''
@@ -322,7 +326,7 @@ class ProteinSummary
   # takes spec_id object
   # the outfn is the output filename
   # opt is an OpenStruct that holds opt.f = the false prefix
-  def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
+  def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
     fppr_output_as_html ||= ''
     header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
     num_cols = header_anchors.size
@@ -330,9 +334,8 @@ class ProteinSummary
     proteins = spec_id.prots
     protein_num = 0
     rows = ""
-    prefix_re = prefix_to_regex(false_prefix)
     proteins.each do |prot|
-      if false_prefix && prot.reference =~ prefix_re
+      if false_flag_re && prot.reference =~ false_flag_re
         next
       end
       uniq_peps = Hash.new {|h,k| h[k] = true; }
@@ -393,7 +396,8 @@ class ProteinSummary
       op.separator "    where file = bioworks -or- <run>-prot (prophet output)"
       op.separator "    outputs: <file>.summary.html"
       op.separator ""
-      op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
+      op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
+      op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
       op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
       op.separator("             if --precision then -f is used to specify a file or prefix")
       op.separator("             that indicates the false positives.")
@@ -434,7 +438,9 @@ class ProteinSummary
         proph_output(file, outfn, opt, fppr_output_as_html)
       when "bioworks"
         spec_id = SpecID.new(file)
-        bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
+        false_regex = flag_to_regex(opt.f, opt.prefix)
+        bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
       else
         abort "filetype for #{file} not recognized!"
       end
@@ -445,6 +451,7 @@ class ProteinSummary
   def create_precision_argv(file, opt)
     # include only those options specific
     new_argv = [file]
+    if opt.prefix ; new_argv << '--prefix' end
     if opt.f ; new_argv << '-f' << opt.f end
     if opt.o ; new_argv << '-o' << opt.o end
     new_argv

data/test/tc_precision.rb CHANGED Viewed

@@ -22,7 +22,7 @@ class PrecTest < Test::Unit::TestCase
   end
   def test_basic_cat
-    output = `#{@cmd} -o #{@tf_html} -f SHUFF_ #{@tf_bioworks_shuff}`
+    output = `#{@cmd} -o #{@tf_html} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
     puts output
     assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
@@ -34,7 +34,7 @@ class PrecTest < Test::Unit::TestCase
   end
   def test_multiple_files
-    output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
+    output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ --prefix #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
     assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
     assert_match(/1.*1.0000.*1.*1.0000.*0.*0.*15.*0.8667/m, IO.read(@tf_html), "has values")
     [@tf_html, @tf_png].each do |file|
@@ -45,14 +45,14 @@ class PrecTest < Test::Unit::TestCase
   def test_area_under_curve
     file = @tfiles + 'ppv_area.txt'
-    `#{@cmd} -o #{file} -a -f SHUFF_ #{@tf_bioworks_shuff}`
+    `#{@cmd} -o #{file} -a -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
     assert(File.exist?(file), "file #{file} exists")
     output = IO.read(file)
     assert_match(/Prec.*7.39206/, output, "consistency check")
     File.unlink file
     outfile = File.join(File.dirname(__FILE__), 'other.html')
-    `#{@cmd} -o #{outfile} -f SHUFF_ #{@tf_bioworks_shuff}`
+    `#{@cmd} -o #{outfile} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
     File.unlink outfile
     File.unlink File.join(File.dirname(__FILE__),'other.png')
   end

data/test/tc_protein_summary.rb CHANGED Viewed

@@ -75,7 +75,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
   def test_proph_with_precision
     #puts @cmd
-    runit "#{@tf_proph_cat_inv} -f INV_ --precision"
+    runit "#{@tf_proph_cat_inv} -f INV_ --prefix --precision"
     html =  IO.read(@tf_proph_cat_inv_summary_html)
     assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
     assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")

data/test/tc_spec_id.rb CHANGED Viewed

@@ -20,11 +20,11 @@ class SpecIDTest < Test::Unit::TestCase
     assert_equal(106, sp.prots.size)
   end
-  def test_classify_by_prefix
+  def test_classify_by_false_flag
     file = @tfiles + "bioworks_with_INV_small.xml"
     sp = SpecID.new(file)
     assert_equal(19, sp.prots.size)
-    (tp, fp) = sp.classify_by_prefix(:prots, "INV_")
+    (tp, fp) = sp.classify_by_false_flag(:prots, "INV_", true, true)
     assert_equal(4, fp.size, "num false pos")
     assert_equal(15, tp.size, "num true pos")
   end
@@ -58,7 +58,7 @@ class SpecIDTest < Test::Unit::TestCase
     (tps, ys) = roc.tps_and_ppv(tp, fp)
     assert_equal(exp_tp, tps)
     assert_equal(exp_fp, ys)
-    (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_")
+    (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_", true)
     # @TODO: assert these guys for consistencies sake:
     assert_in_delta_arrays([1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15], tps, 0.0000001)
     # Consistency check only:

metadata CHANGED Viewed

@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
 specification_version: 1
 name: mspire
 version: !ruby/object:Gem::Version
-  version: 0.2.1
-date: 2007-04-30 00:00:00 -05:00
+  version: 0.2.2
+date: 2007-05-08 00:00:00 -05:00
 summary: Mass Spectrometry Proteomics Objects, Scripts, and Executables
 require_paths:
 - lib