RubyGems - mspire - Versions diffs - 0.2.1 → 0.2.2 - Mend

mspire 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/Rakefile +1 -1
data/changelog.txt +9 -0
data/lib/spec_id.rb +22 -18
data/lib/spec_id/filter.rb +7 -4
data/lib/spec_id/precision.rb +5 -4
data/lib/spec_id/protein_summary.rb +19 -12
data/test/tc_precision.rb +4 -4
data/test/tc_protein_summary.rb +1 -1
data/test/tc_spec_id.rb +3 -3
metadata +2 -2

data/Rakefile CHANGED Viewed

@@ -140,7 +140,7 @@ tm = Time.now
 spec = Gem::Specification.new do |s|
   s.platform = Gem::Platform::RUBY
   s.name = NAME
-  s.version = "0.2.1"
+  s.version = "0.2.2"
   s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
   s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
   s.email = "jprince@icmb.utexas.edu"

data/changelog.txt CHANGED Viewed

@@ -40,3 +40,12 @@ Added support for modifications to bioworks_to_pepxml.rb
 Can read .srf files (nearly interchangeable with bioworks files)
 Redid filter.rb
+## version 0.2.1
+minor bugfix
+## version 0.2.2
+made compatible with Bioworks fasta file reverser and updated tutorial.
+Killed classify_by_prefix routine in favor of classify_by_false_flag which has
+a prefix option

data/lib/spec_id.rb CHANGED Viewed

@@ -223,13 +223,7 @@ module SpecID
     pps
   end
-  # returns [tp, fp] based on the protein prefix for items where items =
-  # (:prot|:peps)
-  # this may result in a duplication of some peptides if they match both
-  # normal and decoy proteins.  In this case, the protein arrays are split,
-  # too, so that each points only to its breed of protein.
-  def classify_by_prefix(items, prefix, fp_on_match=true)
-    regex = /^#{Regexp.escape(prefix)}/
+  def classify_by_regex(items, regex, fp_on_match=true)
     case items
     when :prots
       myproc = proc { |prt|
@@ -264,15 +258,21 @@ module SpecID
     else
       abort "don't recognize "
     end
-end
+  end
-  ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
-  #  # Returns tp, fp where each is an array of proteins where fp is determined
-  #  # by a protein's reference matching the prefix.  fp is a protein matching!
-  #  def classify_prots_by_prefix(prefix)
-  #    regex = /^#{Regexp.escape(prefix)}/
-  #    classify(:prots, proc {|prot| prot.reference })
-  #  end
+  # returns [tp, fp] based on the protein prefix for items where items =
+  # (:prot|:peps)
+  # this may result in a duplication of some peptides if they match both
+  # normal and decoy proteins.  In this case, the protein arrays are split,
+  # too, so that each points only to its breed of protein.
+  def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
+    if prefix
+      regex = /^#{Regexp.escape(flag)}/
+    else
+      regex = /#{Regexp.escape(flag)}/
+    end
+    classify_by_regex(items, regex, fp_on_match)
+  end
   # Returns (match, nomatch)
   # items = symbol (:prots, :peps)
@@ -354,10 +354,14 @@ end
   end
   # convenience method for the common task of determining precision for
-  # proteins (with decoy proteins found by prefix)
+  # proteins (with decoy proteins found by false_flag)
   # returns (num_hits, precision)
-  def num_hits_and_ppv_for_prob(fp_prefix)
-    regex = /^#{Regexp.escape(fp_prefix)}/
+  def num_hits_and_ppv_for_prob(false_flag, prefix=false)
+    if prefix
+      regex = /^#{Regexp.escape(false_flag)}/
+    else
+      regex = /#{Regexp.escape(false_flag)}/
+    end
     prob_proc = probability_proc
     myproc = proc { |prt|
       if prt.reference =~ regex ; false

data/lib/spec_id/filter.rb CHANGED Viewed

@@ -200,7 +200,7 @@ class SpecID::Filter
             new_spec_ids << spec_id
             file_to_prefiltered_spec_id(prefix_or_file, opt)
           else
-            (tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
+            (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
             fps_specid = spec_id.class.new
             tps_specid = spec_id.class.new
@@ -339,8 +339,10 @@ class SpecID::Filter
       op.on("-p", "--ppm N", Float,     "<= ppm               d: #{opt.ppm}") {|v| opt.ppm = v}
       op.separator "                                     if bioworks.xml, = 10^6deltamass/mass"
       op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
-      op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
-      op.separator("                                     last given will apply to remaining files")
+      op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
+      op.separator("                                     e.g., for Bioworks: 'REVERSE'")
+      op.separator("                                     (last given will apply to remaining files)")
+      op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
       op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
         v[0] = get_cys_freq(v[0])
         opt.cys = v
@@ -354,7 +356,8 @@ class SpecID::Filter
       op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
       op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
       op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
-      op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
+      ## NEED TO IMPLEMENT THIS:
+      #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
       op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
     end

data/lib/spec_id/precision.rb CHANGED Viewed

@@ -245,11 +245,12 @@ class Prec
       op.separator ""
       op.separator "Options:"
-      op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
+      op.on("-f", "--fp_data <prefix_or_file>", "flag -or- decoy FILE") {|v| opt.f = v }
       op.separator ""
-      op.separator "        If searched with a concatenated DB, give a PREFIX to decoy proteins."
-      op.separator "        If files have different prefixes, separate with commas."
+      op.separator "        If searched with a concatenated DB, give a false flag to decoy proteins."
+      op.separator "        If files have different flags, separate with commas."
       op.separator "        If searched with a separate decoy DB, give the FILE name of decoy data"
+      op.on("--prefix", "false flag as prefix only") {|v| opt.prefix = v }
       op.separator ""
       ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
       op.separator ""
@@ -374,7 +375,7 @@ Example:
       sp = SpecID.new(file)
       #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
       if opt.f
-        (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
+        (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i], opt.prefix)
         all_arrs[i] << [num_hits,ppv]
         key[i] << ["Precision",  ["# hits", "Prec (decoy)"]]
       end

data/lib/spec_id/protein_summary.rb CHANGED Viewed

@@ -145,9 +145,13 @@ class ProteinSummary
     end
   end
-  def prefix_to_regex(prefix)
-    if prefix
-      /^#{Regexp.escape(prefix)}/
+  def flag_to_regex(flag, prefix=false)
+    if flag
+      if prefix
+        /^#{Regexp.escape(flag)}/
+      else
+        /#{Regexp.escape(flag)}/
+      end
     else
       nil
     end
@@ -164,12 +168,12 @@ class ProteinSummary
   end
   # filters on the false positive regex and sorts by prot probability
-  def filter_and_sort(uniq_prots, prefix=nil)
-    prefix_re = prefix_to_regex(prefix)
+  def filter_and_sort(uniq_prots, flag=nil, prefix=false)
+    false_flag_re = flag_to_regex(flag, prefix)
     sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
     ## filter on prefix
     if prefix
-      sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
+      sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
     end
     sorted
   end
@@ -292,7 +296,7 @@ class ProteinSummary
       end
     end
     uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
-    filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
+    filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
     ## num proteins above cutoff (if opt.c)
     num_prots_html = ''
@@ -322,7 +326,7 @@ class ProteinSummary
   # takes spec_id object
   # the outfn is the output filename
   # opt is an OpenStruct that holds opt.f = the false prefix
-  def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
+  def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
     fppr_output_as_html ||= ''
     header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
     num_cols = header_anchors.size
@@ -330,9 +334,8 @@ class ProteinSummary
     proteins = spec_id.prots
     protein_num = 0
     rows = ""
-    prefix_re = prefix_to_regex(false_prefix)
     proteins.each do |prot|
-      if false_prefix && prot.reference =~ prefix_re
+      if false_flag_re && prot.reference =~ false_flag_re
         next
       end
       uniq_peps = Hash.new {|h,k| h[k] = true; }
@@ -393,7 +396,8 @@ class ProteinSummary
       op.separator "    where file = bioworks -or- <run>-prot (prophet output)"
       op.separator "    outputs: <file>.summary.html"
       op.separator ""
-      op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
+      op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
+      op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
       op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
       op.separator("             if --precision then -f is used to specify a file or prefix")
       op.separator("             that indicates the false positives.")
@@ -434,7 +438,9 @@ class ProteinSummary
         proph_output(file, outfn, opt, fppr_output_as_html)
       when "bioworks"
         spec_id = SpecID.new(file)
-        bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
+        false_regex = flag_to_regex(opt.f, opt.prefix)
+        bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
       else
         abort "filetype for #{file} not recognized!"
       end
@@ -445,6 +451,7 @@ class ProteinSummary
   def create_precision_argv(file, opt)
     # include only those options specific
     new_argv = [file]
+    if opt.prefix ; new_argv << '--prefix' end
     if opt.f ; new_argv << '-f' << opt.f end
     if opt.o ; new_argv << '-o' << opt.o end
     new_argv

data/test/tc_precision.rb CHANGED Viewed

@@ -22,7 +22,7 @@ class PrecTest < Test::Unit::TestCase
   end
   def test_basic_cat
-    output = `#{@cmd} -o #{@tf_html} -f SHUFF_ #{@tf_bioworks_shuff}`
+    output = `#{@cmd} -o #{@tf_html} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
     puts output
     assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
@@ -34,7 +34,7 @@ class PrecTest < Test::Unit::TestCase
   end
   def test_multiple_files
-    output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
+    output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ --prefix #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
     assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
     assert_match(/1.*1.0000.*1.*1.0000.*0.*0.*15.*0.8667/m, IO.read(@tf_html), "has values")
     [@tf_html, @tf_png].each do |file|
@@ -45,14 +45,14 @@ class PrecTest < Test::Unit::TestCase
   def test_area_under_curve
     file = @tfiles + 'ppv_area.txt'
-    `#{@cmd} -o #{file} -a -f SHUFF_ #{@tf_bioworks_shuff}`
+    `#{@cmd} -o #{file} -a -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
     assert(File.exist?(file), "file #{file} exists")
     output = IO.read(file)
     assert_match(/Prec.*7.39206/, output, "consistency check")
     File.unlink file
     outfile = File.join(File.dirname(__FILE__), 'other.html')
-    `#{@cmd} -o #{outfile} -f SHUFF_ #{@tf_bioworks_shuff}`
+    `#{@cmd} -o #{outfile} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
     File.unlink outfile
     File.unlink File.join(File.dirname(__FILE__),'other.png')
   end

data/test/tc_protein_summary.rb CHANGED Viewed

@@ -75,7 +75,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
   def test_proph_with_precision
     #puts @cmd
-    runit "#{@tf_proph_cat_inv} -f INV_ --precision"
+    runit "#{@tf_proph_cat_inv} -f INV_ --prefix --precision"
     html =  IO.read(@tf_proph_cat_inv_summary_html)
     assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
     assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")

data/test/tc_spec_id.rb CHANGED Viewed

@@ -20,11 +20,11 @@ class SpecIDTest < Test::Unit::TestCase
     assert_equal(106, sp.prots.size)
   end
-  def test_classify_by_prefix
+  def test_classify_by_false_flag
     file = @tfiles + "bioworks_with_INV_small.xml"
     sp = SpecID.new(file)
     assert_equal(19, sp.prots.size)
-    (tp, fp) = sp.classify_by_prefix(:prots, "INV_")
+    (tp, fp) = sp.classify_by_false_flag(:prots, "INV_", true, true)
     assert_equal(4, fp.size, "num false pos")
     assert_equal(15, tp.size, "num true pos")
   end
@@ -58,7 +58,7 @@ class SpecIDTest < Test::Unit::TestCase
     (tps, ys) = roc.tps_and_ppv(tp, fp)
     assert_equal(exp_tp, tps)
     assert_equal(exp_fp, ys)
-    (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_")
+    (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_", true)
     # @TODO: assert these guys for consistencies sake:
     assert_in_delta_arrays([1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15], tps, 0.0000001)
     # Consistency check only:

metadata CHANGED Viewed

@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
 specification_version: 1
 name: mspire
 version: !ruby/object:Gem::Version
-  version: 0.2.1
-date: 2007-04-30 00:00:00 -05:00
+  version: 0.2.2
+date: 2007-05-08 00:00:00 -05:00
 summary: Mass Spectrometry Proteomics Objects, Scripts, and Executables
 require_paths:
 - lib