RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/lib/spec_id/precision.rb ADDED Viewed

@@ -0,0 +1,427 @@
+require 'optparse'
+require 'ostruct'
+require 'generator'
+require 'gnuplot'
+require 'roc'
+class String
+  def margin
+    self.gsub(/^\s*\|/,'')
+  end
+end
+class SpecID ; end
+class SpecID::Precision ; end
+module SpecID::Precision::PlotHelper
+  PLOT_TYPE = 'XYData'
+  TITLE = 'Precision (Positive Predictive Value)'
+  XAXIS = 'Num Hits (excludes known false positives)'
+  EXT = '.toplot'
+  IMAGE_EXT = '.png'
+  def create_to_plot_file(all_arrs, key, files, filename_noext)
+    ## CREATE the PLOT IMAGE:
+    to_plot = filename_noext + EXT
+    png = filename_noext + IMAGE_EXT
+    File.open(to_plot,'w') do |out|
+      out.puts PLOT_TYPE
+      out.puts filename_noext
+      out.puts TITLE
+      out.puts XAXIS
+      out.puts escape_to_gnuplot(y_axis_label(key))
+      files.each_with_index do |file,i|
+        #p key[i]
+        #p all_arrs[i]
+        key[i].each_with_index do |k,j|
+          out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
+          out.puts all_arrs[i][j][0].join(' ')
+          out.puts all_arrs[i][j][1].join(' ')
+        end
+      end
+    end
+  end
+  ## outputs a .toplot file based on filename_noext, creates a png file, and
+  ## writes  html to fh that will load the png file up
+  ## This is a self contained module that can be swapped out for a
+  ## completely different plotting program if desired.
+  def plot_figure(all_arrs, key, files, filename_noext)
+    ## CREATE the PLOT IMAGE:
+    to_plot = filename_noext+'.toplot'
+    png = filename_noext+'.png'
+    Gnuplot.open do |gp|
+      Gnuplot::Plot.new( gp ) do |plot|
+        plot.terminal "png noenhanced"
+        plot.output png
+        plot.title TITLE
+        plot.xlabel XAXIS
+        plot.ylabel escape_to_gnuplot(y_axis_label(key))
+        plot.style "line 1 lt 1"
+        plot.style "line 2 lt 12"
+        #plot.style  "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
+        plot.yrange "[-0.05:#{1.05 + 0.020*files.size}]"
+        files.each_with_index do |file,i|
+          key[i].each_with_index do |k,j|
+            plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
+              ds.with = "lines"
+              ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
+            end
+          end
+        end
+      end
+    end
+    ## CREATE the HTML to load the plot:
+    basename_filename_noext = File.basename(filename_noext)
+    output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
+    #output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: &nbsp;&nbsp; <span class=\"code\">#{plot_cmd}</span></caption>\n"
+    output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
+    output << "</table></div>\n"
+    output
+  end  # plot_figure
+end
+module SpecID::Precision::HTML
+  # html and body tags
+  def html
+        "|<html>
+         |#{yield}
+         |</html>\n".margin
+  end
+  def body
+        "|<body>
+         |  #{yield}
+         |</body>\n".margin
+  end
+  def header
+        "|<head>
+         |  #{style}
+         |</head>\n".margin
+  end
+  def td
+        "<td>#{yield}</td>"
+  end
+  def style
+        '
+     <style type="text/css">
+        div#tp_table {
+          text-align: center;
+          margin-top: 50px;
+          margin-bottom: 50px;
+        }
+        span.code {
+        font-family: Courier,Monospace;
+        font-size: 80%;
+        }
+          table {
+              border-width:1px;
+              border-color:#CCCCCC;
+              border-collapse: collapse;
+          }
+          caption {
+            font-size: 90%;
+          }
+          td,th {
+              padding-top: 2px;
+              padding-bottom: 2px;
+              padding-left: 1;
+              padding-right: 1;
+          }
+          th.small {
+            font-size: 80%;
+            font-weight: normal;
+            padding: 1px;
+          }
+          td.redline {
+              background-color: #FF0000;
+              color: #FFFFFF
+          }
+      div#plot {
+        margin: 30px;
+     text-align:center
+      }
+      hr {color: sienna}
+      body { font-size: 8pt; font-family: Arial,Helvetica,Times}
+      </style>
+        '
+  end
+  def table
+        "|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
+         |  #{yield}
+         |</table>\n".margin
+  end
+  def tr
+        "|<tr>
+         |  #{yield}
+         |</tr>\n".margin
+  end
+end # module HTML
+class SpecID::Precision
+  include SpecID::Precision::PlotHelper
+  ###########################################################
+  # GLOBAL SETTINGS:
+  DEF_PREFIX = "INV_"
+  DATA_PREC = 4   # decimal places of precision for ppv data
+  STDOUT_JTPLOT_BASE = "ppv"  # if there is no outfile
+  ###########################################################
+  include SpecID::Precision::HTML
+  ## returns an html string
+  def precision(argv)
+    opt = parse_args(argv)
+    files = argv.to_a
+    out_string = prefix_as_decoy(files, opt)
+    [out_string, opt]
+  end
+  def run_cmd_line(argv)
+    output_string, opt, file_as_decoy = precision(argv)
+    if file_as_decoy
+      puts output_string
+    else
+      ## open file and write to it..
+      if opt.o == 'STDOUT'
+        print output_string
+      else
+        File.open(opt.o,'w') do |fh| fh.print output_string end
+      end
+    end
+  end
+  # returns the outfile with no extension
+  def outfile_noext(opt)
+    if opt == 'STDOUT'
+      "#{STDOUT_JTPLOT_BASE}"
+    else
+      opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
+    end
+  end
+  def file_noext(file)
+    file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
+  end
+  def parse_args(argv)
+    opt = OpenStruct.new
+    opt.o = 'STDOUT'
+    opts = OptionParser.new do |op|
+      op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
+      op.separator ""
+      op.separator "Abbreviations and Definitions:"
+      op.separator "  TP = True Positives"
+      op.separator "  FP = False Positives"
+      op.separator "  Precision = Positive Predictive Value = [TP/(TP+FP)]"
+      op.separator ""
+      op.separator "Output: "
+      op.separator "  1. Decoy as separate search: PPV to STDOUT"
+      op.separator "  2. Decoy proteins from concatenated database: '.html'"
+      op.separator ""
+      op.separator "Options:"
+      op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
+      op.separator ""
+      op.separator "        If searched with a concatenated DB, give a PREFIX to decoy proteins."
+      op.separator "        If files have different prefixes, separate with commas."
+      op.separator "        If searched with a separate decoy DB, give the FILE name of decoy data"
+      op.separator ""
+      ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
+      op.separator ""
+      op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
+      op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
+      op.on("-j", "--plot_file", "output to_plot file") {|v| opt.j = v}
+      op.on_tail("
+Example:
+  For a search on a concatenated database where the decoy proteins have
+  been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
+  output:
+    #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
+  ")
+    end
+    opts.parse!(argv)
+    if argv.size < 1
+      puts opts
+      exit
+    end
+    opt
+  end
+  # takes a comma separated list and extends the last to create an array of
+  # desired size
+  def prefixes(arg, desired_size)
+    arg_arr = arg.split(',')
+    new_arr = []
+    last_arg = arg_arr[0]
+    desired_size.times do |i|
+      if arg_arr[i]
+        new_arr[i] = arg_arr[i]
+        last_arg = new_arr[i]
+      else
+        new_arr[i] = last_arg
+      end
+    end
+    new_arr
+  end
+  ## collapses arrays to one level deep so we can sync them up
+  def arrays_to_one_level_deep(all_arrs)
+    mostly_flat = []
+    all_arrs.each do |per_file|
+      per_file.each do |per_style|
+        mostly_flat << per_style[0]
+        mostly_flat << per_style[1]
+      end
+    end
+    mostly_flat
+  end
+  # prints rows and th for the data
+  def table_cells(all_arrs, key)
+    ## columns specific headings:
+    all_string = ""
+    all_string << tr do
+      line = ""
+      key.each do |per_file|
+        per_file.each do |per_ds|
+          line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
+        end
+      end
+      line
+    end
+    mostly_flat = arrays_to_one_level_deep(all_arrs)
+    SyncEnumerator.new(*mostly_flat).each do |row|
+      all_string << tr do
+        string = row.map {|it|
+          sty="%d"
+          if it.class == Float ; sty="%.#{DATA_PREC}f" end
+          td{ sprintf(sty,it)}
+        }.join
+      end
+    end
+    all_string
+  end
+  def html_table_output(all_arrs, key, files, filename_noext)
+    num_datasets_per_file = all_arrs.first.size
+    num_cols_per_dataset = 2
+    big_colspan = num_datasets_per_file * num_cols_per_dataset
+    output = table do
+      tr do
+        files.map do |file|
+        "<th colspan=\"#{big_colspan}\">#{file}</th>"
+        end.join
+      end +
+        tr do
+        key.map do |arr|
+          arr.map do |ds|
+        "<th colspan=\"2\">#{ds.first}</th>"
+          end
+        end
+        end +
+          table_cells(all_arrs, key)
+    end
+    "<div id=\"tp_table\">" + output + "</div>"
+  end
+  def y_axis_label(key)
+    ## We only take the keys for the first file, as it's assumed that the major
+    ## labels will be identical for all of them
+    labels = key.first.map {|tp| tp.first }
+    labels.join "  |  "
+  end
+  # escapes any ' chars
+  def escape_to_gnuplot(string)
+    # long way, but it works.
+    new_string = ""
+    string.split(//).each do |chr|
+      if chr == "'" ; new_string << "\\" end
+      new_string << chr
+    end
+    new_string
+  end
+  def prefix_as_decoy(files, opt)
+    $stderr.puts "using prefix #{opt.f} ..."
+    if opt.f
+      prefix_arr = prefixes(opt.f, files.size)
+    end
+    all_arrs = []
+    key = []
+    out_noext = outfile_noext(opt.o)
+    files.each_with_index do |file,i|
+      all_arrs[i] = []
+      key[i] = []
+      sp = SpecID.new(file)
+      #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
+      if opt.f
+        (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
+        all_arrs[i] << [num_hits,ppv]
+        key[i] << ["Precision",  ["#TP", "Prec = TP/(TP+FP)"]]
+      else
+        ## These are just from protein prophet probabilities:
+        (num_hits, ppv) = sp.num_hits_and_ppv_for_protein_prophet_probabilities
+        all_arrs[i] << [num_hits,ppv]
+        key[i] << ["Precision",  ["#TP", "Prec = TP/(TP+FP)"]]
+      end
+    end
+    string = ''
+    if opt.a
+      roc = ROC.new
+      #string << "***********************************************************\n"
+      #string << "AREA UNDER CURVE:\n"
+      key.each_with_index do |file,i|
+        string << "#{files[i]} (area under curve)\n"
+        key[i].each_index do |j|
+          string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
+          num_hits = all_arrs[i][j][0]
+          oth = all_arrs[i][j][1]
+          string << roc.area_under_curve(num_hits, oth).to_s << "\n"
+        end
+      end
+      #string << "***********************************************************\n"
+    else
+      if opt.j
+        create_to_plot_file(all_arrs, key, files, out_noext)
+      end
+      string = html do
+        header +
+          body do
+          plot_figure(all_arrs, key, files, out_noext) +
+            html_table_output(all_arrs, key, files, out_noext)
+          end
+      end
+    end
+    string
+  end
+end # class SpecID

data/lib/spec_id/proph.rb CHANGED Viewed

@@ -14,7 +14,7 @@ class Proph
 class Parser
   def root_el(file)
-    XMLTree.parse_file(file)
+    AXML.parse_file(file)
   end
 end
@@ -275,7 +275,7 @@ class Pep::Parser < Parser
       ## file from peptideAtlas:
       search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
-      search_result_regex2 = /<search_result spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
+      search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
       search_hit_regex = /<search_hit .*peptide="(\w+)" /o
       peptide_h = {}