RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/lib/spec_id/false_positive_rate.rb DELETED Viewed

@@ -1,476 +0,0 @@
-# for false_positive_rate:
-require 'optparse'
-require 'ostruct'
-require 'generator'
-require 'gnuplot'
-require 'roc'
-class String
-  def margin
-    self.gsub(/^\s*\|/,'')
-  end
-end
-class SpecID
-  class FalsePositiveRate
-    module HTML
-      # html and body tags
-      def html
-        "|<html>
-         |#{yield}
-         |</html>\n".margin
-      end
-      def body
-        "|<body>
-         |  #{yield}
-         |</body>\n".margin
-      end
-      def header
-        "|<head>
-         |  #{style}
-         |</head>\n".margin
-      end
-      def td
-        "<td>#{yield}</td>"
-      end
-      def style
-        '
-     <style type="text/css">
-        div#tp_table {
-          text-align: center;
-          margin-top: 50px;
-          margin-bottom: 50px;
-        }
-        span.code {
-        font-family: Courier,Monospace;
-        font-size: 80%;
-        }
-          table {
-              border-width:1px;
-              border-color:#CCCCCC;
-              border-collapse: collapse;
-          }
-          caption {
-            font-size: 90%;
-          }
-          td,th {
-              padding-top: 2px;
-              padding-bottom: 2px;
-              padding-left: 1;
-              padding-right: 1;
-          }
-          th.small {
-            font-size: 80%;
-            font-weight: normal;
-            padding: 1px;
-          }
-          td.redline {
-              background-color: #FF0000;
-              color: #FFFFFF
-          }
-      div#plot {
-        margin: 30px;
-     text-align:center
-      }
-      hr {color: sienna}
-      body { font-size: 8pt; font-family: Arial,Helvetica,Times}
-      </style>
-        '
-      end
-      def table
-        "|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
-         |  #{yield}
-         |</table>\n".margin
-      end
-      def tr
-        "|<tr>
-         |  #{yield}
-         |</tr>\n".margin
-      end
-    end # module HTML
-  end # class FalsePositiveRate
-end #class SpecID
-class SpecID
-  class FalsePositiveRate
-    ###########################################################
-    # GLOBAL SETTINGS:
-    DEF_PREFIX = "INV_"
-    DATA_PREC = 4   # decimal places of precision or fpr data
-    STDOUT_JTPLOT_BASE = "fpr"  # if there is no outfile
-    ###########################################################
-    include SpecID::FalsePositiveRate::HTML
-    ## returns either an ascii string (if file_as_decoy) or
-    ## returns an html string
-    def false_positive_rate(argv)
-      opt = parse_args(argv)
-      files = argv.to_a
-      file_as_decoy = false
-      if File.exist? opt.f
-        file_as_decoy = true
-        out_string = file_as_decoy(files, opt)
-      else
-        out_string = prefix_as_decoy(files, opt)
-      end
-      return [out_string, opt, file_as_decoy]
-    end
-    def run_cmd_line(argv)
-      output_string, opt, file_as_decoy = false_positive_rate(argv)
-      if file_as_decoy
-        puts output_string
-      else
-        ## open file and write to it..
-        if opt.o == 'STDOUT'
-          print output_string
-        else
-          File.open(opt.o,'w') do |fh| fh.print output_string end
-        end
-      end
-    end
-    # returns the outfile with no extension
-    def outfile_noext(opt)
-      if opt == 'STDOUT'
-      "#{STDOUT_JTPLOT_BASE}"
-      else
-        opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
-      end
-    end
-    def file_noext(file)
-      file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
-    end
-    def parse_args(argv)
-      opt = OpenStruct.new
-      opt.f = DEF_PREFIX
-      opt.o = 'STDOUT'
-      opts = OptionParser.new do |op|
-        op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
-        op.separator ""
-        op.separator "Abbreviations and Definitions:"
-        op.separator "  TP = True Positives"
-        op.separator "  FP = False Positive"
-        op.separator "  FPR = False Positive Rate = [FP/(TP+FP)] (between 0 and 1)"
-        op.separator "  FPR2 = Gygi's estimation of FPR = [2*FPR]"
-        op.separator "  Precision = [TP/(TP+FP)]"
-        op.separator ""
-        op.separator "Output: "
-        op.separator "  1. Decoy as separate search: FPR to STDOUT"
-        op.separator "  2. Decoy proteins from concatenated database: 'fpr.html'"
-        op.separator ""
-        op.separator "Options:"
-        op.on("-f", "--fp_data <prefix_or_file>", "PREFIX (def: #{DEF_PREFIX}) -or- decoy FILE") {|v| opt.f = v }
-        op.separator ""
-        op.separator "        If searched with a concatenated DB, give a PREFIX to decoy proteins."
-        op.separator "        If files have different prefixes, separate with commas."
-        op.separator "        If searched with a separate decoy DB, give the FILE name of decoy data"
-        op.separator ""
-        op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
-        ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
-        op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
-        op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
-        op.separator ""
-        op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
-        op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
-        op.on_tail("
-Examples:
-  1. For a search on a concatenated database where the decoy proteins have
-  been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
-  output:
-    #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
-  2. To determine the false positive rate of a search (and fpr2 and precision)
-  using a normal and decoy database search, filter both the normal and decoy
-  datasets identically, export to xml and run like this (only works for
-  Bioworks xml export):
-    #{File.basename(__FILE__)} -tp -f decoy_bioworks.xml bioworks.xml
-  ")
-      end
-      opts.parse!(argv)
-      if argv.size < 1
-        puts opts
-        exit
-      end
-      opt
-    end
-    # takes a comma separated list and extends the last to create an array of
-    # desired size
-    def prefixes(arg, desired_size)
-      arg_arr = arg.split(',')
-      new_arr = []
-      last_arg = arg_arr[0]
-      desired_size.times do |i|
-        if arg_arr[i]
-          new_arr[i] = arg_arr[i]
-          last_arg = new_arr[i]
-        else
-          new_arr[i] = last_arg
-        end
-      end
-      new_arr
-    end
-    ## collapses arrays to one level deep so we can sync them up
-    def arrays_to_one_level_deep(all_arrs)
-      mostly_flat = []
-      all_arrs.each do |per_file|
-        per_file.each do |per_style|
-          mostly_flat << per_style[0]
-          mostly_flat << per_style[1]
-        end
-      end
-      mostly_flat
-    end
-    # prints rows and th for the data
-    def table_cells(all_arrs, key)
-      ## columns specific headings:
-      all_string = ""
-      all_string << tr do
-        line = ""
-        key.each do |per_file|
-          per_file.each do |per_ds|
-            line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
-          end
-        end
-        line
-      end
-      mostly_flat = arrays_to_one_level_deep(all_arrs)
-      SyncEnumerator.new(*mostly_flat).each do |row|
-        all_string << tr do
-          string = row.map {|it|
-            sty="%d"
-            if it.class == Float ; sty="%.#{DATA_PREC}f" end
-            td{ sprintf(sty,it)}
-          }.join
-        end
-      end
-      all_string
-    end
-    def html_table_output(all_arrs, key, files, filename_noext)
-      num_datasets_per_file = all_arrs.first.size
-      num_cols_per_dataset = 2
-      big_colspan = num_datasets_per_file * num_cols_per_dataset
-      output = table do
-        tr do
-          files.map do |file|
-        "<th colspan=\"#{big_colspan}\">#{file}</th>"
-          end.join
-        end +
-          tr do
-          key.map do |arr|
-            arr.map do |ds|
-        "<th colspan=\"2\">#{ds.first}</th>"
-            end
-          end
-          end +
-            table_cells(all_arrs, key)
-      end
-    "<div id=\"tp_table\">" + output + "</div>"
-    end
-    def y_axis_label(key)
-      ## We only take the keys for the first file, as it's assumed that the major
-      ## labels will be identical for all of them
-      labels = key.first.map {|tp| tp.first }
-      labels.join "  |  "
-    end
-    # escapes any ' chars
-    def escape_to_gnuplot(string)
-      # long way, but it works.
-      new_string = ""
-      string.split(//).each do |chr|
-        if chr == "'" ; new_string << "\\" end
-        new_string << chr
-      end
-      new_string
-    end
-    ## outputs a .toplot file based on filename_noext, creates a png file, and
-    ## writes  html to fh that will load the png file up
-    ## This is a self contained module that can be swapped out for a
-    ## completely different plotting program if desired.
-    def plot_figure(all_arrs, key, files, filename_noext)
-      ## CREATE the PLOT IMAGE:
-      to_plot = filename_noext+'.toplot'
-      png = filename_noext+'.png'
-      Gnuplot.open do |gp|
-        Gnuplot::Plot.new( gp ) do |plot|
-          plot.terminal "png noenhanced"
-          plot.output png
-          plot.title "Classification Analysis"
-          plot.xlabel 'Num True Positives'
-          plot.ylabel escape_to_gnuplot(y_axis_label(key))
-          plot.style "line 1 lt 1"
-          plot.style "line 2 lt 12"
-          #plot.style  "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
-          plot.yrange "[-0.05:#{1.0 + 0.2*files.size}]"
-          files.each_with_index do |file,i|
-            key[i].each_with_index do |k,j|
-              plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
-              ds.with = "lines"
-              ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
-            end
-          end
-        end
-      end
-    end
-=begin
-      ## CREATE the PLOT IMAGE:
-      to_plot = filename_noext+'.toplot'
-      png = filename_noext+'.png'
-      File.open(to_plot,'w') do |out|
-        out.puts 'XYData'
-        out.puts filename_noext
-        out.puts "Classification Analysis"
-        out.puts 'Num True Positives'
-        out.puts escape_to_gnuplot(y_axis_label(key))
-        files.each_with_index do |file,i|
-          #p key[i]
-          #p all_arrs[i]
-          key[i].each_with_index do |k,j|
-            out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
-            out.puts all_arrs[i][j][0].join(' ')
-            out.puts all_arrs[i][j][1].join(' ')
-          end
-        end
-      end
-      num_files = files.size
-      if $".include? 'plotter.rb'
-        cmd = "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
-        plot_cmd = "plot.rb #{cmd}"
-        Plotter.new.plot_string "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
-        unless File.file? png
-          abort "Fatal Error in plotting cmd=\"#{plot_cmd}\":\n#{reply}"
-        end
-      else
-        warn "plotter.rb not found, not png plot image available"
-      end
-=end
-      ## CREATE the HTML to load the plot:
-      basename_filename_noext = File.basename(filename_noext)
-      output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
-      #output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: &nbsp;&nbsp; <span class=\"code\">#{plot_cmd}</span></caption>\n"
-      output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
-      output << "</table></div>\n"
-      output
-    end  # plot_figure
-    def file_as_decoy(files, opt)
-      bio = SpecID::Bioworks.new
-      puts "Calculating false positive rates using '#{opt.f}' as decoy ..."
-      fps = bio.num_prots(opt.f)
-      out = ""
-      files.each do |file|
-        tps = bio.num_prots(file)
-        out << "*****************************************************\n"
-        out << sprintf("%-36s  # TP : #{tps}\n", file)
-        out << sprintf("%-36s  # FP : #{fps}\n", opt.f)
-        out << sprintf("        False Positive Rate   [FP/(TP+FP)] : %.3f\n", fps.to_f/(tps+fps)) unless opt.n
-        out << sprintf(" Gygi's False Positive Rate 2*[FP/(TP+FP)] : %.3f\n", 2.0*fps/(tps+fps)) if opt.g
-        out << sprintf("                    Precision [TP/(TP+FP)] : %.3f\n", tps.to_f/(tps+fps)) if opt.p
-        out << "*****************************************************\n"
-      end
-      out
-    end
-    def prefix_as_decoy(files, opt)
-      #puts "Calculating false positive rates using prefix #{opt.f} ..."
-      prefix_arr = prefixes(opt.f, files.size)
-      all_arrs = []
-      key = []
-      out_noext = outfile_noext(opt.o)
-      files.each_with_index do |file,i|
-        all_arrs[i] = []
-        key[i] = []
-        sp = SpecID.new(file)
-        #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
-        (tp, prec, fpr2) = sp.tps_and_precision_and_fpr2_times2_for_prob(prefix_arr[i])
-        if opt.g
-          all_arrs[i] << [tp,fpr2]
-          key[i] <<  ["Gygi FPR", ["#TP", "Gygi FPR = 2*FP/(TP+FP)"]]
-        end
-        if opt.p
-          all_arrs[i] << [tp,prec]
-          key[i] <<  ["Prec", ["#TP", "Prec = TP/(TP+FP)"]]
-        end
-        unless opt.n
-          ## Add the fpr datasets
-          fpr = fpr2.map {|v| v/2.0}
-          all_arrs[i] << [tp,fpr]
-          key[i] <<  ["FPR", ["#TP", "FPR = FP/(TP+FP)"]]
-        end
-      end
-      string = ''
-      if opt.a
-        roc = ROC.new
-        #string << "***********************************************************\n"
-        #string << "AREA UNDER CURVE:\n"
-        key.each_with_index do |file,i|
-          string << "#{files[i]} (area under curve)\n"
-          key[i].each_index do |j|
-            string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
-            tps = all_arrs[i][j][0]
-            oth = all_arrs[i][j][1]
-            string << roc.area_under_curve(tps, oth).to_s << "\n"
-          end
-        end
-        #string << "***********************************************************\n"
-      else
-        string = html do
-          header +
-            body do
-            plot_figure(all_arrs, key, files, out_noext) +
-              html_table_output(all_arrs, key, files, out_noext)
-            end
-        end
-      end
-      string
-    end
-  end # class FalsePositiveRate
-end # class SpecID

data/test/tc_gi2annot.rb DELETED Viewed

@@ -1,12 +0,0 @@
-require 'test/unit'
-class Gi2AnnotTest < Test::Unit::TestCase
-  ROOT_DIR = File.join(File.dirname(__FILE__), '..')
-  def test_query
-    # @TODO: Add in some kind of skip if there is no internet connection
-    assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", `ruby -I#{File.join(ROOT_DIR, 'lib')} #{File.join(ROOT_DIR, 'bin', 'gi2annot.rb')} 16130548`)
-  end
-end