RubyGems - mspire - Versions diffs - 0.1.7 → 0.2.0 - Mend

mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/Rakefile +41 -14
data/bin/bioworks2excel.rb +1 -1
data/bin/bioworks_to_pepxml.rb +46 -59
data/bin/fasta_shaker.rb +1 -1
data/bin/filter.rb +6 -0
data/bin/find_aa_freq.rb +23 -0
data/bin/id_precision.rb +3 -2
data/bin/mzxml_to_lmat.rb +2 -1
data/bin/pepproph_filter.rb +1 -1
data/bin/precision.rb +1 -1
data/bin/protein_summary.rb +2 -451
data/bin/raw_to_mzXML.rb +55 -0
data/bin/srf_group.rb +26 -0
data/changelog.txt +7 -0
data/lib/align.rb +3 -3
data/lib/fasta.rb +6 -1
data/lib/gi.rb +9 -4
data/lib/roc.rb +2 -0
data/lib/sample_enzyme.rb +2 -1
data/lib/spec/mzxml/parser.rb +2 -43
data/lib/spec/mzxml.rb +65 -2
data/lib/spec_id/aa_freqs.rb +10 -7
data/lib/spec_id/bioworks.rb +67 -87
data/lib/spec_id/filter.rb +794 -0
data/lib/spec_id/precision.rb +29 -36
data/lib/spec_id/proph.rb +5 -3
data/lib/spec_id/protein_summary.rb +459 -0
data/lib/spec_id/sequest.rb +323 -271
data/lib/spec_id/srf.rb +189 -135
data/lib/spec_id.rb +276 -227
data/lib/spec_id_xml.rb +101 -0
data/lib/toppred.rb +18 -0
data/script/degenerate_peptides.rb +47 -0
data/script/filter-peps.rb +5 -1
data/test/tc_align.rb +1 -1
data/test/tc_bioworks.rb +25 -22
data/test/tc_bioworks_to_pepxml.rb +37 -4
data/test/tc_fasta.rb +3 -1
data/test/tc_fasta_shaker.rb +8 -6
data/test/tc_filter.rb +203 -0
data/test/tc_gi.rb +6 -9
data/test/tc_id_precision.rb +31 -0
data/test/tc_mzxml.rb +8 -6
data/test/tc_peptide_parent_times.rb +2 -1
data/test/tc_precision.rb +1 -1
data/test/tc_proph.rb +5 -5
data/test/tc_protein_summary.rb +36 -13
data/test/tc_sequest.rb +78 -33
data/test/tc_spec_id.rb +128 -6
data/test/tc_srf.rb +84 -38
metadata +67 -62
data/bin/fasta_cat.rb +0 -39
data/bin/fasta_cat_mod.rb +0 -59
data/bin/fasta_mod.rb +0 -57
data/bin/filter_spec_id.rb +0 -365
data/bin/raw2mzXML.rb +0 -21
data/script/gen_database_searching.rb +0 -258

data/bin/protein_summary.rb CHANGED Viewed

@@ -1,455 +1,6 @@
 #!/usr/bin/ruby -w
-require 'axml'
-require 'hash_by'
-require 'optparse'
-require 'ostruct'
-require 'spec_id'
-#############################################################
-# GLOBALS:
-PRECISION_PROGRAM_BASE = 'precision'
-DEF_PREFIX = "INV_"
-DEF_PERCENT_FP = "5.0"
-#############################################################
-# @TODO: add group probability title (showin all group probabilities) for protein prob
-#class String
-#  def margin
-#    self.gsub(/^\s*\|/,'')
-#  end
-#end
-class Runner
-  module HTML
-    def header
-    %Q{<html>
-    <head>
-    #{style}
-    </head>
-    <body>
-    <script type="text/javascript">
-    <!--
-    function toggle_vis(id) {
-      var e = document.getElementById(id);
-      if(e.style.display == 'none')
-        e.style.display = 'block';
-      else
-        e.style.display = 'none';
-    }
-    //-->
-    </script>
-    }
-      end
-      def style
-   '
-   <style type="text/css">
-        table {
-            border-width:1px;
-            border-color:#DDDDDD;
-            border-collapse: collapse;
-        }
-        td,th {
-            padding-top: 2px;
-            padding-bottom: 2px;
-            padding-left: 5;
-            padding-right: 5;
-        }
-        td.redline {
-            background-color: #FF0000;
-            color: #FFFFFF
-        }
-        div.file_info, div.software, div.fppr, div.num_proteins{
-            margin-left: 20px;
-            margin-top: 20px;
-        }
-        div.main {
-          margin-left: 10px;
-          margin-right: 10px;
-          margin-top: 50px;
-          margin-bottom: 50px;
-        }
-    div#error {
-      margin: 30px;
-   text-align:center
-    }
-    hr {color: sienna}
-    body { font-size: 8pt; font-family: Arial,Helvetica,Times}
-    </style>
-      '
-      end
-      # an anchor and a title
-      def at(display, title)
-    "<a title=\"#{title}\">#{display}</a>"
-      end
-      def trailer
-    %q{
-    </body>
-    </html>
-    }
-      end
-      def tr
-    "|<tr>
-     |  #{yield}
-     |</tr>\n".margin
-      end
-      def table
-    "|<div class=\"main\"><table align=\"center\" border=\"1\" style=\"font-size:100%\" width=\"800px\">
-     |  #{yield}
-     |</table></div>\n".margin
-      end
-      def tds(arr)
-        arr.map {|v| "<td>#{v}</td>"}.join
-      end
-      def ths(arr)
-        str = arr.map {|v| "<th>#{v}</th>"}.join
-        str << "\n"
-      end
-    end
-  end
-class Runner
-  include Runner::HTML
-  def ref_html(gi, name)
-  "<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
-  end
-  # Takes the -prot.xml filename and grabs the png file (if available)
-  def error_info(prot_file_name)
-    img = prot_file_name.gsub('.xml', '.png')
-    img_bn = File.basename(img)
-      "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
-  end
-  # attempts to get the NCBI gi code
-  def accession(name)
-    if (name.include? '|') && (name[0,3] == 'gi|')
-      name.split('|')[1]
-    else
-      name
-    end
-  end
-  def prefix_to_regex(prefix)
-    if prefix
-      /^#{Regexp.escape(prefix)}/
-    else
-      nil
-    end
-  end
-  # given a list of proteins, output a tab delimited textfile with protein
-  # name and the total number of peptides found
-  def output_peptide_counts_file(prots, filename)
-    File.open(filename, "w") do |fh_out|
-      prots.each do |prot|
-        fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
-      end
-    end
-  end
-  # filters on the false positive regex and sorts by prot probability
-  def filter_and_sort(uniq_prots, prefix=nil)
-    prefix_re = prefix_to_regex(prefix)
-    sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
-    ## filter on prefix
-    if prefix
-      sorted = sorted.reject {|prot| prot.reference =~ prefix_re }
-    end
-    sorted
-  end
-  # assumes that these are sorted on probability
-  # desired_fppr is a float
-  # returns [number_of_prots, actual_fppr]
-  def num_prots_above_fppr(prots, desired_fppr)
-    current_fppr_rate_percent = 0.0
-    previous_fppr_rate_percent = 0.0
-    current_sum_one_minus_prob = 0.0
-    proteins_within_fppr = 0
-    actual_fppr = nil
-    already_found = false
-    prot_cnt = 0
-    prots.each do |prot|
-      prot_cnt += 1
-      # SUM(1-probX)/#prots
-      current_sum_one_minus_prob += 1.0 - prot._probability.to_f
-      current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
-      if current_fppr_rate_percent > desired_fppr && !already_found
-        actual_fppr = previous_fppr_rate_percent
-        proteins_within_fppr = prot_cnt
-        already_found = true
-      end
-      previous_fppr_rate_percent = current_fppr_rate_percent
-    end
-    [proteins_within_fppr, actual_fppr]
-  end
-    ####    #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
-  # returns a string of the table rows
-  # false_positive_rate (give as a %) is the cutoff mark
-  # returns the number of proteins at the desired_fppr (if given)
-  def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
-    prot_cnt = 0
-    uniq_prots.map do |prot|
-      tr do
-        prot_cnt += 1
-        gi = accession(prot._protein_name)
-        tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
-      end
-    end.join
-  end
-  def print_html_pieces(file, *pieces)
-    File.open(file, "w") do |out|
-      pieces.each do |piece|
-        out.print piece
-      end
-    end
-  end
-  def file_info(file)
-    "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
-    <br/>Last Modified: #{File.mtime(file)}
-    <br/>Size: #{File.size(file)/1000} KB
-    </div>"
-  end
-  def bioworks_script_info(obj)
-    version = "3.2??"
-    if obj.version
-      version = obj.version
-    end
-    script_info{"Bioworks version #{version}"}
-  end
-  def protproph_script_info
-    begin
-      where = `which xinteract`
-      reply = `#{where}`
-    rescue Exception
-      reply = ""
-    end
-    prophet = "TPP (version unknown)"  # put your version here if you can't get it dynamically
-    if reply =~ /xinteract.*?\((TPP .*)\)/
-      prophet = $1.dup
-    end
-    script_info { "ProteinProphet from: #{prophet}" }
-  end
-  def mspire_version
-    string = "mspire"
-    begin
-      if `gem list --local mspire` =~ /mspire \((.*?)\)/
-        string << (" v" + $1)
-      end
-    rescue Exception
-    end
-    string
-  end
-  def script_info
-    "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
-  end
-  def proph_output(file, outfn, opt, fppr_output_as_html)
-    header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'),  at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
-    num_cols = header_anchors.size
-    theaders = ths(header_anchors)
-    root = AXML.parse_file(file)
-    prots = []
-    ## find the min_prob at a fppr of XX
-    min_prob_redline = 1.01  # if no fppr is less than what they give, then all are redlined!
-    if opt.c
-      actual_percent_fp = opt.c.to_f
-    elsif opt.cut_at
-      actual_percent_fp = opt.cut_at.to_f
-    else
-      actual_percent_fp = nil
-    end
-    root.protein_group.each do |group|
-      group.protein.each do |prt|
-        prots << prt
-      end
-    end
-    uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
-    filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
-    ## num proteins above cutoff (if opt.c)
-    num_prots_html = ''
-    if opt.c || opt.cut_at
-      (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
-      num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
-    end
-    if opt.cut_at
-      filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
-    end
-    output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
-    table_string = table do
-      tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
-    end
-    er_info = opt.precision ? error_info(file) : ""
-    html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
-    print_html_pieces(*html_pieces)
-  end # proph_output
-  # given a list of peptide sequences creates javascript to hide/show them
-  def peptide_cell(prot_num, peptide_sequences)
-    "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
-  end
-  def bioworks_output(file, outfn, opt, fppr_output_as_html)
-    header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
-    num_cols = header_anchors.size
-    theaders = ths(header_anchors)
-    bio_obj = SpecID.new(file)
-    proteins = bio_obj.prots
-    protein_num = 0
-    rows = ""
-    prefix_re = prefix_to_regex(opt.f)
-    proteins.each do |prot|
-      if opt.f && prot.reference =~ prefix_re
-        next
-      end
-      uniq_peps = Hash.new {|h,k| h[k] = true; }
-      protein_num += 1
-      prot.peps.each do |pep|
-        uniq_peps[pep.sequence.split('.')[1]] = true
-      end
-      pieces = prot.reference.split(' ')
-      long_prot_name = pieces.shift
-      annotation = pieces.join(' ')
-      accession = prot.accession
-      if accession == '0' ; accession = long_prot_name end
-      rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
-    end
-    table_string = table do
-      tr{theaders} + rows
-    end
-    print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
-  end # bioworks_output
-  def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
-    actual_cutoff = sprintf("%.3f", actual_cutoff)
-    desired_cutoff = sprintf("%.3f", desired_cutoff)
-    "<div class=\"num_proteins\"><h3>False Positive Rate Information</h3>
-    Desired FPR: #{desired_cutoff} %<br/>
-    Actual FPR: #{actual_cutoff} %<br/>
-    Number of Proteins at Actual FPR: #{num_proteins}
-    </div>"
-  end
-  # transforms the output string of file_as_decoy into html
-  def file_as_decoy_to_html(string)
-    lines = string.split("\n")
-    #puts lines ?? is this supposed to be commented out?
-    lines = lines.reject do |obj| obj =~ /\*{10}/ end
-    lines.map! do |line| "#{line}<br/>" end
-    "<div class=\"fppr\">
-    <h3>Classification Analysis</h3>
-    #{lines.join("\n")}
-    </div>"
-  end
-  # transforms the output string of file_as_decoy into html
-  def prefix_as_decoy_to_html(string)
-    "<div class=\"fppr\">
-    <h3>Classification Analysis</h3>
-    </div>" +
-    string
-  end
-  def go(argv)
-    @orig_argv = argv.dup
-    dup_argv = argv.dup
-    opt = OpenStruct.new
-    opt.f = DEF_PREFIX
-    opts = OptionParser.new do |op|
-      op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
-      op.separator "    where file = bioworks -or- <run>-prot (prophet output)"
-      op.separator "    outputs: <file>.summary.html"
-      op.separator ""
-      op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
-      op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
-      op.separator("             if --precision then -f is used to specify a file or prefix")
-      op.separator("             that indicates the false positives.")
-      op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
-      op.separator ""
-      op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
-      op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
-      op.separator("                                     type '#{PRECISION_PROGRAM_BASE}.rb' for details")
-      op.separator ""
-      op.separator "Specific to ProteinProphet (with no concatenated DB):"
-      op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
-      op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
-    end
-    opts.parse!
-    if argv.size < 1
-      puts opts
-      exit
-    end
-    fppr_output_as_html = ''
-    files = argv.to_a
-    files.each do |file|
-      outfn = file.gsub(/\.xml$/, '.summary.html')
-      ## False Positive Rate Calculation:
-      if opt.precision
-        opt.o = outfn # won't actually be written over, but used
-        to_use_argv = create_precision_argv(file, opt)
-        (out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
-        fppr_output_as_html = prefix_as_decoy_to_html(out_string)
-      end
-      case SpecID.file_type(file)
-      when "protproph"
-        proph_output(file, outfn, opt, fppr_output_as_html)
-      when "bioworks"
-        bioworks_output(file, outfn, opt, fppr_output_as_html)
-      else
-        abort "filetype for #{file} not recognized!"
-      end
-    end
-  end # method go
-  def create_precision_argv(file, opt)
-    # include only those options specific
-    new_argv = [file]
-    if opt.f ; new_argv << '-f' << opt.f end
-    if opt.o ; new_argv << '-o' << opt.o end
-    new_argv
-  end
-end   # Runner
-##################################################################
-# MAIN
-##################################################################
-Runner.new.go(ARGV)
+require 'spec_id/protein_summary'
+ProteinSummary.new.create_from_command_line_args(ARGV)

data/bin/raw_to_mzXML.rb ADDED Viewed

@@ -0,0 +1,55 @@
+#!/usr/bin/ruby -w
+require 'optparse'
+require 'spec/mzxml'
+require 'fileutils'
+progname = File.basename(__FILE__)
+opt = {}
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{progname} [OPTIONS] <file>.RAW ..."
+  op.separator ""
+  op.on("-p", "--profile", "uses profile output instead of centroid (default)") {|v| opt[:profile] = v}
+end
+opts.parse!
+if ARGV.size == 0
+  puts opts
+  exit
+end
+converter = Spec::MzXML.find_mzxml_converter
+if converter
+  $stderr.puts "using #{converter} to convert files"
+else
+  puts "cannot find [#{Spec::MzXML::Potential_mzxml_converters.join(', ')}] in the paths:"
+  puts ENV['PATH'].split(/[:;]/).join(", ")
+  abort
+end
+files = ARGV.to_a
+files.each do |file|
+  puts "******************************************"
+  puts "Converting: #{file}"
+  if converter =~ /readw/
+    centroid_or_profile = 'c'
+    if opt[:profile]
+      centroid_or_profile = 'p'
+    end
+    outfile = file.sub(/\.RAW$/i, '.mzXML')
+    cmd = "#{converter} #{file} #{centroid_or_profile} #{outfile}"
+    puts "Performing: '#{cmd}'"
+    puts `#{cmd}`
+  else
+    ## t2x only outputs in cwd!
+    Dir.chdir(File.dirname(file)) do |dir|
+      puts "Performing: '#{cmd}' in #{dir}"
+      puts `#{cmd}`
+      system "#{converter} #{File.basename(file)}"
+    end
+  end
+  puts "******************************************"
+end

data/bin/srf_group.rb ADDED Viewed

@@ -0,0 +1,26 @@
+#!/usr/bin/ruby
+require 'optparse'
+require 'spec_id/srf'
+$OUTFILE = 'bioworks.srg'
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} <file1>.srf <file2>.srf ..."
+  op.separator "outputs: 'bioworks.srg'"
+  op.separator ""
+  op.separator "    A '.srg' file is an ascii text file with a list"
+  op.separator "    of the srf files (full path names) in that group."
+  op.separator ""
+  op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
+end
+if ARGV.size == 0
+  puts opts
+end
+obj = SRFGroup.new
+obj.filenames = ARGV.to_a
+obj.to_srg($OUTFILE)

data/changelog.txt CHANGED Viewed

@@ -1,4 +1,6 @@
+## version 0.1.7
 1. A couple of scripts and subroutines were hashing peptides but not on the file
 basename.  This would result in slightly incorrect results (any time there
 were overlapping scan numbers in multiple datasets, only the top one would be
@@ -31,4 +33,9 @@ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
 called the False Positive Predictive Rate (FPPR).  I will probably implement
 this in a future release.
+## version 0.2.0
+** This is a definite code breaker **
+Revamped the way SpecID works (it is now subclassed).  Since I want to return
+the specific object that the file specifies, I use 'create' now instead of
+'new' (which forces one to return *that* class.

data/lib/align.rb CHANGED Viewed

@@ -24,12 +24,12 @@ class Align
       scanindex_by_basename_noext[runindex.basename_noext] = runindex.scans_by_num
     end
-    dta_filenames = SpecID::Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")
+    dta_filenames = Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")
-    parser = SpecID::Proph::Prot::Parser.new
+    parser = Proph::Prot::Parser.new
     parser.get_prots_and_peps(prot_xml, prot_prob, pep_init_prob, pep_nsp_prob, "regex")
     peptides = parser.peps
-    peptides = SpecID::Proph::Pep.uniq_by_seqcharge(peptides)
+    peptides = Proph::Pep.uniq_by_seqcharge(peptides)
     ## we update each peptide with a list of dtafilenames
     ## then we update with a parallel list of scans (one for each dtafn...
     ## unless there are multiple scans associated with each filename

data/lib/fasta.rb CHANGED Viewed

@@ -1,5 +1,8 @@
 require 'sample_enzyme'
+require 'each_index'
+tmp = $VERBOSE ; $VERBOSE = nil
 class String
   def each_index
@@ -21,6 +24,8 @@ class String
   end
 end
+$VERBOSE = tmp
 class Fasta
@@ -259,9 +264,9 @@ class Fasta
 end
 class Fasta::Prot
-  attr_accessor :header, :aaseq
   # header given as full line with starting '>' (but no newline chars!).
   # aaseq also given without any newline chars
+  attr_accessor :header, :aaseq
   def initialize(header=nil, aaseq=nil)
     @header = header || ''
     if aaseq

data/lib/gi.rb CHANGED Viewed

@@ -40,19 +40,24 @@ class GI
   BATCH_SIZE = 500
   # takes an array of gi numbers and returns an array of annotation
   # This allows use of the batch search mode on NCBI
+  # returns nil if no internet connection
   def self.gi2annot(list_of_gi_numbers)
+    annots = []
     loop do
       batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
       if batch.size == 0 then break end
       string = batch.join(",")
       url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
       #puts url
-      annots = []
-      open(url) do |handle|
-        annots = parse_etool_output(handle)
+      begin
+        open(url) do |handle|
+          annots.push( *(parse_etool_output(handle)) )
+        end
+      rescue SocketError
+        return nil
       end
-      annots
     end
+    annots
   end
   protected

data/lib/roc.rb CHANGED Viewed

@@ -90,6 +90,8 @@ end
 # For calculating precision given lists of hits and decoy hits.  The hits are
 # assumed to have false positives within them that can be estimated from the
 # number of decoy hits at the same rate
+# NOTE: this class assumes that lower scores are better.  Negate your scores
+# if this is not the case.
 class DecoyROC < ROC
   # returns the [num_hits, num_tps, precision] as a function of true

data/lib/sample_enzyme.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module SpecIDXML; end
-require 'spec_id'
+require 'spec_id_xml'
 require 'strscan'
 class SampleEnzyme