RubyGems - mspire - Versions diffs - 0.4.9 → 0.5.0 - Mend

mspire 0.4.9 → 0.5.0

Files changed (255) hide show

data/README +27 -17
data/changelog.txt +31 -62
data/lib/ms/calc.rb +32 -0
data/lib/ms/data/interleaved.rb +60 -0
data/lib/ms/data/lazy_io.rb +73 -0
data/lib/ms/data/lazy_string.rb +15 -0
data/lib/ms/data/simple.rb +59 -0
data/lib/ms/data/transposed.rb +41 -0
data/lib/ms/data.rb +57 -0
data/lib/ms/format/format_error.rb +12 -0
data/lib/ms/spectrum.rb +25 -384
data/lib/ms/support/binary_search.rb +126 -0
data/lib/ms.rb +10 -10
metadata +38 -350
data/INSTALL +0 -58
data/README.rdoc +0 -18
data/Rakefile +0 -330
data/bin/aafreqs.rb +0 -23
data/bin/bioworks2excel.rb +0 -14
data/bin/bioworks_to_pepxml.rb +0 -148
data/bin/bioworks_to_pepxml_gui.rb +0 -225
data/bin/fasta_shaker.rb +0 -5
data/bin/filter_and_validate.rb +0 -5
data/bin/gi2annot.rb +0 -14
data/bin/id_class_anal.rb +0 -112
data/bin/id_precision.rb +0 -172
data/bin/ms_to_lmat.rb +0 -67
data/bin/pepproph_filter.rb +0 -16
data/bin/prob_validate.rb +0 -6
data/bin/protein_summary.rb +0 -6
data/bin/protxml2prots_peps.rb +0 -32
data/bin/raw_to_mzXML.rb +0 -55
data/bin/run_percolator.rb +0 -122
data/bin/sqt_group.rb +0 -26
data/bin/srf_group.rb +0 -27
data/bin/srf_to_sqt.rb +0 -40
data/lib/align/chams.rb +0 -78
data/lib/align.rb +0 -154
data/lib/archive/targz.rb +0 -94
data/lib/bsearch.rb +0 -120
data/lib/core_extensions.rb +0 -16
data/lib/fasta.rb +0 -626
data/lib/gi.rb +0 -124
data/lib/group_by.rb +0 -10
data/lib/index_by.rb +0 -11
data/lib/merge_deep.rb +0 -21
data/lib/ms/converter/mzxml.rb +0 -77
data/lib/ms/gradient_program.rb +0 -170
data/lib/ms/msrun.rb +0 -244
data/lib/ms/msrun_index.rb +0 -108
data/lib/ms/parser/mzdata/axml.rb +0 -67
data/lib/ms/parser/mzdata/dom.rb +0 -175
data/lib/ms/parser/mzdata/libxml.rb +0 -7
data/lib/ms/parser/mzdata.rb +0 -31
data/lib/ms/parser/mzxml/axml.rb +0 -70
data/lib/ms/parser/mzxml/dom.rb +0 -182
data/lib/ms/parser/mzxml/hpricot.rb +0 -253
data/lib/ms/parser/mzxml/libxml.rb +0 -19
data/lib/ms/parser/mzxml/regexp.rb +0 -122
data/lib/ms/parser/mzxml/rexml.rb +0 -72
data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
data/lib/ms/parser/mzxml.rb +0 -282
data/lib/ms/parser.rb +0 -108
data/lib/ms/precursor.rb +0 -25
data/lib/ms/scan.rb +0 -81
data/lib/mspire.rb +0 -4
data/lib/pi_zero.rb +0 -244
data/lib/qvalue.rb +0 -161
data/lib/roc.rb +0 -187
data/lib/sample_enzyme.rb +0 -160
data/lib/scan_i.rb +0 -21
data/lib/spec_id/aa_freqs.rb +0 -170
data/lib/spec_id/bioworks.rb +0 -497
data/lib/spec_id/digestor.rb +0 -138
data/lib/spec_id/mass.rb +0 -179
data/lib/spec_id/parser/proph.rb +0 -335
data/lib/spec_id/precision/filter/cmdline.rb +0 -218
data/lib/spec_id/precision/filter/interactive.rb +0 -134
data/lib/spec_id/precision/filter/output.rb +0 -148
data/lib/spec_id/precision/filter.rb +0 -637
data/lib/spec_id/precision/output.rb +0 -60
data/lib/spec_id/precision/prob/cmdline.rb +0 -160
data/lib/spec_id/precision/prob/output.rb +0 -94
data/lib/spec_id/precision/prob.rb +0 -249
data/lib/spec_id/proph/pep_summary.rb +0 -104
data/lib/spec_id/proph/prot_summary.rb +0 -484
data/lib/spec_id/proph.rb +0 -4
data/lib/spec_id/protein_summary.rb +0 -489
data/lib/spec_id/sequest/params.rb +0 -316
data/lib/spec_id/sequest/pepxml.rb +0 -1458
data/lib/spec_id/sequest.rb +0 -33
data/lib/spec_id/sqt.rb +0 -349
data/lib/spec_id/srf.rb +0 -973
data/lib/spec_id.rb +0 -778
data/lib/spec_id_xml.rb +0 -99
data/lib/transmem/phobius.rb +0 -147
data/lib/transmem/toppred.rb +0 -368
data/lib/transmem.rb +0 -157
data/lib/validator/aa.rb +0 -48
data/lib/validator/aa_est.rb +0 -112
data/lib/validator/background.rb +0 -77
data/lib/validator/bias.rb +0 -95
data/lib/validator/cmdline.rb +0 -431
data/lib/validator/decoy.rb +0 -107
data/lib/validator/digestion_based.rb +0 -70
data/lib/validator/probability.rb +0 -51
data/lib/validator/prot_from_pep.rb +0 -234
data/lib/validator/q_value.rb +0 -32
data/lib/validator/transmem.rb +0 -272
data/lib/validator/true_pos.rb +0 -46
data/lib/validator.rb +0 -197
data/lib/xml.rb +0 -38
data/lib/xml_style_parser.rb +0 -119
data/lib/xmlparser_wrapper.rb +0 -19
data/release_notes.txt +0 -2
data/script/compile_and_plot_smriti_final.rb +0 -97
data/script/create_little_pepxml.rb +0 -61
data/script/degenerate_peptides.rb +0 -47
data/script/estimate_fpr_by_cysteine.rb +0 -226
data/script/extract_gradient_programs.rb +0 -56
data/script/find_cysteine_background.rb +0 -137
data/script/genuine_tps_and_probs.rb +0 -136
data/script/get_apex_values_rexml.rb +0 -44
data/script/histogram_probs.rb +0 -61
data/script/mascot_fix_pepxml.rb +0 -123
data/script/msvis.rb +0 -42
data/script/mzXML2timeIndex.rb +0 -25
data/script/peps_per_bin.rb +0 -67
data/script/prep_dir.rb +0 -121
data/script/simple_protein_digestion.rb +0 -27
data/script/smriti_final_analysis.rb +0 -103
data/script/sqt_to_meta.rb +0 -24
data/script/top_hit_per_scan.rb +0 -67
data/script/toppred_to_yaml.rb +0 -47
data/script/tpp_installer.rb +0 -249
data/specs/align_spec.rb +0 -79
data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
data/specs/bin/fasta_shaker_spec.rb +0 -259
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
data/specs/bin/filter_and_validate_spec.rb +0 -180
data/specs/bin/ms_to_lmat_spec.rb +0 -34
data/specs/bin/prob_validate_spec.rb +0 -86
data/specs/bin/protein_summary_spec.rb +0 -14
data/specs/fasta_spec.rb +0 -354
data/specs/gi_spec.rb +0 -22
data/specs/load_bin_path.rb +0 -7
data/specs/merge_deep_spec.rb +0 -13
data/specs/ms/gradient_program_spec.rb +0 -77
data/specs/ms/msrun_spec.rb +0 -498
data/specs/ms/parser_spec.rb +0 -92
data/specs/ms/spectrum_spec.rb +0 -87
data/specs/pi_zero_spec.rb +0 -115
data/specs/qvalue_spec.rb +0 -39
data/specs/roc_spec.rb +0 -251
data/specs/rspec_autotest.rb +0 -149
data/specs/sample_enzyme_spec.rb +0 -126
data/specs/spec_helper.rb +0 -135
data/specs/spec_id/aa_freqs_spec.rb +0 -52
data/specs/spec_id/bioworks_spec.rb +0 -148
data/specs/spec_id/digestor_spec.rb +0 -75
data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
data/specs/spec_id/precision/filter/output_spec.rb +0 -31
data/specs/spec_id/precision/filter_spec.rb +0 -246
data/specs/spec_id/precision/prob_spec.rb +0 -44
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
data/specs/spec_id/protein_summary_spec.rb +0 -189
data/specs/spec_id/sequest/params_spec.rb +0 -68
data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
data/specs/spec_id/sequest_spec.rb +0 -38
data/specs/spec_id/sqt_spec.rb +0 -246
data/specs/spec_id/srf_spec.rb +0 -172
data/specs/spec_id/srf_spec_helper.rb +0 -139
data/specs/spec_id_helper.rb +0 -33
data/specs/spec_id_spec.rb +0 -366
data/specs/spec_id_xml_spec.rb +0 -33
data/specs/transmem/phobius_spec.rb +0 -425
data/specs/transmem/toppred_spec.rb +0 -298
data/specs/transmem_spec.rb +0 -60
data/specs/transmem_spec_shared.rb +0 -64
data/specs/validator/aa_est_spec.rb +0 -66
data/specs/validator/aa_spec.rb +0 -40
data/specs/validator/background_spec.rb +0 -67
data/specs/validator/bias_spec.rb +0 -122
data/specs/validator/decoy_spec.rb +0 -51
data/specs/validator/fasta_helper.rb +0 -26
data/specs/validator/prot_from_pep_spec.rb +0 -141
data/specs/validator/transmem_spec.rb +0 -146
data/specs/validator/true_pos_spec.rb +0 -58
data/specs/validator_helper.rb +0 -33
data/specs/xml_spec.rb +0 -12
data/test_files/000_pepxml18_small.xml +0 -206
data/test_files/020a.mzXML.timeIndex +0 -4710
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
data/test_files/4-03-03_small-prot.xml +0 -321
data/test_files/4-03-03_small.xml +0 -3876
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +0 -5999
data/test_files/bioworks31.params +0 -77
data/test_files/bioworks32.params +0 -62
data/test_files/bioworks33.params +0 -63
data/test_files/bioworks_single_run_small.xml +0 -7237
data/test_files/bioworks_small.fasta +0 -212
data/test_files/bioworks_small.params +0 -63
data/test_files/bioworks_small.phobius +0 -109
data/test_files/bioworks_small.toppred.out +0 -2847
data/test_files/bioworks_small.xml +0 -5610
data/test_files/bioworks_with_INV_small.xml +0 -3753
data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +0 -304
data/test_files/messups.fasta +0 -297
data/test_files/opd1/000.my_answer.100lines.xml +0 -101
data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
data/test_files/opd1/000_020_3prots-prot.xml +0 -62
data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
data/test_files/opd1/sequest.3.1.params +0 -77
data/test_files/opd1/sequest.3.2.params +0 -62
data/test_files/opd1/twenty_scans.mzXML +0 -418
data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +0 -9
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
data/test_files/pepproph_small.xml +0 -4691
data/test_files/phobius.small.noheader.txt +0 -50
data/test_files/phobius.small.small.txt +0 -53
data/test_files/s01_anC1_ld020mM.key.txt +0 -25
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +0 -297
data/test_files/small.sqt +0 -87
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +0 -14340
data/test_files/tf_bioworks2excel.txt.actual +0 -1035
data/test_files/toppred.small.out +0 -416
data/test_files/toppred.xml.out +0 -318
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
data/test_files/yeast_gly_small-prot.xml +0 -265
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
data/test_files/yeast_gly_small.xml +0 -3807
data/test_files/yeast_gly_small2.parentTimes +0 -6

data/lib/spec_id.rb DELETED Viewed

@@ -1,778 +0,0 @@
-require 'ostruct'
-require 'set'
-require 'hash_by'
-require 'roc'
-require 'sample_enzyme'  # for others
-require 'spec_id/bioworks'
-require 'spec_id/sequest'
-require 'spec_id/proph/prot_summary'
-require 'spec_id/proph/pep_summary'
-require 'spec_id_xml'
-require 'spec_id/sqt'
-require 'spec_id/mass'
-require 'fasta'
-module ProteinReferenceable ; end
-class SampleEnzyme ; end
-module SpecID ; end
-class GenericSpecID ; include SpecID ; end
-module SpecID
-  MONO = Mass::MONO
-  AVG = Mass::AVG
-  attr_accessor :peps, :prots
-  # True if a high protein/peptide score is better than low, false otherwise
-  # This is set automatically for known file types
-  attr_accessor :hi_prob_best
-  # A relative pathname of the file the specid object is derived from
-  attr_accessor :filename
-  # tp = file_type
-  # Will return a SpecID object (really, the object corresponding to the
-  # file type which mixes in SpecID [is_a?(SpecID) == true])
-  # If no file is given, will return a GenericSpecID object.
-  # If file is an array, this is assumed to be a group of srf files which is
-  # converted into an SRFGroup Ojbect and run.
-  def self.new(file=nil, tp=nil)
-    # this will need to be specialized for other groups later
-    if file.is_a?(Array)
-      # takes an array of srf filenames
-      SRFGroup.new(file)
-    elsif file
-      from_file(file, tp)
-    else
-      GenericSpecID.new
-    end
-  end
-  # tp = file_type
-  # a single srf file will be packaged into an SRFGroup object
-  def self.from_file(file, tp=nil)
-    obj = nil
-    unless tp
-      tp = file_type(file)
-    end
-    obj = case tp
-    when 'srf'
-      #@hi_prob_best = false
-      SRFGroup.new([file])
-    when 'srg'
-      #@hi_prob_best = false
-      SRFGroup.new(file)
-    when 'bioworks'
-      #@hi_prob_best = false
-      Bioworks.new(file)
-    when 'protproph'
-      #@hi_prob_best = true
-      Proph::ProtSummary.new(file)
-    when 'pepproph'
-      Proph::PepSummary.new(file)
-    when 'sqg'
-      SQTGroup.new(file)
-    when 'sqt'
-      SQTGroup.new([file])
-    else
-      abort "UNRECOGNIZED file type for #{file}"
-    end
-    obj
-  end
-  def inspect
-    peps_string =
-      if peps
-        "peps(#)=#{peps.size}"
-      else
-        "peps=(nil)"
-      end
-    "<#{self.class} #{peps_string}>"
-  end
-  # given some list of SpecID::Pep based objects, returns the list of proteins
-  # associated with those peptides
-  # kind must be a symbol:
-  # :no_update (current proteins are returned, but their peps attribute
-  # is not updated)
-  # :update (current proteins returned with peps attribute updated)
-  # :new (new proteins are created complete with peps attribute)
-  def self.protein_list(pephits, kind=:no_update)
-    orig_pephits_prts = []
-    if kind == :new
-      new_prots = {}
-      pephits.each_with_index do |pep,i|
-        orig_pephits_prts[i] = pep.prots
-        peps_new_prts = pep.prots.map do |prt|
-          if new_prots.key? prt.reference
-            already_exists = new_prots[prt.reference]
-          else
-            np = prt.dup
-            np.peps = []
-            new_prots[np.reference] = np
-            np
-          end
-        end
-        pep.prots = peps_new_prts
-      end
-    end
-    if kind == :update
-      pephits.each do |pep|
-        pep.prots.each do |prt|
-          prt.peps = []
-        end
-      end
-    end
-    prot_set = {}
-    pephits.each do |pep|
-      prts = pep.prots
-      prts.each do |prt|
-        prot_set[ prt.reference ] = prt
-      end
-      if (kind == :update || kind == :new)
-        prts.each do |prt|
-          prt.peps << pep
-        end
-      end
-    end
-    ## Reset the original protein hits
-    if kind == :new
-      pephits.each_with_index do |pep,i|
-        pep.prots = orig_pephits_prts[i]
-      end
-    end
-    prot_set.values
-  end
-  # takes a comma separated list  or array and extends the last to create an
-  # array of desired size
-  def self.extend_args(arg, desired_size)
-    arg_arr = arg
-    if arg.is_a? String
-      arg_arr = arg.split(',')
-    end
-    new_arr = []
-    last_arg = arg_arr[0]
-    desired_size.times do |i|
-      if arg_arr[i]
-        new_arr[i] = arg_arr[i]
-        last_arg = new_arr[i]
-      else
-        new_arr[i] = last_arg
-      end
-    end
-    new_arr
-  end
-  # takes an array of proteins, each having peps
-  # peptide grouping is done
-  # by-
-  # the protein with the most unique peptides ends up taking any
-  # degenerate peptides, tie goes to one with most hits total, then the one
-  # that had the top xcorr(s) (before removing any peptides).All other
-  # proteins with identical peptides will lose those peptides.  So, the rich
-  # stay rich, and the poor get poorer.
-  # returns an array of triplets where each is [prot, pep_hits,
-  # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
-  # peptide.  The internal links (prot.peps and pep.prots) is NOT modified!!
-  # update_prots == true will set each protein with the peptides found
-  def self.occams_razor(array_of_prots, update_prots=false)
-    peps_found = Set.new
-    to_sort = array_of_prots.map do |prot|
-      pps = prot.peps
-      peps_by_uniq_aaseq = pps.hash_by(:aaseq)
-      uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
-      xcorrs = pps.map {|pep| pep.xcorr }
-      silly = OpenStruct.new
-      # 0                1         2            3     4            5
-      [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
-    end
-    prot_triplets = []
-    to_sort.sort.reverse.each do |ar|
-      prot = ar[3]
-      ## overlapping set:
-      common = peps_found & ar[4]
-      ## find the uniq ones in our little set of peptides:
-      uniq = ar[4] - common
-      pep_hits = []
-      if uniq.size != 0
-        ## add to the found list:
-        peps_found.merge(uniq)
-        uniq.each do |seq|
-          pep_hits.push( *(ar[5][seq]) )
-        end
-        prot_triplets << [prot, pep_hits, uniq.to_a]
-        prot.peps = pep_hits if update_prots
-      end
-    end
-    prot_triplets
-  end
-  # returns number of true positives (array) and the specified output (as
-  # parallel array).  Requires the classification method and a sorted array of
-  # tp values and an array fp values.
-  # (This is simply a wrapper around ROC#by_tps method!)
-  def by_tps(classification_method, tp, fp)
-    ROC.new.by_tps(classification_method, tp, fp)
-  end
-  # from the unique set of peptide hits, create a separate peptide hit for
-  # each protein reference where that peptide only references that protein
-  # e.g. pep.prots = [(a single protein)]
-  def pep_prots
-    pps = []
-    peps.each do |pep|
-      pep.prots.map do |prt|
-        pep.dup
-        pep.prots = [prt]
-        pps << pep
-      end
-    end
-    pps
-  end
-  def self.prots?(ar)
-    ar.first.is_a? SpecID::Prot
-  end
-  def self.peps?(ar)
-    ar.first.is_a? SpecID::Pep
-  end
-  # for older stuff
-  def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
-    objects =
-      case items
-      when :prots
-        prots
-      when :peps
-        peps
-      end
-    SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
-  end
-  # includes the peptide hit in both
-  # returns (target, decoy)
-  # (for peps) ties can be :both, true (target wins), false (decoy wins)
-  # regardless of ties behavior, will partition out the proteins to be
-  # appropriate for the peptide
-  def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
-    if items.size == 0
-      return [[],[]]
-    elsif prots?(items)
-      myproc = proc { |prt|
-        if prt.reference =~ regex ; !decoy_on_match
-        else ; decoy_on_match end
-      }
-      return classify(items, myproc)
-    elsif peps?(items)
-      match = [] ; nomatch = []
-      items.each do |pep|
-        (match_prots, nomatch_prots) = pep.prots.partition do |prot|
-          prot.reference =~ regex
-        end
-        if match_prots.size == 0
-          nomatch << pep
-        elsif nomatch_prots.size == 0
-          match << pep
-        else ## both have hits
-          pep.prots = match_prots
-          nomatch_pep = pep.dup
-          nomatch_pep.prots = nomatch_prots
-          # resolve ties
-          case ties
-          when true
-            if decoy_on_match
-              nomatch << pep
-            else
-              match << pep
-            end
-          when false
-            if decoy_on_match
-              match << pep
-            else
-              nomatch << pep
-            end
-          when :both
-            match << pep
-            nomatch << pep
-          else ; raise ArgumentError
-          end
-        end
-      end
-      if decoy_on_match
-        return [nomatch , match]
-      else
-        return [match, nomatch]
-      end
-    else
-      raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
-    end
-  end
-  # returns [tp, fp] based on the protein prefix for items where items =
-  # (:prot|:peps)
-  # this may result in a duplication of some peptides if they match both
-  # normal and decoy proteins.  In this case, the protein arrays are split,
-  # too, so that each points only to its breed of protein.
-  def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
-    if prefix
-      regex = /^#{Regexp.escape(flag)}/
-    else
-      regex = /#{Regexp.escape(flag)}/
-    end
-    classify_by_regex(items, regex, decoy_on_match)
-  end
-  # Returns (match, nomatch)
-  # items = symbol (:prots, :peps)
-  # Returns two arrays, those returning true from classify_item_by and those
-  # returning false
-  def classify(items, classify_item_by)
-    its = send(items)
-    f = []; t = []
-    its.each do |it|
-      if classify_item_by.call(it)
-        t << it
-      else
-        f << it
-      end
-    end
-    [t,f]
-  end
-  # returns two arrays, true positives and false positives (determined by proc
-  # classify_item_by) sorted by proc rank_item_by.  Items will be ranked from
-  # lowest to highest based on the return value of rank_item_by. items is a
-  # symbol (:prots or :peps)
-  def rank_and_classify(items, rank_item_by, classify_item_by)
-    its = send(items)
-    #its.each do |it| puts it.probability.to_s ; puts it.reference end
-    doublets = its.collect do |item|
-      [ rank_item_by.call(item),
-        classify_item_by.call(item) ]
-    end
-    roc = ROC.new
-    tp, fp = roc.doublets_to_separate(doublets)
-    return tp, fp
-  end
-  # returns a proc for getting all probabilities so that an ascending sort
-  # will put the best scores first
-  def probability_proc
-    if hi_prob_best
-      get_prob_proc = proc {|prt| prt.probability * -1 }
-    else
-      get_prob_proc = proc {|prt| prt.probability }
-    end
-    get_prob_proc
-  end
-  def separate_by_prefix(items, fp_prefix)
-    its = send(items)
-    if items == :prots
-    elsif items == :peps
-      abort "not implemented yet"
-    else
-      abort "no other items recognized yet"
-    end
-  end
-  # sorts the probabilities and then
-  # calcs predicted number hits and precision for protein probabilities
-  # (summing probabilities)
-  # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
-  # called false positive rate
-  # SUM(1-probX)/#prots
-  def num_hits_and_ppv_for_protein_prophet_probabilities
-    current_sum_one_minus_prob = 0.0
-    num_prots = []
-    ppv = []
-    prot_cnt = 0
-    probs = prots.map {|v| v.probability}
-    sorted = probs.sort.reverse
-    sorted.each do |prob|
-      prot_cnt += 1
-      num_prots << prot_cnt
-      current_sum_one_minus_prob += 1.0 - prob
-      ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
-      # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
-    end
-    [num_prots, ppv]
-  end
-  # convenience method for the common task of determining precision for
-  # proteins (with decoy proteins found by false_flag)
-  # returns (num_hits, precision)
-  def num_hits_and_ppv_for_prob(false_flag, prefix=false)
-    if prefix
-      regex = /^#{Regexp.escape(false_flag)}/
-    else
-      regex = /#{Regexp.escape(false_flag)}/
-    end
-    prob_proc = probability_proc
-    myproc = proc { |prt|
-      if prt.reference =~ regex ; false
-      else ; true end
-    }
-    real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
-    (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
-    [num_hits, precision]
-  end
-#  # takes the existing spec_id object and marshals it into "file.msh"
-#  # a new file will always look for a file.msh to load
-#  def marshal(force=false)
-#    if !(File.exist? @marshal_file)| force
-#      File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) }
-#    end
-#  end
-  # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
-  # 'srf' if SRF file, 'srg' if search results group file.
-  def self.file_type(file)
-    if file =~ /\.srg$/
-      return 'srg'
-    elsif file =~ /\.sqg$/
-      return 'sqg'
-    end
-    if IO.read(file, 7,438) == 'Enzyme:'
-      return 'srf'
-    end
-    File.open(file) do |fh|
-      lines = ""
-      8.times { lines << fh.readline }
-      if lines =~ /<bioworksinfo>/
-        return 'bioworks'
-      elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
-        return 'protproph'
-      elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
-        return 'pepproph'
-      end
-      # assumes the header of a sqt file is less than 200 lines ...
-      200.times do
-        line = fh.gets
-        if line
-          lines << line
-        else ; break
-        end
-      end
-      if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
-        return 'sqt'
-      end
-    end
-  end
-  ##############################################
-  # These are pretty specific to Smriti's needs:
-  # Given a hash of peptide arrays by some attribute key
-  # Return two sorted arrays of sorted probabilities
-  # The first of the min and second of the best 10 of each peptide array
-  def min_and_best10(hash)
-    ## choose the min probability and sort by prob
-    min_peptides = hash.collect do |k,v|
-      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
-    end
-    #puts min_peptides[0] # -> Bioworks::Pep
-    min_sorted_peps = sorted_probabilities(min_peptides)
-    #puts min_sorted_peps[0] # -> probability (Float)
-    peptides_by_tens = []
-    hash.each do |k,v|
-      arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10)
-      peptides_by_tens.push(*arr)
-    end
-    top_10_sorted_peps = sorted_probabilities(peptides_by_tens)
-    #puts top_10_sorted_peps[0] # -> float
-    #puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}"
-    #puts "size: min_sorted_peps.size : #{min_sorted_peps.size}"
-    #p top_10_sorted_peps
-    #p min_sorted_peps
-    return min_sorted_peps, top_10_sorted_peps
-  end
-  # Returns a list of sorted probabilities given the array of peptides
-  def sorted_probabilities(peptides)
-    #puts peptides.first.peptide_probability.class
-    #peptides.each do |pep| print pep.class.to_s + " " end
-    #puts peptides.first.is_a? Array
-    #abort "DFHDFD"
-    peptides.collect{|pep| pep.probability }.sort
-  end
-  # returns a sorted lists of probabilities based on all pepprots (a peptide
-  # associated with a protein)
-  def pep_probs_by_pep_prots
-    sorted_probabilities(peps)
-  end
-  ##########################################################################
-  # WARNING! These might be dangerous to your health if there are multiple
-  # files collected in your bioworks file
-  ##########################################################################
-  # (prob_list_by_min, prob_list_by_best10)
-  # returns 2 sorted lists of probabilities based on:
-  #   1. best peptide hit
-  #   2. top 10 peptide hits
-  # on a per scan basis
-  # NOTE: you may want to hash on base_name first!
-  def pep_probs_by_scan
-    hash = peps.hash_by(:first_scan, :last_scan)
-    return min_and_best10(hash)
-  end
-  #(prob_list_by_min, prob_list_by_best10)
-  # same as pep_probs_by_scan but per charge state
-  # NOTE: you may want to hash on base_name first!
-  def pep_probs_by_scan_charge
-    hash = peps.hash_by(:first_scan, :last_scan, :charge)
-    return min_and_best10(hash)
-  end
-  # (prob_list_by_min)
-  # hashes on seq-charge and returns the sorted list of probabilities of top
-  # hit per seq-charge
-  # NOTE: you may want to hash on base_name first!
-  def pep_probs_by_seq_charge
-    hash = peps.hash_by(:sequence, :charge)
-    min_peptides = hash.collect do |k,v|
-      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
-    end
-    sorted_probabilities(min_peptides)
-  end
-  ##########################################################################
-  # USE these if you have multiple files in your bioworks.xml file
-  ##########################################################################
-  # (prob_list_by_min, prob_list_by_best10)
-  # returns 2 sorted lists of probabilities based on:
-  #   1. best peptide hit
-  #   2. top 10 peptide hits
-  # on a per scan basis
-  # NOTE: you may want to hash on base_name first!
-  def pep_probs_by_bn_scan
-    hash = peps.hash_by(:base_name, :first_scan, :last_scan)
-    return min_and_best10(hash)
-  end
-  #(prob_list_by_min, prob_list_by_best10)
-  # same as pep_probs_by_scan but per charge state
-  # NOTE: you may want to hash on base_name first!
-  def pep_probs_by_bn_scan_charge
-    hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
-    return min_and_best10(hash)
-  end
-  # (prob_list_by_min)
-  # hashes on seq-charge and returns the sorted list of probabilities of top
-  # hit per seq-charge
-  # NOTE: you may want to hash on base_name first!
-  def pep_probs_by_bn_seq_charge
-    hash = peps.hash_by(:base_name, :sequence, :charge)
-    min_peptides = hash.collect do |k,v|
-      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
-    end
-    sorted_probabilities(min_peptides)
-  end
-end
-# A Generic spectraID protein
-module SpecID::Prot
-  include ProteinReferenceable
-  # probability is always a float!
-  attr_accessor :probability, :reference, :peps
-  def <=> (other)
-    self.reference <=> other.reference
-  end
-  def inspect
-    pep_string =
-      if peps
-      ", @peps(#)=#{peps.size}"
-      end
-    "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
-  end
-end
-module SpecID::Pep
-   Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
-  attr_accessor :prots
-  attr_accessor :probability
-  # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
-  # or last.
-  attr_accessor :sequence
-  # the basic amino acid sequence (no leading or trailing '.' or amino acids)
-  # should not contain any special symbols, etc.
-  attr_accessor :aaseq
-  attr_accessor :charge
-  # removes nonstandard chars with Non_standard_amino_acid_char_re
-  # preserves A-Z and '.' and '-'
-  def self.remove_non_amino_acids(sequence)
-    sequence.gsub(Non_standard_amino_acid_char_re, '')
-  end
-  # remove_non_amino_acids && split_sequence
-  def self.prepare_sequence(val)
-    nv = remove_non_amino_acids(val)
-    split_sequence(nv)
-  end
-  def <=>(other)
-    aaseq <=> other.aaseq
-  end
-  # Returns prev, peptide, next from sequence.  Parse errors return
-  # nil,nil,nil
-  #   R.PEPTIDE.A  # -> R, PEPTIDE, A
-  #   R.PEPTIDE.-  # -> R, PEPTIDE, -
-  #   PEPTIDE.A    # -> -, PEPTIDE, A
-  #   A.PEPTIDE    # -> A, PEPTIDE, -
-  #   PEPTIDE      # -> nil,nil,nil
-  def self.split_sequence(val)
-    peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
-    pieces = val.split('.')
-    case pieces.size
-    when 3
-      peptide_prev_aa, peptide, peptide_next_aa = *pieces
-    when 2
-      if pieces[0].size > 1  ## N termini
-        peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
-      else  ## C termini
-        peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
-      end
-    when 1  ## this must be a parse error!
-      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
-    when 0
-      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
-    end
-    return peptide_prev_aa, peptide, peptide_next_aa
-  end
-  ##
-  def self.sequence_to_aaseq(sequence)
-    after_removed = remove_non_amino_acids(sequence)
-    pieces = after_removed.split('.')
-    case pieces.size
-    when 3
-      pieces[1]
-    when 2
-      if pieces[0].size > 1  ## N termini
-        pieces[0]
-      else  ## C termini
-        pieces[1]
-      end
-    when 1  ## this must be a parse error!
-      pieces[0] ## which is the peptide itself
-    else
-      abort "bad peptide sequence: #{sequence}"
-    end
-  end
-  # This will rapidly determine the list of proteins for which given
-  # peptides belong.  It is meant to be low level and fast (eventually),
-  # so it asks for the data in a format amenable to this.
-  # returns a mirror array where each entry is an array of Fasta::Prot
-  # objects where each protein contains the sequence
-  def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
-    prots = fasta_obj.prots
-    prot_seqs = prots.map do |prot|
-      prot.aaseq
-    end
-    groups = peptide_strings_list.map do |pep_seq|
-      prot_index = 0
-      protein_group = []
-      prot_seqs.each do |prot_seq|
-        if prot_seq.include? pep_seq
-          protein_group << prots[prot_index]
-        end
-        prot_index += 1
-      end
-      protein_group
-    end
-    groups
-  end
-  # units can be :mmu, :amu, :ppm
-  def mass_accuracy(pep, unit=:ppm, mono=true)
-    # 10^6 * deltam accuracy/ m[measured]
-    # i.e., theoretical mass 1000, measured 999.9: 100ppm
-    # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
-    # pep.mass is the theoretical M+H of the peptide
-    # this assumes that the deltacn value we're being told is correct, but I
-    # have my suspicions (since the <mass> value is not accurate...)
-    ######## TO COMPLETE (and add to spec_id..?)
-    case unit
-    when :ppm
-    when :amu
-    when :mmu
-    end
-  end
-  # calls the method associated with each key and returns the value
-  def values_at(*args)
-    args.map do |arg|
-      send(arg)
-    end
-  end
-  def inspect
-    prot_string =
-      if prots
-      ", @prots(#)=#{prots.size}"
-      end
-    "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
-  end
-end
-class SpecID::GenericProt
-  include SpecID::Prot
-end
-class SpecID::GenericPep
-  include SpecID::Pep
-end