RubyGems - mspire - Versions diffs - 0.4.9 → 0.5.0 - Mend

mspire 0.4.9 → 0.5.0

Files changed (255) hide show

data/README +27 -17
data/changelog.txt +31 -62
data/lib/ms/calc.rb +32 -0
data/lib/ms/data/interleaved.rb +60 -0
data/lib/ms/data/lazy_io.rb +73 -0
data/lib/ms/data/lazy_string.rb +15 -0
data/lib/ms/data/simple.rb +59 -0
data/lib/ms/data/transposed.rb +41 -0
data/lib/ms/data.rb +57 -0
data/lib/ms/format/format_error.rb +12 -0
data/lib/ms/spectrum.rb +25 -384
data/lib/ms/support/binary_search.rb +126 -0
data/lib/ms.rb +10 -10
metadata +38 -350
data/INSTALL +0 -58
data/README.rdoc +0 -18
data/Rakefile +0 -330
data/bin/aafreqs.rb +0 -23
data/bin/bioworks2excel.rb +0 -14
data/bin/bioworks_to_pepxml.rb +0 -148
data/bin/bioworks_to_pepxml_gui.rb +0 -225
data/bin/fasta_shaker.rb +0 -5
data/bin/filter_and_validate.rb +0 -5
data/bin/gi2annot.rb +0 -14
data/bin/id_class_anal.rb +0 -112
data/bin/id_precision.rb +0 -172
data/bin/ms_to_lmat.rb +0 -67
data/bin/pepproph_filter.rb +0 -16
data/bin/prob_validate.rb +0 -6
data/bin/protein_summary.rb +0 -6
data/bin/protxml2prots_peps.rb +0 -32
data/bin/raw_to_mzXML.rb +0 -55
data/bin/run_percolator.rb +0 -122
data/bin/sqt_group.rb +0 -26
data/bin/srf_group.rb +0 -27
data/bin/srf_to_sqt.rb +0 -40
data/lib/align/chams.rb +0 -78
data/lib/align.rb +0 -154
data/lib/archive/targz.rb +0 -94
data/lib/bsearch.rb +0 -120
data/lib/core_extensions.rb +0 -16
data/lib/fasta.rb +0 -626
data/lib/gi.rb +0 -124
data/lib/group_by.rb +0 -10
data/lib/index_by.rb +0 -11
data/lib/merge_deep.rb +0 -21
data/lib/ms/converter/mzxml.rb +0 -77
data/lib/ms/gradient_program.rb +0 -170
data/lib/ms/msrun.rb +0 -244
data/lib/ms/msrun_index.rb +0 -108
data/lib/ms/parser/mzdata/axml.rb +0 -67
data/lib/ms/parser/mzdata/dom.rb +0 -175
data/lib/ms/parser/mzdata/libxml.rb +0 -7
data/lib/ms/parser/mzdata.rb +0 -31
data/lib/ms/parser/mzxml/axml.rb +0 -70
data/lib/ms/parser/mzxml/dom.rb +0 -182
data/lib/ms/parser/mzxml/hpricot.rb +0 -253
data/lib/ms/parser/mzxml/libxml.rb +0 -19
data/lib/ms/parser/mzxml/regexp.rb +0 -122
data/lib/ms/parser/mzxml/rexml.rb +0 -72
data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
data/lib/ms/parser/mzxml.rb +0 -282
data/lib/ms/parser.rb +0 -108
data/lib/ms/precursor.rb +0 -25
data/lib/ms/scan.rb +0 -81
data/lib/mspire.rb +0 -4
data/lib/pi_zero.rb +0 -244
data/lib/qvalue.rb +0 -161
data/lib/roc.rb +0 -187
data/lib/sample_enzyme.rb +0 -160
data/lib/scan_i.rb +0 -21
data/lib/spec_id/aa_freqs.rb +0 -170
data/lib/spec_id/bioworks.rb +0 -497
data/lib/spec_id/digestor.rb +0 -138
data/lib/spec_id/mass.rb +0 -179
data/lib/spec_id/parser/proph.rb +0 -335
data/lib/spec_id/precision/filter/cmdline.rb +0 -218
data/lib/spec_id/precision/filter/interactive.rb +0 -134
data/lib/spec_id/precision/filter/output.rb +0 -148
data/lib/spec_id/precision/filter.rb +0 -637
data/lib/spec_id/precision/output.rb +0 -60
data/lib/spec_id/precision/prob/cmdline.rb +0 -160
data/lib/spec_id/precision/prob/output.rb +0 -94
data/lib/spec_id/precision/prob.rb +0 -249
data/lib/spec_id/proph/pep_summary.rb +0 -104
data/lib/spec_id/proph/prot_summary.rb +0 -484
data/lib/spec_id/proph.rb +0 -4
data/lib/spec_id/protein_summary.rb +0 -489
data/lib/spec_id/sequest/params.rb +0 -316
data/lib/spec_id/sequest/pepxml.rb +0 -1458
data/lib/spec_id/sequest.rb +0 -33
data/lib/spec_id/sqt.rb +0 -349
data/lib/spec_id/srf.rb +0 -973
data/lib/spec_id.rb +0 -778
data/lib/spec_id_xml.rb +0 -99
data/lib/transmem/phobius.rb +0 -147
data/lib/transmem/toppred.rb +0 -368
data/lib/transmem.rb +0 -157
data/lib/validator/aa.rb +0 -48
data/lib/validator/aa_est.rb +0 -112
data/lib/validator/background.rb +0 -77
data/lib/validator/bias.rb +0 -95
data/lib/validator/cmdline.rb +0 -431
data/lib/validator/decoy.rb +0 -107
data/lib/validator/digestion_based.rb +0 -70
data/lib/validator/probability.rb +0 -51
data/lib/validator/prot_from_pep.rb +0 -234
data/lib/validator/q_value.rb +0 -32
data/lib/validator/transmem.rb +0 -272
data/lib/validator/true_pos.rb +0 -46
data/lib/validator.rb +0 -197
data/lib/xml.rb +0 -38
data/lib/xml_style_parser.rb +0 -119
data/lib/xmlparser_wrapper.rb +0 -19
data/release_notes.txt +0 -2
data/script/compile_and_plot_smriti_final.rb +0 -97
data/script/create_little_pepxml.rb +0 -61
data/script/degenerate_peptides.rb +0 -47
data/script/estimate_fpr_by_cysteine.rb +0 -226
data/script/extract_gradient_programs.rb +0 -56
data/script/find_cysteine_background.rb +0 -137
data/script/genuine_tps_and_probs.rb +0 -136
data/script/get_apex_values_rexml.rb +0 -44
data/script/histogram_probs.rb +0 -61
data/script/mascot_fix_pepxml.rb +0 -123
data/script/msvis.rb +0 -42
data/script/mzXML2timeIndex.rb +0 -25
data/script/peps_per_bin.rb +0 -67
data/script/prep_dir.rb +0 -121
data/script/simple_protein_digestion.rb +0 -27
data/script/smriti_final_analysis.rb +0 -103
data/script/sqt_to_meta.rb +0 -24
data/script/top_hit_per_scan.rb +0 -67
data/script/toppred_to_yaml.rb +0 -47
data/script/tpp_installer.rb +0 -249
data/specs/align_spec.rb +0 -79
data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
data/specs/bin/fasta_shaker_spec.rb +0 -259
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
data/specs/bin/filter_and_validate_spec.rb +0 -180
data/specs/bin/ms_to_lmat_spec.rb +0 -34
data/specs/bin/prob_validate_spec.rb +0 -86
data/specs/bin/protein_summary_spec.rb +0 -14
data/specs/fasta_spec.rb +0 -354
data/specs/gi_spec.rb +0 -22
data/specs/load_bin_path.rb +0 -7
data/specs/merge_deep_spec.rb +0 -13
data/specs/ms/gradient_program_spec.rb +0 -77
data/specs/ms/msrun_spec.rb +0 -498
data/specs/ms/parser_spec.rb +0 -92
data/specs/ms/spectrum_spec.rb +0 -87
data/specs/pi_zero_spec.rb +0 -115
data/specs/qvalue_spec.rb +0 -39
data/specs/roc_spec.rb +0 -251
data/specs/rspec_autotest.rb +0 -149
data/specs/sample_enzyme_spec.rb +0 -126
data/specs/spec_helper.rb +0 -135
data/specs/spec_id/aa_freqs_spec.rb +0 -52
data/specs/spec_id/bioworks_spec.rb +0 -148
data/specs/spec_id/digestor_spec.rb +0 -75
data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
data/specs/spec_id/precision/filter/output_spec.rb +0 -31
data/specs/spec_id/precision/filter_spec.rb +0 -246
data/specs/spec_id/precision/prob_spec.rb +0 -44
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
data/specs/spec_id/protein_summary_spec.rb +0 -189
data/specs/spec_id/sequest/params_spec.rb +0 -68
data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
data/specs/spec_id/sequest_spec.rb +0 -38
data/specs/spec_id/sqt_spec.rb +0 -246
data/specs/spec_id/srf_spec.rb +0 -172
data/specs/spec_id/srf_spec_helper.rb +0 -139
data/specs/spec_id_helper.rb +0 -33
data/specs/spec_id_spec.rb +0 -366
data/specs/spec_id_xml_spec.rb +0 -33
data/specs/transmem/phobius_spec.rb +0 -425
data/specs/transmem/toppred_spec.rb +0 -298
data/specs/transmem_spec.rb +0 -60
data/specs/transmem_spec_shared.rb +0 -64
data/specs/validator/aa_est_spec.rb +0 -66
data/specs/validator/aa_spec.rb +0 -40
data/specs/validator/background_spec.rb +0 -67
data/specs/validator/bias_spec.rb +0 -122
data/specs/validator/decoy_spec.rb +0 -51
data/specs/validator/fasta_helper.rb +0 -26
data/specs/validator/prot_from_pep_spec.rb +0 -141
data/specs/validator/transmem_spec.rb +0 -146
data/specs/validator/true_pos_spec.rb +0 -58
data/specs/validator_helper.rb +0 -33
data/specs/xml_spec.rb +0 -12
data/test_files/000_pepxml18_small.xml +0 -206
data/test_files/020a.mzXML.timeIndex +0 -4710
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
data/test_files/4-03-03_small-prot.xml +0 -321
data/test_files/4-03-03_small.xml +0 -3876
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +0 -5999
data/test_files/bioworks31.params +0 -77
data/test_files/bioworks32.params +0 -62
data/test_files/bioworks33.params +0 -63
data/test_files/bioworks_single_run_small.xml +0 -7237
data/test_files/bioworks_small.fasta +0 -212
data/test_files/bioworks_small.params +0 -63
data/test_files/bioworks_small.phobius +0 -109
data/test_files/bioworks_small.toppred.out +0 -2847
data/test_files/bioworks_small.xml +0 -5610
data/test_files/bioworks_with_INV_small.xml +0 -3753
data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +0 -304
data/test_files/messups.fasta +0 -297
data/test_files/opd1/000.my_answer.100lines.xml +0 -101
data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
data/test_files/opd1/000_020_3prots-prot.xml +0 -62
data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
data/test_files/opd1/sequest.3.1.params +0 -77
data/test_files/opd1/sequest.3.2.params +0 -62
data/test_files/opd1/twenty_scans.mzXML +0 -418
data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +0 -9
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
data/test_files/pepproph_small.xml +0 -4691
data/test_files/phobius.small.noheader.txt +0 -50
data/test_files/phobius.small.small.txt +0 -53
data/test_files/s01_anC1_ld020mM.key.txt +0 -25
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +0 -297
data/test_files/small.sqt +0 -87
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +0 -14340
data/test_files/tf_bioworks2excel.txt.actual +0 -1035
data/test_files/toppred.small.out +0 -416
data/test_files/toppred.xml.out +0 -318
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
data/test_files/yeast_gly_small-prot.xml +0 -265
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
data/test_files/yeast_gly_small.xml +0 -3807
data/test_files/yeast_gly_small2.parentTimes +0 -6

data/lib/pi_zero.rb DELETED Viewed

@@ -1,244 +0,0 @@
-require 'rsruby'
-require 'vec'
-require 'vec/r'
-require 'enumerator'
-module PiZero
-  class << self
-    # takes a sorted array of p-values (floats between 0 and 1 inclusive)
-    # returns [thresholds_ar, instantaneous pi_0 calculations_ar]
-    # evenly incremented values will be used by default:
-    # :start=>0.0, :stop=>0.9, :step=>0.01
-    def pi_zero_hats(sorted_pvals, args={})
-      defaults = {:start => 0.0, :stop=>0.9, :step=>0.05 }
-      margs = defaults.merge( args )
-      (start, stop, step) = margs.values_at(:start, :stop, :step)
-      # From Storey et al. PNAS 2003:
-      lambdas = []                 # lambda
-      pi_zeros = []                # pi_0
-      total = sorted_pvals.size  # m
-      # totally inefficient implementation (with correct logic):
-      # TODO: implement this efficiently
-      start.step(stop, step) do |lam|
-        lambdas << lam
-        (greater, less) = sorted_pvals.partition {|pval| pval > lam }
-        pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) )
-      end
-      [lambdas, pi_zeros]
-    end
-=begin
-    def plateau_height_with_gsl(x, y)
-      require 'gsl'
-      x_deltas = (0...(x.size-1)).to_a.map do |i|
-        x[i+1] - x[i]
-      end
-      y_deltas = (0...(y.size-1)).to_a.map do |i|
-        y[i+1] - y[i]
-      end
-      new_xs = x.dup
-      new_ys = y.dup
-      x_deltas.reverse.each do |delt|
-        new_xs.push( new_xs.last + delt )
-      end
-      y_cnt = y.size
-      y_deltas.reverse.each do |delt|
-        y_cnt -= 1
-        new_ys.push( y[y_cnt] - delt )
-      end
-      x_vec = GSL::Vector.alloc(new_xs)
-      y_vec = GSL::Vector.alloc(new_ys)
-      coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
-      coef.eval(x.last)
-      #x2 = GSL::Vector::linspace(0,2.4,20)
-      #graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
-    end
-=end
-    # expecting x and y to make a scatter plot descending to a plateau on the
-    # right side (which is assumed to be of increasing noise as it goes to the
-    # right)
-    # returns the height of the plateau at the right edge
-    #
-    # *
-    #   *
-    #     *
-    #       **
-    #          ** ***         *    *
-    #                    ***** **** ***
-    def plateau_height(x, y)
-      r = RSRuby.instance
-      answ = r.smooth_spline(x,y, :df => 3)
-      ## to plot it!
-      r.plot(x,y, :ylab=>"pi_zeros or frit")
-      r.lines(answ['x'], answ['y'])
-      r.points(answ['x'], answ['y'])
-      sleep(4)
-      answ['y'].last
-    end
-    def plateau_exponential(x,y)
-      require 'gsl'
-      xvec = GSL::Vector.alloc(x)
-      yvec = GSL::Vector.alloc(y)
-      a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec))
-      x2 = GSL::Vector.linspace(0, 1.2, 20)
-      exp_a = GSL::Sf::exp(a2)
-      out_y = exp_a*GSL::Sf::exp(b2*x2)
-      raise NotImplementedError, "need to grab out the answer"
-      #graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4")
-    end
-    # returns a conservative (but close) estimate of pi_0 given p-values
-    # following Storey et al. 2003, PNAS.
-    def pi_zero(pvals)
-      sorted_pvals = pvals.sort
-      plateau_height( *(pi_zero_hats(sorted_pvals)) )
-    end
-    # returns an array where the left values have been filled in using the
-    # similar values on the right side of the distribution.  These values are
-    # pushed onto the end of the array in no guaranteed order.
-    # extends a distribution on the left side where it is missing since
-    # xcorr values <= 0.0 are not reported
-    #     **
-    #    *  *
-    #   *    *
-    #          *
-    #            *
-    #                   *
-    #  Grabs the right tail from above and inverts it to the left side (less
-    #  than zero), creating a more full distribution.  raises an ArgumentError
-    #  if values_chopped_at_zero.size == 0
-    #  this method would be more robust with some smoothing.
-    #  Method currently only meant for large amounts of data.
-    #  input data does not need to be sorted
-    def extend_distribution_left_of_zero(values_chopped_at_zero)
-      sz = values_chopped_at_zero.size
-      raise ArgumentError, "array.size must be > 0" if sz == 0
-      num_bins = (Math.log10(sz) * 100).round
-      vec = VecD.new(values_chopped_at_zero)
-      (bins, freqs) = vec.histogram(num_bins)
-      start_i = 0
-      freqs.each_with_index do |f,i|
-        if f.is_a?(Numeric) && f > 0
-          start_i = i
-          break
-        end
-      end
-      match_it = freqs[start_i]
-      # get the index of the first frequency value less than the zero frequency
-      index_to_chop_at = -1
-      rev_freqs = freqs.reverse
-      rev_freqs.each_with_index do |freq,rev_i|
-        if match_it - rev_freqs[rev_i+1] <= 0
-          index_to_chop_at = freqs.size - 1 - rev_i
-          break
-        end
-      end
-      cut_point = bins[index_to_chop_at]
-      values_chopped_at_zero + values_chopped_at_zero.select {|v| v >= cut_point }.map {|v| cut_point - v }
-    end
-    # assumes the decoy_vals follows a normal distribution
-    def p_values(target_vals, decoy_vals)
-      (mean, stdev) = VecD.new(decoy_vals).sample_stats
-      r = RSRuby.instance
-      vec = VecD.new(target_vals)
-      right_tailed = true
-      vec.p_value_normal(mean, stdev, right_tailed)
-    end
-    def p_values_for_sequest(target_hits, decoy_hits)
-      dh_vals = decoy_hits.map {|v| v.xcorr }
-      new_decoy_vals = PiZero.extend_distribution_left_of_zero(dh_vals)
-      #File.open("target.yml", 'w') {|out| out.puts new_decoy_vals.join(" ") }
-      #File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") }
-      #abort 'checking'
-      p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals )
-    end
-#### NEED TO VERIFY if this is PIT or PI_ZERO!
-=begin
-    # takes a list of booleans with true being a target hit and false being a
-    # decoy hit and returns the pi_zero using the smooth method
-    # Should be ordered from best to worst (i.e., one expects more true values
-    # at the beginning of the list)
-    def pi_zero_from_booleans(booleans)
-      targets = 0
-      decoys = 0
-      xs = []
-      ys = []
-      booleans.reverse.each_with_index do |v,index|
-        if v
-          targets += 1
-        else
-          decoys += 1
-        end
-        if decoys > 0
-          xs << index
-          ys << targets.to_f / decoys
-        end
-      end
-      ys.reverse!
-      plateau_height(xs, ys)
-    end
-=end
-    # returns fraction of incorrect target hits (frit) (this is the percent
-    # incorrect targets [PIT] expressed as a fraction rather than percent)
-    # takes two parallel arrays consisting of the total number of hits (this
-    # will typically be the total # target hits) at that point and the
-    # precision (ranging from: [0,1]) (typically determined by counting the
-    # number of decoy hits).  Expects the number of total hits to be
-    # monotonically increasing and the precision to roughly start high and
-    # decrease as more hits (of lesser quality) are added.
-    def frit_from_precision(total_num_hits_ar, precision_ar)
-      instant_pi_zeros = []
-      total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0|
-        (x1, y1) = dp1
-        (x0, y0) = dp0
-        instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0)
-      end
-      instant_pi_zeros.reverse!
-      plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros)
-    end
-    # Takes an array of doublets ([[int, int], [int, int]...]) where the first
-    # value is the number of target hits and the second is the number of decoy
-    # hits.  Expects that best hits are at the beginning of the list.  Assumes
-    # that each sum is a subset of the following group (shown as actual hits
-    # rather than number of hits):
-    #
-    #    [[target, target, target, decoy], [target, target, target, decoy,
-    #    target, decoy, target], [target, target, target, decoy, target,
-    #    decoy, target, decoy, target, target]]
-    #
-    # This assumption may be relaxed somewhat and should still give good
-    # results.
-    def frit_from_groups(array_of_doublets)
-      frits = []
-      array_of_doublets.reverse.each_cons(2) do |two_doublets|
-        bigger, smaller = two_doublets
-        num_targets = bigger[0] - smaller[0]
-        num_decoy = bigger[1] - smaller[1]
-        num_targets = 0 if num_targets < 0
-        num_decoy = 0 if num_targets < 0
-        if num_decoy > 0
-          frits << (num_targets.to_f / num_decoy)
-        end
-      end
-      frits.reverse!
-      xs = (0...(frits.size)).to_a
-      plateau_height(xs, frits)
-    end
-  end
-end

data/lib/qvalue.rb DELETED Viewed

@@ -1,161 +0,0 @@
-begin
-require 'rsruby'
-rescue LoadError
-  puts "You must have the rsruby gem installed to use the qvalue module"
-  puts $!
-  raise LoadError
-end
-require 'vec'
-# Adapted from qvalue.R by Alan Dabney and John Storey which was LGPL licensed
-class VecD
-  Default_lambdas = []
-  0.0.step(0.9,0.05) {|v| Default_lambdas << v }
-  Default_smooth_df = 3
-  # returns the pi_zero estimate by taking the fraction of all p-values above
-  # lambd and dividing by (1-lambd) and gauranteed to be <= 1
-  def pi_zero_at_lambda(lambd)
-    v = (self.select{|v| v >= lambd}.size.to_f/self.size) / (1 - lambd)
-    [v, 1].min
-  end
-  # returns a parallel array (VecI) of how many are <= in the array
-  # roughly: VecD[1,8,10,8,9,10].num_le => VecI[1, 3, 6, 3, 4, 6]
-  def num_le
-    hash = Hash.new {|h,k| h[k] = [] }
-    self.each_with_index do |v,i|
-      hash[v] << i
-    end
-    num_le_ar = []
-    sorted = self.sort
-    count = 0
-    sorted.each_with_index do |v,i|
-      back = 1
-      count += 1
-      if v == sorted[i-back]
-        while (sorted[i-back] == v)
-          num_le_ar[i-back] = count
-          back -= 1
-        end
-      else
-        num_le_ar[i] = count
-      end
-    end
-    ret = VecI.new(self.size)
-    num_le_ar.zip(sorted) do |n,v|
-      indices = hash[v]
-      indices.each do |i|
-        ret[i] = n
-      end
-    end
-    ret
-  end
-  Default_pi_zero_args = {:lambda_vals => Default_lambdas, :method => :smooth, :log_transform => false }
-  # returns the Pi_0 for given p-values (the values in self)
-  #   lambda_vals = Float or Array of floats of size >= 4.  value(s) within (0,1)
-  #   A single value given then the pi_zero is calculated at that point,
-  #   superceding the method or log_transform arguments
-  #   method = :smooth or :bootstrap
-  #   log_transform = true or false
-  def pi_zero(lambda_vals=Default_pi_zero_args[:lambda_vals], method=Default_pi_zero_args[:method], log_transform=Default_pi_zero_args[:log_transform])
-    if self.min < 0 || self.max > 1
-      raise ArgumentError, "p-values must be within [0,1)"
-    end
-    if lambda_vals.is_a? Numeric
-      lambda_vals = [lambda_vals]
-    end
-    if lambda_vals.size != 1 && lambda_vals.size < 4
-      raise ArgumentError, "#{tun_arg} must have 1 or 4 or more values"
-    end
-    if lambda_vals.any? {|v| v < 0 || v >= 1}
-      raise ArgumentError, "#{tun_arg} vals must be within [0,1)"
-    end
-    pi_zeros = lambda_vals.map {|val| self.pi_zero_at_lambda(val) }
-    r = RSRuby.instance
-    r.plot(lambda_vals,pi_zeros, :ylab=>"instantaneous pi_zeros")
-    answ = r.smooth_spline(lambda_vals, pi_zeros, :df => Default_smooth_df)
-    r.lines(answ['x'], answ['y'])
-    r.points(answ['x'], answ['y'])
-    sleep(20)
-    answer =
-      if lambda_vals.size == 1
-        pi_zeros.first
-      else
-        case method
-        when :smooth
-          r = RSRuby.instance
-          calc_pi_zero = lambda do |_pi_zeros|
-            hash = r.smooth_spline(lambda_vals, _pi_zeros, :df => Default_smooth_df)
-            hash['y'][VecD.new(lambda_vals).max_indices.max]
-          end
-          if log_transform
-            pi_zeros.log_space {|log_vals| calc_pi_zero.call(log_vals) }
-          else
-            calc_pi_zero.call(pi_zeros)
-          end
-        when :bootstrap
-          min_pi0 = pi_zeros.min
-          lsz = lambda_vals.size
-          mse = VecD.new(lsz, 0)
-          pi0_boot = VecD.new(lsz, 0)
-          sz = self.size
-          100.times do   #  for(i in 1:100) {
-            p_boot = self.shuffle
-            (0...lsz).each do |i|
-              pi0_boot[i] = ( p_boot.select{|v| v > lambda_vals[i] }.size.to_f/p_boot.size ) / (1-lambda_vals[i])
-            end
-            mse = mse + ( (pi0_boot-min_pi0)**2 )
-          end
-          #  pi0 <- min(pi0[mse==min(mse)])
-          pi_zero = pi_zeros.values_at(*(mse.min_indices)).min
-          [pi_zero,1].min
-        else
-          raise ArgumentError, ":pi_zero_method must be :smooth or :bootstrap!"
-        end
-      end
-  end
-  # Returns a VecD filled with parallel q-values
-  # assumes that vec is filled with p values
-  # see pi_zero method for arguments, these should be named as symbols in the
-  # pi_zero_args hash.
-  #     robust = true or false    an indicator of whether it is desired to make
-  #                           the estimate more robust for small p-values and
-  #                           a direct finite sample estimate of pFDR
-  # A q-value can be thought of as the global positive false discovery rate
-  # at a particular p-value
-  def qvalues(robust=false, pi_zero_args={})
-    sz = self.size
-    pi0_args = Default_pi_zero_args.merge(pi_zero_args)
-    self.pi_zero(*(pi0_args.values_at(:lambda_vals, :method, :log_transform)))
-    raise RuntimeError, "pi0 <= 0 ... check your p-values!!" if pi_zero <= 0
-    num_le_ar = self.num_le
-    qvalues =
-      if robust
-        den = self.map {|val| 1 - ((1 - val)**(sz)) }
-        self * (pi_zero * sz) / ( num_le_ar * den)
-      else
-        self * (pi_zero * sz) / num_le_ar
-      end
-    u_ar = self.order
-    qvalues[u_ar[sz-1]] = [qvalues[u_ar[sz-1]],1].min
-    (0...sz-1).each do |i|
-      qvalues[u_ar[i]] = [qvalues[u_ar[i]],qvalues[u_ar[i+1]],1].min
-    end
-    qvalues
-  end
-end

data/lib/roc.rb DELETED Viewed

@@ -1,187 +0,0 @@
-# Class for all types of classification analysis:
-# receiver-operator-characteristics, precision-recall, etc..  Some definitions
-# from (Davis & Goadrich. Proceedings of the 23rd
-# International Conference on Machine Learning, Pittsburgh, PA, 2006):
-#   Recall              = TP/(TP+FN) [aka, Sensitivity]
-#   Precision           = TP/(TP+FP) [aka, Positive Predictive Value]
-#   True Positive Rate  = TP/(TP+FN)
-#   False Positive Rate = FP/(FP+TN)
-#
-# Keys to some abbreviations used in this class:
-#   pred = number predicted to be correct
-#   tps = number of true positives
-#   ppv = positive predictive value
-#   om_ppv = one minus positive predictive value = FP/(TP+FP)
-#
-# NOTE: this class assumes that lower scores are better.  Negate your scores
-# if this is not the case.
-#
-# For estimation of false positive rates using a decoy database strategy, see
-# the DecoyROC class.
-class ROC
-  # returns area under the curve found by trapezoids
-  # x and y specify the coordinates to use
-  # x should be monotonic increasing
-  def area_under_curve(x,y)
-    area = 0.0
-    (0...(x.size-1)).each do |i|
-      # determine which is larger
-      if y[i+1] >= y[i]
-        y1 = y[i+1]; y0 = y[i]
-      else
-        y0 = y[i+1]; y1 = y[i]
-      end
-      area += (x[i+1]-x[i]).to_f * ( y0.to_f + (y1-y0).to_f/2 )
-    end
-    area
-  end
-  # takes two lists of values and makes doublets [[val, boolean],...]
-  def separate_to_doublets(tps, fps)
-    true_doublets = tps.map {|v| [v, 0] }
-    false_doublets = fps.map {|v| [v, 1] }
-    all_doublets = true_doublets + false_doublets
-    all_doublets.sort!
-    all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
-  end
-  # given an array of doublets where each doublet is a value and a boolean,
-  # sorts the list and divides it into two arrays (tps, fps) of the values.
-  # The output can then be fed into many of the other routines.
-  def doublets_to_separate(list)
-    tp = []; fp = []
-    list.each do |dbl|
-      if dbl[1]
-        tp << dbl
-      else
-        fp << dbl
-      end
-    end
-    [tp,fp].collect do |arr|
-      arr.collect! {|dbl| dbl[0] }
-      arr.sort
-    end
-  end
-  # Base function for tps calculations
-  def tps_and_ppv(tp, fp)
-    tp_i = 0
-    fp_i = 0
-    x = []
-    y = []
-    num_tps = 0
-    while tp_i < tp.size
-      while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
-        fp_i += 1
-      end
-      unless tp[tp_i] == tp[tp_i+1]
-        # get the correct number of each
-        num_tps = tp_i + 1
-        num_fps = fp_i
-        x << num_tps
-        y << num_tps.to_f/(num_tps+num_fps)
-      end
-      tp_i += 1
-    end
-    return x, y
-  end
-  # takes previously sorted doublets [value, boolean]
-  def numhits_and_ppv(doublets)
-    x = []
-    y = []
-    tps = 0
-    fps = 0
-    doublets.each_with_index do |d,i|
-      if d[1] ; tps += 1
-      else ; fps += 1 end
-      if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
-        num_hits = tps + fps
-        x << num_hits
-        y <<  tps.to_f/num_hits
-      end
-    end
-    [x, y]
-  end
-end
-# For calculating precision given lists of hits and decoy hits.  The hits are
-# assumed to have false positives within them that can be estimated from the
-# number of decoy hits at the same rate
-# NOTE: this class assumes that lower scores are better.  Negate your scores
-# if this is not the case.
-class DecoyROC < ROC
-  # returns the [num_hits, num_tps, precision] as a function of true
-  # positives.  Method will return precisely what is calculated (meaning some
-  # answers may seem bizarre if you have better decoy hits than real).
-  def pred_and_tps_and_ppv(hits, decoy_hits)
-    hits_i = 0
-    decoy_i = 0
-    num_hits_ar = []
-    num_tps_ar = []
-    ppv_ar = []
-    while hits_i < hits.size
-      while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
-        decoy_i += 1
-      end
-      unless hits[hits_i] == hits[hits_i+1]
-        ## determine the number of false positives
-        tot_num_hits = hits_i+1
-        num_tps = tot_num_hits - decoy_i
-        num_hits_ar << tot_num_hits
-        num_tps_ar << num_tps
-        ppv_ar << ( num_tps.to_f/tot_num_hits )
-      end
-      hits_i += 1
-    end
-    [num_hits_ar, num_tps_ar, ppv_ar]
-  end
-  # returns [num_hits, precision] as a function of num hits.  decoy hits are
-  # seen merely as indicators of the number of false hits in the dataset.
-  # This is the same algorithm as pred_and_tps_and_ppv, just eliminates
-  # uneeded calcs
-  def pred_and_ppv(hits, decoy_hits)
-    hits_i = 0
-    decoy_i = 0
-    num_hits_ar = []
-    ppv_ar = []
-    while hits_i < hits.size
-      while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
-        decoy_i += 1
-      end
-      unless hits[hits_i] == hits[hits_i+1]
-        ## determine the number of false positives
-        tot_num_hits = hits_i+1
-        num_tps = tot_num_hits - decoy_i
-        num_hits_ar << tot_num_hits
-        ppv_ar << ( num_tps.to_f/tot_num_hits )
-      end
-      hits_i += 1
-    end
-    [num_hits_ar, ppv_ar]
-  end
-end