mspire 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/pi_zero.rb
DELETED
@@ -1,244 +0,0 @@
|
|
1
|
-
require 'rsruby'
|
2
|
-
require 'vec'
|
3
|
-
require 'vec/r'
|
4
|
-
require 'enumerator'
|
5
|
-
|
6
|
-
|
7
|
-
module PiZero
|
8
|
-
class << self
|
9
|
-
# takes a sorted array of p-values (floats between 0 and 1 inclusive)
|
10
|
-
# returns [thresholds_ar, instantaneous pi_0 calculations_ar]
|
11
|
-
# evenly incremented values will be used by default:
|
12
|
-
# :start=>0.0, :stop=>0.9, :step=>0.01
|
13
|
-
def pi_zero_hats(sorted_pvals, args={})
|
14
|
-
defaults = {:start => 0.0, :stop=>0.9, :step=>0.05 }
|
15
|
-
margs = defaults.merge( args )
|
16
|
-
(start, stop, step) = margs.values_at(:start, :stop, :step)
|
17
|
-
|
18
|
-
# From Storey et al. PNAS 2003:
|
19
|
-
lambdas = [] # lambda
|
20
|
-
pi_zeros = [] # pi_0
|
21
|
-
total = sorted_pvals.size # m
|
22
|
-
|
23
|
-
# totally inefficient implementation (with correct logic):
|
24
|
-
# TODO: implement this efficiently
|
25
|
-
start.step(stop, step) do |lam|
|
26
|
-
lambdas << lam
|
27
|
-
(greater, less) = sorted_pvals.partition {|pval| pval > lam }
|
28
|
-
pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) )
|
29
|
-
end
|
30
|
-
[lambdas, pi_zeros]
|
31
|
-
end
|
32
|
-
|
33
|
-
=begin
|
34
|
-
def plateau_height_with_gsl(x, y)
|
35
|
-
require 'gsl'
|
36
|
-
x_deltas = (0...(x.size-1)).to_a.map do |i|
|
37
|
-
x[i+1] - x[i]
|
38
|
-
end
|
39
|
-
y_deltas = (0...(y.size-1)).to_a.map do |i|
|
40
|
-
y[i+1] - y[i]
|
41
|
-
end
|
42
|
-
new_xs = x.dup
|
43
|
-
new_ys = y.dup
|
44
|
-
x_deltas.reverse.each do |delt|
|
45
|
-
new_xs.push( new_xs.last + delt )
|
46
|
-
end
|
47
|
-
|
48
|
-
y_cnt = y.size
|
49
|
-
y_deltas.reverse.each do |delt|
|
50
|
-
y_cnt -= 1
|
51
|
-
new_ys.push( y[y_cnt] - delt )
|
52
|
-
end
|
53
|
-
|
54
|
-
x_vec = GSL::Vector.alloc(new_xs)
|
55
|
-
y_vec = GSL::Vector.alloc(new_ys)
|
56
|
-
coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
|
57
|
-
coef.eval(x.last)
|
58
|
-
#x2 = GSL::Vector::linspace(0,2.4,20)
|
59
|
-
#graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
|
60
|
-
end
|
61
|
-
=end
|
62
|
-
|
63
|
-
# expecting x and y to make a scatter plot descending to a plateau on the
|
64
|
-
# right side (which is assumed to be of increasing noise as it goes to the
|
65
|
-
# right)
|
66
|
-
# returns the height of the plateau at the right edge
|
67
|
-
#
|
68
|
-
# *
|
69
|
-
# *
|
70
|
-
# *
|
71
|
-
# **
|
72
|
-
# ** *** * *
|
73
|
-
# ***** **** ***
|
74
|
-
def plateau_height(x, y)
|
75
|
-
r = RSRuby.instance
|
76
|
-
answ = r.smooth_spline(x,y, :df => 3)
|
77
|
-
## to plot it!
|
78
|
-
r.plot(x,y, :ylab=>"pi_zeros or frit")
|
79
|
-
r.lines(answ['x'], answ['y'])
|
80
|
-
r.points(answ['x'], answ['y'])
|
81
|
-
sleep(4)
|
82
|
-
|
83
|
-
answ['y'].last
|
84
|
-
end
|
85
|
-
|
86
|
-
def plateau_exponential(x,y)
|
87
|
-
require 'gsl'
|
88
|
-
xvec = GSL::Vector.alloc(x)
|
89
|
-
yvec = GSL::Vector.alloc(y)
|
90
|
-
a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec))
|
91
|
-
x2 = GSL::Vector.linspace(0, 1.2, 20)
|
92
|
-
exp_a = GSL::Sf::exp(a2)
|
93
|
-
out_y = exp_a*GSL::Sf::exp(b2*x2)
|
94
|
-
raise NotImplementedError, "need to grab out the answer"
|
95
|
-
#graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4")
|
96
|
-
|
97
|
-
end
|
98
|
-
|
99
|
-
# returns a conservative (but close) estimate of pi_0 given p-values
|
100
|
-
# following Storey et al. 2003, PNAS.
|
101
|
-
def pi_zero(pvals)
|
102
|
-
sorted_pvals = pvals.sort
|
103
|
-
plateau_height( *(pi_zero_hats(sorted_pvals)) )
|
104
|
-
end
|
105
|
-
|
106
|
-
# returns an array where the left values have been filled in using the
|
107
|
-
# similar values on the right side of the distribution. These values are
|
108
|
-
# pushed onto the end of the array in no guaranteed order.
|
109
|
-
# extends a distribution on the left side where it is missing since
|
110
|
-
# xcorr values <= 0.0 are not reported
|
111
|
-
# **
|
112
|
-
# * *
|
113
|
-
# * *
|
114
|
-
# *
|
115
|
-
# *
|
116
|
-
# *
|
117
|
-
# Grabs the right tail from above and inverts it to the left side (less
|
118
|
-
# than zero), creating a more full distribution. raises an ArgumentError
|
119
|
-
# if values_chopped_at_zero.size == 0
|
120
|
-
# this method would be more robust with some smoothing.
|
121
|
-
# Method currently only meant for large amounts of data.
|
122
|
-
# input data does not need to be sorted
|
123
|
-
def extend_distribution_left_of_zero(values_chopped_at_zero)
|
124
|
-
sz = values_chopped_at_zero.size
|
125
|
-
raise ArgumentError, "array.size must be > 0" if sz == 0
|
126
|
-
num_bins = (Math.log10(sz) * 100).round
|
127
|
-
vec = VecD.new(values_chopped_at_zero)
|
128
|
-
(bins, freqs) = vec.histogram(num_bins)
|
129
|
-
start_i = 0
|
130
|
-
freqs.each_with_index do |f,i|
|
131
|
-
if f.is_a?(Numeric) && f > 0
|
132
|
-
start_i = i
|
133
|
-
break
|
134
|
-
end
|
135
|
-
end
|
136
|
-
match_it = freqs[start_i]
|
137
|
-
# get the index of the first frequency value less than the zero frequency
|
138
|
-
index_to_chop_at = -1
|
139
|
-
rev_freqs = freqs.reverse
|
140
|
-
rev_freqs.each_with_index do |freq,rev_i|
|
141
|
-
if match_it - rev_freqs[rev_i+1] <= 0
|
142
|
-
index_to_chop_at = freqs.size - 1 - rev_i
|
143
|
-
break
|
144
|
-
end
|
145
|
-
end
|
146
|
-
cut_point = bins[index_to_chop_at]
|
147
|
-
values_chopped_at_zero + values_chopped_at_zero.select {|v| v >= cut_point }.map {|v| cut_point - v }
|
148
|
-
end
|
149
|
-
|
150
|
-
# assumes the decoy_vals follows a normal distribution
|
151
|
-
def p_values(target_vals, decoy_vals)
|
152
|
-
(mean, stdev) = VecD.new(decoy_vals).sample_stats
|
153
|
-
r = RSRuby.instance
|
154
|
-
vec = VecD.new(target_vals)
|
155
|
-
right_tailed = true
|
156
|
-
vec.p_value_normal(mean, stdev, right_tailed)
|
157
|
-
end
|
158
|
-
|
159
|
-
def p_values_for_sequest(target_hits, decoy_hits)
|
160
|
-
dh_vals = decoy_hits.map {|v| v.xcorr }
|
161
|
-
new_decoy_vals = PiZero.extend_distribution_left_of_zero(dh_vals)
|
162
|
-
#File.open("target.yml", 'w') {|out| out.puts new_decoy_vals.join(" ") }
|
163
|
-
#File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") }
|
164
|
-
#abort 'checking'
|
165
|
-
p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals )
|
166
|
-
end
|
167
|
-
|
168
|
-
#### NEED TO VERIFY if this is PIT or PI_ZERO!
|
169
|
-
=begin
|
170
|
-
# takes a list of booleans with true being a target hit and false being a
|
171
|
-
# decoy hit and returns the pi_zero using the smooth method
|
172
|
-
# Should be ordered from best to worst (i.e., one expects more true values
|
173
|
-
# at the beginning of the list)
|
174
|
-
def pi_zero_from_booleans(booleans)
|
175
|
-
targets = 0
|
176
|
-
decoys = 0
|
177
|
-
xs = []
|
178
|
-
ys = []
|
179
|
-
booleans.reverse.each_with_index do |v,index|
|
180
|
-
if v
|
181
|
-
targets += 1
|
182
|
-
else
|
183
|
-
decoys += 1
|
184
|
-
end
|
185
|
-
if decoys > 0
|
186
|
-
xs << index
|
187
|
-
ys << targets.to_f / decoys
|
188
|
-
end
|
189
|
-
end
|
190
|
-
ys.reverse!
|
191
|
-
plateau_height(xs, ys)
|
192
|
-
end
|
193
|
-
=end
|
194
|
-
|
195
|
-
# returns fraction of incorrect target hits (frit) (this is the percent
|
196
|
-
# incorrect targets [PIT] expressed as a fraction rather than percent)
|
197
|
-
# takes two parallel arrays consisting of the total number of hits (this
|
198
|
-
# will typically be the total # target hits) at that point and the
|
199
|
-
# precision (ranging from: [0,1]) (typically determined by counting the
|
200
|
-
# number of decoy hits). Expects the number of total hits to be
|
201
|
-
# monotonically increasing and the precision to roughly start high and
|
202
|
-
# decrease as more hits (of lesser quality) are added.
|
203
|
-
def frit_from_precision(total_num_hits_ar, precision_ar)
|
204
|
-
instant_pi_zeros = []
|
205
|
-
total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0|
|
206
|
-
(x1, y1) = dp1
|
207
|
-
(x0, y0) = dp0
|
208
|
-
instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0)
|
209
|
-
end
|
210
|
-
instant_pi_zeros.reverse!
|
211
|
-
plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros)
|
212
|
-
end
|
213
|
-
|
214
|
-
# Takes an array of doublets ([[int, int], [int, int]...]) where the first
|
215
|
-
# value is the number of target hits and the second is the number of decoy
|
216
|
-
# hits. Expects that best hits are at the beginning of the list. Assumes
|
217
|
-
# that each sum is a subset of the following group (shown as actual hits
|
218
|
-
# rather than number of hits):
|
219
|
-
#
|
220
|
-
# [[target, target, target, decoy], [target, target, target, decoy,
|
221
|
-
# target, decoy, target], [target, target, target, decoy, target,
|
222
|
-
# decoy, target, decoy, target, target]]
|
223
|
-
#
|
224
|
-
# This assumption may be relaxed somewhat and should still give good
|
225
|
-
# results.
|
226
|
-
def frit_from_groups(array_of_doublets)
|
227
|
-
frits = []
|
228
|
-
array_of_doublets.reverse.each_cons(2) do |two_doublets|
|
229
|
-
bigger, smaller = two_doublets
|
230
|
-
num_targets = bigger[0] - smaller[0]
|
231
|
-
num_decoy = bigger[1] - smaller[1]
|
232
|
-
num_targets = 0 if num_targets < 0
|
233
|
-
num_decoy = 0 if num_targets < 0
|
234
|
-
if num_decoy > 0
|
235
|
-
frits << (num_targets.to_f / num_decoy)
|
236
|
-
end
|
237
|
-
end
|
238
|
-
frits.reverse!
|
239
|
-
xs = (0...(frits.size)).to_a
|
240
|
-
plateau_height(xs, frits)
|
241
|
-
end
|
242
|
-
|
243
|
-
end
|
244
|
-
end
|
data/lib/qvalue.rb
DELETED
@@ -1,161 +0,0 @@
|
|
1
|
-
|
2
|
-
begin
|
3
|
-
require 'rsruby'
|
4
|
-
rescue LoadError
|
5
|
-
puts "You must have the rsruby gem installed to use the qvalue module"
|
6
|
-
puts $!
|
7
|
-
raise LoadError
|
8
|
-
end
|
9
|
-
require 'vec'
|
10
|
-
|
11
|
-
# Adapted from qvalue.R by Alan Dabney and John Storey which was LGPL licensed
|
12
|
-
|
13
|
-
class VecD
|
14
|
-
Default_lambdas = []
|
15
|
-
0.0.step(0.9,0.05) {|v| Default_lambdas << v }
|
16
|
-
|
17
|
-
Default_smooth_df = 3
|
18
|
-
|
19
|
-
# returns the pi_zero estimate by taking the fraction of all p-values above
|
20
|
-
# lambd and dividing by (1-lambd) and gauranteed to be <= 1
|
21
|
-
def pi_zero_at_lambda(lambd)
|
22
|
-
v = (self.select{|v| v >= lambd}.size.to_f/self.size) / (1 - lambd)
|
23
|
-
[v, 1].min
|
24
|
-
end
|
25
|
-
|
26
|
-
# returns a parallel array (VecI) of how many are <= in the array
|
27
|
-
# roughly: VecD[1,8,10,8,9,10].num_le => VecI[1, 3, 6, 3, 4, 6]
|
28
|
-
def num_le
|
29
|
-
hash = Hash.new {|h,k| h[k] = [] }
|
30
|
-
self.each_with_index do |v,i|
|
31
|
-
hash[v] << i
|
32
|
-
end
|
33
|
-
num_le_ar = []
|
34
|
-
sorted = self.sort
|
35
|
-
count = 0
|
36
|
-
sorted.each_with_index do |v,i|
|
37
|
-
back = 1
|
38
|
-
count += 1
|
39
|
-
if v == sorted[i-back]
|
40
|
-
while (sorted[i-back] == v)
|
41
|
-
num_le_ar[i-back] = count
|
42
|
-
back -= 1
|
43
|
-
end
|
44
|
-
else
|
45
|
-
num_le_ar[i] = count
|
46
|
-
end
|
47
|
-
end
|
48
|
-
ret = VecI.new(self.size)
|
49
|
-
num_le_ar.zip(sorted) do |n,v|
|
50
|
-
indices = hash[v]
|
51
|
-
indices.each do |i|
|
52
|
-
ret[i] = n
|
53
|
-
end
|
54
|
-
end
|
55
|
-
ret
|
56
|
-
end
|
57
|
-
|
58
|
-
Default_pi_zero_args = {:lambda_vals => Default_lambdas, :method => :smooth, :log_transform => false }
|
59
|
-
|
60
|
-
# returns the Pi_0 for given p-values (the values in self)
|
61
|
-
# lambda_vals = Float or Array of floats of size >= 4. value(s) within (0,1)
|
62
|
-
# A single value given then the pi_zero is calculated at that point,
|
63
|
-
# superceding the method or log_transform arguments
|
64
|
-
# method = :smooth or :bootstrap
|
65
|
-
# log_transform = true or false
|
66
|
-
def pi_zero(lambda_vals=Default_pi_zero_args[:lambda_vals], method=Default_pi_zero_args[:method], log_transform=Default_pi_zero_args[:log_transform])
|
67
|
-
if self.min < 0 || self.max > 1
|
68
|
-
raise ArgumentError, "p-values must be within [0,1)"
|
69
|
-
end
|
70
|
-
|
71
|
-
if lambda_vals.is_a? Numeric
|
72
|
-
lambda_vals = [lambda_vals]
|
73
|
-
end
|
74
|
-
if lambda_vals.size != 1 && lambda_vals.size < 4
|
75
|
-
raise ArgumentError, "#{tun_arg} must have 1 or 4 or more values"
|
76
|
-
end
|
77
|
-
if lambda_vals.any? {|v| v < 0 || v >= 1}
|
78
|
-
raise ArgumentError, "#{tun_arg} vals must be within [0,1)"
|
79
|
-
end
|
80
|
-
|
81
|
-
pi_zeros = lambda_vals.map {|val| self.pi_zero_at_lambda(val) }
|
82
|
-
|
83
|
-
r = RSRuby.instance
|
84
|
-
r.plot(lambda_vals,pi_zeros, :ylab=>"instantaneous pi_zeros")
|
85
|
-
answ = r.smooth_spline(lambda_vals, pi_zeros, :df => Default_smooth_df)
|
86
|
-
r.lines(answ['x'], answ['y'])
|
87
|
-
r.points(answ['x'], answ['y'])
|
88
|
-
sleep(20)
|
89
|
-
|
90
|
-
answer =
|
91
|
-
if lambda_vals.size == 1
|
92
|
-
pi_zeros.first
|
93
|
-
else
|
94
|
-
case method
|
95
|
-
when :smooth
|
96
|
-
r = RSRuby.instance
|
97
|
-
calc_pi_zero = lambda do |_pi_zeros|
|
98
|
-
hash = r.smooth_spline(lambda_vals, _pi_zeros, :df => Default_smooth_df)
|
99
|
-
hash['y'][VecD.new(lambda_vals).max_indices.max]
|
100
|
-
end
|
101
|
-
if log_transform
|
102
|
-
pi_zeros.log_space {|log_vals| calc_pi_zero.call(log_vals) }
|
103
|
-
else
|
104
|
-
calc_pi_zero.call(pi_zeros)
|
105
|
-
end
|
106
|
-
when :bootstrap
|
107
|
-
min_pi0 = pi_zeros.min
|
108
|
-
lsz = lambda_vals.size
|
109
|
-
mse = VecD.new(lsz, 0)
|
110
|
-
pi0_boot = VecD.new(lsz, 0)
|
111
|
-
sz = self.size
|
112
|
-
100.times do # for(i in 1:100) {
|
113
|
-
p_boot = self.shuffle
|
114
|
-
(0...lsz).each do |i|
|
115
|
-
pi0_boot[i] = ( p_boot.select{|v| v > lambda_vals[i] }.size.to_f/p_boot.size ) / (1-lambda_vals[i])
|
116
|
-
end
|
117
|
-
mse = mse + ( (pi0_boot-min_pi0)**2 )
|
118
|
-
end
|
119
|
-
# pi0 <- min(pi0[mse==min(mse)])
|
120
|
-
pi_zero = pi_zeros.values_at(*(mse.min_indices)).min
|
121
|
-
[pi_zero,1].min
|
122
|
-
else
|
123
|
-
raise ArgumentError, ":pi_zero_method must be :smooth or :bootstrap!"
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
# Returns a VecD filled with parallel q-values
|
129
|
-
# assumes that vec is filled with p values
|
130
|
-
# see pi_zero method for arguments, these should be named as symbols in the
|
131
|
-
# pi_zero_args hash.
|
132
|
-
# robust = true or false an indicator of whether it is desired to make
|
133
|
-
# the estimate more robust for small p-values and
|
134
|
-
# a direct finite sample estimate of pFDR
|
135
|
-
# A q-value can be thought of as the global positive false discovery rate
|
136
|
-
# at a particular p-value
|
137
|
-
def qvalues(robust=false, pi_zero_args={})
|
138
|
-
sz = self.size
|
139
|
-
pi0_args = Default_pi_zero_args.merge(pi_zero_args)
|
140
|
-
self.pi_zero(*(pi0_args.values_at(:lambda_vals, :method, :log_transform)))
|
141
|
-
raise RuntimeError, "pi0 <= 0 ... check your p-values!!" if pi_zero <= 0
|
142
|
-
num_le_ar = self.num_le
|
143
|
-
qvalues =
|
144
|
-
if robust
|
145
|
-
den = self.map {|val| 1 - ((1 - val)**(sz)) }
|
146
|
-
self * (pi_zero * sz) / ( num_le_ar * den)
|
147
|
-
else
|
148
|
-
self * (pi_zero * sz) / num_le_ar
|
149
|
-
end
|
150
|
-
|
151
|
-
u_ar = self.order
|
152
|
-
|
153
|
-
qvalues[u_ar[sz-1]] = [qvalues[u_ar[sz-1]],1].min
|
154
|
-
(0...sz-1).each do |i|
|
155
|
-
qvalues[u_ar[i]] = [qvalues[u_ar[i]],qvalues[u_ar[i+1]],1].min
|
156
|
-
end
|
157
|
-
qvalues
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
|
data/lib/roc.rb
DELETED
@@ -1,187 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
# Class for all types of classification analysis:
|
6
|
-
# receiver-operator-characteristics, precision-recall, etc.. Some definitions
|
7
|
-
# from (Davis & Goadrich. Proceedings of the 23rd
|
8
|
-
# International Conference on Machine Learning, Pittsburgh, PA, 2006):
|
9
|
-
# Recall = TP/(TP+FN) [aka, Sensitivity]
|
10
|
-
# Precision = TP/(TP+FP) [aka, Positive Predictive Value]
|
11
|
-
# True Positive Rate = TP/(TP+FN)
|
12
|
-
# False Positive Rate = FP/(FP+TN)
|
13
|
-
#
|
14
|
-
# Keys to some abbreviations used in this class:
|
15
|
-
# pred = number predicted to be correct
|
16
|
-
# tps = number of true positives
|
17
|
-
# ppv = positive predictive value
|
18
|
-
# om_ppv = one minus positive predictive value = FP/(TP+FP)
|
19
|
-
#
|
20
|
-
# NOTE: this class assumes that lower scores are better. Negate your scores
|
21
|
-
# if this is not the case.
|
22
|
-
#
|
23
|
-
# For estimation of false positive rates using a decoy database strategy, see
|
24
|
-
# the DecoyROC class.
|
25
|
-
class ROC
|
26
|
-
|
27
|
-
|
28
|
-
# returns area under the curve found by trapezoids
|
29
|
-
# x and y specify the coordinates to use
|
30
|
-
# x should be monotonic increasing
|
31
|
-
def area_under_curve(x,y)
|
32
|
-
area = 0.0
|
33
|
-
(0...(x.size-1)).each do |i|
|
34
|
-
# determine which is larger
|
35
|
-
if y[i+1] >= y[i]
|
36
|
-
y1 = y[i+1]; y0 = y[i]
|
37
|
-
else
|
38
|
-
y0 = y[i+1]; y1 = y[i]
|
39
|
-
end
|
40
|
-
area += (x[i+1]-x[i]).to_f * ( y0.to_f + (y1-y0).to_f/2 )
|
41
|
-
end
|
42
|
-
area
|
43
|
-
end
|
44
|
-
|
45
|
-
# takes two lists of values and makes doublets [[val, boolean],...]
|
46
|
-
def separate_to_doublets(tps, fps)
|
47
|
-
true_doublets = tps.map {|v| [v, 0] }
|
48
|
-
false_doublets = fps.map {|v| [v, 1] }
|
49
|
-
all_doublets = true_doublets + false_doublets
|
50
|
-
all_doublets.sort!
|
51
|
-
all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
|
52
|
-
end
|
53
|
-
|
54
|
-
# given an array of doublets where each doublet is a value and a boolean,
|
55
|
-
# sorts the list and divides it into two arrays (tps, fps) of the values.
|
56
|
-
# The output can then be fed into many of the other routines.
|
57
|
-
def doublets_to_separate(list)
|
58
|
-
tp = []; fp = []
|
59
|
-
list.each do |dbl|
|
60
|
-
if dbl[1]
|
61
|
-
tp << dbl
|
62
|
-
else
|
63
|
-
fp << dbl
|
64
|
-
end
|
65
|
-
end
|
66
|
-
[tp,fp].collect do |arr|
|
67
|
-
arr.collect! {|dbl| dbl[0] }
|
68
|
-
arr.sort
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# Base function for tps calculations
|
73
|
-
def tps_and_ppv(tp, fp)
|
74
|
-
tp_i = 0
|
75
|
-
fp_i = 0
|
76
|
-
x = []
|
77
|
-
y = []
|
78
|
-
num_tps = 0
|
79
|
-
|
80
|
-
while tp_i < tp.size
|
81
|
-
while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
|
82
|
-
fp_i += 1
|
83
|
-
end
|
84
|
-
unless tp[tp_i] == tp[tp_i+1]
|
85
|
-
# get the correct number of each
|
86
|
-
num_tps = tp_i + 1
|
87
|
-
num_fps = fp_i
|
88
|
-
|
89
|
-
x << num_tps
|
90
|
-
y << num_tps.to_f/(num_tps+num_fps)
|
91
|
-
|
92
|
-
end
|
93
|
-
tp_i += 1
|
94
|
-
end
|
95
|
-
return x, y
|
96
|
-
end
|
97
|
-
|
98
|
-
# takes previously sorted doublets [value, boolean]
|
99
|
-
def numhits_and_ppv(doublets)
|
100
|
-
x = []
|
101
|
-
y = []
|
102
|
-
tps = 0
|
103
|
-
fps = 0
|
104
|
-
doublets.each_with_index do |d,i|
|
105
|
-
if d[1] ; tps += 1
|
106
|
-
else ; fps += 1 end
|
107
|
-
|
108
|
-
if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
|
109
|
-
num_hits = tps + fps
|
110
|
-
x << num_hits
|
111
|
-
y << tps.to_f/num_hits
|
112
|
-
end
|
113
|
-
end
|
114
|
-
[x, y]
|
115
|
-
end
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
|
120
|
-
# For calculating precision given lists of hits and decoy hits. The hits are
|
121
|
-
# assumed to have false positives within them that can be estimated from the
|
122
|
-
# number of decoy hits at the same rate
|
123
|
-
# NOTE: this class assumes that lower scores are better. Negate your scores
|
124
|
-
# if this is not the case.
|
125
|
-
class DecoyROC < ROC
|
126
|
-
|
127
|
-
# returns the [num_hits, num_tps, precision] as a function of true
|
128
|
-
# positives. Method will return precisely what is calculated (meaning some
|
129
|
-
# answers may seem bizarre if you have better decoy hits than real).
|
130
|
-
def pred_and_tps_and_ppv(hits, decoy_hits)
|
131
|
-
hits_i = 0
|
132
|
-
decoy_i = 0
|
133
|
-
|
134
|
-
num_hits_ar = []
|
135
|
-
num_tps_ar = []
|
136
|
-
ppv_ar = []
|
137
|
-
|
138
|
-
while hits_i < hits.size
|
139
|
-
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
140
|
-
decoy_i += 1
|
141
|
-
end
|
142
|
-
unless hits[hits_i] == hits[hits_i+1]
|
143
|
-
## determine the number of false positives
|
144
|
-
tot_num_hits = hits_i+1
|
145
|
-
num_tps = tot_num_hits - decoy_i
|
146
|
-
|
147
|
-
num_hits_ar << tot_num_hits
|
148
|
-
num_tps_ar << num_tps
|
149
|
-
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
150
|
-
|
151
|
-
end
|
152
|
-
hits_i += 1
|
153
|
-
end
|
154
|
-
[num_hits_ar, num_tps_ar, ppv_ar]
|
155
|
-
end
|
156
|
-
|
157
|
-
# returns [num_hits, precision] as a function of num hits. decoy hits are
|
158
|
-
# seen merely as indicators of the number of false hits in the dataset.
|
159
|
-
# This is the same algorithm as pred_and_tps_and_ppv, just eliminates
|
160
|
-
# uneeded calcs
|
161
|
-
def pred_and_ppv(hits, decoy_hits)
|
162
|
-
hits_i = 0
|
163
|
-
decoy_i = 0
|
164
|
-
|
165
|
-
num_hits_ar = []
|
166
|
-
ppv_ar = []
|
167
|
-
|
168
|
-
while hits_i < hits.size
|
169
|
-
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
170
|
-
decoy_i += 1
|
171
|
-
end
|
172
|
-
unless hits[hits_i] == hits[hits_i+1]
|
173
|
-
## determine the number of false positives
|
174
|
-
tot_num_hits = hits_i+1
|
175
|
-
num_tps = tot_num_hits - decoy_i
|
176
|
-
|
177
|
-
num_hits_ar << tot_num_hits
|
178
|
-
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
179
|
-
|
180
|
-
end
|
181
|
-
hits_i += 1
|
182
|
-
end
|
183
|
-
[num_hits_ar, ppv_ar]
|
184
|
-
|
185
|
-
end
|
186
|
-
|
187
|
-
end
|