mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/pi_zero.rb
DELETED
|
@@ -1,244 +0,0 @@
|
|
|
1
|
-
require 'rsruby'
|
|
2
|
-
require 'vec'
|
|
3
|
-
require 'vec/r'
|
|
4
|
-
require 'enumerator'
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
module PiZero
|
|
8
|
-
class << self
|
|
9
|
-
# takes a sorted array of p-values (floats between 0 and 1 inclusive)
|
|
10
|
-
# returns [thresholds_ar, instantaneous pi_0 calculations_ar]
|
|
11
|
-
# evenly incremented values will be used by default:
|
|
12
|
-
# :start=>0.0, :stop=>0.9, :step=>0.01
|
|
13
|
-
def pi_zero_hats(sorted_pvals, args={})
|
|
14
|
-
defaults = {:start => 0.0, :stop=>0.9, :step=>0.05 }
|
|
15
|
-
margs = defaults.merge( args )
|
|
16
|
-
(start, stop, step) = margs.values_at(:start, :stop, :step)
|
|
17
|
-
|
|
18
|
-
# From Storey et al. PNAS 2003:
|
|
19
|
-
lambdas = [] # lambda
|
|
20
|
-
pi_zeros = [] # pi_0
|
|
21
|
-
total = sorted_pvals.size # m
|
|
22
|
-
|
|
23
|
-
# totally inefficient implementation (with correct logic):
|
|
24
|
-
# TODO: implement this efficiently
|
|
25
|
-
start.step(stop, step) do |lam|
|
|
26
|
-
lambdas << lam
|
|
27
|
-
(greater, less) = sorted_pvals.partition {|pval| pval > lam }
|
|
28
|
-
pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) )
|
|
29
|
-
end
|
|
30
|
-
[lambdas, pi_zeros]
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
=begin
|
|
34
|
-
def plateau_height_with_gsl(x, y)
|
|
35
|
-
require 'gsl'
|
|
36
|
-
x_deltas = (0...(x.size-1)).to_a.map do |i|
|
|
37
|
-
x[i+1] - x[i]
|
|
38
|
-
end
|
|
39
|
-
y_deltas = (0...(y.size-1)).to_a.map do |i|
|
|
40
|
-
y[i+1] - y[i]
|
|
41
|
-
end
|
|
42
|
-
new_xs = x.dup
|
|
43
|
-
new_ys = y.dup
|
|
44
|
-
x_deltas.reverse.each do |delt|
|
|
45
|
-
new_xs.push( new_xs.last + delt )
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
y_cnt = y.size
|
|
49
|
-
y_deltas.reverse.each do |delt|
|
|
50
|
-
y_cnt -= 1
|
|
51
|
-
new_ys.push( y[y_cnt] - delt )
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
x_vec = GSL::Vector.alloc(new_xs)
|
|
55
|
-
y_vec = GSL::Vector.alloc(new_ys)
|
|
56
|
-
coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
|
|
57
|
-
coef.eval(x.last)
|
|
58
|
-
#x2 = GSL::Vector::linspace(0,2.4,20)
|
|
59
|
-
#graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
|
|
60
|
-
end
|
|
61
|
-
=end
|
|
62
|
-
|
|
63
|
-
# expecting x and y to make a scatter plot descending to a plateau on the
|
|
64
|
-
# right side (which is assumed to be of increasing noise as it goes to the
|
|
65
|
-
# right)
|
|
66
|
-
# returns the height of the plateau at the right edge
|
|
67
|
-
#
|
|
68
|
-
# *
|
|
69
|
-
# *
|
|
70
|
-
# *
|
|
71
|
-
# **
|
|
72
|
-
# ** *** * *
|
|
73
|
-
# ***** **** ***
|
|
74
|
-
def plateau_height(x, y)
|
|
75
|
-
r = RSRuby.instance
|
|
76
|
-
answ = r.smooth_spline(x,y, :df => 3)
|
|
77
|
-
## to plot it!
|
|
78
|
-
r.plot(x,y, :ylab=>"pi_zeros or frit")
|
|
79
|
-
r.lines(answ['x'], answ['y'])
|
|
80
|
-
r.points(answ['x'], answ['y'])
|
|
81
|
-
sleep(4)
|
|
82
|
-
|
|
83
|
-
answ['y'].last
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def plateau_exponential(x,y)
|
|
87
|
-
require 'gsl'
|
|
88
|
-
xvec = GSL::Vector.alloc(x)
|
|
89
|
-
yvec = GSL::Vector.alloc(y)
|
|
90
|
-
a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec))
|
|
91
|
-
x2 = GSL::Vector.linspace(0, 1.2, 20)
|
|
92
|
-
exp_a = GSL::Sf::exp(a2)
|
|
93
|
-
out_y = exp_a*GSL::Sf::exp(b2*x2)
|
|
94
|
-
raise NotImplementedError, "need to grab out the answer"
|
|
95
|
-
#graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4")
|
|
96
|
-
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
# returns a conservative (but close) estimate of pi_0 given p-values
|
|
100
|
-
# following Storey et al. 2003, PNAS.
|
|
101
|
-
def pi_zero(pvals)
|
|
102
|
-
sorted_pvals = pvals.sort
|
|
103
|
-
plateau_height( *(pi_zero_hats(sorted_pvals)) )
|
|
104
|
-
end
|
|
105
|
-
|
|
106
|
-
# returns an array where the left values have been filled in using the
|
|
107
|
-
# similar values on the right side of the distribution. These values are
|
|
108
|
-
# pushed onto the end of the array in no guaranteed order.
|
|
109
|
-
# extends a distribution on the left side where it is missing since
|
|
110
|
-
# xcorr values <= 0.0 are not reported
|
|
111
|
-
# **
|
|
112
|
-
# * *
|
|
113
|
-
# * *
|
|
114
|
-
# *
|
|
115
|
-
# *
|
|
116
|
-
# *
|
|
117
|
-
# Grabs the right tail from above and inverts it to the left side (less
|
|
118
|
-
# than zero), creating a more full distribution. raises an ArgumentError
|
|
119
|
-
# if values_chopped_at_zero.size == 0
|
|
120
|
-
# this method would be more robust with some smoothing.
|
|
121
|
-
# Method currently only meant for large amounts of data.
|
|
122
|
-
# input data does not need to be sorted
|
|
123
|
-
def extend_distribution_left_of_zero(values_chopped_at_zero)
|
|
124
|
-
sz = values_chopped_at_zero.size
|
|
125
|
-
raise ArgumentError, "array.size must be > 0" if sz == 0
|
|
126
|
-
num_bins = (Math.log10(sz) * 100).round
|
|
127
|
-
vec = VecD.new(values_chopped_at_zero)
|
|
128
|
-
(bins, freqs) = vec.histogram(num_bins)
|
|
129
|
-
start_i = 0
|
|
130
|
-
freqs.each_with_index do |f,i|
|
|
131
|
-
if f.is_a?(Numeric) && f > 0
|
|
132
|
-
start_i = i
|
|
133
|
-
break
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
match_it = freqs[start_i]
|
|
137
|
-
# get the index of the first frequency value less than the zero frequency
|
|
138
|
-
index_to_chop_at = -1
|
|
139
|
-
rev_freqs = freqs.reverse
|
|
140
|
-
rev_freqs.each_with_index do |freq,rev_i|
|
|
141
|
-
if match_it - rev_freqs[rev_i+1] <= 0
|
|
142
|
-
index_to_chop_at = freqs.size - 1 - rev_i
|
|
143
|
-
break
|
|
144
|
-
end
|
|
145
|
-
end
|
|
146
|
-
cut_point = bins[index_to_chop_at]
|
|
147
|
-
values_chopped_at_zero + values_chopped_at_zero.select {|v| v >= cut_point }.map {|v| cut_point - v }
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
# assumes the decoy_vals follows a normal distribution
|
|
151
|
-
def p_values(target_vals, decoy_vals)
|
|
152
|
-
(mean, stdev) = VecD.new(decoy_vals).sample_stats
|
|
153
|
-
r = RSRuby.instance
|
|
154
|
-
vec = VecD.new(target_vals)
|
|
155
|
-
right_tailed = true
|
|
156
|
-
vec.p_value_normal(mean, stdev, right_tailed)
|
|
157
|
-
end
|
|
158
|
-
|
|
159
|
-
def p_values_for_sequest(target_hits, decoy_hits)
|
|
160
|
-
dh_vals = decoy_hits.map {|v| v.xcorr }
|
|
161
|
-
new_decoy_vals = PiZero.extend_distribution_left_of_zero(dh_vals)
|
|
162
|
-
#File.open("target.yml", 'w') {|out| out.puts new_decoy_vals.join(" ") }
|
|
163
|
-
#File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") }
|
|
164
|
-
#abort 'checking'
|
|
165
|
-
p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals )
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
#### NEED TO VERIFY if this is PIT or PI_ZERO!
|
|
169
|
-
=begin
|
|
170
|
-
# takes a list of booleans with true being a target hit and false being a
|
|
171
|
-
# decoy hit and returns the pi_zero using the smooth method
|
|
172
|
-
# Should be ordered from best to worst (i.e., one expects more true values
|
|
173
|
-
# at the beginning of the list)
|
|
174
|
-
def pi_zero_from_booleans(booleans)
|
|
175
|
-
targets = 0
|
|
176
|
-
decoys = 0
|
|
177
|
-
xs = []
|
|
178
|
-
ys = []
|
|
179
|
-
booleans.reverse.each_with_index do |v,index|
|
|
180
|
-
if v
|
|
181
|
-
targets += 1
|
|
182
|
-
else
|
|
183
|
-
decoys += 1
|
|
184
|
-
end
|
|
185
|
-
if decoys > 0
|
|
186
|
-
xs << index
|
|
187
|
-
ys << targets.to_f / decoys
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
ys.reverse!
|
|
191
|
-
plateau_height(xs, ys)
|
|
192
|
-
end
|
|
193
|
-
=end
|
|
194
|
-
|
|
195
|
-
# returns fraction of incorrect target hits (frit) (this is the percent
|
|
196
|
-
# incorrect targets [PIT] expressed as a fraction rather than percent)
|
|
197
|
-
# takes two parallel arrays consisting of the total number of hits (this
|
|
198
|
-
# will typically be the total # target hits) at that point and the
|
|
199
|
-
# precision (ranging from: [0,1]) (typically determined by counting the
|
|
200
|
-
# number of decoy hits). Expects the number of total hits to be
|
|
201
|
-
# monotonically increasing and the precision to roughly start high and
|
|
202
|
-
# decrease as more hits (of lesser quality) are added.
|
|
203
|
-
def frit_from_precision(total_num_hits_ar, precision_ar)
|
|
204
|
-
instant_pi_zeros = []
|
|
205
|
-
total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0|
|
|
206
|
-
(x1, y1) = dp1
|
|
207
|
-
(x0, y0) = dp0
|
|
208
|
-
instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0)
|
|
209
|
-
end
|
|
210
|
-
instant_pi_zeros.reverse!
|
|
211
|
-
plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros)
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
# Takes an array of doublets ([[int, int], [int, int]...]) where the first
|
|
215
|
-
# value is the number of target hits and the second is the number of decoy
|
|
216
|
-
# hits. Expects that best hits are at the beginning of the list. Assumes
|
|
217
|
-
# that each sum is a subset of the following group (shown as actual hits
|
|
218
|
-
# rather than number of hits):
|
|
219
|
-
#
|
|
220
|
-
# [[target, target, target, decoy], [target, target, target, decoy,
|
|
221
|
-
# target, decoy, target], [target, target, target, decoy, target,
|
|
222
|
-
# decoy, target, decoy, target, target]]
|
|
223
|
-
#
|
|
224
|
-
# This assumption may be relaxed somewhat and should still give good
|
|
225
|
-
# results.
|
|
226
|
-
def frit_from_groups(array_of_doublets)
|
|
227
|
-
frits = []
|
|
228
|
-
array_of_doublets.reverse.each_cons(2) do |two_doublets|
|
|
229
|
-
bigger, smaller = two_doublets
|
|
230
|
-
num_targets = bigger[0] - smaller[0]
|
|
231
|
-
num_decoy = bigger[1] - smaller[1]
|
|
232
|
-
num_targets = 0 if num_targets < 0
|
|
233
|
-
num_decoy = 0 if num_targets < 0
|
|
234
|
-
if num_decoy > 0
|
|
235
|
-
frits << (num_targets.to_f / num_decoy)
|
|
236
|
-
end
|
|
237
|
-
end
|
|
238
|
-
frits.reverse!
|
|
239
|
-
xs = (0...(frits.size)).to_a
|
|
240
|
-
plateau_height(xs, frits)
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
end
|
|
244
|
-
end
|
data/lib/qvalue.rb
DELETED
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
begin
|
|
3
|
-
require 'rsruby'
|
|
4
|
-
rescue LoadError
|
|
5
|
-
puts "You must have the rsruby gem installed to use the qvalue module"
|
|
6
|
-
puts $!
|
|
7
|
-
raise LoadError
|
|
8
|
-
end
|
|
9
|
-
require 'vec'
|
|
10
|
-
|
|
11
|
-
# Adapted from qvalue.R by Alan Dabney and John Storey which was LGPL licensed
|
|
12
|
-
|
|
13
|
-
class VecD
|
|
14
|
-
Default_lambdas = []
|
|
15
|
-
0.0.step(0.9,0.05) {|v| Default_lambdas << v }
|
|
16
|
-
|
|
17
|
-
Default_smooth_df = 3
|
|
18
|
-
|
|
19
|
-
# returns the pi_zero estimate by taking the fraction of all p-values above
|
|
20
|
-
# lambd and dividing by (1-lambd) and gauranteed to be <= 1
|
|
21
|
-
def pi_zero_at_lambda(lambd)
|
|
22
|
-
v = (self.select{|v| v >= lambd}.size.to_f/self.size) / (1 - lambd)
|
|
23
|
-
[v, 1].min
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# returns a parallel array (VecI) of how many are <= in the array
|
|
27
|
-
# roughly: VecD[1,8,10,8,9,10].num_le => VecI[1, 3, 6, 3, 4, 6]
|
|
28
|
-
def num_le
|
|
29
|
-
hash = Hash.new {|h,k| h[k] = [] }
|
|
30
|
-
self.each_with_index do |v,i|
|
|
31
|
-
hash[v] << i
|
|
32
|
-
end
|
|
33
|
-
num_le_ar = []
|
|
34
|
-
sorted = self.sort
|
|
35
|
-
count = 0
|
|
36
|
-
sorted.each_with_index do |v,i|
|
|
37
|
-
back = 1
|
|
38
|
-
count += 1
|
|
39
|
-
if v == sorted[i-back]
|
|
40
|
-
while (sorted[i-back] == v)
|
|
41
|
-
num_le_ar[i-back] = count
|
|
42
|
-
back -= 1
|
|
43
|
-
end
|
|
44
|
-
else
|
|
45
|
-
num_le_ar[i] = count
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
ret = VecI.new(self.size)
|
|
49
|
-
num_le_ar.zip(sorted) do |n,v|
|
|
50
|
-
indices = hash[v]
|
|
51
|
-
indices.each do |i|
|
|
52
|
-
ret[i] = n
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
ret
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
Default_pi_zero_args = {:lambda_vals => Default_lambdas, :method => :smooth, :log_transform => false }
|
|
59
|
-
|
|
60
|
-
# returns the Pi_0 for given p-values (the values in self)
|
|
61
|
-
# lambda_vals = Float or Array of floats of size >= 4. value(s) within (0,1)
|
|
62
|
-
# A single value given then the pi_zero is calculated at that point,
|
|
63
|
-
# superceding the method or log_transform arguments
|
|
64
|
-
# method = :smooth or :bootstrap
|
|
65
|
-
# log_transform = true or false
|
|
66
|
-
def pi_zero(lambda_vals=Default_pi_zero_args[:lambda_vals], method=Default_pi_zero_args[:method], log_transform=Default_pi_zero_args[:log_transform])
|
|
67
|
-
if self.min < 0 || self.max > 1
|
|
68
|
-
raise ArgumentError, "p-values must be within [0,1)"
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
if lambda_vals.is_a? Numeric
|
|
72
|
-
lambda_vals = [lambda_vals]
|
|
73
|
-
end
|
|
74
|
-
if lambda_vals.size != 1 && lambda_vals.size < 4
|
|
75
|
-
raise ArgumentError, "#{tun_arg} must have 1 or 4 or more values"
|
|
76
|
-
end
|
|
77
|
-
if lambda_vals.any? {|v| v < 0 || v >= 1}
|
|
78
|
-
raise ArgumentError, "#{tun_arg} vals must be within [0,1)"
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
pi_zeros = lambda_vals.map {|val| self.pi_zero_at_lambda(val) }
|
|
82
|
-
|
|
83
|
-
r = RSRuby.instance
|
|
84
|
-
r.plot(lambda_vals,pi_zeros, :ylab=>"instantaneous pi_zeros")
|
|
85
|
-
answ = r.smooth_spline(lambda_vals, pi_zeros, :df => Default_smooth_df)
|
|
86
|
-
r.lines(answ['x'], answ['y'])
|
|
87
|
-
r.points(answ['x'], answ['y'])
|
|
88
|
-
sleep(20)
|
|
89
|
-
|
|
90
|
-
answer =
|
|
91
|
-
if lambda_vals.size == 1
|
|
92
|
-
pi_zeros.first
|
|
93
|
-
else
|
|
94
|
-
case method
|
|
95
|
-
when :smooth
|
|
96
|
-
r = RSRuby.instance
|
|
97
|
-
calc_pi_zero = lambda do |_pi_zeros|
|
|
98
|
-
hash = r.smooth_spline(lambda_vals, _pi_zeros, :df => Default_smooth_df)
|
|
99
|
-
hash['y'][VecD.new(lambda_vals).max_indices.max]
|
|
100
|
-
end
|
|
101
|
-
if log_transform
|
|
102
|
-
pi_zeros.log_space {|log_vals| calc_pi_zero.call(log_vals) }
|
|
103
|
-
else
|
|
104
|
-
calc_pi_zero.call(pi_zeros)
|
|
105
|
-
end
|
|
106
|
-
when :bootstrap
|
|
107
|
-
min_pi0 = pi_zeros.min
|
|
108
|
-
lsz = lambda_vals.size
|
|
109
|
-
mse = VecD.new(lsz, 0)
|
|
110
|
-
pi0_boot = VecD.new(lsz, 0)
|
|
111
|
-
sz = self.size
|
|
112
|
-
100.times do # for(i in 1:100) {
|
|
113
|
-
p_boot = self.shuffle
|
|
114
|
-
(0...lsz).each do |i|
|
|
115
|
-
pi0_boot[i] = ( p_boot.select{|v| v > lambda_vals[i] }.size.to_f/p_boot.size ) / (1-lambda_vals[i])
|
|
116
|
-
end
|
|
117
|
-
mse = mse + ( (pi0_boot-min_pi0)**2 )
|
|
118
|
-
end
|
|
119
|
-
# pi0 <- min(pi0[mse==min(mse)])
|
|
120
|
-
pi_zero = pi_zeros.values_at(*(mse.min_indices)).min
|
|
121
|
-
[pi_zero,1].min
|
|
122
|
-
else
|
|
123
|
-
raise ArgumentError, ":pi_zero_method must be :smooth or :bootstrap!"
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
# Returns a VecD filled with parallel q-values
|
|
129
|
-
# assumes that vec is filled with p values
|
|
130
|
-
# see pi_zero method for arguments, these should be named as symbols in the
|
|
131
|
-
# pi_zero_args hash.
|
|
132
|
-
# robust = true or false an indicator of whether it is desired to make
|
|
133
|
-
# the estimate more robust for small p-values and
|
|
134
|
-
# a direct finite sample estimate of pFDR
|
|
135
|
-
# A q-value can be thought of as the global positive false discovery rate
|
|
136
|
-
# at a particular p-value
|
|
137
|
-
def qvalues(robust=false, pi_zero_args={})
|
|
138
|
-
sz = self.size
|
|
139
|
-
pi0_args = Default_pi_zero_args.merge(pi_zero_args)
|
|
140
|
-
self.pi_zero(*(pi0_args.values_at(:lambda_vals, :method, :log_transform)))
|
|
141
|
-
raise RuntimeError, "pi0 <= 0 ... check your p-values!!" if pi_zero <= 0
|
|
142
|
-
num_le_ar = self.num_le
|
|
143
|
-
qvalues =
|
|
144
|
-
if robust
|
|
145
|
-
den = self.map {|val| 1 - ((1 - val)**(sz)) }
|
|
146
|
-
self * (pi_zero * sz) / ( num_le_ar * den)
|
|
147
|
-
else
|
|
148
|
-
self * (pi_zero * sz) / num_le_ar
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
u_ar = self.order
|
|
152
|
-
|
|
153
|
-
qvalues[u_ar[sz-1]] = [qvalues[u_ar[sz-1]],1].min
|
|
154
|
-
(0...sz-1).each do |i|
|
|
155
|
-
qvalues[u_ar[i]] = [qvalues[u_ar[i]],qvalues[u_ar[i+1]],1].min
|
|
156
|
-
end
|
|
157
|
-
qvalues
|
|
158
|
-
end
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
|
data/lib/roc.rb
DELETED
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# Class for all types of classification analysis:
|
|
6
|
-
# receiver-operator-characteristics, precision-recall, etc.. Some definitions
|
|
7
|
-
# from (Davis & Goadrich. Proceedings of the 23rd
|
|
8
|
-
# International Conference on Machine Learning, Pittsburgh, PA, 2006):
|
|
9
|
-
# Recall = TP/(TP+FN) [aka, Sensitivity]
|
|
10
|
-
# Precision = TP/(TP+FP) [aka, Positive Predictive Value]
|
|
11
|
-
# True Positive Rate = TP/(TP+FN)
|
|
12
|
-
# False Positive Rate = FP/(FP+TN)
|
|
13
|
-
#
|
|
14
|
-
# Keys to some abbreviations used in this class:
|
|
15
|
-
# pred = number predicted to be correct
|
|
16
|
-
# tps = number of true positives
|
|
17
|
-
# ppv = positive predictive value
|
|
18
|
-
# om_ppv = one minus positive predictive value = FP/(TP+FP)
|
|
19
|
-
#
|
|
20
|
-
# NOTE: this class assumes that lower scores are better. Negate your scores
|
|
21
|
-
# if this is not the case.
|
|
22
|
-
#
|
|
23
|
-
# For estimation of false positive rates using a decoy database strategy, see
|
|
24
|
-
# the DecoyROC class.
|
|
25
|
-
class ROC
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
# returns area under the curve found by trapezoids
|
|
29
|
-
# x and y specify the coordinates to use
|
|
30
|
-
# x should be monotonic increasing
|
|
31
|
-
def area_under_curve(x,y)
|
|
32
|
-
area = 0.0
|
|
33
|
-
(0...(x.size-1)).each do |i|
|
|
34
|
-
# determine which is larger
|
|
35
|
-
if y[i+1] >= y[i]
|
|
36
|
-
y1 = y[i+1]; y0 = y[i]
|
|
37
|
-
else
|
|
38
|
-
y0 = y[i+1]; y1 = y[i]
|
|
39
|
-
end
|
|
40
|
-
area += (x[i+1]-x[i]).to_f * ( y0.to_f + (y1-y0).to_f/2 )
|
|
41
|
-
end
|
|
42
|
-
area
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# takes two lists of values and makes doublets [[val, boolean],...]
|
|
46
|
-
def separate_to_doublets(tps, fps)
|
|
47
|
-
true_doublets = tps.map {|v| [v, 0] }
|
|
48
|
-
false_doublets = fps.map {|v| [v, 1] }
|
|
49
|
-
all_doublets = true_doublets + false_doublets
|
|
50
|
-
all_doublets.sort!
|
|
51
|
-
all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# given an array of doublets where each doublet is a value and a boolean,
|
|
55
|
-
# sorts the list and divides it into two arrays (tps, fps) of the values.
|
|
56
|
-
# The output can then be fed into many of the other routines.
|
|
57
|
-
def doublets_to_separate(list)
|
|
58
|
-
tp = []; fp = []
|
|
59
|
-
list.each do |dbl|
|
|
60
|
-
if dbl[1]
|
|
61
|
-
tp << dbl
|
|
62
|
-
else
|
|
63
|
-
fp << dbl
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
[tp,fp].collect do |arr|
|
|
67
|
-
arr.collect! {|dbl| dbl[0] }
|
|
68
|
-
arr.sort
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
# Base function for tps calculations
|
|
73
|
-
def tps_and_ppv(tp, fp)
|
|
74
|
-
tp_i = 0
|
|
75
|
-
fp_i = 0
|
|
76
|
-
x = []
|
|
77
|
-
y = []
|
|
78
|
-
num_tps = 0
|
|
79
|
-
|
|
80
|
-
while tp_i < tp.size
|
|
81
|
-
while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
|
|
82
|
-
fp_i += 1
|
|
83
|
-
end
|
|
84
|
-
unless tp[tp_i] == tp[tp_i+1]
|
|
85
|
-
# get the correct number of each
|
|
86
|
-
num_tps = tp_i + 1
|
|
87
|
-
num_fps = fp_i
|
|
88
|
-
|
|
89
|
-
x << num_tps
|
|
90
|
-
y << num_tps.to_f/(num_tps+num_fps)
|
|
91
|
-
|
|
92
|
-
end
|
|
93
|
-
tp_i += 1
|
|
94
|
-
end
|
|
95
|
-
return x, y
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
# takes previously sorted doublets [value, boolean]
|
|
99
|
-
def numhits_and_ppv(doublets)
|
|
100
|
-
x = []
|
|
101
|
-
y = []
|
|
102
|
-
tps = 0
|
|
103
|
-
fps = 0
|
|
104
|
-
doublets.each_with_index do |d,i|
|
|
105
|
-
if d[1] ; tps += 1
|
|
106
|
-
else ; fps += 1 end
|
|
107
|
-
|
|
108
|
-
if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
|
|
109
|
-
num_hits = tps + fps
|
|
110
|
-
x << num_hits
|
|
111
|
-
y << tps.to_f/num_hits
|
|
112
|
-
end
|
|
113
|
-
end
|
|
114
|
-
[x, y]
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
# For calculating precision given lists of hits and decoy hits. The hits are
|
|
121
|
-
# assumed to have false positives within them that can be estimated from the
|
|
122
|
-
# number of decoy hits at the same rate
|
|
123
|
-
# NOTE: this class assumes that lower scores are better. Negate your scores
|
|
124
|
-
# if this is not the case.
|
|
125
|
-
class DecoyROC < ROC
|
|
126
|
-
|
|
127
|
-
# returns the [num_hits, num_tps, precision] as a function of true
|
|
128
|
-
# positives. Method will return precisely what is calculated (meaning some
|
|
129
|
-
# answers may seem bizarre if you have better decoy hits than real).
|
|
130
|
-
def pred_and_tps_and_ppv(hits, decoy_hits)
|
|
131
|
-
hits_i = 0
|
|
132
|
-
decoy_i = 0
|
|
133
|
-
|
|
134
|
-
num_hits_ar = []
|
|
135
|
-
num_tps_ar = []
|
|
136
|
-
ppv_ar = []
|
|
137
|
-
|
|
138
|
-
while hits_i < hits.size
|
|
139
|
-
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
|
140
|
-
decoy_i += 1
|
|
141
|
-
end
|
|
142
|
-
unless hits[hits_i] == hits[hits_i+1]
|
|
143
|
-
## determine the number of false positives
|
|
144
|
-
tot_num_hits = hits_i+1
|
|
145
|
-
num_tps = tot_num_hits - decoy_i
|
|
146
|
-
|
|
147
|
-
num_hits_ar << tot_num_hits
|
|
148
|
-
num_tps_ar << num_tps
|
|
149
|
-
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
|
150
|
-
|
|
151
|
-
end
|
|
152
|
-
hits_i += 1
|
|
153
|
-
end
|
|
154
|
-
[num_hits_ar, num_tps_ar, ppv_ar]
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# returns [num_hits, precision] as a function of num hits. decoy hits are
|
|
158
|
-
# seen merely as indicators of the number of false hits in the dataset.
|
|
159
|
-
# This is the same algorithm as pred_and_tps_and_ppv, just eliminates
|
|
160
|
-
# uneeded calcs
|
|
161
|
-
def pred_and_ppv(hits, decoy_hits)
|
|
162
|
-
hits_i = 0
|
|
163
|
-
decoy_i = 0
|
|
164
|
-
|
|
165
|
-
num_hits_ar = []
|
|
166
|
-
ppv_ar = []
|
|
167
|
-
|
|
168
|
-
while hits_i < hits.size
|
|
169
|
-
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
|
170
|
-
decoy_i += 1
|
|
171
|
-
end
|
|
172
|
-
unless hits[hits_i] == hits[hits_i+1]
|
|
173
|
-
## determine the number of false positives
|
|
174
|
-
tot_num_hits = hits_i+1
|
|
175
|
-
num_tps = tot_num_hits - decoy_i
|
|
176
|
-
|
|
177
|
-
num_hits_ar << tot_num_hits
|
|
178
|
-
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
|
179
|
-
|
|
180
|
-
end
|
|
181
|
-
hits_i += 1
|
|
182
|
-
end
|
|
183
|
-
[num_hits_ar, ppv_ar]
|
|
184
|
-
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
end
|