mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/pi_zero.rb DELETED
@@ -1,244 +0,0 @@
1
- require 'rsruby'
2
- require 'vec'
3
- require 'vec/r'
4
- require 'enumerator'
5
-
6
-
7
- module PiZero
8
- class << self
9
- # takes a sorted array of p-values (floats between 0 and 1 inclusive)
10
- # returns [thresholds_ar, instantaneous pi_0 calculations_ar]
11
- # evenly incremented values will be used by default:
12
- # :start=>0.0, :stop=>0.9, :step=>0.01
13
- def pi_zero_hats(sorted_pvals, args={})
14
- defaults = {:start => 0.0, :stop=>0.9, :step=>0.05 }
15
- margs = defaults.merge( args )
16
- (start, stop, step) = margs.values_at(:start, :stop, :step)
17
-
18
- # From Storey et al. PNAS 2003:
19
- lambdas = [] # lambda
20
- pi_zeros = [] # pi_0
21
- total = sorted_pvals.size # m
22
-
23
- # totally inefficient implementation (with correct logic):
24
- # TODO: implement this efficiently
25
- start.step(stop, step) do |lam|
26
- lambdas << lam
27
- (greater, less) = sorted_pvals.partition {|pval| pval > lam }
28
- pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) )
29
- end
30
- [lambdas, pi_zeros]
31
- end
32
-
33
- =begin
34
- def plateau_height_with_gsl(x, y)
35
- require 'gsl'
36
- x_deltas = (0...(x.size-1)).to_a.map do |i|
37
- x[i+1] - x[i]
38
- end
39
- y_deltas = (0...(y.size-1)).to_a.map do |i|
40
- y[i+1] - y[i]
41
- end
42
- new_xs = x.dup
43
- new_ys = y.dup
44
- x_deltas.reverse.each do |delt|
45
- new_xs.push( new_xs.last + delt )
46
- end
47
-
48
- y_cnt = y.size
49
- y_deltas.reverse.each do |delt|
50
- y_cnt -= 1
51
- new_ys.push( y[y_cnt] - delt )
52
- end
53
-
54
- x_vec = GSL::Vector.alloc(new_xs)
55
- y_vec = GSL::Vector.alloc(new_ys)
56
- coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
57
- coef.eval(x.last)
58
- #x2 = GSL::Vector::linspace(0,2.4,20)
59
- #graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
60
- end
61
- =end
62
-
63
- # expecting x and y to make a scatter plot descending to a plateau on the
64
- # right side (which is assumed to be of increasing noise as it goes to the
65
- # right)
66
- # returns the height of the plateau at the right edge
67
- #
68
- # *
69
- # *
70
- # *
71
- # **
72
- # ** *** * *
73
- # ***** **** ***
74
- def plateau_height(x, y)
75
- r = RSRuby.instance
76
- answ = r.smooth_spline(x,y, :df => 3)
77
- ## to plot it!
78
- r.plot(x,y, :ylab=>"pi_zeros or frit")
79
- r.lines(answ['x'], answ['y'])
80
- r.points(answ['x'], answ['y'])
81
- sleep(4)
82
-
83
- answ['y'].last
84
- end
85
-
86
- def plateau_exponential(x,y)
87
- require 'gsl'
88
- xvec = GSL::Vector.alloc(x)
89
- yvec = GSL::Vector.alloc(y)
90
- a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec))
91
- x2 = GSL::Vector.linspace(0, 1.2, 20)
92
- exp_a = GSL::Sf::exp(a2)
93
- out_y = exp_a*GSL::Sf::exp(b2*x2)
94
- raise NotImplementedError, "need to grab out the answer"
95
- #graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4")
96
-
97
- end
98
-
99
- # returns a conservative (but close) estimate of pi_0 given p-values
100
- # following Storey et al. 2003, PNAS.
101
- def pi_zero(pvals)
102
- sorted_pvals = pvals.sort
103
- plateau_height( *(pi_zero_hats(sorted_pvals)) )
104
- end
105
-
106
- # returns an array where the left values have been filled in using the
107
- # similar values on the right side of the distribution. These values are
108
- # pushed onto the end of the array in no guaranteed order.
109
- # extends a distribution on the left side where it is missing since
110
- # xcorr values <= 0.0 are not reported
111
- # **
112
- # * *
113
- # * *
114
- # *
115
- # *
116
- # *
117
- # Grabs the right tail from above and inverts it to the left side (less
118
- # than zero), creating a more full distribution. raises an ArgumentError
119
- # if values_chopped_at_zero.size == 0
120
- # this method would be more robust with some smoothing.
121
- # Method currently only meant for large amounts of data.
122
- # input data does not need to be sorted
123
- def extend_distribution_left_of_zero(values_chopped_at_zero)
124
- sz = values_chopped_at_zero.size
125
- raise ArgumentError, "array.size must be > 0" if sz == 0
126
- num_bins = (Math.log10(sz) * 100).round
127
- vec = VecD.new(values_chopped_at_zero)
128
- (bins, freqs) = vec.histogram(num_bins)
129
- start_i = 0
130
- freqs.each_with_index do |f,i|
131
- if f.is_a?(Numeric) && f > 0
132
- start_i = i
133
- break
134
- end
135
- end
136
- match_it = freqs[start_i]
137
- # get the index of the first frequency value less than the zero frequency
138
- index_to_chop_at = -1
139
- rev_freqs = freqs.reverse
140
- rev_freqs.each_with_index do |freq,rev_i|
141
- if match_it - rev_freqs[rev_i+1] <= 0
142
- index_to_chop_at = freqs.size - 1 - rev_i
143
- break
144
- end
145
- end
146
- cut_point = bins[index_to_chop_at]
147
- values_chopped_at_zero + values_chopped_at_zero.select {|v| v >= cut_point }.map {|v| cut_point - v }
148
- end
149
-
150
- # assumes the decoy_vals follows a normal distribution
151
- def p_values(target_vals, decoy_vals)
152
- (mean, stdev) = VecD.new(decoy_vals).sample_stats
153
- r = RSRuby.instance
154
- vec = VecD.new(target_vals)
155
- right_tailed = true
156
- vec.p_value_normal(mean, stdev, right_tailed)
157
- end
158
-
159
- def p_values_for_sequest(target_hits, decoy_hits)
160
- dh_vals = decoy_hits.map {|v| v.xcorr }
161
- new_decoy_vals = PiZero.extend_distribution_left_of_zero(dh_vals)
162
- #File.open("target.yml", 'w') {|out| out.puts new_decoy_vals.join(" ") }
163
- #File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") }
164
- #abort 'checking'
165
- p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals )
166
- end
167
-
168
- #### NEED TO VERIFY if this is PIT or PI_ZERO!
169
- =begin
170
- # takes a list of booleans with true being a target hit and false being a
171
- # decoy hit and returns the pi_zero using the smooth method
172
- # Should be ordered from best to worst (i.e., one expects more true values
173
- # at the beginning of the list)
174
- def pi_zero_from_booleans(booleans)
175
- targets = 0
176
- decoys = 0
177
- xs = []
178
- ys = []
179
- booleans.reverse.each_with_index do |v,index|
180
- if v
181
- targets += 1
182
- else
183
- decoys += 1
184
- end
185
- if decoys > 0
186
- xs << index
187
- ys << targets.to_f / decoys
188
- end
189
- end
190
- ys.reverse!
191
- plateau_height(xs, ys)
192
- end
193
- =end
194
-
195
- # returns fraction of incorrect target hits (frit) (this is the percent
196
- # incorrect targets [PIT] expressed as a fraction rather than percent)
197
- # takes two parallel arrays consisting of the total number of hits (this
198
- # will typically be the total # target hits) at that point and the
199
- # precision (ranging from: [0,1]) (typically determined by counting the
200
- # number of decoy hits). Expects the number of total hits to be
201
- # monotonically increasing and the precision to roughly start high and
202
- # decrease as more hits (of lesser quality) are added.
203
- def frit_from_precision(total_num_hits_ar, precision_ar)
204
- instant_pi_zeros = []
205
- total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0|
206
- (x1, y1) = dp1
207
- (x0, y0) = dp0
208
- instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0)
209
- end
210
- instant_pi_zeros.reverse!
211
- plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros)
212
- end
213
-
214
- # Takes an array of doublets ([[int, int], [int, int]...]) where the first
215
- # value is the number of target hits and the second is the number of decoy
216
- # hits. Expects that best hits are at the beginning of the list. Assumes
217
- # that each sum is a subset of the following group (shown as actual hits
218
- # rather than number of hits):
219
- #
220
- # [[target, target, target, decoy], [target, target, target, decoy,
221
- # target, decoy, target], [target, target, target, decoy, target,
222
- # decoy, target, decoy, target, target]]
223
- #
224
- # This assumption may be relaxed somewhat and should still give good
225
- # results.
226
- def frit_from_groups(array_of_doublets)
227
- frits = []
228
- array_of_doublets.reverse.each_cons(2) do |two_doublets|
229
- bigger, smaller = two_doublets
230
- num_targets = bigger[0] - smaller[0]
231
- num_decoy = bigger[1] - smaller[1]
232
- num_targets = 0 if num_targets < 0
233
- num_decoy = 0 if num_targets < 0
234
- if num_decoy > 0
235
- frits << (num_targets.to_f / num_decoy)
236
- end
237
- end
238
- frits.reverse!
239
- xs = (0...(frits.size)).to_a
240
- plateau_height(xs, frits)
241
- end
242
-
243
- end
244
- end
data/lib/qvalue.rb DELETED
@@ -1,161 +0,0 @@
1
-
2
- begin
3
- require 'rsruby'
4
- rescue LoadError
5
- puts "You must have the rsruby gem installed to use the qvalue module"
6
- puts $!
7
- raise LoadError
8
- end
9
- require 'vec'
10
-
11
- # Adapted from qvalue.R by Alan Dabney and John Storey which was LGPL licensed
12
-
13
- class VecD
14
- Default_lambdas = []
15
- 0.0.step(0.9,0.05) {|v| Default_lambdas << v }
16
-
17
- Default_smooth_df = 3
18
-
19
- # returns the pi_zero estimate by taking the fraction of all p-values above
20
- # lambd and dividing by (1-lambd) and gauranteed to be <= 1
21
- def pi_zero_at_lambda(lambd)
22
- v = (self.select{|v| v >= lambd}.size.to_f/self.size) / (1 - lambd)
23
- [v, 1].min
24
- end
25
-
26
- # returns a parallel array (VecI) of how many are <= in the array
27
- # roughly: VecD[1,8,10,8,9,10].num_le => VecI[1, 3, 6, 3, 4, 6]
28
- def num_le
29
- hash = Hash.new {|h,k| h[k] = [] }
30
- self.each_with_index do |v,i|
31
- hash[v] << i
32
- end
33
- num_le_ar = []
34
- sorted = self.sort
35
- count = 0
36
- sorted.each_with_index do |v,i|
37
- back = 1
38
- count += 1
39
- if v == sorted[i-back]
40
- while (sorted[i-back] == v)
41
- num_le_ar[i-back] = count
42
- back -= 1
43
- end
44
- else
45
- num_le_ar[i] = count
46
- end
47
- end
48
- ret = VecI.new(self.size)
49
- num_le_ar.zip(sorted) do |n,v|
50
- indices = hash[v]
51
- indices.each do |i|
52
- ret[i] = n
53
- end
54
- end
55
- ret
56
- end
57
-
58
- Default_pi_zero_args = {:lambda_vals => Default_lambdas, :method => :smooth, :log_transform => false }
59
-
60
- # returns the Pi_0 for given p-values (the values in self)
61
- # lambda_vals = Float or Array of floats of size >= 4. value(s) within (0,1)
62
- # A single value given then the pi_zero is calculated at that point,
63
- # superceding the method or log_transform arguments
64
- # method = :smooth or :bootstrap
65
- # log_transform = true or false
66
- def pi_zero(lambda_vals=Default_pi_zero_args[:lambda_vals], method=Default_pi_zero_args[:method], log_transform=Default_pi_zero_args[:log_transform])
67
- if self.min < 0 || self.max > 1
68
- raise ArgumentError, "p-values must be within [0,1)"
69
- end
70
-
71
- if lambda_vals.is_a? Numeric
72
- lambda_vals = [lambda_vals]
73
- end
74
- if lambda_vals.size != 1 && lambda_vals.size < 4
75
- raise ArgumentError, "#{tun_arg} must have 1 or 4 or more values"
76
- end
77
- if lambda_vals.any? {|v| v < 0 || v >= 1}
78
- raise ArgumentError, "#{tun_arg} vals must be within [0,1)"
79
- end
80
-
81
- pi_zeros = lambda_vals.map {|val| self.pi_zero_at_lambda(val) }
82
-
83
- r = RSRuby.instance
84
- r.plot(lambda_vals,pi_zeros, :ylab=>"instantaneous pi_zeros")
85
- answ = r.smooth_spline(lambda_vals, pi_zeros, :df => Default_smooth_df)
86
- r.lines(answ['x'], answ['y'])
87
- r.points(answ['x'], answ['y'])
88
- sleep(20)
89
-
90
- answer =
91
- if lambda_vals.size == 1
92
- pi_zeros.first
93
- else
94
- case method
95
- when :smooth
96
- r = RSRuby.instance
97
- calc_pi_zero = lambda do |_pi_zeros|
98
- hash = r.smooth_spline(lambda_vals, _pi_zeros, :df => Default_smooth_df)
99
- hash['y'][VecD.new(lambda_vals).max_indices.max]
100
- end
101
- if log_transform
102
- pi_zeros.log_space {|log_vals| calc_pi_zero.call(log_vals) }
103
- else
104
- calc_pi_zero.call(pi_zeros)
105
- end
106
- when :bootstrap
107
- min_pi0 = pi_zeros.min
108
- lsz = lambda_vals.size
109
- mse = VecD.new(lsz, 0)
110
- pi0_boot = VecD.new(lsz, 0)
111
- sz = self.size
112
- 100.times do # for(i in 1:100) {
113
- p_boot = self.shuffle
114
- (0...lsz).each do |i|
115
- pi0_boot[i] = ( p_boot.select{|v| v > lambda_vals[i] }.size.to_f/p_boot.size ) / (1-lambda_vals[i])
116
- end
117
- mse = mse + ( (pi0_boot-min_pi0)**2 )
118
- end
119
- # pi0 <- min(pi0[mse==min(mse)])
120
- pi_zero = pi_zeros.values_at(*(mse.min_indices)).min
121
- [pi_zero,1].min
122
- else
123
- raise ArgumentError, ":pi_zero_method must be :smooth or :bootstrap!"
124
- end
125
- end
126
- end
127
-
128
- # Returns a VecD filled with parallel q-values
129
- # assumes that vec is filled with p values
130
- # see pi_zero method for arguments, these should be named as symbols in the
131
- # pi_zero_args hash.
132
- # robust = true or false an indicator of whether it is desired to make
133
- # the estimate more robust for small p-values and
134
- # a direct finite sample estimate of pFDR
135
- # A q-value can be thought of as the global positive false discovery rate
136
- # at a particular p-value
137
- def qvalues(robust=false, pi_zero_args={})
138
- sz = self.size
139
- pi0_args = Default_pi_zero_args.merge(pi_zero_args)
140
- self.pi_zero(*(pi0_args.values_at(:lambda_vals, :method, :log_transform)))
141
- raise RuntimeError, "pi0 <= 0 ... check your p-values!!" if pi_zero <= 0
142
- num_le_ar = self.num_le
143
- qvalues =
144
- if robust
145
- den = self.map {|val| 1 - ((1 - val)**(sz)) }
146
- self * (pi_zero * sz) / ( num_le_ar * den)
147
- else
148
- self * (pi_zero * sz) / num_le_ar
149
- end
150
-
151
- u_ar = self.order
152
-
153
- qvalues[u_ar[sz-1]] = [qvalues[u_ar[sz-1]],1].min
154
- (0...sz-1).each do |i|
155
- qvalues[u_ar[i]] = [qvalues[u_ar[i]],qvalues[u_ar[i+1]],1].min
156
- end
157
- qvalues
158
- end
159
- end
160
-
161
-
data/lib/roc.rb DELETED
@@ -1,187 +0,0 @@
1
-
2
-
3
-
4
-
5
- # Class for all types of classification analysis:
6
- # receiver-operator-characteristics, precision-recall, etc.. Some definitions
7
- # from (Davis & Goadrich. Proceedings of the 23rd
8
- # International Conference on Machine Learning, Pittsburgh, PA, 2006):
9
- # Recall = TP/(TP+FN) [aka, Sensitivity]
10
- # Precision = TP/(TP+FP) [aka, Positive Predictive Value]
11
- # True Positive Rate = TP/(TP+FN)
12
- # False Positive Rate = FP/(FP+TN)
13
- #
14
- # Keys to some abbreviations used in this class:
15
- # pred = number predicted to be correct
16
- # tps = number of true positives
17
- # ppv = positive predictive value
18
- # om_ppv = one minus positive predictive value = FP/(TP+FP)
19
- #
20
- # NOTE: this class assumes that lower scores are better. Negate your scores
21
- # if this is not the case.
22
- #
23
- # For estimation of false positive rates using a decoy database strategy, see
24
- # the DecoyROC class.
25
- class ROC
26
-
27
-
28
- # returns area under the curve found by trapezoids
29
- # x and y specify the coordinates to use
30
- # x should be monotonic increasing
31
- def area_under_curve(x,y)
32
- area = 0.0
33
- (0...(x.size-1)).each do |i|
34
- # determine which is larger
35
- if y[i+1] >= y[i]
36
- y1 = y[i+1]; y0 = y[i]
37
- else
38
- y0 = y[i+1]; y1 = y[i]
39
- end
40
- area += (x[i+1]-x[i]).to_f * ( y0.to_f + (y1-y0).to_f/2 )
41
- end
42
- area
43
- end
44
-
45
- # takes two lists of values and makes doublets [[val, boolean],...]
46
- def separate_to_doublets(tps, fps)
47
- true_doublets = tps.map {|v| [v, 0] }
48
- false_doublets = fps.map {|v| [v, 1] }
49
- all_doublets = true_doublets + false_doublets
50
- all_doublets.sort!
51
- all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
52
- end
53
-
54
- # given an array of doublets where each doublet is a value and a boolean,
55
- # sorts the list and divides it into two arrays (tps, fps) of the values.
56
- # The output can then be fed into many of the other routines.
57
- def doublets_to_separate(list)
58
- tp = []; fp = []
59
- list.each do |dbl|
60
- if dbl[1]
61
- tp << dbl
62
- else
63
- fp << dbl
64
- end
65
- end
66
- [tp,fp].collect do |arr|
67
- arr.collect! {|dbl| dbl[0] }
68
- arr.sort
69
- end
70
- end
71
-
72
- # Base function for tps calculations
73
- def tps_and_ppv(tp, fp)
74
- tp_i = 0
75
- fp_i = 0
76
- x = []
77
- y = []
78
- num_tps = 0
79
-
80
- while tp_i < tp.size
81
- while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
82
- fp_i += 1
83
- end
84
- unless tp[tp_i] == tp[tp_i+1]
85
- # get the correct number of each
86
- num_tps = tp_i + 1
87
- num_fps = fp_i
88
-
89
- x << num_tps
90
- y << num_tps.to_f/(num_tps+num_fps)
91
-
92
- end
93
- tp_i += 1
94
- end
95
- return x, y
96
- end
97
-
98
- # takes previously sorted doublets [value, boolean]
99
- def numhits_and_ppv(doublets)
100
- x = []
101
- y = []
102
- tps = 0
103
- fps = 0
104
- doublets.each_with_index do |d,i|
105
- if d[1] ; tps += 1
106
- else ; fps += 1 end
107
-
108
- if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
109
- num_hits = tps + fps
110
- x << num_hits
111
- y << tps.to_f/num_hits
112
- end
113
- end
114
- [x, y]
115
- end
116
-
117
-
118
- end
119
-
120
- # For calculating precision given lists of hits and decoy hits. The hits are
121
- # assumed to have false positives within them that can be estimated from the
122
- # number of decoy hits at the same rate
123
- # NOTE: this class assumes that lower scores are better. Negate your scores
124
- # if this is not the case.
125
- class DecoyROC < ROC
126
-
127
- # returns the [num_hits, num_tps, precision] as a function of true
128
- # positives. Method will return precisely what is calculated (meaning some
129
- # answers may seem bizarre if you have better decoy hits than real).
130
- def pred_and_tps_and_ppv(hits, decoy_hits)
131
- hits_i = 0
132
- decoy_i = 0
133
-
134
- num_hits_ar = []
135
- num_tps_ar = []
136
- ppv_ar = []
137
-
138
- while hits_i < hits.size
139
- while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
140
- decoy_i += 1
141
- end
142
- unless hits[hits_i] == hits[hits_i+1]
143
- ## determine the number of false positives
144
- tot_num_hits = hits_i+1
145
- num_tps = tot_num_hits - decoy_i
146
-
147
- num_hits_ar << tot_num_hits
148
- num_tps_ar << num_tps
149
- ppv_ar << ( num_tps.to_f/tot_num_hits )
150
-
151
- end
152
- hits_i += 1
153
- end
154
- [num_hits_ar, num_tps_ar, ppv_ar]
155
- end
156
-
157
- # returns [num_hits, precision] as a function of num hits. decoy hits are
158
- # seen merely as indicators of the number of false hits in the dataset.
159
- # This is the same algorithm as pred_and_tps_and_ppv, just eliminates
160
- # uneeded calcs
161
- def pred_and_ppv(hits, decoy_hits)
162
- hits_i = 0
163
- decoy_i = 0
164
-
165
- num_hits_ar = []
166
- ppv_ar = []
167
-
168
- while hits_i < hits.size
169
- while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
170
- decoy_i += 1
171
- end
172
- unless hits[hits_i] == hits[hits_i+1]
173
- ## determine the number of false positives
174
- tot_num_hits = hits_i+1
175
- num_tps = tot_num_hits - decoy_i
176
-
177
- num_hits_ar << tot_num_hits
178
- ppv_ar << ( num_tps.to_f/tot_num_hits )
179
-
180
- end
181
- hits_i += 1
182
- end
183
- [num_hits_ar, ppv_ar]
184
-
185
- end
186
-
187
- end