mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/pi_zero.rb DELETED
@@ -1,244 +0,0 @@
1
- require 'rsruby'
2
- require 'vec'
3
- require 'vec/r'
4
- require 'enumerator'
5
-
6
-
7
- module PiZero
8
- class << self
9
- # takes a sorted array of p-values (floats between 0 and 1 inclusive)
10
- # returns [thresholds_ar, instantaneous pi_0 calculations_ar]
11
- # evenly incremented values will be used by default:
12
- # :start=>0.0, :stop=>0.9, :step=>0.01
13
- def pi_zero_hats(sorted_pvals, args={})
14
- defaults = {:start => 0.0, :stop=>0.9, :step=>0.05 }
15
- margs = defaults.merge( args )
16
- (start, stop, step) = margs.values_at(:start, :stop, :step)
17
-
18
- # From Storey et al. PNAS 2003:
19
- lambdas = [] # lambda
20
- pi_zeros = [] # pi_0
21
- total = sorted_pvals.size # m
22
-
23
- # totally inefficient implementation (with correct logic):
24
- # TODO: implement this efficiently
25
- start.step(stop, step) do |lam|
26
- lambdas << lam
27
- (greater, less) = sorted_pvals.partition {|pval| pval > lam }
28
- pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) )
29
- end
30
- [lambdas, pi_zeros]
31
- end
32
-
33
- =begin
34
- def plateau_height_with_gsl(x, y)
35
- require 'gsl'
36
- x_deltas = (0...(x.size-1)).to_a.map do |i|
37
- x[i+1] - x[i]
38
- end
39
- y_deltas = (0...(y.size-1)).to_a.map do |i|
40
- y[i+1] - y[i]
41
- end
42
- new_xs = x.dup
43
- new_ys = y.dup
44
- x_deltas.reverse.each do |delt|
45
- new_xs.push( new_xs.last + delt )
46
- end
47
-
48
- y_cnt = y.size
49
- y_deltas.reverse.each do |delt|
50
- y_cnt -= 1
51
- new_ys.push( y[y_cnt] - delt )
52
- end
53
-
54
- x_vec = GSL::Vector.alloc(new_xs)
55
- y_vec = GSL::Vector.alloc(new_ys)
56
- coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
57
- coef.eval(x.last)
58
- #x2 = GSL::Vector::linspace(0,2.4,20)
59
- #graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
60
- end
61
- =end
62
-
63
- # expecting x and y to make a scatter plot descending to a plateau on the
64
- # right side (which is assumed to be of increasing noise as it goes to the
65
- # right)
66
- # returns the height of the plateau at the right edge
67
- #
68
- # *
69
- # *
70
- # *
71
- # **
72
- # ** *** * *
73
- # ***** **** ***
74
- def plateau_height(x, y)
75
- r = RSRuby.instance
76
- answ = r.smooth_spline(x,y, :df => 3)
77
- ## to plot it!
78
- r.plot(x,y, :ylab=>"pi_zeros or frit")
79
- r.lines(answ['x'], answ['y'])
80
- r.points(answ['x'], answ['y'])
81
- sleep(4)
82
-
83
- answ['y'].last
84
- end
85
-
86
- def plateau_exponential(x,y)
87
- require 'gsl'
88
- xvec = GSL::Vector.alloc(x)
89
- yvec = GSL::Vector.alloc(y)
90
- a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec))
91
- x2 = GSL::Vector.linspace(0, 1.2, 20)
92
- exp_a = GSL::Sf::exp(a2)
93
- out_y = exp_a*GSL::Sf::exp(b2*x2)
94
- raise NotImplementedError, "need to grab out the answer"
95
- #graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4")
96
-
97
- end
98
-
99
- # returns a conservative (but close) estimate of pi_0 given p-values
100
- # following Storey et al. 2003, PNAS.
101
- def pi_zero(pvals)
102
- sorted_pvals = pvals.sort
103
- plateau_height( *(pi_zero_hats(sorted_pvals)) )
104
- end
105
-
106
- # returns an array where the left values have been filled in using the
107
- # similar values on the right side of the distribution. These values are
108
- # pushed onto the end of the array in no guaranteed order.
109
- # extends a distribution on the left side where it is missing since
110
- # xcorr values <= 0.0 are not reported
111
- # **
112
- # * *
113
- # * *
114
- # *
115
- # *
116
- # *
117
- # Grabs the right tail from above and inverts it to the left side (less
118
- # than zero), creating a more full distribution. raises an ArgumentError
119
- # if values_chopped_at_zero.size == 0
120
- # this method would be more robust with some smoothing.
121
- # Method currently only meant for large amounts of data.
122
- # input data does not need to be sorted
123
- def extend_distribution_left_of_zero(values_chopped_at_zero)
124
- sz = values_chopped_at_zero.size
125
- raise ArgumentError, "array.size must be > 0" if sz == 0
126
- num_bins = (Math.log10(sz) * 100).round
127
- vec = VecD.new(values_chopped_at_zero)
128
- (bins, freqs) = vec.histogram(num_bins)
129
- start_i = 0
130
- freqs.each_with_index do |f,i|
131
- if f.is_a?(Numeric) && f > 0
132
- start_i = i
133
- break
134
- end
135
- end
136
- match_it = freqs[start_i]
137
- # get the index of the first frequency value less than the zero frequency
138
- index_to_chop_at = -1
139
- rev_freqs = freqs.reverse
140
- rev_freqs.each_with_index do |freq,rev_i|
141
- if match_it - rev_freqs[rev_i+1] <= 0
142
- index_to_chop_at = freqs.size - 1 - rev_i
143
- break
144
- end
145
- end
146
- cut_point = bins[index_to_chop_at]
147
- values_chopped_at_zero + values_chopped_at_zero.select {|v| v >= cut_point }.map {|v| cut_point - v }
148
- end
149
-
150
- # assumes the decoy_vals follows a normal distribution
151
- def p_values(target_vals, decoy_vals)
152
- (mean, stdev) = VecD.new(decoy_vals).sample_stats
153
- r = RSRuby.instance
154
- vec = VecD.new(target_vals)
155
- right_tailed = true
156
- vec.p_value_normal(mean, stdev, right_tailed)
157
- end
158
-
159
- def p_values_for_sequest(target_hits, decoy_hits)
160
- dh_vals = decoy_hits.map {|v| v.xcorr }
161
- new_decoy_vals = PiZero.extend_distribution_left_of_zero(dh_vals)
162
- #File.open("target.yml", 'w') {|out| out.puts new_decoy_vals.join(" ") }
163
- #File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") }
164
- #abort 'checking'
165
- p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals )
166
- end
167
-
168
- #### NEED TO VERIFY if this is PIT or PI_ZERO!
169
- =begin
170
- # takes a list of booleans with true being a target hit and false being a
171
- # decoy hit and returns the pi_zero using the smooth method
172
- # Should be ordered from best to worst (i.e., one expects more true values
173
- # at the beginning of the list)
174
- def pi_zero_from_booleans(booleans)
175
- targets = 0
176
- decoys = 0
177
- xs = []
178
- ys = []
179
- booleans.reverse.each_with_index do |v,index|
180
- if v
181
- targets += 1
182
- else
183
- decoys += 1
184
- end
185
- if decoys > 0
186
- xs << index
187
- ys << targets.to_f / decoys
188
- end
189
- end
190
- ys.reverse!
191
- plateau_height(xs, ys)
192
- end
193
- =end
194
-
195
- # returns fraction of incorrect target hits (frit) (this is the percent
196
- # incorrect targets [PIT] expressed as a fraction rather than percent)
197
- # takes two parallel arrays consisting of the total number of hits (this
198
- # will typically be the total # target hits) at that point and the
199
- # precision (ranging from: [0,1]) (typically determined by counting the
200
- # number of decoy hits). Expects the number of total hits to be
201
- # monotonically increasing and the precision to roughly start high and
202
- # decrease as more hits (of lesser quality) are added.
203
- def frit_from_precision(total_num_hits_ar, precision_ar)
204
- instant_pi_zeros = []
205
- total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0|
206
- (x1, y1) = dp1
207
- (x0, y0) = dp0
208
- instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0)
209
- end
210
- instant_pi_zeros.reverse!
211
- plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros)
212
- end
213
-
214
- # Takes an array of doublets ([[int, int], [int, int]...]) where the first
215
- # value is the number of target hits and the second is the number of decoy
216
- # hits. Expects that best hits are at the beginning of the list. Assumes
217
- # that each sum is a subset of the following group (shown as actual hits
218
- # rather than number of hits):
219
- #
220
- # [[target, target, target, decoy], [target, target, target, decoy,
221
- # target, decoy, target], [target, target, target, decoy, target,
222
- # decoy, target, decoy, target, target]]
223
- #
224
- # This assumption may be relaxed somewhat and should still give good
225
- # results.
226
- def frit_from_groups(array_of_doublets)
227
- frits = []
228
- array_of_doublets.reverse.each_cons(2) do |two_doublets|
229
- bigger, smaller = two_doublets
230
- num_targets = bigger[0] - smaller[0]
231
- num_decoy = bigger[1] - smaller[1]
232
- num_targets = 0 if num_targets < 0
233
- num_decoy = 0 if num_targets < 0
234
- if num_decoy > 0
235
- frits << (num_targets.to_f / num_decoy)
236
- end
237
- end
238
- frits.reverse!
239
- xs = (0...(frits.size)).to_a
240
- plateau_height(xs, frits)
241
- end
242
-
243
- end
244
- end
data/lib/qvalue.rb DELETED
@@ -1,161 +0,0 @@
1
-
2
- begin
3
- require 'rsruby'
4
- rescue LoadError
5
- puts "You must have the rsruby gem installed to use the qvalue module"
6
- puts $!
7
- raise LoadError
8
- end
9
- require 'vec'
10
-
11
- # Adapted from qvalue.R by Alan Dabney and John Storey which was LGPL licensed
12
-
13
- class VecD
14
- Default_lambdas = []
15
- 0.0.step(0.9,0.05) {|v| Default_lambdas << v }
16
-
17
- Default_smooth_df = 3
18
-
19
- # returns the pi_zero estimate by taking the fraction of all p-values above
20
- # lambd and dividing by (1-lambd) and gauranteed to be <= 1
21
- def pi_zero_at_lambda(lambd)
22
- v = (self.select{|v| v >= lambd}.size.to_f/self.size) / (1 - lambd)
23
- [v, 1].min
24
- end
25
-
26
- # returns a parallel array (VecI) of how many are <= in the array
27
- # roughly: VecD[1,8,10,8,9,10].num_le => VecI[1, 3, 6, 3, 4, 6]
28
- def num_le
29
- hash = Hash.new {|h,k| h[k] = [] }
30
- self.each_with_index do |v,i|
31
- hash[v] << i
32
- end
33
- num_le_ar = []
34
- sorted = self.sort
35
- count = 0
36
- sorted.each_with_index do |v,i|
37
- back = 1
38
- count += 1
39
- if v == sorted[i-back]
40
- while (sorted[i-back] == v)
41
- num_le_ar[i-back] = count
42
- back -= 1
43
- end
44
- else
45
- num_le_ar[i] = count
46
- end
47
- end
48
- ret = VecI.new(self.size)
49
- num_le_ar.zip(sorted) do |n,v|
50
- indices = hash[v]
51
- indices.each do |i|
52
- ret[i] = n
53
- end
54
- end
55
- ret
56
- end
57
-
58
- Default_pi_zero_args = {:lambda_vals => Default_lambdas, :method => :smooth, :log_transform => false }
59
-
60
- # returns the Pi_0 for given p-values (the values in self)
61
- # lambda_vals = Float or Array of floats of size >= 4. value(s) within (0,1)
62
- # A single value given then the pi_zero is calculated at that point,
63
- # superceding the method or log_transform arguments
64
- # method = :smooth or :bootstrap
65
- # log_transform = true or false
66
- def pi_zero(lambda_vals=Default_pi_zero_args[:lambda_vals], method=Default_pi_zero_args[:method], log_transform=Default_pi_zero_args[:log_transform])
67
- if self.min < 0 || self.max > 1
68
- raise ArgumentError, "p-values must be within [0,1)"
69
- end
70
-
71
- if lambda_vals.is_a? Numeric
72
- lambda_vals = [lambda_vals]
73
- end
74
- if lambda_vals.size != 1 && lambda_vals.size < 4
75
- raise ArgumentError, "#{tun_arg} must have 1 or 4 or more values"
76
- end
77
- if lambda_vals.any? {|v| v < 0 || v >= 1}
78
- raise ArgumentError, "#{tun_arg} vals must be within [0,1)"
79
- end
80
-
81
- pi_zeros = lambda_vals.map {|val| self.pi_zero_at_lambda(val) }
82
-
83
- r = RSRuby.instance
84
- r.plot(lambda_vals,pi_zeros, :ylab=>"instantaneous pi_zeros")
85
- answ = r.smooth_spline(lambda_vals, pi_zeros, :df => Default_smooth_df)
86
- r.lines(answ['x'], answ['y'])
87
- r.points(answ['x'], answ['y'])
88
- sleep(20)
89
-
90
- answer =
91
- if lambda_vals.size == 1
92
- pi_zeros.first
93
- else
94
- case method
95
- when :smooth
96
- r = RSRuby.instance
97
- calc_pi_zero = lambda do |_pi_zeros|
98
- hash = r.smooth_spline(lambda_vals, _pi_zeros, :df => Default_smooth_df)
99
- hash['y'][VecD.new(lambda_vals).max_indices.max]
100
- end
101
- if log_transform
102
- pi_zeros.log_space {|log_vals| calc_pi_zero.call(log_vals) }
103
- else
104
- calc_pi_zero.call(pi_zeros)
105
- end
106
- when :bootstrap
107
- min_pi0 = pi_zeros.min
108
- lsz = lambda_vals.size
109
- mse = VecD.new(lsz, 0)
110
- pi0_boot = VecD.new(lsz, 0)
111
- sz = self.size
112
- 100.times do # for(i in 1:100) {
113
- p_boot = self.shuffle
114
- (0...lsz).each do |i|
115
- pi0_boot[i] = ( p_boot.select{|v| v > lambda_vals[i] }.size.to_f/p_boot.size ) / (1-lambda_vals[i])
116
- end
117
- mse = mse + ( (pi0_boot-min_pi0)**2 )
118
- end
119
- # pi0 <- min(pi0[mse==min(mse)])
120
- pi_zero = pi_zeros.values_at(*(mse.min_indices)).min
121
- [pi_zero,1].min
122
- else
123
- raise ArgumentError, ":pi_zero_method must be :smooth or :bootstrap!"
124
- end
125
- end
126
- end
127
-
128
- # Returns a VecD filled with parallel q-values
129
- # assumes that vec is filled with p values
130
- # see pi_zero method for arguments, these should be named as symbols in the
131
- # pi_zero_args hash.
132
- # robust = true or false an indicator of whether it is desired to make
133
- # the estimate more robust for small p-values and
134
- # a direct finite sample estimate of pFDR
135
- # A q-value can be thought of as the global positive false discovery rate
136
- # at a particular p-value
137
- def qvalues(robust=false, pi_zero_args={})
138
- sz = self.size
139
- pi0_args = Default_pi_zero_args.merge(pi_zero_args)
140
- self.pi_zero(*(pi0_args.values_at(:lambda_vals, :method, :log_transform)))
141
- raise RuntimeError, "pi0 <= 0 ... check your p-values!!" if pi_zero <= 0
142
- num_le_ar = self.num_le
143
- qvalues =
144
- if robust
145
- den = self.map {|val| 1 - ((1 - val)**(sz)) }
146
- self * (pi_zero * sz) / ( num_le_ar * den)
147
- else
148
- self * (pi_zero * sz) / num_le_ar
149
- end
150
-
151
- u_ar = self.order
152
-
153
- qvalues[u_ar[sz-1]] = [qvalues[u_ar[sz-1]],1].min
154
- (0...sz-1).each do |i|
155
- qvalues[u_ar[i]] = [qvalues[u_ar[i]],qvalues[u_ar[i+1]],1].min
156
- end
157
- qvalues
158
- end
159
- end
160
-
161
-
data/lib/roc.rb DELETED
@@ -1,187 +0,0 @@
1
-
2
-
3
-
4
-
5
- # Class for all types of classification analysis:
6
- # receiver-operator-characteristics, precision-recall, etc.. Some definitions
7
- # from (Davis & Goadrich. Proceedings of the 23rd
8
- # International Conference on Machine Learning, Pittsburgh, PA, 2006):
9
- # Recall = TP/(TP+FN) [aka, Sensitivity]
10
- # Precision = TP/(TP+FP) [aka, Positive Predictive Value]
11
- # True Positive Rate = TP/(TP+FN)
12
- # False Positive Rate = FP/(FP+TN)
13
- #
14
- # Keys to some abbreviations used in this class:
15
- # pred = number predicted to be correct
16
- # tps = number of true positives
17
- # ppv = positive predictive value
18
- # om_ppv = one minus positive predictive value = FP/(TP+FP)
19
- #
20
- # NOTE: this class assumes that lower scores are better. Negate your scores
21
- # if this is not the case.
22
- #
23
- # For estimation of false positive rates using a decoy database strategy, see
24
- # the DecoyROC class.
25
- class ROC
26
-
27
-
28
- # returns area under the curve found by trapezoids
29
- # x and y specify the coordinates to use
30
- # x should be monotonic increasing
31
- def area_under_curve(x,y)
32
- area = 0.0
33
- (0...(x.size-1)).each do |i|
34
- # determine which is larger
35
- if y[i+1] >= y[i]
36
- y1 = y[i+1]; y0 = y[i]
37
- else
38
- y0 = y[i+1]; y1 = y[i]
39
- end
40
- area += (x[i+1]-x[i]).to_f * ( y0.to_f + (y1-y0).to_f/2 )
41
- end
42
- area
43
- end
44
-
45
- # takes two lists of values and makes doublets [[val, boolean],...]
46
- def separate_to_doublets(tps, fps)
47
- true_doublets = tps.map {|v| [v, 0] }
48
- false_doublets = fps.map {|v| [v, 1] }
49
- all_doublets = true_doublets + false_doublets
50
- all_doublets.sort!
51
- all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
52
- end
53
-
54
- # given an array of doublets where each doublet is a value and a boolean,
55
- # sorts the list and divides it into two arrays (tps, fps) of the values.
56
- # The output can then be fed into many of the other routines.
57
- def doublets_to_separate(list)
58
- tp = []; fp = []
59
- list.each do |dbl|
60
- if dbl[1]
61
- tp << dbl
62
- else
63
- fp << dbl
64
- end
65
- end
66
- [tp,fp].collect do |arr|
67
- arr.collect! {|dbl| dbl[0] }
68
- arr.sort
69
- end
70
- end
71
-
72
- # Base function for tps calculations
73
- def tps_and_ppv(tp, fp)
74
- tp_i = 0
75
- fp_i = 0
76
- x = []
77
- y = []
78
- num_tps = 0
79
-
80
- while tp_i < tp.size
81
- while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
82
- fp_i += 1
83
- end
84
- unless tp[tp_i] == tp[tp_i+1]
85
- # get the correct number of each
86
- num_tps = tp_i + 1
87
- num_fps = fp_i
88
-
89
- x << num_tps
90
- y << num_tps.to_f/(num_tps+num_fps)
91
-
92
- end
93
- tp_i += 1
94
- end
95
- return x, y
96
- end
97
-
98
- # takes previously sorted doublets [value, boolean]
99
- def numhits_and_ppv(doublets)
100
- x = []
101
- y = []
102
- tps = 0
103
- fps = 0
104
- doublets.each_with_index do |d,i|
105
- if d[1] ; tps += 1
106
- else ; fps += 1 end
107
-
108
- if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
109
- num_hits = tps + fps
110
- x << num_hits
111
- y << tps.to_f/num_hits
112
- end
113
- end
114
- [x, y]
115
- end
116
-
117
-
118
- end
119
-
120
- # For calculating precision given lists of hits and decoy hits. The hits are
121
- # assumed to have false positives within them that can be estimated from the
122
- # number of decoy hits at the same rate
123
- # NOTE: this class assumes that lower scores are better. Negate your scores
124
- # if this is not the case.
125
- class DecoyROC < ROC
126
-
127
- # returns the [num_hits, num_tps, precision] as a function of true
128
- # positives. Method will return precisely what is calculated (meaning some
129
- # answers may seem bizarre if you have better decoy hits than real).
130
- def pred_and_tps_and_ppv(hits, decoy_hits)
131
- hits_i = 0
132
- decoy_i = 0
133
-
134
- num_hits_ar = []
135
- num_tps_ar = []
136
- ppv_ar = []
137
-
138
- while hits_i < hits.size
139
- while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
140
- decoy_i += 1
141
- end
142
- unless hits[hits_i] == hits[hits_i+1]
143
- ## determine the number of false positives
144
- tot_num_hits = hits_i+1
145
- num_tps = tot_num_hits - decoy_i
146
-
147
- num_hits_ar << tot_num_hits
148
- num_tps_ar << num_tps
149
- ppv_ar << ( num_tps.to_f/tot_num_hits )
150
-
151
- end
152
- hits_i += 1
153
- end
154
- [num_hits_ar, num_tps_ar, ppv_ar]
155
- end
156
-
157
- # returns [num_hits, precision] as a function of num hits. decoy hits are
158
- # seen merely as indicators of the number of false hits in the dataset.
159
- # This is the same algorithm as pred_and_tps_and_ppv, just eliminates
160
- # uneeded calcs
161
- def pred_and_ppv(hits, decoy_hits)
162
- hits_i = 0
163
- decoy_i = 0
164
-
165
- num_hits_ar = []
166
- ppv_ar = []
167
-
168
- while hits_i < hits.size
169
- while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
170
- decoy_i += 1
171
- end
172
- unless hits[hits_i] == hits[hits_i+1]
173
- ## determine the number of false positives
174
- tot_num_hits = hits_i+1
175
- num_tps = tot_num_hits - decoy_i
176
-
177
- num_hits_ar << tot_num_hits
178
- ppv_ar << ( num_tps.to_f/tot_num_hits )
179
-
180
- end
181
- hits_i += 1
182
- end
183
- [num_hits_ar, ppv_ar]
184
-
185
- end
186
-
187
- end