mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,226 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- ## The yeast Scal db mean background is: 0.00984
4
- ## The yeast Cysteine background freq is: 0.0131986582396467
5
- pep_seq_re = /<search_hit .* peptide="(\w+)"/o
6
- pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
7
-
8
- if ARGV.size != 3
9
- puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
10
- puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
11
- abort
12
- end
13
-
14
- def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
15
- File.open(base_toplot, "w") do |fh|
16
- fh.puts 'XYData'
17
- fh.puts base
18
- fh.puts title
19
- fh.puts xaxis
20
- fh.puts yaxis
21
- cats.each do |ar|
22
- fh.puts ar.join(" & ")
23
- ar.each do |a|
24
- fh.puts hash[a].join(" ")
25
- end
26
- end
27
- end
28
- end
29
-
30
- ############################################################################
31
- #### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
32
- #### CHANGE HIM THERE (eventually we need to put him in a lib file)
33
- # (actual # with cys, expected # with cys, total#peptides,
34
- # mean_fraction_of_cysteines_true, std)
35
- # PepHit(C) = Peptide containing cysteine
36
- # # Total PepHit(C) # Observed Bad Pep (C)
37
- # ------------------ proportional_to ----------------------
38
- # # Total PepHit # Total Bad PepHit (X)
39
- def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
40
-
41
- # the number of bona fide BAD cysteine hits
42
- # (some of the cysteine hits (~5%) are true positives)
43
-
44
- ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
45
- if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
46
- total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
47
- fpr = total_number_false / total_peptides
48
- [fpr, total_number_false]
49
- end
50
- ############################################################################
51
-
52
-
53
-
54
-
55
- (cysteine_background_freq, background_freq, file) = ARGV
56
- cysteine_background_freq = cysteine_background_freq.to_f
57
- background_freq = background_freq.to_f
58
-
59
- seq_probs = []
60
- last_seq_prob = nil
61
- File.open(file) do |fh|
62
- fh.each do |line|
63
- if line =~ pep_seq_re
64
- ar = Array.new(2)
65
- ar[0] = $1
66
- seq_probs << ar
67
- last_seq_prob = ar
68
- elsif line =~ pep_prob_re
69
- last_seq_prob[1] = $1.to_f
70
- end
71
- end
72
- end
73
-
74
- #seq_probs.each do |seq|
75
- # if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
76
- # abort "BAD PARSING!!"
77
- # end
78
- #end
79
- amino_acid_as_st = 'C'
80
-
81
- sorted = seq_probs.sort_by {|v| v[1] }.reverse
82
-
83
- ## traverse the peptides
84
- actual_cys_containing_peps = 0
85
- expected_cys_containing_peps = 0.0
86
- current_sum_one_minus_prob = 0.0
87
- prob_estimated_fpr = 0.0
88
- pep_cnt = 0
89
- one_minus_freq = 1.0 - cysteine_background_freq
90
-
91
- ## tabulate:
92
- pep_cnts = []
93
- probs = []
94
- prob_fprs = []
95
- prob_tps = []
96
- cys_fprs = []
97
- cys_tps = []
98
- fpr_diff = []
99
-
100
-
101
- sorted.each do |ar|
102
- pep_cnt += 1
103
-
104
- pep = ar[0]
105
- prob = ar[1]
106
-
107
- ## Cysteine FPR: ##
108
- # Expected:
109
- expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
110
- # Actual:
111
- if pep.include?(amino_acid_as_st)
112
- actual_cys_containing_peps += 1
113
- end
114
- (cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
115
- cys_tp = pep_cnt.to_f - total_num_false_by_cys
116
-
117
-
118
- ## FPR by prob: ##
119
- # SUM(1-probX)/#peps
120
- current_sum_one_minus_prob += 1.0 - prob
121
- prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
122
- prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
123
-
124
- ## GRAB or report the data:
125
- pep_cnts << pep_cnt
126
- probs << prob
127
- prob_fprs << prob_estimated_fpr
128
- prob_tps << prob_tp
129
- cys_fprs << cys_fpr
130
- cys_tps << cys_tp
131
- fpr_diff << prob_estimated_fpr - cys_fpr
132
-
133
- #puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
134
- end
135
-
136
- hash = {
137
- 'pep_cnts' => pep_cnts,
138
- 'probs' => probs,
139
- 'prob_fprs' => prob_fprs,
140
- 'prob_tps' => prob_tps,
141
- 'cys_fprs' => cys_fprs,
142
- 'cys_tps' => cys_tps,
143
- 'fpr_diff' => fpr_diff,
144
- }
145
-
146
-
147
- real_base = file.sub(/\.xml/,'')
148
-
149
-
150
-
151
- ## TPS vs FPR
152
- base = real_base.dup
153
- base << "." << "tps_vs_fpr"
154
- base_toplot = base + '.to_plot'
155
- title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
156
- xaxis = "TPs"
157
- yaxis = "FPR"
158
- cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
159
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
160
-
161
- ## PEPHITS vs FPR
162
- base = real_base.dup
163
- base << "." << "num_pep_hits_vs_fpr"
164
- base_toplot = base + '.to_plot'
165
- title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
166
- xaxis = "num peptide hits"
167
- yaxis = "FPR"
168
- cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
169
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
170
-
171
- ## PEPHITS VS FPR DIFF
172
- base = real_base.dup
173
- base << "." << "num_pep_hits_vs_fpr_diff"
174
- base_toplot = base + '.to_plot'
175
- title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
176
- xaxis = "num peptide hits"
177
- yaxis = "FPR diff (prob - cysteine)"
178
- cats = [['pep_cnts', 'fpr_diff']]
179
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
180
-
181
- ## PROB VS FPR DIFF
182
- base = real_base.dup
183
- base << "." << "prob_vs_fpr_diff"
184
- base_toplot = base + '.to_plot'
185
- title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
186
- xaxis = "peptide probability"
187
- yaxis = "FPR diff (prob - cysteine)"
188
- cats = [['probs', 'fpr_diff']]
189
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
190
-
191
-
192
-
193
- =begin
194
-
195
- returns [number_of_prots, actual_fpr]
196
- def num_prots_above_fpr(prots, desired_fpr)
197
- current_fpr_rate_percent = 0.0
198
- previous_fpr_rate_percent = 0.0
199
- current_sum_one_minus_prob = 0.0
200
- proteins_within_fpr = 0
201
- actual_fpr = nil
202
- already_found = false
203
- prot_cnt = 0
204
- prots.each do |prot|
205
- prot_cnt += 1
206
- # SUM(1-probX)/#prots
207
- current_sum_one_minus_prob += 1.0 - prot._probability.to_f
208
- current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
209
-
210
- if current_fpr_rate_percent > desired_fpr && !already_found
211
- actual_fpr = previous_fpr_rate_percent
212
- proteins_within_fpr = prot_cnt
213
- already_found = true
214
- end
215
- previous_fpr_rate_percent = current_fpr_rate_percent
216
- end
217
- [proteins_within_fpr, actual_fpr]
218
- end
219
-
220
- =end
221
-
222
-
223
-
224
-
225
-
226
-
@@ -1,56 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'optparse'
4
- require 'table'
5
-
6
- require 'ms/gradient_program'
7
-
8
- delimiter = "\t"
9
- table_format = false
10
- opts = OptionParser.new do |op|
11
- op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
12
- op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
13
- if v == 'space'
14
- delimiter = " "
15
- elsif v == 'tab'
16
- delimiter = "\t"
17
- elsif v == 'format'
18
- table_format = true
19
- else
20
- abort "don't recognize #{v}"
21
- end
22
- end
23
- end
24
-
25
- opts.parse!
26
-
27
- if ARGV.size == 0
28
- puts opts
29
- exit
30
- end
31
-
32
-
33
- sets_of_tables = {}
34
- ARGV.each do |file|
35
- File.open(file) do |fh|
36
- sets_of_tables[file] = GradientProgram.all_from_handle(fh)
37
- end
38
- end
39
-
40
- sets_of_tables.each do |file, tables|
41
- puts "FILE: #{file}"
42
- tables.each do |gp|
43
- puts "PUMP_TYPE: #{gp.pump_type}"
44
- col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
45
- data = gp.time_points.map do |tp|
46
- line = [tp.time, *(tp.percentages)]
47
- line << tp.flow_rate
48
- end
49
- table = Table.new(data, nil, col_labels)
50
- if table_format
51
- puts table.to_formatted_string
52
- else
53
- puts table.to_s(delimiter)
54
- end
55
- end
56
- end
@@ -1,137 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'vec'
4
-
5
- # FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
6
- # is 0.0101409563168847
7
-
8
- # <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
9
-
10
- def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
11
- File.open(base_toplot, "w") do |fh|
12
- fh.puts 'XYData'
13
- fh.puts base
14
- fh.puts title
15
- fh.puts xaxis
16
- fh.puts yaxis
17
- cats.each do |ar|
18
- fh.puts ar.join(" & ")
19
- ar.each do |a|
20
- fh.puts hash[a].join(" ")
21
- end
22
- end
23
- end
24
- system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
25
- end
26
-
27
- peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
28
-
29
- unless ARGV.size == 2
30
- abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
31
- end
32
-
33
- (cysteine_background_freq, file) = ARGV
34
-
35
- # each pep = [nsp_prob, init_prob, SEQUENCE]
36
- peps = []
37
- File.open(file) do |fh|
38
- fh.each do |line|
39
- if line =~ peptide_re
40
- peps << [$3.to_f,$2.to_f,$1]
41
- end
42
- end
43
- end
44
-
45
-
46
- amino_acid_as_st = 'C'
47
- one_minus_freq = 1.0 - cysteine_background_freq.to_f
48
- actual_cys_containing_peps = 0
49
- expected_cys_containing_peps = 0.0
50
- current_sum_one_minus_prob = 0.0
51
- prob_estimated_fpr = 0.0
52
- pep_cnt = 0
53
-
54
- the_probs = []
55
- the_fractions = []
56
- special_probs = []
57
-
58
-
59
-
60
-
61
- #peps.sort.reverse.each do |ar|
62
- #peps.sort.each do |ar|
63
- peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
64
- (nsp_prob, init_prob, pep) = ar
65
- ## Cysteine FPR: ##
66
- # Expected:
67
- expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
68
- # Actual:
69
- if pep.include?(amino_acid_as_st)
70
- actual_cys_containing_peps += 1
71
- end
72
- fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
73
-
74
- special_prob = (3.0 * nsp_prob) + init_prob
75
-
76
- ## Get the final fraction
77
- #if special_prob < 4.0
78
- # #puts the_fractions.join(" ")
79
- # puts the_fractions.last
80
- # abort
81
- #end
82
-
83
- # gather data to plot
84
- the_probs << nsp_prob
85
- special_probs << special_prob
86
- the_fractions << fraction_ac_exp
87
-
88
- end
89
-
90
-
91
-
92
- hash = {
93
- 'probs' => the_probs,
94
- 'fractions' => the_fractions,
95
- 'special_probs' => special_probs,
96
- }
97
-
98
- real_base = file.sub(/\.xml/,'')
99
-
100
-
101
- =begin
102
- ## PROB VS FPR DIFF
103
- base = real_base.dup
104
- base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
105
- base_toplot = base + '.to_plot'
106
- title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
107
- xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
108
- yaxis = "fraction with cysteines (actual/expected)"
109
- cats = [['probs', 'fractions']]
110
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
111
- =end
112
-
113
-
114
- =begin
115
- ## PROB VS FPR DIFF
116
- base = real_base.dup
117
- base << "." << "prob_vs_actual_expected_fraction"
118
- base_toplot = base + '.to_plot'
119
- title = "peptide prob vs fraction with cysteines (actual/expected)"
120
- xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
121
- yaxis = "fraction with cysteines (actual/expected)"
122
- cats = [['probs', 'fractions']]
123
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
124
- =end
125
-
126
- ## SPECIAL PROB VS FPR DIFF
127
- base = real_base.dup
128
- base << "." << "special_prob_vs_actual_expected_fraction"
129
- base_toplot = base + '.to_plot'
130
- title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
131
- xaxis = "(3 * nsp_prob) + init_prob"
132
- yaxis = "fraction with cysteines (actual/expected)"
133
- cats = [['special_probs', 'fractions']]
134
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
135
-
136
-
137
-
@@ -1,136 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- # Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
4
- # There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
5
-
6
- require 'spec_id'
7
- require 'fasta'
8
- require 'optparse'
9
- require 'ostruct'
10
-
11
- # returns an accession number if available, or the entire reference (less the
12
- # starting '>'
13
- def get_fasta_accession(fasta_prot)
14
- head = fasta_prot.header
15
- if head =~ ACC_REGEX
16
- $1.dup
17
- else
18
- head.sub(/^>/, '').rstrip
19
- end
20
- end
21
-
22
- # returns the accession number from a reference, or the complete reference
23
- def accession_from_ref(pep)
24
- ref = pep.prot.reference
25
- if ref =~ ACC_REGEX
26
- $1.dup
27
- else
28
- ref.rstrip
29
- end
30
- end
31
-
32
- def get_pep_prot_accession(pep)
33
- acc = pep.prot.accession
34
- if !acc || acc == '0' || acc == 0
35
- accession_from_ref(pep)
36
- else
37
- acc
38
- end
39
- end
40
-
41
- #####################################################################
42
- # MAIN
43
- #####################################################################
44
-
45
- opt = OpenStruct.new
46
- opt.p = 'prob'
47
- opts = OptionParser.new do |op|
48
- op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
49
- op.separator " [prints to stdout tab delimited table]"
50
- op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
51
- op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
52
- end
53
- opts.parse!
54
-
55
- if ARGV.size < 2
56
- puts opts
57
- exit
58
- end
59
-
60
- case opt.p
61
- when 'prob'
62
- param = :peptide_probability
63
- best = :first
64
- when 'xcorr'
65
- param = :xcorr
66
- best = :last
67
- else
68
- abort "incorrect param: #{opt.p}"
69
- end
70
-
71
- ############################
72
- # GLOBALS
73
- DELIM = "\t"
74
- ACC_REGEX = /\|(.*?)\|/o
75
- ############################
76
-
77
- bioworks = ARGV[0]
78
- fasta_file = ARGV[1]
79
-
80
- fprots = Fasta.new.read_file(fasta_file).prots
81
- gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
82
-
83
- peptides = SpecID.new(bioworks).peps
84
-
85
-
86
- ## Get the best peptide(s) per scan
87
- top_peps_per_scan = []
88
-
89
- peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
90
- sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
91
-
92
- top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
93
- found_another = false
94
- sorted_list.each do |pep|
95
- if pep.send(param).to_f == top_peps.send(best).send(param).to_f
96
- if opt.t
97
- top_peps << pep
98
- else
99
- found_another = true
100
- end
101
- end
102
- end
103
- unless found_another
104
- top_peps_per_scan.push( *top_peps )
105
- end
106
- end
107
-
108
-
109
- ## Get the best scoring peptide per peptide/prot from list of best
110
- ## peptides/scan
111
- top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
112
- pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
113
- end
114
-
115
- ## sort the peptides by best score
116
- sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
117
- if best == :last ; sorted_top_pep_seq_prots.reverse! end
118
-
119
- ## plot the probability vs. the number of tps
120
- puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
121
- tps = 0
122
- sorted_top_pep_seq_prots.each do |pep|
123
- if gi_nums.include?( get_pep_prot_accession(pep) )
124
- tps += 1
125
- puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
126
- end
127
- end
128
-
129
-
130
-
131
-
132
-
133
-
134
-
135
-
136
-
@@ -1,44 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'rexml/document'
4
-
5
- if ARGV.size == 0
6
- puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
7
- puts "outputs a .csv file"
8
- exit
9
- end
10
-
11
- class Protein
12
- attr_accessor :name, :pi, :ni
13
- def initialize(name, pi, ni)
14
- @name, @pi, @ni = name, pi, ni
15
- end
16
- end
17
-
18
- class Listener
19
- attr_accessor :proteins
20
-
21
- def initialize
22
- @proteins = []
23
- end
24
-
25
- def tag_start(name, attrs)
26
- if name == "protein"
27
- protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
28
- @proteins.push( protein )
29
- end
30
- end
31
-
32
- def method_missing(*args) ; end
33
-
34
- end
35
-
36
- ARGV.each do |file|
37
- File.open("output.csv", 'w') do |out|
38
- listener = Listener.new
39
- REXML::Document.parse_stream(File.new(file), listener)
40
- listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
41
- out.puts [protein.name, protein.pi, protein.ni].join("\t")
42
- end
43
- end
44
- end
@@ -1,61 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'vec'
4
- require 'spec_id'
5
- require 'optparse'
6
- require 'ostruct'
7
- require 'set'
8
-
9
-
10
- opt = OpenStruct.new
11
- opt.p = ["INV_"]
12
- opt.b = 50
13
- opts = OptionParser.new do |opts|
14
- opts.banner = "usage: #{File.basename(__FILE__)} [-d -b bins -p prefix[,...]] file ..."
15
- opts.on_head "\noutputs 'histogram.toplot'\n(then) % plot.rb -w lp --yrange n1: --noenhanced histogram.toplot\n"
16
- opts.on("-p", "--prefix PREFIX", "(comma sep list) FP protein header prefix (def: #{opt.p})") {|v| opt.p = v.split(',')}
17
- opts.on("-b", "--bins NUM_BINS", "number of histogram bins (def: #{opt.b})") {|v| opt.b = v.to_i}
18
- opts.on("-d", "--diff", "plots TP - FP") {|v| opt.b = v.to_i}
19
- end
20
- opts.parse!
21
-
22
- if ARGV.size < 1
23
- puts opts
24
- end
25
-
26
- outfile = 'histogram.toplot'
27
- dtype = 'XYData'
28
- outfile_base = 'histogram'
29
- title = 'histogram of protein probabilities'
30
- xaxis = 'probability'
31
- yaxis = 'frequency'
32
- out = File.open(outfile, "w")
33
- [dtype, outfile_base, title, xaxis, yaxis].each do |it|
34
- out.puts it
35
- end
36
-
37
- files = ARGV.to_a
38
- files.each_with_index do |file,i|
39
- fp = VecD.new; tp = VecD.new
40
- bio = SpecID.new(file)
41
- re = /^#{opt.p[i]}/
42
- bio.prots.each do |prot|
43
- if prot.reference =~ re
44
- fp << Math.log10(prot.probability)
45
- else
46
- tp << Math.log10(prot.probability)
47
- end
48
- end
49
- if fp.size == 0 then puts "NO FALSE POSITIVES FOUND! Your prefix is probably wrong ;)" end
50
- label = file
51
- t_bin, t_freq = tp.histogram(opt.b)
52
- f_bin, f_freq = fp.histogram(opt.b)
53
- out.puts 'TP ' + label
54
- out.puts t_bin.to_s
55
- out.puts t_freq.to_s
56
- out.puts 'FP ' + label
57
- out.puts f_bin.to_s
58
- out.puts f_freq.to_s
59
- end
60
-
61
- out.close