mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,226 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- ## The yeast Scal db mean background is: 0.00984
4
- ## The yeast Cysteine background freq is: 0.0131986582396467
5
- pep_seq_re = /<search_hit .* peptide="(\w+)"/o
6
- pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
7
-
8
- if ARGV.size != 3
9
- puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
10
- puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
11
- abort
12
- end
13
-
14
- def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
15
- File.open(base_toplot, "w") do |fh|
16
- fh.puts 'XYData'
17
- fh.puts base
18
- fh.puts title
19
- fh.puts xaxis
20
- fh.puts yaxis
21
- cats.each do |ar|
22
- fh.puts ar.join(" & ")
23
- ar.each do |a|
24
- fh.puts hash[a].join(" ")
25
- end
26
- end
27
- end
28
- end
29
-
30
- ############################################################################
31
- #### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
32
- #### CHANGE HIM THERE (eventually we need to put him in a lib file)
33
- # (actual # with cys, expected # with cys, total#peptides,
34
- # mean_fraction_of_cysteines_true, std)
35
- # PepHit(C) = Peptide containing cysteine
36
- # # Total PepHit(C) # Observed Bad Pep (C)
37
- # ------------------ proportional_to ----------------------
38
- # # Total PepHit # Total Bad PepHit (X)
39
- def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
40
-
41
- # the number of bona fide BAD cysteine hits
42
- # (some of the cysteine hits (~5%) are true positives)
43
-
44
- ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
45
- if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
46
- total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
47
- fpr = total_number_false / total_peptides
48
- [fpr, total_number_false]
49
- end
50
- ############################################################################
51
-
52
-
53
-
54
-
55
- (cysteine_background_freq, background_freq, file) = ARGV
56
- cysteine_background_freq = cysteine_background_freq.to_f
57
- background_freq = background_freq.to_f
58
-
59
- seq_probs = []
60
- last_seq_prob = nil
61
- File.open(file) do |fh|
62
- fh.each do |line|
63
- if line =~ pep_seq_re
64
- ar = Array.new(2)
65
- ar[0] = $1
66
- seq_probs << ar
67
- last_seq_prob = ar
68
- elsif line =~ pep_prob_re
69
- last_seq_prob[1] = $1.to_f
70
- end
71
- end
72
- end
73
-
74
- #seq_probs.each do |seq|
75
- # if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
76
- # abort "BAD PARSING!!"
77
- # end
78
- #end
79
- amino_acid_as_st = 'C'
80
-
81
- sorted = seq_probs.sort_by {|v| v[1] }.reverse
82
-
83
- ## traverse the peptides
84
- actual_cys_containing_peps = 0
85
- expected_cys_containing_peps = 0.0
86
- current_sum_one_minus_prob = 0.0
87
- prob_estimated_fpr = 0.0
88
- pep_cnt = 0
89
- one_minus_freq = 1.0 - cysteine_background_freq
90
-
91
- ## tabulate:
92
- pep_cnts = []
93
- probs = []
94
- prob_fprs = []
95
- prob_tps = []
96
- cys_fprs = []
97
- cys_tps = []
98
- fpr_diff = []
99
-
100
-
101
- sorted.each do |ar|
102
- pep_cnt += 1
103
-
104
- pep = ar[0]
105
- prob = ar[1]
106
-
107
- ## Cysteine FPR: ##
108
- # Expected:
109
- expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
110
- # Actual:
111
- if pep.include?(amino_acid_as_st)
112
- actual_cys_containing_peps += 1
113
- end
114
- (cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
115
- cys_tp = pep_cnt.to_f - total_num_false_by_cys
116
-
117
-
118
- ## FPR by prob: ##
119
- # SUM(1-probX)/#peps
120
- current_sum_one_minus_prob += 1.0 - prob
121
- prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
122
- prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
123
-
124
- ## GRAB or report the data:
125
- pep_cnts << pep_cnt
126
- probs << prob
127
- prob_fprs << prob_estimated_fpr
128
- prob_tps << prob_tp
129
- cys_fprs << cys_fpr
130
- cys_tps << cys_tp
131
- fpr_diff << prob_estimated_fpr - cys_fpr
132
-
133
- #puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
134
- end
135
-
136
- hash = {
137
- 'pep_cnts' => pep_cnts,
138
- 'probs' => probs,
139
- 'prob_fprs' => prob_fprs,
140
- 'prob_tps' => prob_tps,
141
- 'cys_fprs' => cys_fprs,
142
- 'cys_tps' => cys_tps,
143
- 'fpr_diff' => fpr_diff,
144
- }
145
-
146
-
147
- real_base = file.sub(/\.xml/,'')
148
-
149
-
150
-
151
- ## TPS vs FPR
152
- base = real_base.dup
153
- base << "." << "tps_vs_fpr"
154
- base_toplot = base + '.to_plot'
155
- title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
156
- xaxis = "TPs"
157
- yaxis = "FPR"
158
- cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
159
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
160
-
161
- ## PEPHITS vs FPR
162
- base = real_base.dup
163
- base << "." << "num_pep_hits_vs_fpr"
164
- base_toplot = base + '.to_plot'
165
- title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
166
- xaxis = "num peptide hits"
167
- yaxis = "FPR"
168
- cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
169
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
170
-
171
- ## PEPHITS VS FPR DIFF
172
- base = real_base.dup
173
- base << "." << "num_pep_hits_vs_fpr_diff"
174
- base_toplot = base + '.to_plot'
175
- title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
176
- xaxis = "num peptide hits"
177
- yaxis = "FPR diff (prob - cysteine)"
178
- cats = [['pep_cnts', 'fpr_diff']]
179
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
180
-
181
- ## PROB VS FPR DIFF
182
- base = real_base.dup
183
- base << "." << "prob_vs_fpr_diff"
184
- base_toplot = base + '.to_plot'
185
- title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
186
- xaxis = "peptide probability"
187
- yaxis = "FPR diff (prob - cysteine)"
188
- cats = [['probs', 'fpr_diff']]
189
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
190
-
191
-
192
-
193
- =begin
194
-
195
- returns [number_of_prots, actual_fpr]
196
- def num_prots_above_fpr(prots, desired_fpr)
197
- current_fpr_rate_percent = 0.0
198
- previous_fpr_rate_percent = 0.0
199
- current_sum_one_minus_prob = 0.0
200
- proteins_within_fpr = 0
201
- actual_fpr = nil
202
- already_found = false
203
- prot_cnt = 0
204
- prots.each do |prot|
205
- prot_cnt += 1
206
- # SUM(1-probX)/#prots
207
- current_sum_one_minus_prob += 1.0 - prot._probability.to_f
208
- current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
209
-
210
- if current_fpr_rate_percent > desired_fpr && !already_found
211
- actual_fpr = previous_fpr_rate_percent
212
- proteins_within_fpr = prot_cnt
213
- already_found = true
214
- end
215
- previous_fpr_rate_percent = current_fpr_rate_percent
216
- end
217
- [proteins_within_fpr, actual_fpr]
218
- end
219
-
220
- =end
221
-
222
-
223
-
224
-
225
-
226
-
@@ -1,56 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'optparse'
4
- require 'table'
5
-
6
- require 'ms/gradient_program'
7
-
8
- delimiter = "\t"
9
- table_format = false
10
- opts = OptionParser.new do |op|
11
- op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
12
- op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
13
- if v == 'space'
14
- delimiter = " "
15
- elsif v == 'tab'
16
- delimiter = "\t"
17
- elsif v == 'format'
18
- table_format = true
19
- else
20
- abort "don't recognize #{v}"
21
- end
22
- end
23
- end
24
-
25
- opts.parse!
26
-
27
- if ARGV.size == 0
28
- puts opts
29
- exit
30
- end
31
-
32
-
33
- sets_of_tables = {}
34
- ARGV.each do |file|
35
- File.open(file) do |fh|
36
- sets_of_tables[file] = GradientProgram.all_from_handle(fh)
37
- end
38
- end
39
-
40
- sets_of_tables.each do |file, tables|
41
- puts "FILE: #{file}"
42
- tables.each do |gp|
43
- puts "PUMP_TYPE: #{gp.pump_type}"
44
- col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
45
- data = gp.time_points.map do |tp|
46
- line = [tp.time, *(tp.percentages)]
47
- line << tp.flow_rate
48
- end
49
- table = Table.new(data, nil, col_labels)
50
- if table_format
51
- puts table.to_formatted_string
52
- else
53
- puts table.to_s(delimiter)
54
- end
55
- end
56
- end
@@ -1,137 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'vec'
4
-
5
- # FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
6
- # is 0.0101409563168847
7
-
8
- # <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
9
-
10
- def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
11
- File.open(base_toplot, "w") do |fh|
12
- fh.puts 'XYData'
13
- fh.puts base
14
- fh.puts title
15
- fh.puts xaxis
16
- fh.puts yaxis
17
- cats.each do |ar|
18
- fh.puts ar.join(" & ")
19
- ar.each do |a|
20
- fh.puts hash[a].join(" ")
21
- end
22
- end
23
- end
24
- system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
25
- end
26
-
27
- peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
28
-
29
- unless ARGV.size == 2
30
- abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
31
- end
32
-
33
- (cysteine_background_freq, file) = ARGV
34
-
35
- # each pep = [nsp_prob, init_prob, SEQUENCE]
36
- peps = []
37
- File.open(file) do |fh|
38
- fh.each do |line|
39
- if line =~ peptide_re
40
- peps << [$3.to_f,$2.to_f,$1]
41
- end
42
- end
43
- end
44
-
45
-
46
- amino_acid_as_st = 'C'
47
- one_minus_freq = 1.0 - cysteine_background_freq.to_f
48
- actual_cys_containing_peps = 0
49
- expected_cys_containing_peps = 0.0
50
- current_sum_one_minus_prob = 0.0
51
- prob_estimated_fpr = 0.0
52
- pep_cnt = 0
53
-
54
- the_probs = []
55
- the_fractions = []
56
- special_probs = []
57
-
58
-
59
-
60
-
61
- #peps.sort.reverse.each do |ar|
62
- #peps.sort.each do |ar|
63
- peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
64
- (nsp_prob, init_prob, pep) = ar
65
- ## Cysteine FPR: ##
66
- # Expected:
67
- expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
68
- # Actual:
69
- if pep.include?(amino_acid_as_st)
70
- actual_cys_containing_peps += 1
71
- end
72
- fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
73
-
74
- special_prob = (3.0 * nsp_prob) + init_prob
75
-
76
- ## Get the final fraction
77
- #if special_prob < 4.0
78
- # #puts the_fractions.join(" ")
79
- # puts the_fractions.last
80
- # abort
81
- #end
82
-
83
- # gather data to plot
84
- the_probs << nsp_prob
85
- special_probs << special_prob
86
- the_fractions << fraction_ac_exp
87
-
88
- end
89
-
90
-
91
-
92
- hash = {
93
- 'probs' => the_probs,
94
- 'fractions' => the_fractions,
95
- 'special_probs' => special_probs,
96
- }
97
-
98
- real_base = file.sub(/\.xml/,'')
99
-
100
-
101
- =begin
102
- ## PROB VS FPR DIFF
103
- base = real_base.dup
104
- base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
105
- base_toplot = base + '.to_plot'
106
- title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
107
- xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
108
- yaxis = "fraction with cysteines (actual/expected)"
109
- cats = [['probs', 'fractions']]
110
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
111
- =end
112
-
113
-
114
- =begin
115
- ## PROB VS FPR DIFF
116
- base = real_base.dup
117
- base << "." << "prob_vs_actual_expected_fraction"
118
- base_toplot = base + '.to_plot'
119
- title = "peptide prob vs fraction with cysteines (actual/expected)"
120
- xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
121
- yaxis = "fraction with cysteines (actual/expected)"
122
- cats = [['probs', 'fractions']]
123
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
124
- =end
125
-
126
- ## SPECIAL PROB VS FPR DIFF
127
- base = real_base.dup
128
- base << "." << "special_prob_vs_actual_expected_fraction"
129
- base_toplot = base + '.to_plot'
130
- title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
131
- xaxis = "(3 * nsp_prob) + init_prob"
132
- yaxis = "fraction with cysteines (actual/expected)"
133
- cats = [['special_probs', 'fractions']]
134
- plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
135
-
136
-
137
-
@@ -1,136 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- # Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
4
- # There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
5
-
6
- require 'spec_id'
7
- require 'fasta'
8
- require 'optparse'
9
- require 'ostruct'
10
-
11
- # returns an accession number if available, or the entire reference (less the
12
- # starting '>'
13
- def get_fasta_accession(fasta_prot)
14
- head = fasta_prot.header
15
- if head =~ ACC_REGEX
16
- $1.dup
17
- else
18
- head.sub(/^>/, '').rstrip
19
- end
20
- end
21
-
22
- # returns the accession number from a reference, or the complete reference
23
- def accession_from_ref(pep)
24
- ref = pep.prot.reference
25
- if ref =~ ACC_REGEX
26
- $1.dup
27
- else
28
- ref.rstrip
29
- end
30
- end
31
-
32
- def get_pep_prot_accession(pep)
33
- acc = pep.prot.accession
34
- if !acc || acc == '0' || acc == 0
35
- accession_from_ref(pep)
36
- else
37
- acc
38
- end
39
- end
40
-
41
- #####################################################################
42
- # MAIN
43
- #####################################################################
44
-
45
- opt = OpenStruct.new
46
- opt.p = 'prob'
47
- opts = OptionParser.new do |op|
48
- op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
49
- op.separator " [prints to stdout tab delimited table]"
50
- op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
51
- op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
52
- end
53
- opts.parse!
54
-
55
- if ARGV.size < 2
56
- puts opts
57
- exit
58
- end
59
-
60
- case opt.p
61
- when 'prob'
62
- param = :peptide_probability
63
- best = :first
64
- when 'xcorr'
65
- param = :xcorr
66
- best = :last
67
- else
68
- abort "incorrect param: #{opt.p}"
69
- end
70
-
71
- ############################
72
- # GLOBALS
73
- DELIM = "\t"
74
- ACC_REGEX = /\|(.*?)\|/o
75
- ############################
76
-
77
- bioworks = ARGV[0]
78
- fasta_file = ARGV[1]
79
-
80
- fprots = Fasta.new.read_file(fasta_file).prots
81
- gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
82
-
83
- peptides = SpecID.new(bioworks).peps
84
-
85
-
86
- ## Get the best peptide(s) per scan
87
- top_peps_per_scan = []
88
-
89
- peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
90
- sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
91
-
92
- top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
93
- found_another = false
94
- sorted_list.each do |pep|
95
- if pep.send(param).to_f == top_peps.send(best).send(param).to_f
96
- if opt.t
97
- top_peps << pep
98
- else
99
- found_another = true
100
- end
101
- end
102
- end
103
- unless found_another
104
- top_peps_per_scan.push( *top_peps )
105
- end
106
- end
107
-
108
-
109
- ## Get the best scoring peptide per peptide/prot from list of best
110
- ## peptides/scan
111
- top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
112
- pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
113
- end
114
-
115
- ## sort the peptides by best score
116
- sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
117
- if best == :last ; sorted_top_pep_seq_prots.reverse! end
118
-
119
- ## plot the probability vs. the number of tps
120
- puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
121
- tps = 0
122
- sorted_top_pep_seq_prots.each do |pep|
123
- if gi_nums.include?( get_pep_prot_accession(pep) )
124
- tps += 1
125
- puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
126
- end
127
- end
128
-
129
-
130
-
131
-
132
-
133
-
134
-
135
-
136
-
@@ -1,44 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'rexml/document'
4
-
5
- if ARGV.size == 0
6
- puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
7
- puts "outputs a .csv file"
8
- exit
9
- end
10
-
11
- class Protein
12
- attr_accessor :name, :pi, :ni
13
- def initialize(name, pi, ni)
14
- @name, @pi, @ni = name, pi, ni
15
- end
16
- end
17
-
18
- class Listener
19
- attr_accessor :proteins
20
-
21
- def initialize
22
- @proteins = []
23
- end
24
-
25
- def tag_start(name, attrs)
26
- if name == "protein"
27
- protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
28
- @proteins.push( protein )
29
- end
30
- end
31
-
32
- def method_missing(*args) ; end
33
-
34
- end
35
-
36
- ARGV.each do |file|
37
- File.open("output.csv", 'w') do |out|
38
- listener = Listener.new
39
- REXML::Document.parse_stream(File.new(file), listener)
40
- listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
41
- out.puts [protein.name, protein.pi, protein.ni].join("\t")
42
- end
43
- end
44
- end
@@ -1,61 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'vec'
4
- require 'spec_id'
5
- require 'optparse'
6
- require 'ostruct'
7
- require 'set'
8
-
9
-
10
- opt = OpenStruct.new
11
- opt.p = ["INV_"]
12
- opt.b = 50
13
- opts = OptionParser.new do |opts|
14
- opts.banner = "usage: #{File.basename(__FILE__)} [-d -b bins -p prefix[,...]] file ..."
15
- opts.on_head "\noutputs 'histogram.toplot'\n(then) % plot.rb -w lp --yrange n1: --noenhanced histogram.toplot\n"
16
- opts.on("-p", "--prefix PREFIX", "(comma sep list) FP protein header prefix (def: #{opt.p})") {|v| opt.p = v.split(',')}
17
- opts.on("-b", "--bins NUM_BINS", "number of histogram bins (def: #{opt.b})") {|v| opt.b = v.to_i}
18
- opts.on("-d", "--diff", "plots TP - FP") {|v| opt.b = v.to_i}
19
- end
20
- opts.parse!
21
-
22
- if ARGV.size < 1
23
- puts opts
24
- end
25
-
26
- outfile = 'histogram.toplot'
27
- dtype = 'XYData'
28
- outfile_base = 'histogram'
29
- title = 'histogram of protein probabilities'
30
- xaxis = 'probability'
31
- yaxis = 'frequency'
32
- out = File.open(outfile, "w")
33
- [dtype, outfile_base, title, xaxis, yaxis].each do |it|
34
- out.puts it
35
- end
36
-
37
- files = ARGV.to_a
38
- files.each_with_index do |file,i|
39
- fp = VecD.new; tp = VecD.new
40
- bio = SpecID.new(file)
41
- re = /^#{opt.p[i]}/
42
- bio.prots.each do |prot|
43
- if prot.reference =~ re
44
- fp << Math.log10(prot.probability)
45
- else
46
- tp << Math.log10(prot.probability)
47
- end
48
- end
49
- if fp.size == 0 then puts "NO FALSE POSITIVES FOUND! Your prefix is probably wrong ;)" end
50
- label = file
51
- t_bin, t_freq = tp.histogram(opt.b)
52
- f_bin, f_freq = fp.histogram(opt.b)
53
- out.puts 'TP ' + label
54
- out.puts t_bin.to_s
55
- out.puts t_freq.to_s
56
- out.puts 'FP ' + label
57
- out.puts f_bin.to_s
58
- out.puts f_freq.to_s
59
- end
60
-
61
- out.close