mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -1,797 +0,0 @@
1
-
2
- require 'spec_id'
3
- require 'optparse'
4
- require 'ostruct'
5
- require 'spec_id/aa_freqs'
6
- require 'shuffle'
7
- require 'vec'
8
- require 'table'
9
-
10
-
11
- ########################################################
12
- WRITE_CYS_FIND = false
13
- ########################################################
14
-
15
-
16
- module SpecID
17
- attr_accessor :orig_peps, :passed_peps, :passed_prots
18
- # The filename passed in for filtering
19
- attr_accessor :passed_in_filename
20
-
21
- # returns the top peptide hits per file dta (first_scan + charge)
22
- # all hits with same score as top score are returned
23
- # assumes that all fields are strings...
24
- # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
25
- # deletes the protein array (but not relevant proteins)
26
- # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
27
- # sets the @orig_peps attribute to those passing
28
- def top_peps_prefilter!
29
- ## Bioworks peps are text based and need to be transformed first
30
- if peps.first.is_a? Bioworks::Pep
31
- peps.each do |pep|
32
- pep.xcorr = pep.xcorr.to_f
33
- pep.deltacn = pep.deltacn.to_f
34
- pep.deltamass = pep.deltamass.to_f
35
- pep.mass = pep.mass.to_f
36
- pep.charge = pep.charge.to_i
37
- pep.first_scan = pep.first_scan.to_i
38
- end
39
- end
40
- ## Srf Peps need no transformation!
41
-
42
- # get the top peptide by firstscan/charge (equivalent to .out files)
43
- top_peps = []
44
- self.peps.hash_by {|pep| [pep.base_name, pep.first_scan, pep.charge]}.values.map do |v|
45
- #self.peps.hash_by {|pep| [pep.aaseq, pep.charge]}.values.map do |v|
46
- best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
47
- top_score = best_to_worst.first.xcorr
48
- best_to_worst.each do |pep|
49
- if pep.xcorr == top_score
50
- top_peps << pep
51
- else ; break
52
- end
53
- end
54
- end
55
- @orig_peps = top_peps
56
- end
57
-
58
- # (xcorr1, xcorr2, xcorr3, deltacn, ppm)
59
- # interface very unstable. For now, keeping it very loose...
60
- # assumed that peptide xcorr, deltacn, deltamass, mass, ppm are Floats
61
- # assumed that peptide charge is Integer
62
- # returns peps_passed
63
- # must respond to 'peps'
64
- # DOES NOT UPDATE the prot.peps attribute!!
65
- def filter_sequest(args, include_deltacnstar=false)
66
- (x1, x2, x3, deltacn, ppm) = args
67
- self.peps.select do |pep|
68
- # have to add the upper limit to deltacn because the lowest score is often
69
- # assigned a 1.10 in bioworks!
70
- pep_deltacn = pep.deltacn
71
- pep_charge = pep.charge
72
-
73
- ## The outer parentheses are critical to getting the correct answer!
74
- passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
75
-
76
- if passing
77
- if !include_deltacnstar && pep_deltacn > 1.0
78
- false
79
- else
80
- true
81
- end
82
- else
83
- false
84
- end
85
- end
86
- end
87
-
88
-
89
- # given some list of SpecID::Pep based objects, finds the list of proteins
90
- # associated with those peptides
91
- # update_prot_peps => when true, updates prot.peps attribute given the list
92
- # of pephits
93
- # kind =
94
- # :no_update (current proteins are returned, but their peps attribute
95
- # is not updated)
96
- # :update (current proteins returned with peps attribute updated)
97
- # :new (new proteins are created complete with peps attribute)
98
- def self.passing_proteins(pephits, kind=:no_update)
99
-
100
- orig_pephits_prts = []
101
- if kind == :new
102
- new_prots = {}
103
- pephits.each_with_index do |pep,i|
104
- orig_pephits_prts[i] = pep.prots
105
- peps_new_prts = pep.prots.map do |prt|
106
- if new_prots.key? prt.reference
107
- already_exists = new_prots[prt.reference]
108
- else
109
- np = prt.dup
110
- np.peps = []
111
- new_prots[np.reference] = np
112
- np
113
- end
114
- end
115
- pep.prots = peps_new_prts
116
- end
117
- end
118
-
119
- if kind == :update
120
- pephits.each do |pep|
121
- pep.prots.each do |prt|
122
- prt.peps = []
123
- end
124
- end
125
- end
126
-
127
- prot_set = {}
128
- pephits.each do |pep|
129
- prts = pep.prots
130
- prts.each do |prt|
131
- prot_set[ prt.reference ] = prt
132
- end
133
- if (kind == :update || kind == :new)
134
- prts.each do |prt|
135
- prt.peps << pep
136
- end
137
- end
138
- end
139
-
140
- ## Reset the original protein hits
141
- if kind == :new
142
- pephits.each_with_index do |pep,i|
143
- pep.prots = orig_pephits_prts[i]
144
- end
145
- end
146
-
147
- prot_set.values
148
- end
149
- end
150
-
151
-
152
- class SpecID::Filter
153
-
154
- NUM_PROT_FPPR_ITERATIONS = 10
155
-
156
- def self.run_from_argv(argv)
157
- obj = self.new
158
- obj.run_from_argv(argv)
159
- end
160
-
161
- def run_from_argv(argv)
162
- reply = get_options(argv)
163
- return unless reply
164
- files, opt = reply
165
-
166
- #files = ARGV.map {|file| file }
167
- #ARGV.clear
168
-
169
- $stderr.puts "reading files (can take a minute or two for large files)..." if $VERBOSE
170
- spec_ids = files.map do |file|
171
- spec_id = file_to_prefiltered_spec_id(file, opt)
172
- spec_id
173
- end
174
-
175
- ## the options hash
176
- hash = {}
177
- if opt.cys
178
- if opt.cys[1]
179
- opt.cys[1] = opt.cys[1].to_f
180
- else
181
- opt.cys[1] = 0.0
182
- end
183
- hash[:cys] = opt.cys
184
- end
185
-
186
-
187
- hash[:tps] =
188
- if opt.tps
189
- Fasta.new.read_file(opt.tps).prots.map do |prot|
190
- prot.aaseq.chomp
191
- end
192
- end
193
-
194
- hash[:dcy] =
195
- if opt.false
196
- new_spec_ids = []
197
- prefixes_or_files = SpecID.extend_args(opt.false, files.size)
198
- false_spec_ids = spec_ids.zip(prefixes_or_files).map do |spec_id, prefix_or_file|
199
- if File.exist? prefix_or_file
200
- new_spec_ids << spec_id
201
- file_to_prefiltered_spec_id(prefix_or_file, opt)
202
- else
203
- (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
204
- fps_specid = spec_id.class.new
205
- tps_specid = spec_id.class.new
206
-
207
- fps_specid.peps = fps
208
- tps_specid.peps = tps
209
- new_spec_ids << tps_specid
210
- fps_specid
211
- end
212
- end
213
- spec_ids = new_spec_ids
214
- false_spec_ids
215
- end
216
-
217
- defaults = {
218
- :dcy => nil, # { spec_id => false_spec_id }
219
- :cys => nil, # [cys_background_freq, cys_containing_freq]
220
- :tps => nil,
221
- :tmm => nil,
222
- :occams_razor => opt.occams_razor,
223
- }
224
- args = defaults.merge hash
225
-
226
-
227
- base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.ppm]
228
-
229
- #################################################### <--
230
- @fppr_methods = [:tmm, :tps, :cys, :dcy].select do |x|
231
- args[x]
232
- end
233
- @groups_reporting = [:pephits, :aaseq, :prothits]
234
- @groups_reporting.push( :occams_razor ) if args[:occams_razor]
235
-
236
- @cat_labels = {
237
- :pephits => 'pep_hits',
238
- :prothits => 'prot_hits',
239
- :aaseq => 'uniq_aa_hits',
240
- :occams_razor => 'occams_prot_hits',
241
- }
242
- #################################################### <--
243
-
244
- if opt.log
245
- @logfh = File.open(opt.log, 'w')
246
- else
247
- @logfh = nil
248
- end
249
- #########################################
250
- # PRINT FILTER LEGEND
251
- out filter_legend(@fppr_methods)
252
- #########################################
253
-
254
- if opt.filters_file
255
- lines = IO.readlines(opt.filters_file)
256
- lines.each do |line|
257
- line.chomp!
258
- answer = prep_reply(line, base_args)
259
- next if answer == false
260
- base_args = answer
261
- filter_round(spec_ids, base_args, args)
262
- end
263
- elsif opt.i
264
- ## CLEAR ARGV (since otherwise, gets reads it!)
265
- ARGV.clear
266
- out interactive_help
267
- reply = "nil"
268
- loop do
269
- b = base_args
270
- out "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} ppm:#{b[4]}"
271
- loop do
272
- reply = gets.chomp
273
- answer = prep_reply(reply, base_args)
274
- if answer == false
275
- out interactive_help
276
- else
277
- base_args = answer
278
- filter_round(spec_ids, base_args, args)
279
- break
280
- end
281
- end
282
- end
283
- else
284
- filter_round(spec_ids, base_args, args)
285
- end
286
-
287
- if opt.log
288
- @logfh.close
289
- end
290
-
291
- end
292
-
293
- def out(string)
294
- puts string
295
- if @logfh
296
- @logfh.puts string
297
- end
298
- end
299
-
300
- # takes a fasta file or a string ( to be cast as a float )
301
- def get_cys_freq(arg)
302
- if File.exist? arg
303
- SpecID::AAFreqs.new(arg).aafreqs[:C]
304
- else
305
- arg.to_f
306
- end
307
- end
308
-
309
- # prints shortened number for display
310
- def short(num)
311
- sprintf( "%.3f",num)
312
- end
313
-
314
- # if good arguments, returns [files_array, options]
315
- # else prints an error argument and returns nil
316
- def get_options(argv)
317
- dup_argv = argv.dup
318
-
319
- opt = OpenStruct.new
320
- opt.x1 = 1.0
321
- opt.x2 = 1.5
322
- opt.x3 = 2.0
323
- opt.c = 0.1
324
- opt.ppm = 1000.0
325
- opt.false = false
326
-
327
- opts = OptionParser.new do |op|
328
- op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <bioworks.xml | bioworks.srg>"
329
- op.separator("prints number of peptides/proteins ID'd at given thresholds")
330
- op.separator "only top hit (by xcorr) per scan+charge is considered"
331
-
332
- #op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
333
- #op.separator(" (these are peptides who are the only hit with xcorr > 0)")
334
- op.separator ""
335
- op.on("-1", "--xcorr1 N", Float, "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v}
336
- op.on("-2", "--xcorr2 N", Float, "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v}
337
- op.on("-3", "--xcorr3 N", Float, "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v}
338
- op.on("-c", "--deltacn N", Float, ">= deltacn d: #{opt.c}") {|v| opt.c = v}
339
- op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
340
- op.separator " if bioworks.xml, = 10^6deltamass/mass"
341
- op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
342
- op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
343
- op.separator(" e.g., for Bioworks: 'REVERSE'")
344
- op.separator(" (last given will apply to remaining files)")
345
- op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
346
- op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
347
- v[0] = get_cys_freq(v[0])
348
- opt.cys = v
349
- end
350
- op.separator(" freq = freq of cysteine as amino acid")
351
- op.separator(" [bkg] = freq of cys containing peps d: 0.0")
352
- op.on("--filters_file <file>", "(no -i) file with list of interactive input") {|v| opt.filters_file = v}
353
- op.on("-t", "--tps <fasta>", "fasta file containing true hits") {|v| opt.tps = v }
354
- #op.on("--tmm <toppred.out>", "toppred.out file with transmembr. topology") {|v| opt.tps = v }
355
- op.on("--yaml", "spits out yaml-ized data") {|v| opt.tabulate = v }
356
- op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
357
- op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
358
- op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
359
- ## NEED TO IMPLEMENT THIS:
360
- #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
361
- op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
362
- end
363
-
364
- opts.parse!(dup_argv)
365
-
366
- if dup_argv.size < 1
367
- puts opts
368
- return nil
369
- end
370
-
371
- [dup_argv, opt]
372
- end
373
-
374
- # (actual # with cys, expected # with cys, total#peptides,
375
- # mean_fraction_of_cysteines_true, std)
376
- # PepHit(C) = Peptide containing cysteine
377
- # # Total PepHit(C) # Observed Bad Pep (C)
378
- # ------------------ proportional_to ----------------------
379
- # # Total PepHit # Total Bad PepHit (X)
380
- # returns the fppr and the total number false
381
- def fppr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
382
-
383
- # the number of bona fide BAD cysteine hits
384
- # (some of the cysteine hits (~5%) are true positives)
385
-
386
- ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
387
- if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
388
- total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
389
- fppr = total_number_false / total_peptides
390
- [fppr, total_number_false]
391
- end
392
-
393
- # num_peps_per_protein is an array of the number of peptides per protein hit
394
- # (these are the true hits)
395
- # assumes that the number follows a gaussian distribution (binomial
396
- # distributions tend toward gaussians, I believe, at large N)
397
- # returns [mean_num_wrong, mean_fppr, stdev_num_wrong, stdev_fppr] fppr
398
- def protein_fppr( num_peps_per_protein, number_false_peptides, num_iterations=10)
399
-
400
- ## Check for more false peptides than peptides in our proteins:
401
- total_protein_peps = 0
402
- contained = num_peps_per_protein.each do |num|
403
- total_protein_peps += num
404
- end
405
- ## All peptides will be wrong every time!
406
- ## which means all proteins will be wrong every time!
407
- if number_false_peptides >= total_protein_peps
408
- # [all proteins wrong, fppr=1.0
409
- return [num_peps_per_protein.size, 1.0, 0.0, 0.0]
410
- end
411
-
412
-
413
- num_prots = num_peps_per_protein.size
414
- sample = VecD.new(num_iterations)
415
- # indexed by peptide_number, pointing to a protein's peptide_count
416
- # we shuffle the indices and then walk along until we are finished
417
- # then we count how many proteins still have peptides
418
-
419
- # we create an array to hold the peptide number for each protein, then we
420
- # can reference the same entity when subtracting the peptides in the
421
- # algorithm
422
- cont_pep_num_per_prot_ars = (0...num_iterations).map do |i|
423
- total_protein_peps = 0
424
- contained = num_peps_per_protein.map do |num|
425
- [num]
426
- end
427
- end
428
-
429
- cont_num_by_pep_index_ars = cont_pep_num_per_prot_ars.map do |ar|
430
- index_count = 0
431
- pc_ar = []
432
- ar.each do |contained_num|
433
- contained_num.first.times do
434
- pc_ar[index_count] = contained_num
435
- index_count += 1
436
- end
437
- end
438
- pc_ar
439
- end
440
-
441
- indices = (0...(cont_num_by_pep_index_ars.first.size)).map {|x| x }
442
-
443
-
444
- (0...num_iterations).each do |i|
445
- num_false = 0
446
- indices.shuffle!
447
- pc = cont_num_by_pep_index_ars[i]
448
- number_false_peptides.times do |shuffle_index|
449
- #big_i = indices[shuffle_index]
450
- pc[indices[shuffle_index]][0] -= 1
451
- end
452
- cont_pep_num_per_prot_ars[i].each do |contained_pep_count|
453
- if contained_pep_count.first == 0
454
- num_false += 1
455
- end
456
- end
457
- sample[i] = num_false
458
- end
459
- (mean_num_wrong, stdev) = sample.sample_stats
460
- mean_fppr = mean_num_wrong / num_prots
461
- stdev_fppr = stdev / num_prots
462
- [mean_num_wrong, mean_fppr, stdev, stdev_fppr]
463
- end
464
-
465
- # returns [total_number_false, fppr, fraction_expected]
466
- # also takes a hash of pephits keyed on :aaseq
467
- def fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
468
- (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(pephits, cys_bg_freq)
469
- fraction_of_expected = ac.to_f/exp
470
-
471
- (cys_fprate, total_num_false) = fppr_by_cysteines(ac, exp, pephits.size, cys_containing_freq)
472
- [total_num_false, cys_fprate, fraction_of_expected]
473
- end
474
-
475
- def report_cysteines
476
- #### UNDERWAY:::
477
- cys_tps = pep_nums[i] - total_num_false
478
-
479
- puts "CYSTEINE FPR: "
480
- puts " (# peps containing >= 1 cysteines)"
481
- puts " actual: #{ac}"
482
- puts "fraction of expected: #{short(fraction_of_expected)}"
483
- puts " expected # FP's: " + short(total_num_false)
484
- puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
485
-
486
- puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)"
487
- puts "Combined Score & FPR"
488
- puts "#{combined_score}\t#{cys_fprate}"
489
- puts "Combined Score & fraction of expected"
490
- #puts "#{combined_score} #{fraction_of_expected}"
491
- to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
492
- puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
493
- puts(['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
494
-
495
- end
496
-
497
- def filter_legend(fppr_methods)
498
- lines = []
499
- lines << "Note: protein FPPR values are probably optimistic"
500
- lines << "[this implementation assumes an equal likelihood that a false peptide"
501
- lines << " comes from a protein with more hits as one with less (which is probably"
502
- lines << " not the case)]"
503
- lines << "* = deltacn_star = peptides with deltacn > 1.0 (no sibling hits)"
504
- if fppr_methods.size > 0
505
- lines << "Following are methods for determining false identification rate:"
506
- lines << ['dcy=decoy', 'cys=cysteine', 'tps=known_true_positives'].join(" ")
507
- ## when tmm is implemented:
508
- #lines << ['dcy=decoy', 'cys=cysteine', 'tmm=transmembrane', 'tps=known_true_positives'].join(" ")
509
- end
510
- lines.join("\n")
511
- end
512
-
513
- # does this give aafreq from a fasta file?
514
- # freq = cysteines.aafreqs[:C]
515
-
516
- # returns [total_number_false, fppr]
517
- # pephits can be an array or a hash of peptides keyed on :aaseq
518
- def fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
519
- if pephits.is_a? Hash
520
- seqs = pephits.keys
521
- else
522
- seqs = pephits.map do |v|
523
- v.aaseq
524
- end
525
- end
526
- real_tps = 0
527
- real_fps = 0
528
- # could also do with partition
529
- seqs.each do |pep_aaseq|
530
- if true_pos_aaseqs_ar.any? {|prot_aaseq| prot_aaseq.include? pep_aaseq}
531
- real_tps += 1
532
- else
533
- real_fps += 1
534
- end
535
- end
536
- real_fppr = real_fps.to_f/pephits.size
537
- [real_fps, real_fppr]
538
- end
539
-
540
- def filter_spec_id(spec_id, filter_args, args)
541
- results_hash = {}
542
- # that second argument is to update protein peptides
543
- pephits = spec_id.filter_sequest(filter_args)
544
-
545
- results_hash[:prothits] = SpecID.passing_proteins(pephits, :no_update)
546
- results_hash[:pephits] = pephits
547
- results_hash[:dcn_cnt] = pephits.select{|v| v.deltacn > 1.0}.size
548
- # be aware that this is a hash keyed by aaseq and values of arrays of
549
- # peptides sharing the same aaseq!
550
- results_hash[:aaseq] = pephits.hash_by(:aaseq)
551
- results_hash
552
- end
553
-
554
- # returns [#FP, FPPR]
555
- def dcy_fppr(pephits, false_pephits)
556
- fps = false_pephits.size
557
- [fps, fps.to_f/pephits.size]
558
- end
559
-
560
- def tmm_fppr(pephits)
561
- abort "NEED TO IMPLEMENT"
562
- end
563
-
564
- # returns [#FP, FPPR]
565
- def cys_fppr(pephits, cys_bg_freq, cys_containing_freq)
566
- (total_num_false, cys_fprate, fraction_of_expected) = fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
567
- [total_num_false, cys_fprate]
568
- end
569
-
570
- def tps_fppr(pephits, true_pos_aaseqs_ar)
571
- fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
572
- end
573
-
574
- ## methods should be passed in like this 'cysteine' for cysteine_fppr
575
- ## all methods should return [number_false, fppr]
576
- ## returns a hash (by method) for each set of pephits
577
- ## if :dcy is given as a method, then expects the false pephits array
578
- def calculate_pep_fppr(pephits_ar, methods, args, false_pephits_ar=nil)
579
- cnt = 0
580
- pephits_ar.map do |ph|
581
- hash = {}
582
- methods.each do |mth|
583
- case mth
584
- when :dcy
585
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, false_pephits_ar[cnt])
586
- when :cys
587
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, *(args[:cys]) )
588
- when :tps
589
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, (args[:tps]) )
590
- else
591
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph)
592
- end
593
- end
594
- cnt += 1
595
- hash
596
- end
597
- end
598
-
599
- # fpr is a SpecID obj that is the false positives
600
- # cysteines holds an aafreqs object or nil
601
- def filter_round(spec_ids, filter_args, args)
602
-
603
- # push fpr on the end for the calculations
604
- ## FILTER the NORMAL spec_id objects
605
- little_tables = []
606
- spec_ids.each_with_index do |spec_id, i|
607
- normal_results = filter_spec_id(spec_id, filter_args, args)
608
-
609
- ## FILTER the FALSE objects (if given)
610
- false_results =
611
- if args[:dcy]
612
- little_args_hash = args.dup
613
- false_results = filter_spec_id(args[:dcy][i], filter_args, little_args_hash)
614
- end
615
-
616
- ## HOW TO CALCULATE FPPR FOR EVERYTHING:
617
- # pephits Fpephits C/Tpephits TPpephits
618
- # uniqaa Funiqaa C/Tuniqaa TPuniqaa
619
- # prothits ProtFPR(Fpephits, prothits) ProtFPR(C/Tpephits, prothits) ProtFPR(total-TPpephits, prothits)
620
- # OccProthits ProtFPR(Funiqaa, OccProthits) ProtFPR(C/Tuniqaa, OccProthits) ProtFPR(total-TPuniqaa, OccProthits)
621
- # C/T = cystein or Transmembrane method
622
-
623
- ## set up false results array
624
- if args[:dcy]
625
- fr_ar = [false_results[:pephits], false_results[:aaseq]]
626
- else
627
- fr_ar = nil
628
- end
629
- (pephits_fppr_results, aaseq_fppr_results) = calculate_pep_fppr([normal_results[:pephits], normal_results[:aaseq]], @fppr_methods, args, fr_ar)
630
-
631
- ## NORMAL prothits
632
- ## update prothits peptides
633
- updated_proteins = SpecID.passing_proteins(normal_results[:pephits], :update)
634
- pep_cnt_arr = updated_proteins.map {|v| v.peps.size }
635
-
636
- ## update occams prothits
637
- if args[:occams_razor]
638
- updated_occams_protein_triplets = SpecID::occams_razor(updated_proteins, true)
639
- occams_pep_cnt_arr = updated_occams_protein_triplets.map {|v| v[1].size }
640
- occams_prots = updated_occams_protein_triplets.map {|v| v[0] }
641
- normal_results[:occams_razor] = occams_prots
642
- end
643
-
644
- ## note that the original prot.peps arrays are obliterated by this.
645
- ## we would need to re-update if someone wanted these
646
-
647
- prothits_fppr_results = {}
648
- occams_results = {}
649
- @fppr_methods.each do |mth|
650
- prothits_fppr_results[mth] = protein_fppr(pep_cnt_arr, pephits_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS)
651
- occams_results[mth] = protein_fppr(occams_pep_cnt_arr, aaseq_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) if args[:occams_razor]
652
- end
653
-
654
- fppr_results = {
655
- :pephits => pephits_fppr_results,
656
- :aaseq => aaseq_fppr_results,
657
- :prothits => prothits_fppr_results,
658
- }
659
- fppr_results[:occams_razor] = occams_results if args[:occams_razor]
660
-
661
- ## CHANGE ALL RESULTS INTO PERCENTAGES:
662
- fppr_results.each do |bk,hash|
663
- hash.each do |k,val|
664
- hash[k][1] = 100.0 * val[1]
665
- end
666
- end
667
- little_tables[i] = to_table( spec_id, args, normal_results, fppr_results, @groups_reporting, @fppr_methods, @cat_labels)
668
- end
669
-
670
- out filter_params_string(filter_args, @fppr_methods)
671
- little_tables.each do |tbl|
672
- out tbl.to_formatted_string(nil, ' ')
673
- out "-----------------------------------------------\n"
674
- end
675
- #big_table(spec_ids, filter_args, args, normal_results, groups_reporting, fppr_results, cat_labels)
676
-
677
- end
678
-
679
-
680
-
681
- def filter_params_string(filter_args, fppr_methods)
682
- (x1, x2, x3, deltacn, ppm) = filter_args
683
- st = []
684
- st << "=========================================================================="
685
- st << " xcorr(1,2,3) >= #{x1},#{x2},#{x3} || deltacn >= #{deltacn} || ppm <= #{ppm} "
686
- st << ''
687
- st.join("\n")
688
- #st = []
689
- #st << ["xcorr(1,2,3) >= #{x1},#{x2},#{x3}", "deltacn >= #{deltacn}", "ppm <= #{ppm}"].join("\t")
690
- #st
691
- end
692
-
693
- def to_table(spec_id, args, normal_results, fppr_results, groups_reporting, fppr_methods, cat_labels)
694
- #table is in the form: { column heading => [ values ] }
695
-
696
- title = spec_id.passed_in_filename
697
- col_labels = ['num', *(fppr_methods.map{|v| "#{v}%" })]
698
-
699
- row_labels = groups_reporting.map {|grp| cat_labels[grp]}
700
- dt = groups_reporting.map do |grp|
701
- line = [normal_results[grp].size]
702
- fppr_methods.each do |mth|
703
- line << fppr_results[grp][mth][1]
704
- end
705
- line
706
- end
707
-
708
- Table.new(dt, row_labels, col_labels, title)
709
- #puts(['TABULATE:', combined_score, pep_tps, pep_fppr, real_tps, real_fppr, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
710
- end
711
-
712
- def combined_score(filter_args)
713
- (x1, x2, x3, deltacn, ppm) = filter_args
714
- combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)
715
- end
716
-
717
- # assumes its already chomped
718
- # updates the 5 globals
719
- def prep_reply(reply, base)
720
- if reply == 'q' ; exit ; end
721
- if reply =~ /^\s*$/
722
- base
723
- elsif reply
724
- arr = reply.split(/\s+/)
725
- to_change = []
726
- to_change_hash = {}
727
- arr.each do |it|
728
- if it.include? ':'
729
- (k,v) = it.split(':')
730
- to_change_hash[k] = v
731
- else
732
- to_change << it
733
- end
734
- end
735
- to_change.each_with_index do |tc,i|
736
- begin
737
- base[i] = tc.to_f
738
- rescue NoMethodError
739
- out "BAD ARG: #{tc}"
740
- return false
741
- end
742
- end
743
- to_change_hash.each do |k,v|
744
- case k
745
- when 'x1' ; base[0] = v
746
- when 'x2' ; base[1] = v
747
- when 'x3' ; base[2] = v
748
- when 'dcn' ; base[3] = v
749
- when 'ppm' ; base[4] = v
750
- else
751
- out "BAD ARG: #{k}:#{v}"
752
- end
753
- end
754
- base.map {|v| v.to_f }
755
- else
756
- false
757
- end
758
- end
759
-
760
- def file_to_prefiltered_spec_id(file, opt)
761
- spec_id = nil
762
- marshal_file = file + ".prefiltered.msh"
763
- if File.exist?(marshal_file)
764
- File.open(marshal_file) do |fh|
765
- spec_id = Marshal.load(fh)
766
- end
767
- else
768
- spec_id = SpecID.new(file)
769
- spec_id.passed_in_filename = file
770
- spec_id.top_peps_prefilter!
771
- ## marshal it!
772
- if opt.marshal
773
- File.open(marshal_file, "w") do |fh|
774
- Marshal.dump(spec_id,fh)
775
- end
776
- end
777
- end
778
- spec_id
779
- end
780
-
781
- def interactive_help
782
- string = []
783
- string << "********************************************************"
784
- string << "INTERACTIVE FILTERING HELP:"
785
- string << "enter: <x1> <x2> <x3> <dcn> <ppm>"
786
- string << "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> ppm:<ppm>"
787
- string << "or : dcn:<dcn>"
788
- string << "or : <x1> <x2> ppm:<ppm>"
789
- string << "etc..."
790
- string << "<enter> to (re)run current values"
791
- string << "'q' to quit"
792
- string << "********************************************************"
793
- string.join("\n")
794
- end
795
-
796
-
797
- end