mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -1,797 +0,0 @@
1
-
2
- require 'spec_id'
3
- require 'optparse'
4
- require 'ostruct'
5
- require 'spec_id/aa_freqs'
6
- require 'shuffle'
7
- require 'vec'
8
- require 'table'
9
-
10
-
11
- ########################################################
12
- WRITE_CYS_FIND = false
13
- ########################################################
14
-
15
-
16
- module SpecID
17
- attr_accessor :orig_peps, :passed_peps, :passed_prots
18
- # The filename passed in for filtering
19
- attr_accessor :passed_in_filename
20
-
21
- # returns the top peptide hits per file dta (first_scan + charge)
22
- # all hits with same score as top score are returned
23
- # assumes that all fields are strings...
24
- # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
25
- # deletes the protein array (but not relevant proteins)
26
- # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
27
- # sets the @orig_peps attribute to those passing
28
- def top_peps_prefilter!
29
- ## Bioworks peps are text based and need to be transformed first
30
- if peps.first.is_a? Bioworks::Pep
31
- peps.each do |pep|
32
- pep.xcorr = pep.xcorr.to_f
33
- pep.deltacn = pep.deltacn.to_f
34
- pep.deltamass = pep.deltamass.to_f
35
- pep.mass = pep.mass.to_f
36
- pep.charge = pep.charge.to_i
37
- pep.first_scan = pep.first_scan.to_i
38
- end
39
- end
40
- ## Srf Peps need no transformation!
41
-
42
- # get the top peptide by firstscan/charge (equivalent to .out files)
43
- top_peps = []
44
- self.peps.hash_by {|pep| [pep.base_name, pep.first_scan, pep.charge]}.values.map do |v|
45
- #self.peps.hash_by {|pep| [pep.aaseq, pep.charge]}.values.map do |v|
46
- best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
47
- top_score = best_to_worst.first.xcorr
48
- best_to_worst.each do |pep|
49
- if pep.xcorr == top_score
50
- top_peps << pep
51
- else ; break
52
- end
53
- end
54
- end
55
- @orig_peps = top_peps
56
- end
57
-
58
- # (xcorr1, xcorr2, xcorr3, deltacn, ppm)
59
- # interface very unstable. For now, keeping it very loose...
60
- # assumed that peptide xcorr, deltacn, deltamass, mass, ppm are Floats
61
- # assumed that peptide charge is Integer
62
- # returns peps_passed
63
- # must respond to 'peps'
64
- # DOES NOT UPDATE the prot.peps attribute!!
65
- def filter_sequest(args, include_deltacnstar=false)
66
- (x1, x2, x3, deltacn, ppm) = args
67
- self.peps.select do |pep|
68
- # have to add the upper limit to deltacn because the lowest score is often
69
- # assigned a 1.10 in bioworks!
70
- pep_deltacn = pep.deltacn
71
- pep_charge = pep.charge
72
-
73
- ## The outer parentheses are critical to getting the correct answer!
74
- passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
75
-
76
- if passing
77
- if !include_deltacnstar && pep_deltacn > 1.0
78
- false
79
- else
80
- true
81
- end
82
- else
83
- false
84
- end
85
- end
86
- end
87
-
88
-
89
- # given some list of SpecID::Pep based objects, finds the list of proteins
90
- # associated with those peptides
91
- # update_prot_peps => when true, updates prot.peps attribute given the list
92
- # of pephits
93
- # kind =
94
- # :no_update (current proteins are returned, but their peps attribute
95
- # is not updated)
96
- # :update (current proteins returned with peps attribute updated)
97
- # :new (new proteins are created complete with peps attribute)
98
- def self.passing_proteins(pephits, kind=:no_update)
99
-
100
- orig_pephits_prts = []
101
- if kind == :new
102
- new_prots = {}
103
- pephits.each_with_index do |pep,i|
104
- orig_pephits_prts[i] = pep.prots
105
- peps_new_prts = pep.prots.map do |prt|
106
- if new_prots.key? prt.reference
107
- already_exists = new_prots[prt.reference]
108
- else
109
- np = prt.dup
110
- np.peps = []
111
- new_prots[np.reference] = np
112
- np
113
- end
114
- end
115
- pep.prots = peps_new_prts
116
- end
117
- end
118
-
119
- if kind == :update
120
- pephits.each do |pep|
121
- pep.prots.each do |prt|
122
- prt.peps = []
123
- end
124
- end
125
- end
126
-
127
- prot_set = {}
128
- pephits.each do |pep|
129
- prts = pep.prots
130
- prts.each do |prt|
131
- prot_set[ prt.reference ] = prt
132
- end
133
- if (kind == :update || kind == :new)
134
- prts.each do |prt|
135
- prt.peps << pep
136
- end
137
- end
138
- end
139
-
140
- ## Reset the original protein hits
141
- if kind == :new
142
- pephits.each_with_index do |pep,i|
143
- pep.prots = orig_pephits_prts[i]
144
- end
145
- end
146
-
147
- prot_set.values
148
- end
149
- end
150
-
151
-
152
- class SpecID::Filter
153
-
154
- NUM_PROT_FPPR_ITERATIONS = 10
155
-
156
- def self.run_from_argv(argv)
157
- obj = self.new
158
- obj.run_from_argv(argv)
159
- end
160
-
161
- def run_from_argv(argv)
162
- reply = get_options(argv)
163
- return unless reply
164
- files, opt = reply
165
-
166
- #files = ARGV.map {|file| file }
167
- #ARGV.clear
168
-
169
- $stderr.puts "reading files (can take a minute or two for large files)..." if $VERBOSE
170
- spec_ids = files.map do |file|
171
- spec_id = file_to_prefiltered_spec_id(file, opt)
172
- spec_id
173
- end
174
-
175
- ## the options hash
176
- hash = {}
177
- if opt.cys
178
- if opt.cys[1]
179
- opt.cys[1] = opt.cys[1].to_f
180
- else
181
- opt.cys[1] = 0.0
182
- end
183
- hash[:cys] = opt.cys
184
- end
185
-
186
-
187
- hash[:tps] =
188
- if opt.tps
189
- Fasta.new.read_file(opt.tps).prots.map do |prot|
190
- prot.aaseq.chomp
191
- end
192
- end
193
-
194
- hash[:dcy] =
195
- if opt.false
196
- new_spec_ids = []
197
- prefixes_or_files = SpecID.extend_args(opt.false, files.size)
198
- false_spec_ids = spec_ids.zip(prefixes_or_files).map do |spec_id, prefix_or_file|
199
- if File.exist? prefix_or_file
200
- new_spec_ids << spec_id
201
- file_to_prefiltered_spec_id(prefix_or_file, opt)
202
- else
203
- (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
204
- fps_specid = spec_id.class.new
205
- tps_specid = spec_id.class.new
206
-
207
- fps_specid.peps = fps
208
- tps_specid.peps = tps
209
- new_spec_ids << tps_specid
210
- fps_specid
211
- end
212
- end
213
- spec_ids = new_spec_ids
214
- false_spec_ids
215
- end
216
-
217
- defaults = {
218
- :dcy => nil, # { spec_id => false_spec_id }
219
- :cys => nil, # [cys_background_freq, cys_containing_freq]
220
- :tps => nil,
221
- :tmm => nil,
222
- :occams_razor => opt.occams_razor,
223
- }
224
- args = defaults.merge hash
225
-
226
-
227
- base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.ppm]
228
-
229
- #################################################### <--
230
- @fppr_methods = [:tmm, :tps, :cys, :dcy].select do |x|
231
- args[x]
232
- end
233
- @groups_reporting = [:pephits, :aaseq, :prothits]
234
- @groups_reporting.push( :occams_razor ) if args[:occams_razor]
235
-
236
- @cat_labels = {
237
- :pephits => 'pep_hits',
238
- :prothits => 'prot_hits',
239
- :aaseq => 'uniq_aa_hits',
240
- :occams_razor => 'occams_prot_hits',
241
- }
242
- #################################################### <--
243
-
244
- if opt.log
245
- @logfh = File.open(opt.log, 'w')
246
- else
247
- @logfh = nil
248
- end
249
- #########################################
250
- # PRINT FILTER LEGEND
251
- out filter_legend(@fppr_methods)
252
- #########################################
253
-
254
- if opt.filters_file
255
- lines = IO.readlines(opt.filters_file)
256
- lines.each do |line|
257
- line.chomp!
258
- answer = prep_reply(line, base_args)
259
- next if answer == false
260
- base_args = answer
261
- filter_round(spec_ids, base_args, args)
262
- end
263
- elsif opt.i
264
- ## CLEAR ARGV (since otherwise, gets reads it!)
265
- ARGV.clear
266
- out interactive_help
267
- reply = "nil"
268
- loop do
269
- b = base_args
270
- out "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} ppm:#{b[4]}"
271
- loop do
272
- reply = gets.chomp
273
- answer = prep_reply(reply, base_args)
274
- if answer == false
275
- out interactive_help
276
- else
277
- base_args = answer
278
- filter_round(spec_ids, base_args, args)
279
- break
280
- end
281
- end
282
- end
283
- else
284
- filter_round(spec_ids, base_args, args)
285
- end
286
-
287
- if opt.log
288
- @logfh.close
289
- end
290
-
291
- end
292
-
293
- def out(string)
294
- puts string
295
- if @logfh
296
- @logfh.puts string
297
- end
298
- end
299
-
300
- # takes a fasta file or a string ( to be cast as a float )
301
- def get_cys_freq(arg)
302
- if File.exist? arg
303
- SpecID::AAFreqs.new(arg).aafreqs[:C]
304
- else
305
- arg.to_f
306
- end
307
- end
308
-
309
- # prints shortened number for display
310
- def short(num)
311
- sprintf( "%.3f",num)
312
- end
313
-
314
- # if good arguments, returns [files_array, options]
315
- # else prints an error argument and returns nil
316
- def get_options(argv)
317
- dup_argv = argv.dup
318
-
319
- opt = OpenStruct.new
320
- opt.x1 = 1.0
321
- opt.x2 = 1.5
322
- opt.x3 = 2.0
323
- opt.c = 0.1
324
- opt.ppm = 1000.0
325
- opt.false = false
326
-
327
- opts = OptionParser.new do |op|
328
- op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <bioworks.xml | bioworks.srg>"
329
- op.separator("prints number of peptides/proteins ID'd at given thresholds")
330
- op.separator "only top hit (by xcorr) per scan+charge is considered"
331
-
332
- #op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
333
- #op.separator(" (these are peptides who are the only hit with xcorr > 0)")
334
- op.separator ""
335
- op.on("-1", "--xcorr1 N", Float, "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v}
336
- op.on("-2", "--xcorr2 N", Float, "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v}
337
- op.on("-3", "--xcorr3 N", Float, "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v}
338
- op.on("-c", "--deltacn N", Float, ">= deltacn d: #{opt.c}") {|v| opt.c = v}
339
- op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
340
- op.separator " if bioworks.xml, = 10^6deltamass/mass"
341
- op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
342
- op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
343
- op.separator(" e.g., for Bioworks: 'REVERSE'")
344
- op.separator(" (last given will apply to remaining files)")
345
- op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
346
- op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
347
- v[0] = get_cys_freq(v[0])
348
- opt.cys = v
349
- end
350
- op.separator(" freq = freq of cysteine as amino acid")
351
- op.separator(" [bkg] = freq of cys containing peps d: 0.0")
352
- op.on("--filters_file <file>", "(no -i) file with list of interactive input") {|v| opt.filters_file = v}
353
- op.on("-t", "--tps <fasta>", "fasta file containing true hits") {|v| opt.tps = v }
354
- #op.on("--tmm <toppred.out>", "toppred.out file with transmembr. topology") {|v| opt.tps = v }
355
- op.on("--yaml", "spits out yaml-ized data") {|v| opt.tabulate = v }
356
- op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
357
- op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
358
- op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
359
- ## NEED TO IMPLEMENT THIS:
360
- #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
361
- op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
362
- end
363
-
364
- opts.parse!(dup_argv)
365
-
366
- if dup_argv.size < 1
367
- puts opts
368
- return nil
369
- end
370
-
371
- [dup_argv, opt]
372
- end
373
-
374
- # (actual # with cys, expected # with cys, total#peptides,
375
- # mean_fraction_of_cysteines_true, std)
376
- # PepHit(C) = Peptide containing cysteine
377
- # # Total PepHit(C) # Observed Bad Pep (C)
378
- # ------------------ proportional_to ----------------------
379
- # # Total PepHit # Total Bad PepHit (X)
380
- # returns the fppr and the total number false
381
- def fppr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
382
-
383
- # the number of bona fide BAD cysteine hits
384
- # (some of the cysteine hits (~5%) are true positives)
385
-
386
- ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
387
- if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
388
- total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
389
- fppr = total_number_false / total_peptides
390
- [fppr, total_number_false]
391
- end
392
-
393
- # num_peps_per_protein is an array of the number of peptides per protein hit
394
- # (these are the true hits)
395
- # assumes that the number follows a gaussian distribution (binomial
396
- # distributions tend toward gaussians, I believe, at large N)
397
- # returns [mean_num_wrong, mean_fppr, stdev_num_wrong, stdev_fppr] fppr
398
- def protein_fppr( num_peps_per_protein, number_false_peptides, num_iterations=10)
399
-
400
- ## Check for more false peptides than peptides in our proteins:
401
- total_protein_peps = 0
402
- contained = num_peps_per_protein.each do |num|
403
- total_protein_peps += num
404
- end
405
- ## All peptides will be wrong every time!
406
- ## which means all proteins will be wrong every time!
407
- if number_false_peptides >= total_protein_peps
408
- # [all proteins wrong, fppr=1.0
409
- return [num_peps_per_protein.size, 1.0, 0.0, 0.0]
410
- end
411
-
412
-
413
- num_prots = num_peps_per_protein.size
414
- sample = VecD.new(num_iterations)
415
- # indexed by peptide_number, pointing to a protein's peptide_count
416
- # we shuffle the indices and then walk along until we are finished
417
- # then we count how many proteins still have peptides
418
-
419
- # we create an array to hold the peptide number for each protein, then we
420
- # can reference the same entity when subtracting the peptides in the
421
- # algorithm
422
- cont_pep_num_per_prot_ars = (0...num_iterations).map do |i|
423
- total_protein_peps = 0
424
- contained = num_peps_per_protein.map do |num|
425
- [num]
426
- end
427
- end
428
-
429
- cont_num_by_pep_index_ars = cont_pep_num_per_prot_ars.map do |ar|
430
- index_count = 0
431
- pc_ar = []
432
- ar.each do |contained_num|
433
- contained_num.first.times do
434
- pc_ar[index_count] = contained_num
435
- index_count += 1
436
- end
437
- end
438
- pc_ar
439
- end
440
-
441
- indices = (0...(cont_num_by_pep_index_ars.first.size)).map {|x| x }
442
-
443
-
444
- (0...num_iterations).each do |i|
445
- num_false = 0
446
- indices.shuffle!
447
- pc = cont_num_by_pep_index_ars[i]
448
- number_false_peptides.times do |shuffle_index|
449
- #big_i = indices[shuffle_index]
450
- pc[indices[shuffle_index]][0] -= 1
451
- end
452
- cont_pep_num_per_prot_ars[i].each do |contained_pep_count|
453
- if contained_pep_count.first == 0
454
- num_false += 1
455
- end
456
- end
457
- sample[i] = num_false
458
- end
459
- (mean_num_wrong, stdev) = sample.sample_stats
460
- mean_fppr = mean_num_wrong / num_prots
461
- stdev_fppr = stdev / num_prots
462
- [mean_num_wrong, mean_fppr, stdev, stdev_fppr]
463
- end
464
-
465
- # returns [total_number_false, fppr, fraction_expected]
466
- # also takes a hash of pephits keyed on :aaseq
467
- def fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
468
- (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(pephits, cys_bg_freq)
469
- fraction_of_expected = ac.to_f/exp
470
-
471
- (cys_fprate, total_num_false) = fppr_by_cysteines(ac, exp, pephits.size, cys_containing_freq)
472
- [total_num_false, cys_fprate, fraction_of_expected]
473
- end
474
-
475
- def report_cysteines
476
- #### UNDERWAY:::
477
- cys_tps = pep_nums[i] - total_num_false
478
-
479
- puts "CYSTEINE FPR: "
480
- puts " (# peps containing >= 1 cysteines)"
481
- puts " actual: #{ac}"
482
- puts "fraction of expected: #{short(fraction_of_expected)}"
483
- puts " expected # FP's: " + short(total_num_false)
484
- puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
485
-
486
- puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)"
487
- puts "Combined Score & FPR"
488
- puts "#{combined_score}\t#{cys_fprate}"
489
- puts "Combined Score & fraction of expected"
490
- #puts "#{combined_score} #{fraction_of_expected}"
491
- to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
492
- puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
493
- puts(['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
494
-
495
- end
496
-
497
- def filter_legend(fppr_methods)
498
- lines = []
499
- lines << "Note: protein FPPR values are probably optimistic"
500
- lines << "[this implementation assumes an equal likelihood that a false peptide"
501
- lines << " comes from a protein with more hits as one with less (which is probably"
502
- lines << " not the case)]"
503
- lines << "* = deltacn_star = peptides with deltacn > 1.0 (no sibling hits)"
504
- if fppr_methods.size > 0
505
- lines << "Following are methods for determining false identification rate:"
506
- lines << ['dcy=decoy', 'cys=cysteine', 'tps=known_true_positives'].join(" ")
507
- ## when tmm is implemented:
508
- #lines << ['dcy=decoy', 'cys=cysteine', 'tmm=transmembrane', 'tps=known_true_positives'].join(" ")
509
- end
510
- lines.join("\n")
511
- end
512
-
513
- # does this give aafreq from a fasta file?
514
- # freq = cysteines.aafreqs[:C]
515
-
516
- # returns [total_number_false, fppr]
517
- # pephits can be an array or a hash of peptides keyed on :aaseq
518
- def fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
519
- if pephits.is_a? Hash
520
- seqs = pephits.keys
521
- else
522
- seqs = pephits.map do |v|
523
- v.aaseq
524
- end
525
- end
526
- real_tps = 0
527
- real_fps = 0
528
- # could also do with partition
529
- seqs.each do |pep_aaseq|
530
- if true_pos_aaseqs_ar.any? {|prot_aaseq| prot_aaseq.include? pep_aaseq}
531
- real_tps += 1
532
- else
533
- real_fps += 1
534
- end
535
- end
536
- real_fppr = real_fps.to_f/pephits.size
537
- [real_fps, real_fppr]
538
- end
539
-
540
- def filter_spec_id(spec_id, filter_args, args)
541
- results_hash = {}
542
- # that second argument is to update protein peptides
543
- pephits = spec_id.filter_sequest(filter_args)
544
-
545
- results_hash[:prothits] = SpecID.passing_proteins(pephits, :no_update)
546
- results_hash[:pephits] = pephits
547
- results_hash[:dcn_cnt] = pephits.select{|v| v.deltacn > 1.0}.size
548
- # be aware that this is a hash keyed by aaseq and values of arrays of
549
- # peptides sharing the same aaseq!
550
- results_hash[:aaseq] = pephits.hash_by(:aaseq)
551
- results_hash
552
- end
553
-
554
- # returns [#FP, FPPR]
555
- def dcy_fppr(pephits, false_pephits)
556
- fps = false_pephits.size
557
- [fps, fps.to_f/pephits.size]
558
- end
559
-
560
- def tmm_fppr(pephits)
561
- abort "NEED TO IMPLEMENT"
562
- end
563
-
564
- # returns [#FP, FPPR]
565
- def cys_fppr(pephits, cys_bg_freq, cys_containing_freq)
566
- (total_num_false, cys_fprate, fraction_of_expected) = fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
567
- [total_num_false, cys_fprate]
568
- end
569
-
570
- def tps_fppr(pephits, true_pos_aaseqs_ar)
571
- fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
572
- end
573
-
574
- ## methods should be passed in like this 'cysteine' for cysteine_fppr
575
- ## all methods should return [number_false, fppr]
576
- ## returns a hash (by method) for each set of pephits
577
- ## if :dcy is given as a method, then expects the false pephits array
578
- def calculate_pep_fppr(pephits_ar, methods, args, false_pephits_ar=nil)
579
- cnt = 0
580
- pephits_ar.map do |ph|
581
- hash = {}
582
- methods.each do |mth|
583
- case mth
584
- when :dcy
585
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, false_pephits_ar[cnt])
586
- when :cys
587
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, *(args[:cys]) )
588
- when :tps
589
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, (args[:tps]) )
590
- else
591
- hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph)
592
- end
593
- end
594
- cnt += 1
595
- hash
596
- end
597
- end
598
-
599
- # fpr is a SpecID obj that is the false positives
600
- # cysteines holds an aafreqs object or nil
601
- def filter_round(spec_ids, filter_args, args)
602
-
603
- # push fpr on the end for the calculations
604
- ## FILTER the NORMAL spec_id objects
605
- little_tables = []
606
- spec_ids.each_with_index do |spec_id, i|
607
- normal_results = filter_spec_id(spec_id, filter_args, args)
608
-
609
- ## FILTER the FALSE objects (if given)
610
- false_results =
611
- if args[:dcy]
612
- little_args_hash = args.dup
613
- false_results = filter_spec_id(args[:dcy][i], filter_args, little_args_hash)
614
- end
615
-
616
- ## HOW TO CALCULATE FPPR FOR EVERYTHING:
617
- # pephits Fpephits C/Tpephits TPpephits
618
- # uniqaa Funiqaa C/Tuniqaa TPuniqaa
619
- # prothits ProtFPR(Fpephits, prothits) ProtFPR(C/Tpephits, prothits) ProtFPR(total-TPpephits, prothits)
620
- # OccProthits ProtFPR(Funiqaa, OccProthits) ProtFPR(C/Tuniqaa, OccProthits) ProtFPR(total-TPuniqaa, OccProthits)
621
- # C/T = cystein or Transmembrane method
622
-
623
- ## set up false results array
624
- if args[:dcy]
625
- fr_ar = [false_results[:pephits], false_results[:aaseq]]
626
- else
627
- fr_ar = nil
628
- end
629
- (pephits_fppr_results, aaseq_fppr_results) = calculate_pep_fppr([normal_results[:pephits], normal_results[:aaseq]], @fppr_methods, args, fr_ar)
630
-
631
- ## NORMAL prothits
632
- ## update prothits peptides
633
- updated_proteins = SpecID.passing_proteins(normal_results[:pephits], :update)
634
- pep_cnt_arr = updated_proteins.map {|v| v.peps.size }
635
-
636
- ## update occams prothits
637
- if args[:occams_razor]
638
- updated_occams_protein_triplets = SpecID::occams_razor(updated_proteins, true)
639
- occams_pep_cnt_arr = updated_occams_protein_triplets.map {|v| v[1].size }
640
- occams_prots = updated_occams_protein_triplets.map {|v| v[0] }
641
- normal_results[:occams_razor] = occams_prots
642
- end
643
-
644
- ## note that the original prot.peps arrays are obliterated by this.
645
- ## we would need to re-update if someone wanted these
646
-
647
- prothits_fppr_results = {}
648
- occams_results = {}
649
- @fppr_methods.each do |mth|
650
- prothits_fppr_results[mth] = protein_fppr(pep_cnt_arr, pephits_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS)
651
- occams_results[mth] = protein_fppr(occams_pep_cnt_arr, aaseq_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) if args[:occams_razor]
652
- end
653
-
654
- fppr_results = {
655
- :pephits => pephits_fppr_results,
656
- :aaseq => aaseq_fppr_results,
657
- :prothits => prothits_fppr_results,
658
- }
659
- fppr_results[:occams_razor] = occams_results if args[:occams_razor]
660
-
661
- ## CHANGE ALL RESULTS INTO PERCENTAGES:
662
- fppr_results.each do |bk,hash|
663
- hash.each do |k,val|
664
- hash[k][1] = 100.0 * val[1]
665
- end
666
- end
667
- little_tables[i] = to_table( spec_id, args, normal_results, fppr_results, @groups_reporting, @fppr_methods, @cat_labels)
668
- end
669
-
670
- out filter_params_string(filter_args, @fppr_methods)
671
- little_tables.each do |tbl|
672
- out tbl.to_formatted_string(nil, ' ')
673
- out "-----------------------------------------------\n"
674
- end
675
- #big_table(spec_ids, filter_args, args, normal_results, groups_reporting, fppr_results, cat_labels)
676
-
677
- end
678
-
679
-
680
-
681
- def filter_params_string(filter_args, fppr_methods)
682
- (x1, x2, x3, deltacn, ppm) = filter_args
683
- st = []
684
- st << "=========================================================================="
685
- st << " xcorr(1,2,3) >= #{x1},#{x2},#{x3} || deltacn >= #{deltacn} || ppm <= #{ppm} "
686
- st << ''
687
- st.join("\n")
688
- #st = []
689
- #st << ["xcorr(1,2,3) >= #{x1},#{x2},#{x3}", "deltacn >= #{deltacn}", "ppm <= #{ppm}"].join("\t")
690
- #st
691
- end
692
-
693
- def to_table(spec_id, args, normal_results, fppr_results, groups_reporting, fppr_methods, cat_labels)
694
- #table is in the form: { column heading => [ values ] }
695
-
696
- title = spec_id.passed_in_filename
697
- col_labels = ['num', *(fppr_methods.map{|v| "#{v}%" })]
698
-
699
- row_labels = groups_reporting.map {|grp| cat_labels[grp]}
700
- dt = groups_reporting.map do |grp|
701
- line = [normal_results[grp].size]
702
- fppr_methods.each do |mth|
703
- line << fppr_results[grp][mth][1]
704
- end
705
- line
706
- end
707
-
708
- Table.new(dt, row_labels, col_labels, title)
709
- #puts(['TABULATE:', combined_score, pep_tps, pep_fppr, real_tps, real_fppr, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
710
- end
711
-
712
- def combined_score(filter_args)
713
- (x1, x2, x3, deltacn, ppm) = filter_args
714
- combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)
715
- end
716
-
717
- # assumes its already chomped
718
- # updates the 5 globals
719
- def prep_reply(reply, base)
720
- if reply == 'q' ; exit ; end
721
- if reply =~ /^\s*$/
722
- base
723
- elsif reply
724
- arr = reply.split(/\s+/)
725
- to_change = []
726
- to_change_hash = {}
727
- arr.each do |it|
728
- if it.include? ':'
729
- (k,v) = it.split(':')
730
- to_change_hash[k] = v
731
- else
732
- to_change << it
733
- end
734
- end
735
- to_change.each_with_index do |tc,i|
736
- begin
737
- base[i] = tc.to_f
738
- rescue NoMethodError
739
- out "BAD ARG: #{tc}"
740
- return false
741
- end
742
- end
743
- to_change_hash.each do |k,v|
744
- case k
745
- when 'x1' ; base[0] = v
746
- when 'x2' ; base[1] = v
747
- when 'x3' ; base[2] = v
748
- when 'dcn' ; base[3] = v
749
- when 'ppm' ; base[4] = v
750
- else
751
- out "BAD ARG: #{k}:#{v}"
752
- end
753
- end
754
- base.map {|v| v.to_f }
755
- else
756
- false
757
- end
758
- end
759
-
760
- def file_to_prefiltered_spec_id(file, opt)
761
- spec_id = nil
762
- marshal_file = file + ".prefiltered.msh"
763
- if File.exist?(marshal_file)
764
- File.open(marshal_file) do |fh|
765
- spec_id = Marshal.load(fh)
766
- end
767
- else
768
- spec_id = SpecID.new(file)
769
- spec_id.passed_in_filename = file
770
- spec_id.top_peps_prefilter!
771
- ## marshal it!
772
- if opt.marshal
773
- File.open(marshal_file, "w") do |fh|
774
- Marshal.dump(spec_id,fh)
775
- end
776
- end
777
- end
778
- spec_id
779
- end
780
-
781
- def interactive_help
782
- string = []
783
- string << "********************************************************"
784
- string << "INTERACTIVE FILTERING HELP:"
785
- string << "enter: <x1> <x2> <x3> <dcn> <ppm>"
786
- string << "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> ppm:<ppm>"
787
- string << "or : dcn:<dcn>"
788
- string << "or : <x1> <x2> ppm:<ppm>"
789
- string << "etc..."
790
- string << "<enter> to (re)run current values"
791
- string << "'q' to quit"
792
- string << "********************************************************"
793
- string.join("\n")
794
- end
795
-
796
-
797
- end