mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,623 @@
1
+ require 'sort_by_attributes'
2
+ require 'validator'
3
+ require 'spec_id'
4
+ require 'merge_deep'
5
+ require 'spec_id/precision/filter/interactive'
6
+ require 'spec_id/precision/filter/output'
7
+
8
+
9
+ class Filter
10
+
11
+ # filters using previously passed in methods and options
12
+ def filter(group)
13
+ if @opts
14
+ send(@method, group, *@opts)
15
+ else
16
+ send(@method, group)
17
+ end
18
+ end
19
+
20
+ # replaces the contents of group with what passed
21
+ def filter!(group)
22
+ group.replace(filter(group))
23
+ end
24
+ end
25
+
26
+
27
+
28
+ # we have to require this after we setup our defaults hash
29
+ # require 'filter/spec_id/cmdline'
30
+
31
+ class SpecID::Precision::Filter
32
+ FV_DEFAULTS = {
33
+ :sequest =>
34
+ {
35
+ :xcorr1 => 1.0,
36
+ :xcorr2 => 1.5,
37
+ :xcorr3 => 2.0,
38
+ :deltacn => 0.1,
39
+ :ppm => 1000,
40
+ :include_deltacnstar => true,
41
+ },
42
+
43
+
44
+ # output
45
+ :proteins => false,
46
+ :output => [],
47
+
48
+ # general
49
+ :top_hit_by => :xcorr,
50
+ :postfilter => :top_per_scan,
51
+ :prefilter => false,
52
+ :hits_together => true,
53
+
54
+ # These are also defaulted in the commandline because they are necessary
55
+ # for the validators... could this introduce conflicts somehow?
56
+ :decoy_on_match => true,
57
+ :ties => true,
58
+
59
+ # UNLISTED FOR NOW:
60
+ :include_ties_in_top_hit_prefilter => true,
61
+ :include_ties_in_top_hit_postfilter => false,
62
+ }
63
+
64
+ require 'spec_id/precision/filter/cmdline'
65
+
66
+ def filter_and_validate_cmdline(args)
67
+ (spec_id_obj, options, option_parser) = CmdlineParser.new.parse(args)
68
+ if spec_id_obj == nil
69
+ puts option_parser
70
+ return
71
+ end
72
+ final_answer = SpecID::Precision::Filter.new.filter_and_validate(spec_id_obj, options)
73
+ end
74
+
75
+ # # output_array has doublets of [format, handle]
76
+ # # answer is the answer one gets out of filter_and_validate
77
+ # def output(answer, output_array)
78
+ # output_array.each do |format, handle|
79
+ # SpecID::Precision::Filter::Output.new(format, handle)
80
+ # end
81
+ # end
82
+
83
+ # Very high level method that takes simple parameters.
84
+ # spec_id may be a filename or a SpecID object (containing peps)
85
+ # Default values may be queried from SpecID::Precision::Filter::FV_DEFAULTS
86
+ # Returns a structured hash:
87
+ # Fl = Float ; Ar = Array
88
+ # { :params => <Hash of filtering params>,
89
+ # :pephits => <Ar of pephits>,
90
+ # :pephits_precision => [<array of precision>]
91
+ # # if :proteins => true
92
+ # :prothits => <Array of prothits>,
93
+ # :prothits_precision => [ Array of hashes where each hash =
94
+ # { :worst => Fl, :normal => Fl,
95
+ # :normal_stdev => Fl } ]
96
+ # }
97
+ #
98
+ # NOTE: Brackets [] indicate an Array! The Bar '|' indicates another option.
99
+ # The asterik '*' is the default option.
100
+ #
101
+ # :sequest => {
102
+ # :xcorr1 -> >= (xcorr +1 charge state)
103
+ # :xcorr2 -> >= (xcorr +2 charge state)
104
+ # :xcorr3 -> >= (xcorr +3 charge state)
105
+ # :deltacn -> >= (delta cn)
106
+ # :ppm -> <= parts per million (Float)
107
+ # :include_deltacnstar => *true | false include deltacn (given at 1.1) of
108
+ # top hit with no 2nd hit
109
+ #
110
+ # }
111
+ # OUTPUT:
112
+ # :proteins => true | *false gives proteins (and validation)
113
+ # :output => [[format, FILENAME=nil],...] formats to output filtering results.
114
+ # can be used multiple times
115
+ # FILENAME is the filename to use
116
+ # if nil, then outputs to $stdout
117
+ # valid formats are:
118
+ # :text_table (default)
119
+ # :yaml (need to implement)
120
+ # :protein_summary (need to implement)
121
+ # :html_table (need to implement)
122
+ # default value =>
123
+ # [[:text_table,nil]]
124
+ #
125
+ # VALIDATION:
126
+ # :validators => [Array] objects that respond to pephit_precision
127
+ # usually of base class Validator
128
+ # NOTE: if you have decoy peptides, you MUST have
129
+ # a Validator::Decoy object to separate them out.
130
+ # NOTE: if transmem validator passed in, the
131
+ # proteins in spec_id must already be granted
132
+ # transmem status!
133
+ #
134
+ #
135
+ # OTHER:
136
+ # :top_hit_by -> *:xcorr | :probability
137
+ # probabilities only in bioworks.xml files right now (if
138
+ # they were calculated).
139
+ # :postfilter -> *:top_per_scan | :top_per_aaseq | :top_per_aaseq_charge
140
+ # :top_per_scan hashes by filename + scan
141
+ # :top_per_aaseq hashes by top_per_scan + aaseq
142
+ # :top_per_aaseq_charge hashes by top_per_aaseq + charge
143
+ # :prefilter -> true | *false Takes top hit per file+scan+charge
144
+ # :interactive => interactive_object
145
+ # # should behave like this:
146
+ # # interactive_object.filter_args(currentopts) -> args_for_filtering | nil (done)
147
+ #
148
+ # # interactive_object.passing(final_answer)
149
+
150
+ # The defaults for filter_and_validate
151
+
152
+ def filter_and_validate(spec_id_obj, options={})
153
+ # NOTE:
154
+ # This is a fairly complicated method. The complication comes in doing
155
+ # top hit filters on separate/cat searches wanted them to be either
156
+ # together or separate. I opt for fewer conversions between the two, but
157
+ # that means keeping track of more things...
158
+
159
+ opts = FV_DEFAULTS.merge_deep(options)
160
+
161
+ spec_id = spec_id_obj
162
+
163
+ peps = spec_id.peps
164
+ filename = spec_id.filename
165
+
166
+ #######################################
167
+ # DEFAULTS:
168
+ interactive_changing_keys = [:xcorr1, :xcorr2, :xcorr3, :deltacn, :ppm, :include_deltacnstar, :postfilter]
169
+ interactive_shortcut_map = {
170
+ :xcorr1 => 'x1',
171
+ :xcorr2 => 'x2',
172
+ :xcorr3 => 'x3',
173
+ :deltacn => 'dcn',
174
+ :ppm => 'ppm',
175
+ :include_deltacnstar => 'dcns',
176
+ :postfilter => 'pf',
177
+ }
178
+ to_float = proc {|x| x.to_f}
179
+ to_bool = proc do |x|
180
+ case x
181
+ when /^t/io
182
+ true
183
+ when /^f/io
184
+ false
185
+ when true
186
+ true
187
+ when false
188
+ false
189
+ else
190
+ nil
191
+ end
192
+ end
193
+ to_postfilter = proc do |x|
194
+ case x
195
+ when 's'
196
+ :top_per_scan
197
+ when 'a'
198
+ :top_per_aaseq
199
+ when 'ac'
200
+ :top_per_aaseq_charge
201
+ when Symbol
202
+ x
203
+ end
204
+ end
205
+ casting_map = {
206
+ :xcorr1 => to_float,
207
+ :xcorr2 => to_float,
208
+ :xcorr3 => to_float,
209
+ :deltacn => to_float,
210
+ :ppm => to_float,
211
+ :include_deltacnstar => to_bool,
212
+ :postfilter => to_postfilter,
213
+ }
214
+
215
+ # output:
216
+ # NOTE: BOOLEANS that are by default false do not need a default!!
217
+ # They will yield false on key lookup if no key or false!
218
+ # BOOLEANS that by default are true should be queried like this
219
+ # !(opts[:<option>] == false)
220
+
221
+ # open up each of the files for writing
222
+ if opts[:output]
223
+ outputs = opts[:output].map do |format, where|
224
+ if where == nil
225
+ where = $stdout
226
+ end
227
+ SpecID::Precision::Filter::Output.new(format, where)
228
+ end
229
+ end
230
+
231
+ postfilters_per_hash = {
232
+ :top_per_scan => [:base_name, :first_scan],
233
+ :top_per_aaseq => [:aaseq], # first by top_per_scan, then this guy
234
+ :top_per_aaseq_charge => [:aaseq, :charge], # first by top_per_scan, then this one
235
+ }
236
+
237
+ top_hit_by__to_sort_by = {
238
+ :xcorr => [:xcorr, {:down=> [:xcorr]}],
239
+ :probability => [:probability, (spec_id.hi_prob_best ? {:down=> [:probability]} : {})],
240
+ }
241
+ sort_by_att_opts = top_hit_by__to_sort_by[opts[:top_hit_by]]
242
+ opts_for_top_hit_prefilter = {
243
+ :per => [:base_name, :first_scan, :charge],
244
+ :by => sort_by_att_opts,
245
+ :include_ties => opts[:include_ties_in_top_hit_prefilter]
246
+ }
247
+ # PRIVATE DEFAULTS:
248
+ merge_prefix = 'DECOY_'
249
+ unmerge_regexp = /^DECOY_/
250
+
251
+ #######################################
252
+
253
+
254
+ # opts_decoy = opts[:decoy]
255
+
256
+
257
+
258
+ # if we have a Validator::Decoy object, we will use its defaults to split
259
+ # peptides.
260
+ decoy_validator =
261
+ if opts[:validators]
262
+ decoy_vals = opts[:validators].select {|v| v.class == Validator::Decoy }
263
+ if decoy_vals.size == 0
264
+ nil
265
+ elsif decoy_vals.size == 1
266
+ decoy_vals.first
267
+ else
268
+ raise ArgumentError, "can only have one Validator::Decoy object"
269
+ end
270
+
271
+ ### suck out the relevant parameters
272
+ #sep_params = [:decoy_on_match, :correct_wins].inject({}) do |hash,k|
273
+ # hash[k] = decoy_validator.send(k)
274
+ # hash
275
+ #end
276
+ else
277
+ nil
278
+ end
279
+
280
+ decoy_validator_to_split_with = nil
281
+
282
+ pep_sets =
283
+ if decoy_validator
284
+ if decoy_validator.constraint.is_a?(Regexp)
285
+ if opts[:hits_together]
286
+ decoy_validator_to_split_with = decoy_validator
287
+ [peps]
288
+ else
289
+ (target, decoy) = decoy_validator.partition(peps)
290
+ #(target, decoy) = SpecID.classify_by_prot(peps, opts_decoy, sep_params[:decoy_on_match], sep_params[:correct_wins])
291
+ [target, decoy]
292
+ end
293
+ elsif decoy_validator.constraint.is_a?(String) ## a Filename
294
+ decoy_peps = SpecID.new(decoy_validator.constraint).peps
295
+
296
+ if opts[:hits_together]
297
+ # we fake that the protein sets are together
298
+ decoy_validator_to_split_with = Validator::Decoy.new(unmerge_regexp)
299
+ decoy_peps.each do |pep|
300
+ pep.prots.each {|prt| prt.reference = merge_prefix + prt.reference }
301
+ end
302
+ [peps + decoy_peps] # wrap them so we get the target out
303
+ else
304
+ [peps, decoy_peps]
305
+ end
306
+ else
307
+ raise ArgumentError, "Decoy::Validator#constraint must be a Regexp or valid SpecID file"
308
+ end
309
+ else
310
+ [peps] # no decoy
311
+ end
312
+
313
+ if opts[:proteins]
314
+ protein_validator = Validator::ProtFromPep.new
315
+ end
316
+
317
+ ### TOP HITS PREFILTER < < TOP_HITS_TOGETHER > >
318
+ ###########################
319
+ # TOP HITS FILTER:
320
+ ###########################
321
+ # REALLY, this guy only exists for speed and memory consumption
322
+ # If we prefilter, we don't have to filter as many hits in every
323
+ # interactive round. I'd leave this guy out if I were doing only a
324
+ # sequest filter. (I should compare results with this filter and w/o)
325
+ # This guy is very tricky since we need to consider whether they are to be
326
+ # run together or separately and not do more work than we need
327
+ # get passed_target for any case (and passed_decoy if opts[:decoy])
328
+
329
+
330
+ top_hit_prefilter = SpecID::Precision::Filter::Peps.new(:top_hit, opts_for_top_hit_prefilter) if opts[:prefilter]
331
+
332
+ if top_hit_prefilter
333
+ pep_sets.map! do |pep_set|
334
+ top_hit_prefilter.filter(pep_set)
335
+ end
336
+ end
337
+
338
+ # prepare our top hit filter:
339
+ # since we are now modulating this guy, we need to create it fresh every
340
+ # time
341
+ top_per_scan_postfilter = SpecID::Precision::Filter::Peps.new(:top_hit,
342
+ :per => postfilters_per_hash[:top_per_scan],
343
+ :by => sort_by_att_opts,
344
+ :include_ties => opts[:include_ties_in_top_hit_postfilter])
345
+
346
+
347
+
348
+ # Prepare to loop
349
+ # Give interactive help once here if necessary
350
+ interactive = opts[:interactive]
351
+ if interactive
352
+ ARGV.clear
353
+ interactive.out(interactive.interactive_help(interactive_changing_keys, interactive_shortcut_map)) if interactive.verbose
354
+ end
355
+
356
+ # the loop is for if we are interactive
357
+ final_answer = nil
358
+ loop do
359
+
360
+ if interactive #interactive
361
+ # a bit of a hack, but we shove on the postfilter param to modulate
362
+ opts[:sequest][:postfilter] = opts[:postfilter]
363
+ response = interactive.filter_args(opts[:sequest], interactive_changing_keys, interactive_shortcut_map, casting_map)
364
+ opts[:postfilter] = opts[:sequest].delete(:postfilter)
365
+ break if response == nil
366
+ end
367
+
368
+ # prepare our top hit filter:
369
+ # since we are now modulating this guy, we need to create it fresh every
370
+ # time
371
+
372
+ sub_postfilter =
373
+ if opts[:postfilter] == :top_per_scan
374
+ nil
375
+ else
376
+ postfilter_per_args = postfilters_per_hash[opts[:postfilter]]
377
+ SpecID::Precision::Filter::Peps.new(:top_hit,
378
+ :per => postfilter_per_args,
379
+ :by => sort_by_att_opts,
380
+ :include_ties => opts[:include_ties_in_top_hit_postfilter]
381
+ )
382
+ end
383
+
384
+ pep_sets_to_be_filtered = pep_sets.map
385
+
386
+ ### SEQUEST < EITHER >
387
+ ###########################
388
+ # SEQUEST FILTER:
389
+ ###########################
390
+ # This guy is immune to the trickiness of top hits, so we just filter
391
+ # separately since validation is best done without decoys (except decoy)
392
+ sequest_args = opts[:sequest].values_at( :xcorr1, :xcorr2, :xcorr3, :deltacn, :ppm, :include_deltacnstar )
393
+ sequest_filter = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *sequest_args)
394
+
395
+ pep_sets_filtered = pep_sets_to_be_filtered.map do |pep_set|
396
+ sequest_filter.filter(pep_set)
397
+ end
398
+
399
+ ### FINAL HIT PER SCAN < < TOP_HITS_TOGETHER > >
400
+ ##########################
401
+ # FINAL HIT PER SCAN
402
+ ##########################
403
+ # Why not just do the top hit filter in the top hits pre filter before?
404
+ # Good question. Answer: We may have instances when the top hit (by
405
+ # xcorr) has some other poorer attribute than the hit at the other charge.
406
+ # In this case, we'd end up with no passing peptide.
407
+ # Also, the xcorr filter is per charge, so we may filter out the higher
408
+ # scoring peptide hit even though the other would pass based on its charge
409
+ # state, etc., etc....
410
+ # ###################################################
411
+ # NOTE THIS WELL:
412
+ # IF IT IS SUPPOSE TO be separate it's *ALREADY* separate, if together its
413
+ # *ALREADY* together!!!!
414
+ # the implication is that we don't need to do any merging or
415
+ # separating before we do this last filter!!!!
416
+ # ###################################################
417
+
418
+ # TODO: We need to add this guy in!
419
+ #if opts[:uniq_aa]
420
+ # pep_sets_filtered.map do |pep_set|
421
+ # end
422
+ #end
423
+
424
+ pep_sets_filtered.map! do |pep_set|
425
+ top_per_scan_postfilter.filter!(pep_set)
426
+ if sub_postfilter
427
+ sub_postfilter.filter!(pep_set)
428
+ else
429
+ pep_set
430
+ end
431
+ end
432
+
433
+ normal_post_filtered_peps = pep_sets_filtered.first
434
+
435
+ # separate the decoy's out if they are together
436
+ if decoy_validator_to_split_with # only set if opts[:hits_together]!!
437
+ (target, decoy) = decoy_validator_to_split_with.partition(normal_post_filtered_peps)
438
+ pep_sets_filtered = [target, decoy]
439
+ end
440
+
441
+ ### VALIDATION < SEPARATE >
442
+ pephit_precision_array = get_pephit_precision(opts[:validators], *pep_sets_filtered) if opts[:validators]
443
+
444
+ final_answer = {
445
+ :params => opts,
446
+ :pephits => pep_sets_filtered.first,
447
+ }
448
+ if pephit_precision_array
449
+ final_answer[:pephits_precision] = pephit_precision_array
450
+ end
451
+
452
+ if opts[:proteins]
453
+ protein_precision_array = peptide_precision_to_protein_precision(protein_validator, normal_post_filtered_peps, pephit_precision_array)
454
+ # this could be factored out (since we do it in protein_precision)
455
+
456
+ # merge the final prots into a unique set:
457
+ final_answer[:prothits] = normal_post_filtered_peps.inject(Set.new) do |protset, pep|
458
+ protset.merge(pep.prots)
459
+ end
460
+ final_answer[:prothits_precision] = protein_precision_array
461
+ end
462
+
463
+ ## output the output
464
+ outputs.each {|output| output.print(final_answer) }
465
+
466
+ if interactive
467
+ interactive.passing(opts, final_answer)
468
+ end
469
+
470
+ if !interactive
471
+ break
472
+ end
473
+ end
474
+ # Close the filehandles
475
+ outputs.each { |output| output.close } if opts[:output]
476
+ final_answer
477
+ end
478
+
479
+ # takes peps and a peptide_precision_hash. Returns a hash with the same
480
+ # keys of peptide_precision_hash where the value is a hash with these keys:
481
+ # :worst => worstcase protein precision
482
+ # :normal => estimaton by binomial/gaussian method (optimistic)
483
+ # :normal_stdev => the stdev of the normal method
484
+ def peptide_precision_to_protein_precision(protein_validator, peps, peptide_precision_array, round_num_false=:ceil)
485
+ peptide_precision_array.map do |precision|
486
+ num_false = ((1.0 - precision) * peps.size).ceil
487
+ reply = protein_validator.prothit_precision(peps, num_false)
488
+ hash = {}
489
+ %w(worst normal normal_stdev).zip(reply) do |label, answer|
490
+ hash[label.to_sym] = answer
491
+ end
492
+ hash
493
+ end
494
+ end
495
+
496
+ # takes an array of validator objects and peps (already separated out from
497
+ # decoys; the decoy's can be passed in
498
+ # returns an array of results
499
+ def get_pephit_precision(validators, peps, decoy_peps=nil, grant_transmem_status=false)
500
+ validators.map do |validator|
501
+ if validator.class == Validator::Decoy
502
+ validator.pephit_precision(peps, decoy_peps)
503
+ else
504
+ validator.pephit_precision(peps)
505
+ end
506
+ end
507
+ end
508
+ end
509
+
510
+ class SpecID::Precision::Filter::Peps < Filter
511
+
512
+ # can pass in the method to call. If you have static options and you will
513
+ # reuse your filter, you can pass them in here.
514
+ # BEWARE: this will override any passed into the method at filter time.
515
+ # If you need to do that, make a new, blank filter and pass in your args
516
+ # at filter time
517
+ def initialize(meth=nil, *opts)
518
+ @method = meth
519
+ if opts.size > 0
520
+ @opts = opts
521
+ else
522
+ @opts = nil
523
+ end
524
+ end
525
+
526
+ # passes the top peptide hits per attributes that it is hashed by
527
+ # all hits with same score as top score are returned
528
+ # assumes that all attributes are cast properly: Float,Integer, etc
529
+ # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
530
+ # deletes the protein array (but not relevant proteins)
531
+ # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
532
+ # returns self for chaining
533
+ # opts
534
+ # :per => Array of attributes e.g. [:first_scan, :charge] # TODO: allow lambda
535
+ # :by => an array for sort_by_attributes
536
+ # e.g. [:xcorr, :deltacn, :ppm, {:down => [:xcorr, :deltacn]}]
537
+ # :ties => *false | true | :as_array
538
+ # false - one top hit is selected by random (by sorting)
539
+ # true - all ties are included in final answer
540
+ # :as_array - ties are included as an array
541
+ def top_hit(peps, opts = {})
542
+
543
+ # get the top peptide by firstscan/charge (equivalent to .out files)
544
+ top_peps = []
545
+ #hash = peps.hash_by(*(opts[:per]))
546
+ per_array = opts[:per]
547
+ hash = peps.hash_by(*per_array)
548
+ ties = opts[:ties]
549
+ if ties == :as_array
550
+ as_array = true
551
+ end
552
+ hash.values.each do |v|
553
+ best_to_worst = v.sort_by_attributes(*(opts[:by]))
554
+ if ties
555
+
556
+ best_hit = best_to_worst.first
557
+ ## get the values that matter for the top hit
558
+ # here get the attributes we are considering
559
+ atts =
560
+ if opts[:by].last.is_a? Hash
561
+ opts[:by][0...-1]
562
+ else
563
+ opts[:by].dup
564
+ end
565
+ # find the best hits values
566
+ top_hit_vals = atts.map do |att|
567
+ best_hit.send(att)
568
+ end
569
+
570
+ tying_peps = []
571
+ best_to_worst.each do |pep|
572
+ tie = true
573
+ atts.each_with_index do |att,i|
574
+ unless (pep.send(att) == top_hit_vals[i])
575
+ tie = false
576
+ break
577
+ end
578
+ end
579
+ if tie
580
+ tying_peps << pep
581
+ else
582
+ break
583
+ end
584
+ end
585
+ if as_array
586
+ if tying_peps.size == 1
587
+ top_peps.push( *tying_peps )
588
+ else
589
+ top_peps.push( tying_peps )
590
+ end
591
+ else
592
+ top_peps.push( *tying_peps )
593
+ end
594
+ else
595
+ top_peps << best_to_worst.first
596
+ end
597
+ end
598
+ top_peps
599
+ end
600
+
601
+ # returns self for chaining
602
+ def standard_sequest_filter(peps, x1,x2,x3,deltacn,ppm,include_deltacnstar=true)
603
+ peps.select do |pep|
604
+ pep_deltacn = pep.deltacn
605
+ pep_charge = pep.charge
606
+
607
+ ## The outer parentheses are critical to getting the correct answer!
608
+ _passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
609
+
610
+ if _passing
611
+ if ((!include_deltacnstar) && (pep_deltacn > 1.0))
612
+ false
613
+ else
614
+ true
615
+ end
616
+ else
617
+ false
618
+ end
619
+ end
620
+ end
621
+
622
+ end
623
+
@@ -0,0 +1,60 @@
1
+
2
+
3
+ module SpecID ; end
4
+ module SpecID::Precision ; end
5
+
6
+ module SpecID::Precision::Output
7
+
8
+ # takes a format type (as symbol) and the handle to write to
9
+ # if handle_or_file is a file, will open it and close (on calling close)
10
+ # if it is a handle, will not close it
11
+ def initialize(format, handle_or_file)
12
+ @handle =
13
+ if handle_or_file.is_a? String
14
+ @need_to_close = true
15
+ File.open(handle_or_file, 'w')
16
+ else
17
+ @need_to_close = false
18
+ handle_or_file
19
+ end
20
+ @format = format
21
+ end
22
+
23
+ # returns self
24
+ def print(answer)
25
+ send( @format, @handle, answer )
26
+ self
27
+ end
28
+
29
+ # turns all keys that are symbols into strings (recursively into *Hashes*)
30
+ def self.symbol_keys_to_string(hash)
31
+ new_hash = {}
32
+ hash.each do |k,v|
33
+ new_value =
34
+ if v.is_a? Hash
35
+ symbol_keys_to_string(v)
36
+ else
37
+ v
38
+ end
39
+ if k.is_a? Symbol
40
+ new_hash[k.to_s] = new_value
41
+ else
42
+ new_hash[k] = new_value
43
+ end
44
+ end
45
+ new_hash
46
+ end
47
+
48
+ # TODO: implement recursively, this has just grown and grown terribly
49
+ def hash_as_string(hash)
50
+ hash.inspect
51
+ end
52
+
53
+ # will close the handle if it is a File object
54
+ def close
55
+ if @need_to_close
56
+ @handle.close
57
+ end
58
+ end
59
+
60
+ end