mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,623 @@
1
+ require 'sort_by_attributes'
2
+ require 'validator'
3
+ require 'spec_id'
4
+ require 'merge_deep'
5
+ require 'spec_id/precision/filter/interactive'
6
+ require 'spec_id/precision/filter/output'
7
+
8
+
9
+ class Filter
10
+
11
+ # filters using previously passed in methods and options
12
+ def filter(group)
13
+ if @opts
14
+ send(@method, group, *@opts)
15
+ else
16
+ send(@method, group)
17
+ end
18
+ end
19
+
20
+ # replaces the contents of group with what passed
21
+ def filter!(group)
22
+ group.replace(filter(group))
23
+ end
24
+ end
25
+
26
+
27
+
28
+ # we have to require this after we setup our defaults hash
29
+ # require 'filter/spec_id/cmdline'
30
+
31
+ class SpecID::Precision::Filter
32
+ FV_DEFAULTS = {
33
+ :sequest =>
34
+ {
35
+ :xcorr1 => 1.0,
36
+ :xcorr2 => 1.5,
37
+ :xcorr3 => 2.0,
38
+ :deltacn => 0.1,
39
+ :ppm => 1000,
40
+ :include_deltacnstar => true,
41
+ },
42
+
43
+
44
+ # output
45
+ :proteins => false,
46
+ :output => [],
47
+
48
+ # general
49
+ :top_hit_by => :xcorr,
50
+ :postfilter => :top_per_scan,
51
+ :prefilter => false,
52
+ :hits_together => true,
53
+
54
+ # These are also defaulted in the commandline because they are necessary
55
+ # for the validators... could this introduce conflicts somehow?
56
+ :decoy_on_match => true,
57
+ :ties => true,
58
+
59
+ # UNLISTED FOR NOW:
60
+ :include_ties_in_top_hit_prefilter => true,
61
+ :include_ties_in_top_hit_postfilter => false,
62
+ }
63
+
64
+ require 'spec_id/precision/filter/cmdline'
65
+
66
+ def filter_and_validate_cmdline(args)
67
+ (spec_id_obj, options, option_parser) = CmdlineParser.new.parse(args)
68
+ if spec_id_obj == nil
69
+ puts option_parser
70
+ return
71
+ end
72
+ final_answer = SpecID::Precision::Filter.new.filter_and_validate(spec_id_obj, options)
73
+ end
74
+
75
+ # # output_array has doublets of [format, handle]
76
+ # # answer is the answer one gets out of filter_and_validate
77
+ # def output(answer, output_array)
78
+ # output_array.each do |format, handle|
79
+ # SpecID::Precision::Filter::Output.new(format, handle)
80
+ # end
81
+ # end
82
+
83
+ # Very high level method that takes simple parameters.
84
+ # spec_id may be a filename or a SpecID object (containing peps)
85
+ # Default values may be queried from SpecID::Precision::Filter::FV_DEFAULTS
86
+ # Returns a structured hash:
87
+ # Fl = Float ; Ar = Array
88
+ # { :params => <Hash of filtering params>,
89
+ # :pephits => <Ar of pephits>,
90
+ # :pephits_precision => [<array of precision>]
91
+ # # if :proteins => true
92
+ # :prothits => <Array of prothits>,
93
+ # :prothits_precision => [ Array of hashes where each hash =
94
+ # { :worst => Fl, :normal => Fl,
95
+ # :normal_stdev => Fl } ]
96
+ # }
97
+ #
98
+ # NOTE: Brackets [] indicate an Array! The Bar '|' indicates another option.
99
+ # The asterik '*' is the default option.
100
+ #
101
+ # :sequest => {
102
+ # :xcorr1 -> >= (xcorr +1 charge state)
103
+ # :xcorr2 -> >= (xcorr +2 charge state)
104
+ # :xcorr3 -> >= (xcorr +3 charge state)
105
+ # :deltacn -> >= (delta cn)
106
+ # :ppm -> <= parts per million (Float)
107
+ # :include_deltacnstar => *true | false include deltacn (given at 1.1) of
108
+ # top hit with no 2nd hit
109
+ #
110
+ # }
111
+ # OUTPUT:
112
+ # :proteins => true | *false gives proteins (and validation)
113
+ # :output => [[format, FILENAME=nil],...] formats to output filtering results.
114
+ # can be used multiple times
115
+ # FILENAME is the filename to use
116
+ # if nil, then outputs to $stdout
117
+ # valid formats are:
118
+ # :text_table (default)
119
+ # :yaml (need to implement)
120
+ # :protein_summary (need to implement)
121
+ # :html_table (need to implement)
122
+ # default value =>
123
+ # [[:text_table,nil]]
124
+ #
125
+ # VALIDATION:
126
+ # :validators => [Array] objects that respond to pephit_precision
127
+ # usually of base class Validator
128
+ # NOTE: if you have decoy peptides, you MUST have
129
+ # a Validator::Decoy object to separate them out.
130
+ # NOTE: if transmem validator passed in, the
131
+ # proteins in spec_id must already be granted
132
+ # transmem status!
133
+ #
134
+ #
135
+ # OTHER:
136
+ # :top_hit_by -> *:xcorr | :probability
137
+ # probabilities only in bioworks.xml files right now (if
138
+ # they were calculated).
139
+ # :postfilter -> *:top_per_scan | :top_per_aaseq | :top_per_aaseq_charge
140
+ # :top_per_scan hashes by filename + scan
141
+ # :top_per_aaseq hashes by top_per_scan + aaseq
142
+ # :top_per_aaseq_charge hashes by top_per_aaseq + charge
143
+ # :prefilter -> true | *false Takes top hit per file+scan+charge
144
+ # :interactive => interactive_object
145
+ # # should behave like this:
146
+ # # interactive_object.filter_args(currentopts) -> args_for_filtering | nil (done)
147
+ #
148
+ # # interactive_object.passing(final_answer)
149
+
150
+ # The defaults for filter_and_validate
151
+
152
+ def filter_and_validate(spec_id_obj, options={})
153
+ # NOTE:
154
+ # This is a fairly complicated method. The complication comes in doing
155
+ # top hit filters on separate/cat searches wanted them to be either
156
+ # together or separate. I opt for fewer conversions between the two, but
157
+ # that means keeping track of more things...
158
+
159
+ opts = FV_DEFAULTS.merge_deep(options)
160
+
161
+ spec_id = spec_id_obj
162
+
163
+ peps = spec_id.peps
164
+ filename = spec_id.filename
165
+
166
+ #######################################
167
+ # DEFAULTS:
168
+ interactive_changing_keys = [:xcorr1, :xcorr2, :xcorr3, :deltacn, :ppm, :include_deltacnstar, :postfilter]
169
+ interactive_shortcut_map = {
170
+ :xcorr1 => 'x1',
171
+ :xcorr2 => 'x2',
172
+ :xcorr3 => 'x3',
173
+ :deltacn => 'dcn',
174
+ :ppm => 'ppm',
175
+ :include_deltacnstar => 'dcns',
176
+ :postfilter => 'pf',
177
+ }
178
+ to_float = proc {|x| x.to_f}
179
+ to_bool = proc do |x|
180
+ case x
181
+ when /^t/io
182
+ true
183
+ when /^f/io
184
+ false
185
+ when true
186
+ true
187
+ when false
188
+ false
189
+ else
190
+ nil
191
+ end
192
+ end
193
+ to_postfilter = proc do |x|
194
+ case x
195
+ when 's'
196
+ :top_per_scan
197
+ when 'a'
198
+ :top_per_aaseq
199
+ when 'ac'
200
+ :top_per_aaseq_charge
201
+ when Symbol
202
+ x
203
+ end
204
+ end
205
+ casting_map = {
206
+ :xcorr1 => to_float,
207
+ :xcorr2 => to_float,
208
+ :xcorr3 => to_float,
209
+ :deltacn => to_float,
210
+ :ppm => to_float,
211
+ :include_deltacnstar => to_bool,
212
+ :postfilter => to_postfilter,
213
+ }
214
+
215
+ # output:
216
+ # NOTE: BOOLEANS that are by default false do not need a default!!
217
+ # They will yield false on key lookup if no key or false!
218
+ # BOOLEANS that by default are true should be queried like this
219
+ # !(opts[:<option>] == false)
220
+
221
+ # open up each of the files for writing
222
+ if opts[:output]
223
+ outputs = opts[:output].map do |format, where|
224
+ if where == nil
225
+ where = $stdout
226
+ end
227
+ SpecID::Precision::Filter::Output.new(format, where)
228
+ end
229
+ end
230
+
231
+ postfilters_per_hash = {
232
+ :top_per_scan => [:base_name, :first_scan],
233
+ :top_per_aaseq => [:aaseq], # first by top_per_scan, then this guy
234
+ :top_per_aaseq_charge => [:aaseq, :charge], # first by top_per_scan, then this one
235
+ }
236
+
237
+ top_hit_by__to_sort_by = {
238
+ :xcorr => [:xcorr, {:down=> [:xcorr]}],
239
+ :probability => [:probability, (spec_id.hi_prob_best ? {:down=> [:probability]} : {})],
240
+ }
241
+ sort_by_att_opts = top_hit_by__to_sort_by[opts[:top_hit_by]]
242
+ opts_for_top_hit_prefilter = {
243
+ :per => [:base_name, :first_scan, :charge],
244
+ :by => sort_by_att_opts,
245
+ :include_ties => opts[:include_ties_in_top_hit_prefilter]
246
+ }
247
+ # PRIVATE DEFAULTS:
248
+ merge_prefix = 'DECOY_'
249
+ unmerge_regexp = /^DECOY_/
250
+
251
+ #######################################
252
+
253
+
254
+ # opts_decoy = opts[:decoy]
255
+
256
+
257
+
258
+ # if we have a Validator::Decoy object, we will use its defaults to split
259
+ # peptides.
260
+ decoy_validator =
261
+ if opts[:validators]
262
+ decoy_vals = opts[:validators].select {|v| v.class == Validator::Decoy }
263
+ if decoy_vals.size == 0
264
+ nil
265
+ elsif decoy_vals.size == 1
266
+ decoy_vals.first
267
+ else
268
+ raise ArgumentError, "can only have one Validator::Decoy object"
269
+ end
270
+
271
+ ### suck out the relevant parameters
272
+ #sep_params = [:decoy_on_match, :correct_wins].inject({}) do |hash,k|
273
+ # hash[k] = decoy_validator.send(k)
274
+ # hash
275
+ #end
276
+ else
277
+ nil
278
+ end
279
+
280
+ decoy_validator_to_split_with = nil
281
+
282
+ pep_sets =
283
+ if decoy_validator
284
+ if decoy_validator.constraint.is_a?(Regexp)
285
+ if opts[:hits_together]
286
+ decoy_validator_to_split_with = decoy_validator
287
+ [peps]
288
+ else
289
+ (target, decoy) = decoy_validator.partition(peps)
290
+ #(target, decoy) = SpecID.classify_by_prot(peps, opts_decoy, sep_params[:decoy_on_match], sep_params[:correct_wins])
291
+ [target, decoy]
292
+ end
293
+ elsif decoy_validator.constraint.is_a?(String) ## a Filename
294
+ decoy_peps = SpecID.new(decoy_validator.constraint).peps
295
+
296
+ if opts[:hits_together]
297
+ # we fake that the protein sets are together
298
+ decoy_validator_to_split_with = Validator::Decoy.new(unmerge_regexp)
299
+ decoy_peps.each do |pep|
300
+ pep.prots.each {|prt| prt.reference = merge_prefix + prt.reference }
301
+ end
302
+ [peps + decoy_peps] # wrap them so we get the target out
303
+ else
304
+ [peps, decoy_peps]
305
+ end
306
+ else
307
+ raise ArgumentError, "Decoy::Validator#constraint must be a Regexp or valid SpecID file"
308
+ end
309
+ else
310
+ [peps] # no decoy
311
+ end
312
+
313
+ if opts[:proteins]
314
+ protein_validator = Validator::ProtFromPep.new
315
+ end
316
+
317
+ ### TOP HITS PREFILTER < < TOP_HITS_TOGETHER > >
318
+ ###########################
319
+ # TOP HITS FILTER:
320
+ ###########################
321
+ # REALLY, this guy only exists for speed and memory consumption
322
+ # If we prefilter, we don't have to filter as many hits in every
323
+ # interactive round. I'd leave this guy out if I were doing only a
324
+ # sequest filter. (I should compare results with this filter and w/o)
325
+ # This guy is very tricky since we need to consider whether they are to be
326
+ # run together or separately and not do more work than we need
327
+ # get passed_target for any case (and passed_decoy if opts[:decoy])
328
+
329
+
330
+ top_hit_prefilter = SpecID::Precision::Filter::Peps.new(:top_hit, opts_for_top_hit_prefilter) if opts[:prefilter]
331
+
332
+ if top_hit_prefilter
333
+ pep_sets.map! do |pep_set|
334
+ top_hit_prefilter.filter(pep_set)
335
+ end
336
+ end
337
+
338
+ # prepare our top hit filter:
339
+ # since we are now modulating this guy, we need to create it fresh every
340
+ # time
341
+ top_per_scan_postfilter = SpecID::Precision::Filter::Peps.new(:top_hit,
342
+ :per => postfilters_per_hash[:top_per_scan],
343
+ :by => sort_by_att_opts,
344
+ :include_ties => opts[:include_ties_in_top_hit_postfilter])
345
+
346
+
347
+
348
+ # Prepare to loop
349
+ # Give interactive help once here if necessary
350
+ interactive = opts[:interactive]
351
+ if interactive
352
+ ARGV.clear
353
+ interactive.out(interactive.interactive_help(interactive_changing_keys, interactive_shortcut_map)) if interactive.verbose
354
+ end
355
+
356
+ # the loop is for if we are interactive
357
+ final_answer = nil
358
+ loop do
359
+
360
+ if interactive #interactive
361
+ # a bit of a hack, but we shove on the postfilter param to modulate
362
+ opts[:sequest][:postfilter] = opts[:postfilter]
363
+ response = interactive.filter_args(opts[:sequest], interactive_changing_keys, interactive_shortcut_map, casting_map)
364
+ opts[:postfilter] = opts[:sequest].delete(:postfilter)
365
+ break if response == nil
366
+ end
367
+
368
+ # prepare our top hit filter:
369
+ # since we are now modulating this guy, we need to create it fresh every
370
+ # time
371
+
372
+ sub_postfilter =
373
+ if opts[:postfilter] == :top_per_scan
374
+ nil
375
+ else
376
+ postfilter_per_args = postfilters_per_hash[opts[:postfilter]]
377
+ SpecID::Precision::Filter::Peps.new(:top_hit,
378
+ :per => postfilter_per_args,
379
+ :by => sort_by_att_opts,
380
+ :include_ties => opts[:include_ties_in_top_hit_postfilter]
381
+ )
382
+ end
383
+
384
+ pep_sets_to_be_filtered = pep_sets.map
385
+
386
+ ### SEQUEST < EITHER >
387
+ ###########################
388
+ # SEQUEST FILTER:
389
+ ###########################
390
+ # This guy is immune to the trickiness of top hits, so we just filter
391
+ # separately since validation is best done without decoys (except decoy)
392
+ sequest_args = opts[:sequest].values_at( :xcorr1, :xcorr2, :xcorr3, :deltacn, :ppm, :include_deltacnstar )
393
+ sequest_filter = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *sequest_args)
394
+
395
+ pep_sets_filtered = pep_sets_to_be_filtered.map do |pep_set|
396
+ sequest_filter.filter(pep_set)
397
+ end
398
+
399
+ ### FINAL HIT PER SCAN < < TOP_HITS_TOGETHER > >
400
+ ##########################
401
+ # FINAL HIT PER SCAN
402
+ ##########################
403
+ # Why not just do the top hit filter in the top hits pre filter before?
404
+ # Good question. Answer: We may have instances when the top hit (by
405
+ # xcorr) has some other poorer attribute than the hit at the other charge.
406
+ # In this case, we'd end up with no passing peptide.
407
+ # Also, the xcorr filter is per charge, so we may filter out the higher
408
+ # scoring peptide hit even though the other would pass based on its charge
409
+ # state, etc., etc....
410
+ # ###################################################
411
+ # NOTE THIS WELL:
412
+ # IF IT IS SUPPOSE TO be separate it's *ALREADY* separate, if together its
413
+ # *ALREADY* together!!!!
414
+ # the implication is that we don't need to do any merging or
415
+ # separating before we do this last filter!!!!
416
+ # ###################################################
417
+
418
+ # TODO: We need to add this guy in!
419
+ #if opts[:uniq_aa]
420
+ # pep_sets_filtered.map do |pep_set|
421
+ # end
422
+ #end
423
+
424
+ pep_sets_filtered.map! do |pep_set|
425
+ top_per_scan_postfilter.filter!(pep_set)
426
+ if sub_postfilter
427
+ sub_postfilter.filter!(pep_set)
428
+ else
429
+ pep_set
430
+ end
431
+ end
432
+
433
+ normal_post_filtered_peps = pep_sets_filtered.first
434
+
435
+ # separate the decoy's out if they are together
436
+ if decoy_validator_to_split_with # only set if opts[:hits_together]!!
437
+ (target, decoy) = decoy_validator_to_split_with.partition(normal_post_filtered_peps)
438
+ pep_sets_filtered = [target, decoy]
439
+ end
440
+
441
+ ### VALIDATION < SEPARATE >
442
+ pephit_precision_array = get_pephit_precision(opts[:validators], *pep_sets_filtered) if opts[:validators]
443
+
444
+ final_answer = {
445
+ :params => opts,
446
+ :pephits => pep_sets_filtered.first,
447
+ }
448
+ if pephit_precision_array
449
+ final_answer[:pephits_precision] = pephit_precision_array
450
+ end
451
+
452
+ if opts[:proteins]
453
+ protein_precision_array = peptide_precision_to_protein_precision(protein_validator, normal_post_filtered_peps, pephit_precision_array)
454
+ # this could be factored out (since we do it in protein_precision)
455
+
456
+ # merge the final prots into a unique set:
457
+ final_answer[:prothits] = normal_post_filtered_peps.inject(Set.new) do |protset, pep|
458
+ protset.merge(pep.prots)
459
+ end
460
+ final_answer[:prothits_precision] = protein_precision_array
461
+ end
462
+
463
+ ## output the output
464
+ outputs.each {|output| output.print(final_answer) }
465
+
466
+ if interactive
467
+ interactive.passing(opts, final_answer)
468
+ end
469
+
470
+ if !interactive
471
+ break
472
+ end
473
+ end
474
+ # Close the filehandles
475
+ outputs.each { |output| output.close } if opts[:output]
476
+ final_answer
477
+ end
478
+
479
+ # takes peps and a peptide_precision_hash. Returns a hash with the same
480
+ # keys of peptide_precision_hash where the value is a hash with these keys:
481
+ # :worst => worstcase protein precision
482
+ # :normal => estimaton by binomial/gaussian method (optimistic)
483
+ # :normal_stdev => the stdev of the normal method
484
+ def peptide_precision_to_protein_precision(protein_validator, peps, peptide_precision_array, round_num_false=:ceil)
485
+ peptide_precision_array.map do |precision|
486
+ num_false = ((1.0 - precision) * peps.size).ceil
487
+ reply = protein_validator.prothit_precision(peps, num_false)
488
+ hash = {}
489
+ %w(worst normal normal_stdev).zip(reply) do |label, answer|
490
+ hash[label.to_sym] = answer
491
+ end
492
+ hash
493
+ end
494
+ end
495
+
496
+ # takes an array of validator objects and peps (already separated out from
497
+ # decoys; the decoy's can be passed in
498
+ # returns an array of results
499
+ def get_pephit_precision(validators, peps, decoy_peps=nil, grant_transmem_status=false)
500
+ validators.map do |validator|
501
+ if validator.class == Validator::Decoy
502
+ validator.pephit_precision(peps, decoy_peps)
503
+ else
504
+ validator.pephit_precision(peps)
505
+ end
506
+ end
507
+ end
508
+ end
509
+
510
+ class SpecID::Precision::Filter::Peps < Filter
511
+
512
+ # can pass in the method to call. If you have static options and you will
513
+ # reuse your filter, you can pass them in here.
514
+ # BEWARE: this will override any passed into the method at filter time.
515
+ # If you need to do that, make a new, blank filter and pass in your args
516
+ # at filter time
517
+ def initialize(meth=nil, *opts)
518
+ @method = meth
519
+ if opts.size > 0
520
+ @opts = opts
521
+ else
522
+ @opts = nil
523
+ end
524
+ end
525
+
526
+ # passes the top peptide hits per attributes that it is hashed by
527
+ # all hits with same score as top score are returned
528
+ # assumes that all attributes are cast properly: Float,Integer, etc
529
+ # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
530
+ # deletes the protein array (but not relevant proteins)
531
+ # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
532
+ # returns self for chaining
533
+ # opts
534
+ # :per => Array of attributes e.g. [:first_scan, :charge] # TODO: allow lambda
535
+ # :by => an array for sort_by_attributes
536
+ # e.g. [:xcorr, :deltacn, :ppm, {:down => [:xcorr, :deltacn]}]
537
+ # :ties => *false | true | :as_array
538
+ # false - one top hit is selected by random (by sorting)
539
+ # true - all ties are included in final answer
540
+ # :as_array - ties are included as an array
541
+ def top_hit(peps, opts = {})
542
+
543
+ # get the top peptide by firstscan/charge (equivalent to .out files)
544
+ top_peps = []
545
+ #hash = peps.hash_by(*(opts[:per]))
546
+ per_array = opts[:per]
547
+ hash = peps.hash_by(*per_array)
548
+ ties = opts[:ties]
549
+ if ties == :as_array
550
+ as_array = true
551
+ end
552
+ hash.values.each do |v|
553
+ best_to_worst = v.sort_by_attributes(*(opts[:by]))
554
+ if ties
555
+
556
+ best_hit = best_to_worst.first
557
+ ## get the values that matter for the top hit
558
+ # here get the attributes we are considering
559
+ atts =
560
+ if opts[:by].last.is_a? Hash
561
+ opts[:by][0...-1]
562
+ else
563
+ opts[:by].dup
564
+ end
565
+ # find the best hits values
566
+ top_hit_vals = atts.map do |att|
567
+ best_hit.send(att)
568
+ end
569
+
570
+ tying_peps = []
571
+ best_to_worst.each do |pep|
572
+ tie = true
573
+ atts.each_with_index do |att,i|
574
+ unless (pep.send(att) == top_hit_vals[i])
575
+ tie = false
576
+ break
577
+ end
578
+ end
579
+ if tie
580
+ tying_peps << pep
581
+ else
582
+ break
583
+ end
584
+ end
585
+ if as_array
586
+ if tying_peps.size == 1
587
+ top_peps.push( *tying_peps )
588
+ else
589
+ top_peps.push( tying_peps )
590
+ end
591
+ else
592
+ top_peps.push( *tying_peps )
593
+ end
594
+ else
595
+ top_peps << best_to_worst.first
596
+ end
597
+ end
598
+ top_peps
599
+ end
600
+
601
+ # returns self for chaining
602
+ def standard_sequest_filter(peps, x1,x2,x3,deltacn,ppm,include_deltacnstar=true)
603
+ peps.select do |pep|
604
+ pep_deltacn = pep.deltacn
605
+ pep_charge = pep.charge
606
+
607
+ ## The outer parentheses are critical to getting the correct answer!
608
+ _passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
609
+
610
+ if _passing
611
+ if ((!include_deltacnstar) && (pep_deltacn > 1.0))
612
+ false
613
+ else
614
+ true
615
+ end
616
+ else
617
+ false
618
+ end
619
+ end
620
+ end
621
+
622
+ end
623
+
@@ -0,0 +1,60 @@
1
+
2
+
3
+ module SpecID ; end
4
+ module SpecID::Precision ; end
5
+
6
+ module SpecID::Precision::Output
7
+
8
+ # takes a format type (as symbol) and the handle to write to
9
+ # if handle_or_file is a file, will open it and close (on calling close)
10
+ # if it is a handle, will not close it
11
+ def initialize(format, handle_or_file)
12
+ @handle =
13
+ if handle_or_file.is_a? String
14
+ @need_to_close = true
15
+ File.open(handle_or_file, 'w')
16
+ else
17
+ @need_to_close = false
18
+ handle_or_file
19
+ end
20
+ @format = format
21
+ end
22
+
23
+ # returns self
24
+ def print(answer)
25
+ send( @format, @handle, answer )
26
+ self
27
+ end
28
+
29
+ # turns all keys that are symbols into strings (recursively into *Hashes*)
30
+ def self.symbol_keys_to_string(hash)
31
+ new_hash = {}
32
+ hash.each do |k,v|
33
+ new_value =
34
+ if v.is_a? Hash
35
+ symbol_keys_to_string(v)
36
+ else
37
+ v
38
+ end
39
+ if k.is_a? Symbol
40
+ new_hash[k.to_s] = new_value
41
+ else
42
+ new_hash[k] = new_value
43
+ end
44
+ end
45
+ new_hash
46
+ end
47
+
48
+ # TODO: implement recursively, this has just grown and grown terribly
49
+ def hash_as_string(hash)
50
+ hash.inspect
51
+ end
52
+
53
+ # will close the handle if it is a File object
54
+ def close
55
+ if @need_to_close
56
+ @handle.close
57
+ end
58
+ end
59
+
60
+ end