mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,1513 @@
1
+
2
+ require 'sample_enzyme'
3
+ require 'ms/parser/mzxml'
4
+ require 'hash_by'
5
+ require 'set_from_hash'
6
+ require 'spec_id/bioworks'
7
+ require 'instance_var_set_from_hash'
8
+ require 'ms/msrun'
9
+ require 'spec_id/srf'
10
+ require 'spec_id/sequest/params'
11
+ require 'fileutils'
12
+
13
+ class Numeric
14
+ # returns a string with a + or - on the front
15
+ def to_plus_minus_string
16
+ if self >= 0
17
+ '+' << self.to_s
18
+ else
19
+ self.to_s
20
+ end
21
+ end
22
+ end
23
+
24
+
25
+ module Sequest ; end
26
+ class Sequest::PepXML; end
27
+
28
+ class Sequest::PepXML::MSMSPipelineAnalysis
29
+ include SpecIDXML
30
+ # Version 1.2.3
31
+ attr_writer :date
32
+ attr_writer :xmlns, :xmlns_xsi, :xsi_schemaLocation
33
+ attr_accessor :summary_xml
34
+ # Version 2.3.4
35
+ attr_writer :xmlns, :xmlns_xsi, :xsi_schema_location
36
+ attr_accessor :pepxml_version
37
+ attr_accessor :msms_run_summary
38
+
39
+ # if block given, sets msms_run_summary to block
40
+ def initialize(hash=nil)
41
+ @xmlns = nil
42
+ @xmlns_xsi = nil
43
+ @xsi_schema_location = nil
44
+ if hash
45
+ self.set_from_hash(hash)
46
+ end
47
+ if block_given?
48
+ @msms_run_summary = yield
49
+ end
50
+ end
51
+
52
+ # if no date string given, then it will set to Time.now
53
+ def date
54
+ if @date ; @date
55
+ else
56
+ case Sequest::PepXML.pepxml_version
57
+ when 18 ; tarr = Time.now.to_a ; tarr[3..5].reverse.join('-') + "T#{tarr[0..2].reverse.join(':')}"
58
+ end
59
+ end
60
+ end
61
+
62
+ def xmlns
63
+ if @xmlns ; @xmlns
64
+ else ; "http://regis-web.systemsbiology.net/pepXML"
65
+ end
66
+ end
67
+
68
+ def xmlns_xsi
69
+ if @xmlns_xsi ; @xmlns_xsi
70
+ else ; "http://www.w3.org/2001/XMLSchema-instance"
71
+ end
72
+ end
73
+
74
+ def xsi_schema_location
75
+ if @xsi_schema_location ; @xsi_schema_location
76
+ else ; "http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd"
77
+ end
78
+ end
79
+
80
+ def to_pepxml
81
+ case Sequest::PepXML.pepxml_version
82
+ when 18
83
+ element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do
84
+ @msms_run_summary.to_pepxml
85
+ end
86
+ else
87
+ abort "Don't know how to deal with version: #{Sequest::PepXML.pepxml_version}"
88
+ end
89
+ end
90
+
91
+ end
92
+
93
+ class Sequest::PepXML::MSMSRunSummary
94
+ include SpecID
95
+ include SpecIDXML
96
+
97
+ # the version of TPP you are using (determines xml output)
98
+ # The name of the pep xml file (without extension) (but this is a long
99
+ # filename!!!)
100
+ attr_accessor :base_name
101
+ # The name of the mass spec manufacturer
102
+ attr_accessor :ms_manufacturer
103
+ attr_accessor :ms_model
104
+ attr_accessor :ms_mass_analyzer
105
+ attr_accessor :ms_detector
106
+ attr_accessor :raw_data_type
107
+ attr_accessor :raw_data
108
+ attr_accessor :ms_ionization
109
+ attr_accessor :pepxml_version
110
+
111
+ # A SampleEnzyme object (responds to: name, cut, no_cut, sense)
112
+ attr_accessor :sample_enzyme
113
+ # A SearchSummary object
114
+ attr_accessor :search_summary
115
+ # An array of spectrum_queries
116
+ attr_accessor :spectrum_queries
117
+
118
+ # takes a hash of name, value pairs
119
+ # if block given, spectrum_queries (should be array of spectrum queries) is
120
+ # set to the return value of the block
121
+ def initialize(hash=nil)
122
+ @spectrum_queries = []
123
+ if hash
124
+ instance_var_set_from_hash(hash)
125
+ end
126
+ if block_given? ; @spectrum_queries = yield end
127
+ end
128
+
129
+ def to_pepxml
130
+ case Sequest::PepXML.pepxml_version
131
+ when 18
132
+ element_xml_and_att_string(:msms_run_summary, "base_name=\"#{base_name}\" msManufacturer=\"#{ms_manufacturer}\" msModel=\"#{ms_model}\" msIonization=\"#{ms_ionization}\" msMassAnalyzer=\"#{ms_mass_analyzer}\" msDetector=\"#{ms_detector}\" raw_data_type=\"#{raw_data_type}\" raw_data=\"#{raw_data}\"") do
133
+ sample_enzyme.to_pepxml +
134
+ search_summary.to_pepxml +
135
+ spectrum_queries.map {|sq| sq.to_pepxml }.join
136
+ end
137
+ end
138
+ end
139
+
140
+ def search_hit_class
141
+ Sequest::PepXML::SearchHit
142
+ end
143
+
144
+ def self.from_pepxml_node(node)
145
+ self.new.from_pepxml_node(node)
146
+ end
147
+
148
+ # peps correspond to search_results
149
+ def from_pepxml_node(node)
150
+ @base_name = node['base_name']
151
+ @ms_manufacturer = node['msManufacturer']
152
+ @ms_model = node['msModel']
153
+ @ms_manufacturer = node['msIonization']
154
+ @ms_mass_analyzer = node['msMassAnalyzer']
155
+ @ms_detector = node['msDetector']
156
+ @raw_data_type = node['raw_data_type']
157
+ @raw_data = node['raw_data']
158
+
159
+ sample_enzyme_n = node.find_first("child::sample_enzyme")
160
+ @sample_enzyme = SampleEnzyme.from_pepxml_node(sample_enzyme_n)
161
+
162
+ search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
163
+ spectrum_queries = search_summary_n.find("following-sibling::spectrum_query")
164
+ @spectrum_queries = spectrum_queries.map do |sq_n|
165
+ Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n, self)
166
+ end
167
+
168
+ ## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
169
+ #into a better search summary object (to eventually depracate the params object)
170
+ @search_summary = node ## in future call SearchSummary.from_pepxml_node
171
+ @peps = []
172
+ @spectrum_queries.each do |sq|
173
+ sq.search_results.each do |sr|
174
+ @peps.push( *(sr.search_hits) )
175
+ end
176
+ end
177
+ self
178
+ end
179
+ end
180
+
181
+
182
+
183
+ class Sequest::PepXML
184
+ include SpecIDXML
185
+
186
+ ## CREATE a default version for the entire class
187
+ class << self
188
+ attr_accessor :pepxml_version
189
+ end
190
+ DEF_VERSION = 18
191
+ self.pepxml_version = DEF_VERSION # default version
192
+
193
+ attr_accessor :pepxml_version, :msms_pipeline_analysis
194
+ ## the full path name (no extension)
195
+ attr_accessor :base_name
196
+ attr_accessor :h_plus
197
+ attr_accessor :avg_parent
198
+
199
+ #attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
200
+
201
+ # returns an array of spectrum queries
202
+ def spectrum_queries
203
+ msms_pipeline_analysis.msms_run_summary.spectrum_queries
204
+ end
205
+
206
+ # msms_pipeline_analysis is set to the result of the yielded block
207
+ # and set_mono_or_avg is called with params if given
208
+ def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
209
+ self.class.pepxml_version = pepxml_version
210
+ if sequest_params_obj
211
+ set_mono_or_avg(sequest_params_obj)
212
+ end
213
+ if block_given?
214
+ @msms_pipeline_analysis = yield
215
+ @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
216
+ end
217
+ end
218
+
219
+ # sets @h_plus and @avg_parent from the sequest params object
220
+ def set_mono_or_avg(sequest_params_obj)
221
+ case sequest_params_obj.precursor_mass_type
222
+ when "monoisotopic" ; @avg_parent = false
223
+ else ; @avg_parent = true
224
+ end
225
+
226
+ case @avg_parent
227
+ when true ; @h_plus = SpecID::AVG[:h_plus]
228
+ when false ; @h_plus = SpecID::MONO[:h_plus]
229
+ end
230
+ end
231
+
232
+ def date
233
+ Time.new.to_s
234
+ end
235
+
236
+ def xml_version
237
+ '<?xml version="1.0" encoding="UTF-8"?>' + "\n"
238
+ end
239
+
240
+ # for pepxml_version == 0
241
+ def doctype
242
+ '<!DOCTYPE msms_pipeline_analysis SYSTEM "/usr/bin/msms_analysis3.dtd">' + "\n"
243
+ end
244
+
245
+ def style_sheet
246
+ case self.class.pepxml_version
247
+ when 18
248
+ '<?xml-stylesheet type="text/xsl" href="/tools/bin/TPP/tpp/schema/pepXML_std.xsl"?>'
249
+ end
250
+ end
251
+
252
+ def header
253
+ case self.class.pepxml_version
254
+ when 18 ; xml_version + style_sheet
255
+ end
256
+ end
257
+
258
+ # updates the private attrs _num_prots and _first_prot on bioworks pep
259
+ # objects. Ideally, we'd like these attributes to reside elsewhere, but for
260
+ # memory concerns, this is best for now.
261
+ def self._prot_num_and_first_prot_by_pep(pep_array)
262
+ pep_array.hash_by(:aaseq).each do |aasq, pep_arr|
263
+ prts = []
264
+ pep_arr.each { |pep| prts.push( *(pep.prots) ) }
265
+ prts.uniq!
266
+ _size = prts.size
267
+ pep_arr.each do |pep|
268
+ pep._num_prots = _size
269
+ pep._first_prot = prts.first
270
+ end
271
+ end
272
+ end
273
+
274
+
275
+ Default_Options = {
276
+ :out_path => '.',
277
+ #:backup_db_path => '.',
278
+ # a PepXML option
279
+ :pepxml_version => DEF_VERSION,
280
+ ## MSMSRunSummary options:
281
+ # string must be recognized in sample_enzyme.rb
282
+ # or create your own SampleEnzyme object
283
+ :ms_manufacturer => 'ThermoFinnigan',
284
+ :ms_model => 'LCQ Deca XP Plus',
285
+ :ms_ionization => 'ESI',
286
+ :ms_mass_analyzer => 'Ion Trap',
287
+ :ms_detector => 'UNKNOWN',
288
+ :ms_data => '.', # path to ms data files (raw or mzxml)
289
+ :raw_data_type => "raw",
290
+ :raw_data => ".mzXML", ## even if you don't have it?
291
+ ## SearchSummary options:
292
+ :out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
293
+ :out_data => ".tgz", ## may be srf??
294
+ :copy_mzxml => false, # copy the mzxml file to the out_path (create it if necessary)
295
+ :print => false, # print the objects to file
296
+ }
297
+
298
+ # will dynamically set :ms_model and :ms_mass_analyzer from srf info
299
+ # (ignoring defaults or anything passed in) for LTQ Orbitrap
300
+ # and LCQ Deca XP
301
+ # See SRF::Sequest::PepXML::Default_Options hash for defaults
302
+ # unless given, the out_path will be given as the path of the srf_file
303
+ # srf may be an object or a filename
304
+ def self.new_from_srf(srf, opts={})
305
+ opts = Default_Options.merge(opts)
306
+
307
+ ## read the srf file
308
+ if srf.is_a? String
309
+ srf = SRF.new(srf)
310
+ end
311
+
312
+ ## set the outpath
313
+ out_path = opts.delete(:out_path)
314
+
315
+ params = srf.params
316
+
317
+ ## check to see if we need backup_db
318
+ backup_db_path = opts.delete(:backup_db_path)
319
+ if !File.exist?(params.database) && backup_db_path
320
+ params.database_path = backup_db_path
321
+ end
322
+
323
+ #######################################################################
324
+ # PREPARE THE OPTIONS:
325
+ #######################################################################
326
+ ## remove items from the options hash that don't belong to
327
+ ppxml_version = opts.delete(:pepxml_version)
328
+ out_data_type = opts.delete(:out_data_type)
329
+ out_data = opts.delete(:out_data)
330
+
331
+ ## Extract meta info from srf
332
+ bn_noext = base_name_noext(srf.header.raw_filename)
333
+ opts[:ms_model] = srf.header.model
334
+ case opts[:ms_model]
335
+ when /Orbitrap/
336
+ opts[:ms_mass_analyzer] = 'Orbitrap'
337
+ when /LCQ Deca XP/
338
+ opts[:ms_mass_analyzer] = 'Ion Trap'
339
+ end
340
+
341
+ ## Create the base name
342
+ full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
343
+ opts[:base_name] = full_base_name_no_ext
344
+
345
+ ## Create the search summary:
346
+ search_summary_options = {
347
+ :search_database => Sequest::PepXML::SearchDatabase.new(params),
348
+ :base_name => full_base_name_no_ext,
349
+ :out_data_type => out_data_type,
350
+ :out_data => out_data
351
+ }
352
+ modifications_string = srf.header.modifications
353
+ search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
354
+
355
+ # create the sample enzyme from the params object:
356
+ opts[:sample_enzyme] = params.sample_enzyme
357
+
358
+ ## Create the pepxml obj and top level objects
359
+ pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
360
+ pipeline = Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'})
361
+ pepxml_obj.msms_pipeline_analysis = pipeline
362
+ pipeline.msms_run_summary = Sequest::PepXML::MSMSRunSummary.new(opts)
363
+ pipeline.msms_run_summary.search_summary = search_summary
364
+ modifications_obj = search_summary.modifications
365
+
366
+ ## name some common variables we'll need
367
+ h_plus = pepxml_obj.h_plus
368
+ avg_parent = pepxml_obj.avg_parent
369
+
370
+
371
+ ## COPY MZXML FILES IF NECESSARY
372
+ if opts[:copy_mzxml]
373
+ mzxml_pathname_noext = File.join(opts[:ms_data], bn_noext)
374
+ to_copy = MS::Converter::MzXML.file_to_mzxml(mzxml_pathname_noext)
375
+ if to_copy
376
+ FileUtils.cp to_copy, out_path
377
+ else
378
+ puts "Couldn't file mzXML file with base: #{mzxml_pathname_noext}"
379
+ puts "Perhaps you need to specifiy the location of the raw data"
380
+ puts "or need an mzXML converter (readw or t2x)"
381
+ exit
382
+ end
383
+ end
384
+
385
+
386
+ #######################################################################
387
+ # CREATE the spectrum_queries_ar
388
+ #######################################################################
389
+ srf_index = srf.index
390
+ out_files = srf.out_files
391
+ spectrum_queries_arr = Array.new(srf.dta_files.size)
392
+ files_with_hits_index = 0 ## will end up being 1 indexed
393
+ srf.dta_files.each_with_index do |dta_file,i|
394
+ next if out_files[i].num_hits == 0
395
+ files_with_hits_index += 1
396
+
397
+ # We don't need to sort the hits by xcorr since it comes pre-sorted in
398
+ # srf files!
399
+ #arr = hits.sort_by{|v| v.xcorr }
400
+
401
+ # Get proper deltacn and deltacnstar
402
+ # under new srf, deltacn is already corrected for what prophet wants,
403
+ # deltacn_orig is how to access the old one
404
+ # Prophet deltacn is not the same as the native Sequest deltacn
405
+ # It is the deltacn of the second best hit!
406
+
407
+ hits = out_files[i].hits
408
+ top_hit = hits[0]
409
+ second_hit = hits[1]
410
+ deltacnstar =
411
+ if second_hit ; '0'
412
+ else ; '1'
413
+ end
414
+
415
+ ## mass calculations:
416
+ precursor_neutral_mass = dta_file.mh - h_plus
417
+ calc_neutral_pep_mass = top_hit[0] - h_plus
418
+
419
+ (start_scan, end_scan, charge) = srf_index[i]
420
+
421
+
422
+
423
+ sq_hash = {
424
+ :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
425
+ :start_scan => start_scan,
426
+ :end_scan => end_scan,
427
+ :precursor_neutral_mass => precursor_neutral_mass,
428
+ :assumed_charge => charge.to_i,
429
+ :pepxml_version => ppxml_version,
430
+ :index => files_with_hits_index,
431
+ }
432
+
433
+ spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
434
+
435
+ sequence = top_hit.sequence
436
+
437
+ # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
438
+ ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
439
+ (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
440
+ # 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn
441
+
442
+ sh_hash = {
443
+ :hit_rank => 1,
444
+ :peptide => pepseq,
445
+ :peptide_prev_aa => prevaa,
446
+ :peptide_next_aa => nextaa,
447
+ :protein => top_hit[10].first.reference.split(" ").first,
448
+ :num_tot_proteins => top_hit[10].size,
449
+ :num_matched_ions => top_hit[7],
450
+ :tot_num_ions => top_hit[8],
451
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
452
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
453
+ :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, sequence),
454
+ :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, sequence),
455
+ :is_rejected => 0,
456
+ # These are search score attributes:
457
+ :xcorr => top_hit[3],
458
+ :deltacn => top_hit[19],
459
+ :deltacnstar => deltacnstar,
460
+ :spscore => top_hit[2],
461
+ :sprank => top_hit[6],
462
+ :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
463
+ }
464
+ search_hit = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
465
+
466
+ search_result = Sequest::PepXML::SearchResult.new
467
+ search_result.search_hits = [search_hit]
468
+ spectrum_query.search_results = [search_result]
469
+ spectrum_queries_arr[files_with_hits_index] = spectrum_query
470
+ end
471
+ spectrum_queries_arr.compact!
472
+
473
+ pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
474
+ pepxml_obj.base_name = pipeline.msms_run_summary.base_name
475
+ pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
476
+
477
+ pepxml_obj
478
+ end
479
+
480
+ # takes an .srg or bioworks.xml file
481
+ # if possible, ensures that an mzXML file is present for each pepxml file
482
+ # :print => true, will print files
483
+ def self.set_from_bioworks(bioworks_file, opts={})
484
+ opts = Default_Options.merge(opts)
485
+ ## Create the out_path directory if necessary
486
+
487
+ unless File.exist? opts[:out_path]
488
+ FileUtils.mkpath(opts[:out_path])
489
+ end
490
+ unless File.directory? opts[:out_path]
491
+ abort "#{opts[:out_path]} must be a directory!"
492
+ end
493
+
494
+ spec_id = SpecID.new(bioworks_file)
495
+ pepxml_objs =
496
+ if spec_id.is_a? Bioworks
497
+ abort("must have opts[:params] set!") unless opts[:params]
498
+ set_from_bioworks_xml(bioworks_file, opts[:params], opts)
499
+ elsif spec_id.is_a? SRFGroup
500
+ spec_id.srfs.map do |srf|
501
+ new_from_srf(srf, opts)
502
+ end
503
+ else
504
+ abort "invalid object"
505
+ end
506
+
507
+ if opts[:print]
508
+ pepxml_objs.each do |obj|
509
+ obj.to_pepxml(obj.base_name + ".xml")
510
+ end
511
+ end
512
+ pepxml_objs
513
+ end
514
+
515
+
516
+ # Takes bioworks 3.2/3.3 xml output (with no filters)
517
+ # Returns a list of PepXML objects
518
+ # params = sequest.params file
519
+ # bioworks = bioworks.xml exported multi-consensus view file
520
+ # pepxml_version = 0 for tpp 1.2.3
521
+ # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
522
+ def self.set_from_bioworks_xml(bioworks, params, opts={})
523
+ opts = Default_Options.merge(opts)
524
+ pepxml_version, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
525
+
526
+ unless out_path
527
+ out_path = '.'
528
+ end
529
+
530
+ supported_versions = [18]
531
+
532
+ unless supported_versions.include?(opts[:pepxml_version])
533
+ abort "pepxml_version: #{pepxml_version} not currently supported. Current support is for versions #{supported_versions.join(', ')}"
534
+ end
535
+
536
+ ## Turn params and bioworks_obj into objects if necessary:
537
+ # Params:
538
+ if params.class == Sequest::Params # OK!
539
+ elsif params.class == String ; params = Sequest::Params.new(params)
540
+ else ; abort "Don't recognize #{params} as object or string!"
541
+ end
542
+ # Bioworks:
543
+ if bioworks.class == Bioworks # OK!
544
+ elsif bioworks.class == String ; bioworks = SpecID.new(bioworks)
545
+ else ; abort "Don't recognize #{bioworks} as object or string!"
546
+ end
547
+
548
+ #puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
549
+
550
+ ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
551
+ ## bioworks
552
+ #bioworks.peps.each { |pep| if pep.class != Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end }
553
+
554
+ ## check to see if we need backup_db
555
+
556
+ backup_db_path = opts.delete(:backup_db_path)
557
+ if !File.exist?(params.database) && backup_db_path
558
+ params.database_path = backup_db_path
559
+ end
560
+
561
+ ## Start
562
+ split_bio_objs = []
563
+
564
+ ## (num_prots_by_pep, prot_by_pep) =
565
+ #num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE"
566
+
567
+ modifications_string = bioworks.modifications
568
+
569
+ ## Create a hash of spectrum_query arrays by filename (this very big block):
570
+ spectrum_queries_by_base_name = {}
571
+ # Hash by the filenames to split into filenames:
572
+ pepxml_objects = bioworks.peps.hash_by(:base_name).map do |base_name, pep_arr|
573
+
574
+ search_summary = Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => Sequest::PepXML::SearchDatabase.new(params), :out_data_type => out_data_type, :out_data => out_data})
575
+ modifications_obj = search_summary.modifications
576
+
577
+ pepxml_obj = Sequest::PepXML.new(pepxml_version, params)
578
+ full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
579
+
580
+ case pepxml_version
581
+ when 18
582
+ pipeline = Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'})
583
+ msms_run_summary = Sequest::PepXML::MSMSRunSummary.new({
584
+ :base_name => full_base_name_no_ext,
585
+ :ms_manufacturer => ms_manufacturer,
586
+ :ms_model => ms_model,
587
+ :ms_ionization => ms_ionization,
588
+ :ms_mass_analyzer => ms_mass_analyzer,
589
+ :ms_detector => ms_detector,
590
+ :raw_data_type => raw_data_type,
591
+ :raw_data => raw_data,
592
+ :sample_enzyme => params.sample_enzyme,
593
+ :search_summary => search_summary,
594
+ })
595
+ pipeline.msms_run_summary = msms_run_summary
596
+ pepxml_obj.msms_pipeline_analysis = pipeline
597
+ pepxml_obj.msms_pipeline_analysis.msms_run_summary.search_summary.base_name = full_base_name_no_ext
598
+ pepxml_obj.base_name = full_base_name_no_ext
599
+ pepxml_obj
600
+ end
601
+
602
+ # Create a hash by pep object containing num_tot_proteins
603
+ # This is only valid if all hits are present (no previous thresholding)
604
+ # Since out2summary only acts on one folder at a time,
605
+ # we should only do it for one folder at a time! (that's why we do this
606
+ # here instead of globally)
607
+ self._prot_num_and_first_prot_by_pep(pep_arr)
608
+ prec_mz_arr = nil
609
+ case x = bioworks.version
610
+ when /3.2/
611
+ calc_prec_by = :prec_mz_arr
612
+ # get the precursor_mz array for this filename
613
+ mzxml_file = MS::Converter::MzXML.file_to_mzxml(File.join(ms_data, base_name))
614
+ prec_mz_arr = MS::MSRun.precursor_mz_by_scan_num(mzxml_file)
615
+ when /3.3/
616
+ calc_prec_by = :deltamass
617
+ else
618
+ abort "invalid BioworksBrowser version: #{x}"
619
+ end
620
+
621
+ if opts[:copy_mzxml]
622
+ to_copy = MS::Converter::MzXML.file_to_mzxml(File.join(ms_data, base_name))
623
+ if to_copy
624
+ FileUtils.cp to_copy, out_path
625
+ end
626
+ end
627
+
628
+
629
+ spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
630
+
631
+
632
+ # Sort_by_rank and take the top hit (to mimick out2summary):
633
+ arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
634
+ top_pep = arr.pop
635
+ second_hit = arr.last # needed for deltacnstar
636
+
637
+
638
+ case calc_prec_by
639
+ when :prec_mz_arr
640
+ precursor_neutral_mass = Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge, pepxml_obj.avg_parent)
641
+ when :deltamass
642
+ precursor_neutral_mass = Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
643
+ end
644
+
645
+ calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
646
+
647
+ # deltacn & star:
648
+ # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
649
+ if second_hit
650
+ #top_pep.deltacn = second_hit.deltacn
651
+ deltacnstar = '0'
652
+ else
653
+ top_pep.deltacn = '1.0'
654
+ deltacnstar = '1'
655
+ end
656
+ # Create the nested structure of queries{results{hits}}
657
+ # (Ruby's blocks work beautifully for things like this)
658
+ spec_query = Sequest::PepXML::SpectrumQuery.new({
659
+ :spectrum => [top_pep.base_name, top_pep.first_scan, top_pep.last_scan, top_pep.charge].join("."),
660
+ :start_scan => top_pep.first_scan,
661
+ :end_scan => top_pep.last_scan,
662
+ :precursor_neutral_mass => precursor_neutral_mass,
663
+ :assumed_charge => top_pep.charge,
664
+ :pepxml_version => pepxml_version,
665
+ })
666
+
667
+
668
+ search_result = Sequest::PepXML::SearchResult.new
669
+ #puts "set MASSDIFF: "
670
+ #p precursor_neutral_mass - calc_neutral_pep_mass
671
+ ## Calculate some interdependent values;
672
+ # NOTE: the bioworks mass is reallyf M+H if two or more scans went
673
+ # into the search_hit; calc_neutral_pep_mass is simply the avg of
674
+ # precursor masses adjusted to be neutral
675
+ (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(top_pep.sequence)
676
+ (num_matched_ions, tot_num_ions) = Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
677
+ search_hit = Sequest::PepXML::SearchHit.new({
678
+ :hit_rank => 1,
679
+ :peptide => pepseq,
680
+ :peptide_prev_aa => prevaa,
681
+ :peptide_next_aa => nextaa,
682
+ :protein => top_pep._first_prot.reference.split(" ").first,
683
+ :num_tot_proteins => top_pep._num_prots,
684
+ :num_matched_ions => num_matched_ions,
685
+ :tot_num_ions => tot_num_ions,
686
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
687
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
688
+ :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_pep.sequence),
689
+ :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_pep.sequence),
690
+ :is_rejected => 0,
691
+ # These are search score attributes:
692
+ :xcorr => top_pep.xcorr,
693
+ :deltacn => top_pep.deltacn,
694
+ :deltacnstar => deltacnstar,
695
+ :spscore => top_pep.sp,
696
+ :sprank => top_pep.rsp,
697
+ :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
698
+ :spectrum_query => spec_query,
699
+ })
700
+ search_result.search_hits = [search_hit] # there can be multiple search hits
701
+ spec_query.search_results = [search_result] # can be multiple search_results
702
+ spec_query
703
+ end
704
+
705
+ # create an index by spectrum as results end up typically in out2summary
706
+ # (I really dislike this order, however)
707
+ spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
708
+ spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
709
+ pipeline.msms_run_summary.spectrum_queries = spectrum_queries_ar
710
+ pepxml_obj
711
+ end ## collects pepxml_objs
712
+ # summary_xml is the short basename of the pepxml file (e.g., "020.xml")
713
+ pepxml_objects.sort_by {|obj| obj.summary_xml }
714
+ end
715
+
716
+ def summary_xml
717
+ base_name + ".xml"
718
+ end
719
+
720
+ def precursor_mass_type
721
+ @params.precursor_mass_type
722
+ end
723
+
724
+ def fragment_mass_type
725
+ @params.fragment_mass_type
726
+ end
727
+
728
+ # combines filename in a manner consistent with the path
729
+ def self.make_base_name(path, filename)
730
+ sep = '/'
731
+ if path.split('/').size < path.split("\\").size
732
+ sep = "\\"
733
+ end
734
+ if path.split('').last == sep
735
+ path + File.basename(filename)
736
+ else
737
+ path + sep + File.basename(filename)
738
+ end
739
+ end
740
+
741
+ # outputs pepxml, (to file if given)
742
+ def to_pepxml(file=nil)
743
+ string = header
744
+ string << @msms_pipeline_analysis.to_pepxml
745
+
746
+ if file
747
+ File.open(file, "w") do |fh| fh.print string end
748
+ end
749
+ string
750
+ end
751
+
752
+ # given any kind of filename (from windows or whatever)
753
+ # returns the base of the filename with no file extension
754
+ def self.base_name_noext(file)
755
+ file.gsub!("\\", '/')
756
+ File.basename(file).sub(/\.[\w^\.]+$/, '')
757
+ end
758
+
759
+
760
+ end # PepXML
761
+
762
+
763
+ class Sequest::PepXML::SearchResult
764
+ include SpecIDXML
765
+ # an array of search_hits
766
+ attr_accessor :search_hits
767
+
768
+ # if block given, then search_hits set to return value
769
+ def initialize
770
+ if block_given? ; @search_hits = yield
771
+ else ; @search_hits = [] end
772
+ end
773
+
774
+ def to_pepxml
775
+ element_xml_no_atts(:search_result) do
776
+ @search_hits.map {|sh| sh.to_pepxml }.join
777
+ end
778
+ end
779
+
780
+ def self.from_pepxml_node(node, spec_query)
781
+ self.new.from_pepxml_node(node, spec_query)
782
+ end
783
+
784
+ def from_pepxml_node(node, spec_query, msmsrun_obj)
785
+ sh_klass = msmsrun_obj.search_hit_class
786
+ @search_hits = node.children.map do |sh_n|
787
+ sh_klass.from_pepxml_node(sh_n, spec_query)
788
+ end
789
+ self
790
+ end
791
+ end
792
+
793
+ class Sequest::PepXML::SearchSummary
794
+ include SpecIDXML
795
+ attr_accessor :params
796
+ attr_accessor :base_name
797
+ attr_accessor :out_data_type
798
+ attr_accessor :out_data
799
+ # by default, "1"
800
+ attr_accessor :search_id
801
+ attr_accessor :modifications
802
+ # A SearchDatabase object (responds to :local_path and :type)
803
+ attr_accessor :search_database
804
+ # if given a sequest params object, then will set the following attributes:
805
+ # args is a hash of parameters
806
+ # modifications_string -> See Modifications
807
+ def initialize(prms=nil, modifications_string='', args=nil)
808
+ @search_id = "1"
809
+ if prms
810
+ @params = prms
811
+ @modifications = Sequest::PepXML::Modifications.new(prms, modifications_string)
812
+ end
813
+ if args ; set_from_hash(args) end
814
+ end
815
+
816
+ def method_missing(symbol, *args)
817
+ if @params ; @params.send(symbol, *args) end
818
+ end
819
+
820
+ def to_pepxml
821
+ element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
822
+ search_database.to_pepxml +
823
+ short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
824
+ @modifications.to_pepxml +
825
+ Sequest::PepXML::Parameters.new(@params).to_pepxml
826
+ end
827
+ end
828
+
829
+ def self.from_pepxml_node(node)
830
+ self.new.from_pepxml_node(node)
831
+ end
832
+
833
+ def from_pepxml_node(node)
834
+ raise NotImplementedError, "right now we just have the xml node at your disposal"
835
+ end
836
+
837
+ end
838
+
839
+ class Sequest::PepXML::Parameters
840
+ include SpecIDXML
841
+
842
+ attr_accessor :params
843
+
844
+ def initialize(obj=nil)
845
+ @params = obj
846
+ end
847
+ # (used to be called pepxml_parameters)
848
+ # Returns xml in the form <parameter name="#{method_name}"
849
+ # value="#{method_value}"/> for list of symbols
850
+ def to_pepxml
851
+ keys_as_symbols = @params.opts.sort.map do |k,v| k.to_s end
852
+ params_xml(@params, *keys_as_symbols)
853
+ # (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
854
+ end
855
+ end
856
+
857
+ class Sequest::PepXML::Modifications
858
+ include SpecIDXML
859
+
860
+ # sequest params object
861
+ attr_accessor :params
862
+ # array holding AAModifications
863
+ attr_accessor :aa_mods
864
+ # array holding TerminalModifications
865
+ attr_accessor :term_mods
866
+ # a hash of all differential modifications present by aa_one_letter_symbol
867
+ # and special_symbol. This is NOT the mass difference but the total mass {
868
+ # 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
869
+ # the amino acid sequence, they are give the *differential* mass. The
870
+ # termini are given the special symbol as in sequest e.g. '[' => 12.22, #
871
+ # cterminus ']' => 14.55 # nterminus
872
+ attr_accessor :masses_by_diff_mod_hash
873
+ # a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
874
+ # values are the special_symbols
875
+ attr_accessor :mod_symbols_hash
876
+
877
+ # The modification symbols string looks like this:
878
+ # (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
879
+ # ct is cterminal peptide (differential)
880
+ # nt is nterminal peptide (differential)
881
+ # the C is just cysteine
882
+ # will set_modifications and masses_by_diff_mod hash
883
+ def initialize(params=nil, modification_symbols_string='')
884
+ @params = params
885
+ if @params
886
+ set_modifications(params, modification_symbols_string)
887
+ end
888
+ end
889
+
890
+ # set the masses_by_diff_mod and mod_symbols_hash from
891
+ def set_hashes(modification_symbols_string)
892
+
893
+ @mod_symbols_hash = {}
894
+ @masses_by_diff_mod = {}
895
+ if (modification_symbols_string == nil || modification_symbols_string == '')
896
+ return nil
897
+ end
898
+ table = @params.mass_table
899
+ modification_symbols_string.split(/\)\s+\(/).each do |mod|
900
+ if mod =~ /\(?(\w+)(.) (.[\d\.]+)\)?/
901
+ if $1 == 'ct' || $1 == 'nt'
902
+ mass_diff = $3.to_f
903
+ @masses_by_diff_mod[$2] = mass_diff
904
+ @mod_symbols_hash[[$1.to_sym, mass_diff]] = $2.dup
905
+ # changed from below to match tests, is this right?
906
+ # @mod_symbols_hash[[$1, mass_diff]] = $2.dup
907
+ else
908
+ symbol_string = $2.dup
909
+ mass_diff = $3.to_f
910
+ $1.split('').each do |aa|
911
+ aa_as_sym = aa.to_sym
912
+ @masses_by_diff_mod[aa+symbol_string] = mass_diff + table[aa_as_sym]
913
+ @mod_symbols_hash[[aa_as_sym, mass_diff]] = symbol_string
914
+ end
915
+ end
916
+ end
917
+ end
918
+ end
919
+
920
+ # given a bare peptide (no end pieces) returns a ModificationInfo object
921
+ # e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
922
+ # if there are no modifications, returns nil
923
+ def modification_info(peptide)
924
+ if @masses_by_diff_mod.size == 0
925
+ return nil
926
+ end
927
+ hash = {}
928
+ hash[:modified_peptide] = peptide.dup
929
+ hsh = @masses_by_diff_mod
930
+ table = @params.mass_table
931
+ h = table[:h] # this? or h_plus ??
932
+ oh = table[:o] + h
933
+ ## only the termini can match a single char
934
+ if hsh.key? peptide[0,1]
935
+ # AA + H + differential_mod
936
+ hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
937
+ peptide = peptide[1...(peptide.size)]
938
+ end
939
+ if hsh.key? peptide[(peptide.size-1),1]
940
+ # AA + OH + differential_mod
941
+ hash[:mod_cterm_mass] = table[peptide[(peptide.size-2),1].to_sym] + oh + hsh[peptide[-1,1]]
942
+ peptide.slice!( 0..-2 )
943
+ peptide = peptide[0...(peptide.size-1)]
944
+ end
945
+ mod_array = []
946
+ (0...peptide.size).each do |i|
947
+ if hsh.key? peptide[i,2]
948
+ mod_array << Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([ i+1 , hsh[peptide[i,2]] ])
949
+ end
950
+ end
951
+ if mod_array.size > 0
952
+ hash[:mod_aminoacid_masses] = mod_array
953
+ end
954
+ if hash.size > 1 # if there is more than just the modified peptide there
955
+ Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
956
+ #Sequest::PepXML::SearchHit::ModificationInfo.new(hash.values_at(:modified_peptide, :mod_aminoacid_masses, :mod_nterm_mass, :mod_cterm_mass)
957
+ else
958
+ nil
959
+ end
960
+ end
961
+
962
+ # returns an array of static mod objects and static terminal mod objects
963
+ def create_static_mods(params)
964
+
965
+ ####################################
966
+ ## static mods
967
+ ####################################
968
+
969
+ static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
970
+ static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
971
+
972
+ params.mods.each do |k,v|
973
+ v_to_f = v.to_f
974
+ if v_to_f != 0.0
975
+ if k =~ /add_(\w)_/
976
+ static_mods << [$1.to_sym, v_to_f]
977
+ else
978
+ static_terminal_mods << [k, v_to_f]
979
+ end
980
+ end
981
+ end
982
+ aa_hash = params.mass_table
983
+
984
+ ## Create the static_mods objects
985
+ static_mods.map! do |mod|
986
+ hash = {
987
+ :aminoacid => mod[0].to_s,
988
+ :massdiff => mod[1],
989
+ :mass => aa_hash[mod[0]] + mod[1],
990
+ :variable => 'N',
991
+ :binary => 'Y',
992
+ }
993
+ Sequest::PepXML::AAModification.new(hash)
994
+ end
995
+
996
+ ## Create the static_terminal_mods objects
997
+ static_terminal_mods.map! do |mod|
998
+ terminus = if mod[0] =~ /Cterm/ ; 'c'
999
+ else ; 'n' # only two possible termini
1000
+ end
1001
+ protein_terminus = case mod[0]
1002
+ when /Nterm_protein/ ; 'n'
1003
+ when /Cterm_protein/ ; 'c'
1004
+ else nil
1005
+ end
1006
+
1007
+ # create the hash
1008
+ hash = {
1009
+ :terminus => terminus,
1010
+ :massdiff => mod[1],
1011
+ :variable => 'N',
1012
+ :description => mod[0],
1013
+ }
1014
+ hash[:protein_terminus] = protein_terminus if protein_terminus
1015
+ Sequest::PepXML::TerminalModification.new(hash)
1016
+ end
1017
+ [static_mods, static_terminal_mods]
1018
+ end
1019
+
1020
+ # 1. sets aa_mods and term_mods from a sequest params object
1021
+ # 2. sets @params
1022
+ # 3. sets @masses_by_diff_mod
1023
+ def set_modifications(params, modification_symbols_string)
1024
+ @params = params
1025
+
1026
+ set_hashes(modification_symbols_string)
1027
+ (static_mods, static_terminal_mods) = create_static_mods(params)
1028
+
1029
+ aa_hash = params.mass_table
1030
+ #################################
1031
+ # Variable Mods:
1032
+ #################################
1033
+ arr = params.diff_search_options.rstrip.split(/\s+/)
1034
+ # [aa.to_sym, diff.to_f]
1035
+ variable_mods = []
1036
+ (0...arr.size).step(2) do |i|
1037
+ if arr[i].to_f != 0.0
1038
+ variable_mods << [arr[i+1], arr[i].to_f]
1039
+ end
1040
+ end
1041
+ mod_objects = []
1042
+ variable_mods.each do |mod|
1043
+ mod[0].split('').each do |aa|
1044
+ hash = {
1045
+
1046
+ :aminoacid => aa,
1047
+ :massdiff => mod[1],
1048
+ :mass => aa_hash[aa.to_sym] + mod[1],
1049
+ :variable => 'Y',
1050
+ :binary => 'N',
1051
+ :symbol => @mod_symbols_hash[[aa.to_sym, mod[1]]],
1052
+ }
1053
+ mod_objects << Sequest::PepXML::AAModification.new(hash)
1054
+ end
1055
+ end
1056
+ variable_mods = mod_objects
1057
+ #################################
1058
+ # TERMINAL Variable Mods:
1059
+ #################################
1060
+ # These are always peptide, not protein termini (for sequest)
1061
+ (nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
1062
+
1063
+ to_add = []
1064
+ if nterm_diff != 0.0
1065
+ to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
1066
+ end
1067
+ if cterm_diff != 0.0
1068
+ to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
1069
+ end
1070
+
1071
+ variable_terminal_mods = to_add.map do |term, mssdiff, symb|
1072
+ hash = {
1073
+ :terminus => term,
1074
+ :massdiff => mssdiff,
1075
+ :variable => 'Y',
1076
+ :symbol => symb,
1077
+ }
1078
+ Sequest::PepXML::TerminalModification.new(hash)
1079
+ end
1080
+
1081
+ #########################
1082
+ # COLLECT THEM
1083
+ #########################
1084
+ @aa_mods = static_mods + variable_mods
1085
+ @term_mods = static_terminal_mods + variable_terminal_mods
1086
+ end
1087
+
1088
+ ## Generates the pepxml for static and differential amino acid mods based on
1089
+ ## sequest object
1090
+ def to_pepxml
1091
+ st = ''
1092
+ if @aa_mods
1093
+ st << @aa_mods.map {|v| v.to_pepxml }.join
1094
+ end
1095
+ if @term_mods
1096
+ st << @term_mods.map {|v| v.to_pepxml }.join
1097
+ end
1098
+ st
1099
+ end
1100
+
1101
+ end
1102
+
1103
+ # Modified aminoacid, static or variable
1104
+ # unless otherwise stated, all attributes can be anything
1105
+ class Sequest::PepXML::AAModification
1106
+ include SpecIDXML
1107
+
1108
+ # The amino acid (one letter code)
1109
+ attr_accessor :aminoacid
1110
+ # Must be a string!!!!
1111
+ # Mass difference with respect to unmodified aminoacid, must begin with
1112
+ # either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
1113
+ # consider Numeric#to_plus_minus_string at top
1114
+ attr_accessor :massdiff
1115
+ # Mass of modified aminoacid
1116
+ attr_accessor :mass
1117
+ # Y if both modified and unmodified aminoacid could be present in the
1118
+ # dataset, N if only modified aminoacid can be present
1119
+ attr_accessor :variable
1120
+ # whether modification can reside only at protein terminus (specified 'n',
1121
+ # 'c', or 'nc')
1122
+ attr_accessor :peptide_terminus
1123
+ # MSial symbol used by search engine to designate this modification
1124
+ attr_accessor :symbol
1125
+ # Y if each peptide must have only modified or unmodified aminoacid, N if a
1126
+ # peptide may contain both modified and unmodified aminoacid
1127
+ attr_accessor :binary
1128
+
1129
+ def initialize(hash=nil)
1130
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1131
+ end
1132
+
1133
+ def to_pepxml
1134
+ # note massdiff
1135
+ short_element_xml_and_att_string("aminoacid_modification", "aminoacid=\"#{aminoacid}\" massdiff=\"#{massdiff.to_plus_minus_string}\" mass=\"#{mass}\" variable=\"#{variable}\" peptide_terminus=\"#{peptide_terminus}\" symbol=\"#{symbol}\" binary=\"#{binary}\"")
1136
+ end
1137
+
1138
+ end
1139
+
1140
+ # Modified aminoacid, static or variable
1141
+ class Sequest::PepXML::TerminalModification
1142
+ include SpecIDXML
1143
+
1144
+ # n for N-terminus, c for C-terminus
1145
+ attr_accessor :terminus
1146
+ # Mass difference with respect to unmodified terminus
1147
+ attr_accessor :massdiff
1148
+ # Mass of modified terminus
1149
+ attr_accessor :mass
1150
+ # Y if both modified and unmodified terminus could be present in the
1151
+ # dataset, N if only modified terminus can be present
1152
+ attr_accessor :variable
1153
+ # MSial symbol used by search engine to designate this modification
1154
+ attr_accessor :symbol
1155
+ # whether modification can reside only at protein terminus (specified n or
1156
+ # c)
1157
+ attr_accessor :protein_terminus
1158
+ attr_accessor :description
1159
+
1160
+ def initialize(hash=nil)
1161
+ instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1162
+ end
1163
+
1164
+ def to_pepxml
1165
+ #short_element_xml_from_instance_vars("terminal_modification")
1166
+ short_element_xml_and_att_string("terminal_modification", "terminus=\"#{terminus}\" massdiff=\"#{massdiff.to_plus_minus_string}\" mass=\"#{mass}\" variable=\"#{variable}\" symbol=\"#{symbol}\" protein_terminus=\"#{protein_terminus}\" description=\"#{description}\"")
1167
+ end
1168
+ end
1169
+
1170
+
1171
+ class Sequest::PepXML::SearchDatabase
1172
+ include SpecIDXML
1173
+ attr_accessor :local_path
1174
+ attr_writer :seq_type
1175
+ # Takes a SequestParams object
1176
+ # Sets :local_path from the params object attr :database
1177
+ def initialize(params=nil, args=nil)
1178
+ @seq_type = nil
1179
+ if params
1180
+ @local_path = params.database
1181
+ end
1182
+ if args ; set_from_hash(args) end
1183
+ end
1184
+
1185
+ def seq_type
1186
+ if @seq_type ; @seq_type
1187
+ else
1188
+ if @local_path =~ /\.fasta/
1189
+ 'AA'
1190
+ else
1191
+ abort "Don't recognize type from your database local path: #{@local_path}"
1192
+ end
1193
+ end
1194
+ end
1195
+
1196
+ def to_pepxml
1197
+ short_element_xml_and_att_string(:search_database, "local_path=\"#{local_path}\" type=\"#{seq_type}\"")
1198
+ end
1199
+
1200
+ end
1201
+
1202
+ Sequest::PepXML::SpectrumQuery = ArrayClass.new(%w(spectrum start_scan end_scan precursor_neutral_mass index assumed_charge search_results pepxml_version))
1203
+
1204
+ class Sequest::PepXML::SpectrumQuery
1205
+ include SpecIDXML
1206
+
1207
+ ############################################################
1208
+ # FOR PEPXML:
1209
+ ############################################################
1210
+ def to_pepxml
1211
+ case Sequest::PepXML.pepxml_version
1212
+ when 18
1213
+ element_xml("spectrum_query", [:spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :assumed_charge, :index]) do
1214
+ search_results.collect { |sr| sr.to_pepxml }.join
1215
+ end
1216
+ end
1217
+ end
1218
+
1219
+ def self.from_pepxml_node(node, msmsrun_obj)
1220
+ self.new.from_pepxml_node(node, msmsrun_obj)
1221
+ end
1222
+
1223
+ def from_pepxml_node(node, msmsrun_obj)
1224
+ self[0] = node['spectrum']
1225
+ self[1] = node['start_scan'].to_i
1226
+ self[2] = node['end_scan'].to_i
1227
+ self[3] = node['precursor_neutral_mass'].to_f
1228
+ self[4] = node['index'].to_i
1229
+ self[5] = node['assumed_charge'].to_i
1230
+ self[6] = node.children.map do |v|
1231
+ sh = Sequest::PepXML::SearchResult.new
1232
+ sh.from_pepxml_node(v, self, msmsrun_obj)
1233
+ end
1234
+ self
1235
+ end
1236
+
1237
+ # Returns the precursor_neutral based on the scans and an array indexed by
1238
+ # scan numbers. first and last scan and charge should be integers.
1239
+ # This is the precursor_mz - h_plus!
1240
+ # by=:prec_mz_arr|:deltamass
1241
+ # if prec_mz_arr then the following arguments must be supplied:
1242
+ # :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor
1243
+ # m/z for each product scan, :charge = int
1244
+ # if deltamass then the following arguments must be supplied:
1245
+ # m_plus_h = float, deltamass = float
1246
+ # For both flavors, a final additional argument 'average_weights'
1247
+ # can be used. If true (default), average weights will be used, if false,
1248
+ # monoisotopic weights (currently this is simply the mass of the proton)
1249
+ def self.calc_precursor_neutral_mass(by, *args)
1250
+ average_weights = true
1251
+ case by
1252
+ when :prec_mz_arr
1253
+ (first_scan, last_scan, prec_mz_arr, charge, average_weights) = args
1254
+ when :deltamass
1255
+ (m_plus_h, deltamass, average_weights) = args
1256
+ end
1257
+
1258
+ if average_weights
1259
+ mass_h_plus = SpecID::AVG[:h_plus]
1260
+ else
1261
+ mass_h_plus = SpecID::MONO[:h_plus]
1262
+ end
1263
+
1264
+ case by
1265
+ when :prec_mz_arr
1266
+ mz = nil
1267
+ if first_scan != last_scan
1268
+ sum = 0.0
1269
+ tot_num = 0
1270
+ (first_scan..last_scan).each do |scan|
1271
+ val = prec_mz_arr[scan]
1272
+ if val # if the scan is not an mslevel 2
1273
+ sum += val
1274
+ tot_num += 1
1275
+ end
1276
+ end
1277
+ mz = sum/tot_num
1278
+ else
1279
+ mz = prec_mz_arr[first_scan]
1280
+ end
1281
+ charge * (mz - mass_h_plus)
1282
+ when :deltamass
1283
+ m_plus_h - mass_h_plus + deltamass
1284
+ else
1285
+ abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}"
1286
+ end
1287
+ end
1288
+
1289
+ end
1290
+
1291
+
1292
+ Sequest::PepXML::SearchHit = ArrayClass.new( %w( hit_rank peptide peptide_prev_aa peptide_next_aa protein num_tot_proteins num_matched_ions tot_num_ions calc_neutral_pep_mass massdiff num_tol_term num_missed_cleavages is_rejected deltacnstar xcorr deltacn spscore sprank modification_info spectrum_query) )
1293
+
1294
+ # 0=hit_rank 1=peptide 2=peptide_prev_aa 3=peptide_next_aa 4=protein 5=num_tot_proteins 6=num_matched_ions 7=tot_num_ions 8=calc_neutral_pep_mass 9=massdiff 10=num_tol_term 11=num_missed_cleavages 12=is_rejected 13=deltacnstar 14=xcorr 15=deltacn 16=spscore 17=sprank 18=modification_info 19=spectrum_query
1295
+
1296
+ class Sequest::PepXML::SearchHit
1297
+ include SpecID::Pep
1298
+ include SpecIDXML
1299
+
1300
+ Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1301
+
1302
+
1303
+ # These are all search_score elements:
1304
+
1305
+ # 1 if there is no second ranked hit, 0 otherwise
1306
+
1307
+ tmp_verb = $VERBOSE
1308
+ $VERBOSE = nil
1309
+ def initialize(hash=nil)
1310
+ super(@@arr_size)
1311
+ if hash
1312
+ self[0,20] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank], hash[:modification_info], hash[:spectrum_query]]
1313
+ end
1314
+ self
1315
+ end
1316
+ $VERBOSE = tmp_verb
1317
+
1318
+ undef_method :inspect
1319
+ def inspect
1320
+ var = @@attributes.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
1321
+ "#<SearchHit #{var}>"
1322
+ end
1323
+
1324
+ # requires Params object and full sequence (with heads and tails)
1325
+ def self.calc_num_missed_cleavages(params, sequence)
1326
+ num_missed = 0
1327
+ offset, split_after, except_before = params.enzyme_specificity
1328
+ first, middle, last = SpecID::Pep.split_sequence(sequence)
1329
+ to_regexp = "[#{split_after}]"
1330
+ if except_before.size > 0
1331
+ to_regexp << "[^#{except_before}]"
1332
+ end
1333
+ regexp = /#{to_regexp}/
1334
+ arr = middle.scan(regexp)
1335
+ num = arr.size
1336
+ if middle[-1,1] =~ regexp
1337
+ # if the regexp is a single letter (exceptions) and the last letter
1338
+ # matches, then it will count when it is not a missed cleavage
1339
+ # we can correct for this and get the right answer
1340
+ num -= 1
1341
+ else
1342
+ num
1343
+ end
1344
+ end
1345
+
1346
+ # requires Params object and full sequence (with heads and tails)
1347
+ def self.calc_num_tol_term(params, sequence)
1348
+ num_tol = 0
1349
+ offset, split_after, except_before = params.enzyme_specificity
1350
+ first, middle, last = SpecID::Pep.split_sequence(sequence)
1351
+ last_of_middle = middle[-1,1]
1352
+ first_of_middle = middle[0,1]
1353
+ if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
1354
+ num_tol += 1
1355
+ end
1356
+ if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
1357
+ num_tol += 1
1358
+ end
1359
+ num_tol
1360
+ end
1361
+
1362
+ # Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
1363
+ def self.split_ions(ions)
1364
+ ions.split("/").map {|ion| ion.to_i }
1365
+ end
1366
+
1367
+ def search_score_xml(symbol)
1368
+ "#{tabs}<search_score name=\"#{symbol}\" value=\"#{send(symbol)}\"/>"
1369
+ end
1370
+
1371
+ def search_scores_xml(*symbol_list)
1372
+ symbol_list.collect do |sy|
1373
+ search_score_xml(sy)
1374
+ end.join("\n") + "\n"
1375
+ end
1376
+
1377
+ def to_pepxml
1378
+ mod_pepxml =
1379
+ if self[18]
1380
+ self[18].to_pepxml
1381
+ else
1382
+ ''
1383
+ end
1384
+
1385
+ #string = element_xml_and_att_string("search_hit", [:hit_rank, :peptide, :peptide_prev_aa, :peptide_next_aa, :protein, :num_tot_proteins, :num_matched_ions, :tot_num_ions, :calc_neutral_pep_mass, :massdiff_as_string, :num_tol_term, :num_missed_cleavages, :is_rejected]) do
1386
+ # note the to_plus_minus_string
1387
+ #puts "MASSDIFF:"
1388
+ #p massdiff
1389
+ element_xml_and_att_string("search_hit", "hit_rank=\"#{hit_rank}\" peptide=\"#{peptide}\" peptide_prev_aa=\"#{peptide_prev_aa}\" peptide_next_aa=\"#{peptide_next_aa}\" protein=\"#{protein}\" num_tot_proteins=\"#{num_tot_proteins}\" num_matched_ions=\"#{num_matched_ions}\" tot_num_ions=\"#{tot_num_ions}\" calc_neutral_pep_mass=\"#{calc_neutral_pep_mass}\" massdiff=\"#{massdiff.to_plus_minus_string}\" num_tol_term=\"#{num_tol_term}\" num_missed_cleavages=\"#{num_missed_cleavages}\" is_rejected=\"#{is_rejected}\"") do
1390
+ mod_pepxml +
1391
+ search_scores_xml(:xcorr, :deltacn, :deltacnstar, :spscore, :sprank)
1392
+ end
1393
+ end
1394
+
1395
+ def self.from_pepxml_node(node, spec_query)
1396
+ self.new.from_pepxml_node(node, spec_query)
1397
+ end
1398
+
1399
+ def from_pepxml_node(node, spec_query)
1400
+ self[0] = node['hit_rank'].to_i
1401
+ self[1] = node['peptide']
1402
+ self[2] = node['peptide_prev_aa']
1403
+ self[3] = node['peptide_next_aa']
1404
+ self[4] = node['protein'] ## will this be the string?? (yes, for now)
1405
+ self[5] = node['num_tot_proteins'].to_i
1406
+ self[6] = node['num_matched_ions'].to_i
1407
+ self[7] = node['tot_num_ions'].to_i
1408
+ self[8] = node['calc_neutral_pep_mass'].to_f
1409
+ self[9] = node['massdiff'].to_f
1410
+ self[10] = node['num_tol_term'].to_i
1411
+ self[11] = node['num_missed_cleavages'].to_i
1412
+ self[12] = node['is_rejected'].to_i
1413
+ if modinfo_node = node.find_first("child::modification_info")
1414
+ self[18] = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
1415
+ end
1416
+ node.find("child::search_score").each do |ss_n|
1417
+ case ss_n['name']
1418
+ when 'deltacnstar'
1419
+ self[13] = ss_n['value'].to_i
1420
+ when 'xcorr'
1421
+ self[14] = ss_n['value'].to_f
1422
+ when 'deltacn'
1423
+ self[15] = ss_n['value'].to_f
1424
+ when 'spscore'
1425
+ self[16] = ss_n['value'].to_f
1426
+ when 'sprank'
1427
+ self[17] = ss_n['value'].to_i
1428
+ end
1429
+ end
1430
+ self[19] = spec_query
1431
+ self
1432
+ end
1433
+
1434
+ end
1435
+
1436
+
1437
+ Sequest::PepXML::SearchHit::ModificationInfo = ArrayClass.new(%w(modified_peptide mod_aminoacid_masses mod_nterm_mass mod_cterm_mass))
1438
+
1439
+ # Positions and masses of modifications
1440
+ class Sequest::PepXML::SearchHit::ModificationInfo
1441
+ include SpecIDXML
1442
+
1443
+ ## Should be something like this:
1444
+ # <modification_info mod_nterm_mass=" " mod_nterm_mass=" " modified_peptide=" ">
1445
+ # <mod_aminoacid_mass position=" " mass=" "/>
1446
+ # </modification_info>
1447
+
1448
+ alias_method :masses, :mod_aminoacid_masses
1449
+ alias_method :masses=, :mod_aminoacid_masses=
1450
+
1451
+ # Mass of modified N terminus<
1452
+ #attr_accessor :mod_nterm_mass
1453
+ # Mass of modified C terminus<
1454
+ #attr_accessor :mod_cterm_mass
1455
+ # Peptide sequence (with indicated modifications) I'm assuming that the
1456
+ # native sequest indicators are OK here
1457
+ #attr_accessor :modified_peptide
1458
+
1459
+ # These are objects of type: ...ModAminoacidMass
1460
+ # position ranges from 1 to peptide length
1461
+ #attr_accessor :mod_aminoacid_masses
1462
+
1463
+ # Will escape any xml special chars in modified_peptide
1464
+ def to_pepxml
1465
+ ## Collect the modifications:
1466
+ mod_strings = []
1467
+ if masses and masses.size > 0
1468
+ mod_strings = masses.map do |ar|
1469
+ "position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
1470
+ end
1471
+ end
1472
+ ## Create the attribute string:
1473
+ att_parts = []
1474
+ if mod_nterm_mass
1475
+ att_parts << "mod_nterm_mass=\"#{mod_nterm_mass}\""
1476
+ end
1477
+ if mod_cterm_mass
1478
+ att_parts << "mod_cterm_mass=\"#{mod_cterm_mass}\""
1479
+ end
1480
+ if modified_peptide
1481
+ att_parts << "modified_peptide=\"#{escape_special_chars(modified_peptide)}\""
1482
+ end
1483
+ element_xml_and_att_string('modification_info', att_parts.join(" ")) do
1484
+ mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
1485
+ end
1486
+ end
1487
+
1488
+ def self.from_pepxml_node(node)
1489
+ self.new.from_pepxml_node(node)
1490
+ end
1491
+
1492
+ # returns self
1493
+ def from_pepxml_node(node)
1494
+ self[0] = node['modified_peptide']
1495
+ self[2] = node['mod_nterm_mass']
1496
+ self[3] = node['mod_cterm_mass']
1497
+ masses = []
1498
+ node.children do |mass_n|
1499
+ masses << Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mass_n['position'].to_i, mass_n['mass'].to_f])
1500
+ end
1501
+ self[1] = masses
1502
+ self
1503
+ end
1504
+
1505
+ ##
1506
+
1507
+ # <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
1508
+ # <mod_aminoacid_mass position="2" mass="545.7160"/>
1509
+ # <mod_aminoacid_mass position="3" mass="147.1926"/>
1510
+ # </modification_info>
1511
+ end
1512
+
1513
+ Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass = ArrayClass.new(%w(position mass))