mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -1,1675 +1,5 @@
1
+ require 'spec_id/sequest/params'
1
2
 
2
- require 'sample_enzyme'
3
- require 'spec/mzxml/parser'
4
- require 'hash_by'
5
- require 'set_from_hash'
6
- require 'spec_id/bioworks'
7
- require 'instance_var_set_from_hash'
8
- require 'spec/msrun'
9
- require 'spec_id/srf'
10
- require 'fileutils'
11
-
12
- class Numeric
13
- # returns a string with a + or - on the front
14
- def to_plus_minus_string
15
- if self >= 0
16
- '+' << self.to_s
17
- else
18
- '-' << self.to_s
19
- end
20
- end
21
- end
22
-
23
- ##########################################
24
- # NEED TO ADD MODIFICATIONS and generally verify pepxml creation!!! :
25
- # HERE's an excerpt from an example file from tpp 2.9.2 that I'm going to follow:
26
- =begin
27
- <search_summary base_name="/regis/data3/search/akeller/LCQ/COMET/LIGHT/haloICAT2_41" search_engine="COMET" precursor_mass_type="average" fragment_mass_type="average">
28
- <sequence_search_constraint sequence="C"/>
29
- <aminoacid_modification aminoacid="C" massdiff="8.049" mass="553.765" variable="Y" binary="N"/>
30
- <aminoacid_modification aminoacid="C" massdiff="442.5772" mass="545.7160" variable="N"/>
31
- <aminoacid_modification aminoacid="M" massdiff="16.0000" mass="147.1926" variable="Y" binary="N" symbol="1"/>
32
- <parameter name="peptide_mass_tol" value="3.0000"/>
33
- <parameter name="peptide_mass_tol_units" value="DA"/>
34
- <parameter name="num_output_lines" value="10"/>
35
- <parameter name="remove_precursor_peak" value="0"/>
36
- <parameter name="num_dup_headers" value="1"/>
37
- <parameter name="email_address" value=""/>
38
- <parameter name="ion_series" value="010000010"/>
39
- <parameter name="max_num_var_mod_residues" value="3"/>
40
- <parameter name="md5_check_sum" value="2547286a77a35abe2af3f2e9825ab814"/>
41
- </search_summary>
42
- =end
43
-
44
- # and a guy with modifications:
45
- =begin
46
- <search_result spectrum="haloICAT2_41.1110.1110.2" start_scan="1110" end_scan="1110" precursor_neutral_mass="2000.6641" assumed_charge="2" index="28">
47
- <search_hit hit_rank="1" peptide="GCMPSKEVLSAGAHR" peptide_prev_aa="R" peptide_next_aa="Y" protein="Chr_ORF0132" num_tot_proteins="1" num_matched_ions="19" tot_num_ions="30" calc_neutral_pep_mass="2001.3685" massdiff="-0.704" num_tol_term="2" num_missed_cleavages="1" is_rejected="0">
48
- <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
49
- <mod_aminoacid_mass position="2" mass="545.7160"/>
50
- <mod_aminoacid_mass position="3" mass="147.1926"/>
51
- </modification_info>
52
- <search_score name="dotproduct" value="359"/>
53
- <search_score name="delta" value="0.296"/>
54
- <search_score name="deltastar" value="0"/>
55
- <search_score name="zscore" value="5.290"/>
56
- <search_score name="expect" value="0.000E+00"/>
57
- <peptideprophet_result probability="0.9994" all_ntt_prob="(0.3713,0.4360,0.9994)">
58
- <search_score_summary>
59
- <parameter name="fval" value="3.4002"/>
60
- <parameter name="ntt" value="2"/>
61
- <parameter name="nmc" value="1"/>
62
- <parameter name="massd" value="-0.704"/>
63
- </search_score_summary>
64
- </peptideprophet_result>
65
- =end
66
-
67
- # sequest.params option:
68
- # diff_search_options = 15.994910 M 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y
69
- # permanent mods are at the bottom: ...
70
- # add_A_Alanine = 0.0000 ; added to A
71
- # add_S_Serine = 0.0000 ; added to S
72
- # add_P_Proline = 0.0000 ; added to P
73
- # add_V_Valine = 0.0000 ; added to V
74
- # add_T_Threonine = 0.0000 ; added to T
75
- # ...
76
-
77
-
78
-
79
- module Sequest; end
80
- class Sequest::PepXML; end
81
-
82
- class Sequest::PepXML::MSMSPipelineAnalysis
83
- include SpecIDXML
84
- # Version 1.2.3
85
- attr_writer :date
86
- attr_writer :xmlns, :xmlns_xsi, :xsi_schemaLocation
87
- attr_accessor :summary_xml
88
- # Version 2.3.4
89
- attr_writer :xmlns, :xmlns_xsi, :xsi_schema_location
90
- attr_accessor :pepxml_version
91
- attr_accessor :msms_run_summary
92
-
93
- # if block given, sets msms_run_summary to block
94
- def initialize(hash=nil)
95
- @xmlns = nil
96
- @xmlns_xsi = nil
97
- @xsi_schema_location = nil
98
- if hash
99
- self.set_from_hash(hash)
100
- end
101
- if block_given?
102
- @msms_run_summary = yield
103
- end
104
- end
105
-
106
- # if no date string given, then it will set to Time.now
107
- def date
108
- if @date ; @date
109
- else
110
- case Sequest::PepXML.pepxml_version
111
- when 18 ; tarr = Time.now.to_a ; tarr[3..5].reverse.join('-') + "T#{tarr[0..2].reverse.join(':')}"
112
- when 0 ; Time.new.to_s
113
- end
114
- end
115
- end
116
-
117
- def xmlns
118
- if @xmlns ; @xmlns
119
- else ; "http://regis-web.systemsbiology.net/pepXML"
120
- end
121
- end
122
-
123
- def xmlns_xsi
124
- if @xmlns_xsi ; @xmlns_xsi
125
- else ; "http://www.w3.org/2001/XMLSchema-instance"
126
- end
127
- end
128
-
129
- def xsi_schema_location
130
- if @xsi_schema_location ; @xsi_schema_location
131
- else ; "http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd"
132
- end
133
- end
134
-
135
- def to_pepxml
136
- case Sequest::PepXML.pepxml_version
137
- when 0
138
- element_xml(:msms_pipeline_analysis, [:date, :summary_xml]) do
139
- @msms_run_summary.to_pepxml
140
- end
141
- when 18
142
- element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do
143
- @msms_run_summary.to_pepxml
144
- end
145
- else
146
- abort "Don't know how to deal with version: #{Sequest::PepXML.pepxml_version}"
147
- end
148
- end
149
-
150
- end
151
-
152
- class Sequest::PepXML::MSMSRunSummary
153
- include SpecIDXML
154
-
155
- # the version of TPP you are using (determines xml output)
156
- # The name of the pep xml file (without extension) (but this is a long
157
- # filename!!!)
158
- attr_accessor :base_name
159
- # The name of the mass spec manufacturer
160
- attr_accessor :ms_manufacturer
161
- attr_accessor :ms_model
162
- attr_accessor :ms_mass_analyzer
163
- attr_accessor :ms_detector
164
- attr_accessor :raw_data_type
165
- attr_accessor :raw_data
166
- attr_accessor :ms_ionization
167
- attr_accessor :pepxml_version
168
-
169
- # A SampleEnzyme object (responds to: name, cut, no_cut, sense)
170
- attr_accessor :sample_enzyme
171
- # A SearchSummary object
172
- attr_accessor :search_summary
173
- # An array of spectrum_queries
174
- attr_accessor :spectrum_queries
175
-
176
- # takes a hash of name, value pairs
177
- # if block given, spectrum_queries (should be array of spectrum queries) is
178
- # set to the return value of the block
179
- def initialize(hash=nil)
180
- @spectrum_queries = []
181
- if hash
182
- instance_var_set_from_hash(hash)
183
- end
184
- if block_given? ; @spectrum_queries = yield end
185
- end
186
-
187
- def to_pepxml
188
- case Sequest::PepXML.pepxml_version
189
- when 18
190
- element_xml_and_att_string(:msms_run_summary, "base_name=\"#{base_name}\" msManufacturer=\"#{ms_manufacturer}\" msModel=\"#{ms_model}\" msIonization=\"#{ms_ionization}\" msMassAnalyzer=\"#{ms_mass_analyzer}\" msDetector=\"#{ms_detector}\" raw_data_type=\"#{raw_data_type}\" raw_data=\"#{raw_data}\"") do
191
- sample_enzyme.to_pepxml +
192
- search_summary.to_pepxml +
193
- spectrum_queries.map {|sq| sq.to_pepxml }.join
194
- end
195
- when 0
196
- # element_xml(:msms_run_summary, [:base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme]) do
197
- # element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type]) do
198
- # [
199
- # @params.short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]),
200
- # @params.short_element_xml(:sequence_search_constraint, [:sequence]),
201
- # @params.short_element_xml(:sequence_search_constraint, [:sequence]),
202
- # @params.pepxml_parameters(:peptide_mass_tol, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
203
- # ].join("\n")
204
- # end + "\n" +
205
- # @spectrum_queries.collect {|result| result.to_pepxml }.join("\n")
206
- # end
207
- end
208
- end
209
-
210
- end
211
-
212
-
213
-
214
- class Sequest::PepXML
215
- include SpecIDXML
216
-
217
- ## CREATE a default version for the entire class
218
- class << self
219
- attr_accessor :pepxml_version
220
- end
221
- DEF_VERSION = 18
222
- self.pepxml_version = DEF_VERSION # default version
223
-
224
- attr_accessor :pepxml_version, :msms_pipeline_analysis
225
- ## the full path name (no extension)
226
- attr_accessor :base_name
227
- attr_accessor :h_plus
228
- attr_accessor :avg_parent
229
-
230
- #attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
231
-
232
- # returns an array of spectrum queries
233
- def spectrum_queries
234
- msms_pipeline_analysis.msms_run_summary.spectrum_queries
235
- end
236
-
237
- # msms_pipeline_analysis is set to the result of the yielded block
238
- # and set_mono_or_avg is called with params if given
239
- def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
240
- self.class.pepxml_version = pepxml_version
241
- if sequest_params_obj
242
- set_mono_or_avg(sequest_params_obj)
243
- end
244
- if block_given?
245
- @msms_pipeline_analysis = yield
246
- @base_name = @msms_pipeline_analysis.msms_run_summary.base_name
247
- end
248
- end
249
-
250
- # sets @h_plus and @avg_parent from the sequest params object
251
- def set_mono_or_avg(sequest_params_obj)
252
- case sequest_params_obj.precursor_mass_type
253
- when "monoisotopic" ; @avg_parent = false
254
- else ; @avg_parent = true
255
- end
256
-
257
- case @avg_parent
258
- when true ; @h_plus = SpecID::AVG[:h_plus]
259
- when false ; @h_plus = SpecID::MONO[:h_plus]
260
- end
261
- end
262
-
263
- def date
264
- Time.new.to_s
265
- end
266
-
267
- def xml_version
268
- '<?xml version="1.0" encoding="UTF-8"?>' + "\n"
269
- end
270
-
271
- # for pepxml_version == 0
272
- def doctype
273
- '<!DOCTYPE msms_pipeline_analysis SYSTEM "/usr/bin/msms_analysis3.dtd">' + "\n"
274
- end
275
-
276
- def style_sheet
277
- case self.class.pepxml_version
278
- when 0
279
- '<?xml-stylesheet type="text/xsl" href="/isb/std_xsl/pepXML_std.xsl"?>' + "\n"
280
- when 18
281
- '<?xml-stylesheet type="text/xsl" href="/tools/bin/TPP/tpp/schema/pepXML_std.xsl"?>'
282
- end
283
- end
284
-
285
- def header
286
- case self.class.pepxml_version
287
- when 0 ; xml_version + doctype + style_sheet
288
- when 18 ; xml_version + style_sheet
289
- end
290
- end
291
-
292
- # updates the private attrs _num_prots and _first_prot on bioworks pep
293
- # objects. Ideally, we'd like these attributes to reside elsewhere, but for
294
- # memory concerns, this is best for now.
295
- def self._prot_num_and_first_prot_by_pep(pep_array)
296
- pep_array.hash_by(:aaseq).each do |aasq, pep_arr|
297
- prts = []
298
- pep_arr.each { |pep| prts.push( *(pep.prots) ) }
299
- prts.uniq!
300
- _size = prts.size
301
- pep_arr.each do |pep|
302
- pep._num_prots = _size.to_s
303
- pep._first_prot = prts.first
304
- end
305
- end
306
- end
307
-
308
-
309
- Default_Options = {
310
- :out_path => '.',
311
- #:backup_db_path => '.',
312
- # a PepXML option
313
- :pepxml_version => DEF_VERSION,
314
- ## MSMSRunSummary options:
315
- # string must be recognized in sample_enzyme.rb
316
- # or create your own SampleEnzyme object
317
- :sample_enzyme => 'trypsin',
318
- :ms_manufacturer => 'ThermoFinnigan',
319
- :ms_model => 'LCQ Deca XP Plus',
320
- :ms_ionization => 'ESI',
321
- :ms_mass_analyzer => 'Ion Trap',
322
- :ms_detector => 'UNKNOWN',
323
- :ms_data => '.', # path to ms data files (raw or mzxml)
324
- :raw_data_type => "raw",
325
- :raw_data => ".mzXML", ## even if you don't have it?
326
- ## SearchSummary options:
327
- :out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
328
- :out_data => ".tgz", ## may be srf??
329
- :copy_mzxml => false, # copy the mzxml file to the out_path (create it if necessary)
330
- :print => false, # print the objects to file
331
- }
332
-
333
- # will dynamically set :ms_model and :ms_mass_analyzer from srf info
334
- # (ignoring defaults or anything passed in) for LTQ Orbitrap
335
- # and LCQ Deca XP
336
- # See SRF::Sequest::PepXML::Default_Options hash for defaults
337
- # unless given, the out_path will be given as the path of the srf_file
338
- # srf may be an object or a filename
339
- def self.new_from_srf(srf, opts={})
340
- opts = Default_Options.merge(opts)
341
-
342
- ## read the srf file
343
- if srf.is_a? String
344
- srf = SRF.new(srf)
345
- end
346
-
347
- ## set the outpath
348
- out_path = opts.delete(:out_path)
349
-
350
- params = srf.params
351
-
352
- ## check to see if we need backup_db
353
- backup_db_path = opts.delete(:backup_db_path)
354
- if !File.exist?(params.database) && backup_db_path
355
- params.database_path = backup_db_path
356
- end
357
-
358
- #######################################################################
359
- # PREPARE THE OPTIONS:
360
- #######################################################################
361
- ## remove items from the options hash that don't belong to
362
- ppxml_version = opts.delete(:pepxml_version)
363
- out_data_type = opts.delete(:out_data_type)
364
- out_data = opts.delete(:out_data)
365
-
366
- ## Extract meta info from srf
367
- bn_noext = base_name_noext(srf.header.raw_filename)
368
- opts[:ms_model] = srf.header.model
369
- case opts[:ms_model]
370
- when /Orbitrap/
371
- opts[:ms_mass_analyzer] = 'Orbitrap'
372
- when /LCQ Deca XP/
373
- opts[:ms_mass_analyzer] = 'Ion Trap'
374
- end
375
-
376
- ## Create the base name
377
- full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
378
- opts[:base_name] = full_base_name_no_ext
379
-
380
- ## Create the search summary:
381
- search_summary_options = {
382
- :search_database => Sequest::PepXML::SearchDatabase.new(params),
383
- :base_name => full_base_name_no_ext,
384
- :out_data_type => out_data_type,
385
- :out_data => out_data
386
- }
387
- modifications_string = srf.header.modifications
388
- search_summary = Sequest::PepXML::SearchSummary.new( params, modifications_string, search_summary_options)
389
-
390
- ## Create the SampleEnzyme object if necessary
391
- unless opts[:sample_enzyme].is_a? SampleEnzyme
392
- opts[:sample_enzyme] = SampleEnzyme.new(opts[:sample_enzyme])
393
- end
394
-
395
- ## Create the pepxml obj and top level objects
396
- pepxml_obj = Sequest::PepXML.new(ppxml_version, params)
397
- pipeline = Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'})
398
- pepxml_obj.msms_pipeline_analysis = pipeline
399
- pipeline.msms_run_summary = Sequest::PepXML::MSMSRunSummary.new(opts)
400
- pipeline.msms_run_summary.search_summary = search_summary
401
- modifications_obj = search_summary.modifications
402
-
403
- ## name some common variables we'll need
404
- h_plus = pepxml_obj.h_plus
405
- avg_parent = pepxml_obj.avg_parent
406
-
407
-
408
- ## COPY MZXML FILES IF NECESSARY
409
- if opts[:copy_mzxml]
410
- mzxml_pathname_noext = File.join(opts[:ms_data], bn_noext)
411
- to_copy = Spec::MzXML.file_to_mzxml(mzxml_pathname_noext)
412
- if to_copy
413
- FileUtils.cp to_copy, out_path
414
- else
415
- puts "Couldn't file mzXML file with base: #{mzxml_pathname_noext}"
416
- puts "Perhaps you need to specifiy the location of the raw data"
417
- puts "or need an mzXML converter (readw.exe or t2x)"
418
- exit
419
- end
420
- end
421
-
422
-
423
- #######################################################################
424
- # CREATE the spectrum_queries_ar
425
- #######################################################################
426
- srf_index = srf.index
427
- out_files = srf.out_files
428
- spectrum_queries_arr = Array.new(srf.dta_files.size)
429
- files_with_hits_index = 0 ## will end up being 1 indexed
430
- srf.dta_files.each_with_index do |dta_file,i|
431
- next if out_files[i].num_hits == 0
432
- files_with_hits_index += 1
433
-
434
- # Sort the hits
435
- hits = out_files[i].hits
436
- arr = hits.sort_by{|v| v.xcorr }
437
-
438
- # Get proper deltacn and deltacnstar
439
- # Prophet deltacn is not the same as the native Sequest deltacn
440
- # It is the deltacn of the second best hit!
441
- top_hit = arr.pop
442
- second_hit = arr.last
443
- if second_hit
444
- top_hit[1] = second_hit[1]
445
- deltacnstar = '0'
446
- else
447
- top_hit[1] = '1.0'
448
- deltacnstar = '1'
449
- end
450
-
451
-
452
-
453
- ## mass calculations:
454
- precursor_neutral_mass = dta_file.mh - h_plus
455
- calc_neutral_pep_mass = top_hit[0] - h_plus
456
- massdiff = precursor_neutral_mass - calc_neutral_pep_mass
457
- if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
458
- else ; massdiff = massdiff.to_s end
459
-
460
- (start_scan, end_scan, charge) = srf_index[i]
461
-
462
-
463
-
464
- sq_hash = {
465
- :spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
466
- :start_scan => start_scan,
467
- :end_scan => end_scan,
468
- :precursor_neutral_mass => precursor_neutral_mass,
469
- :assumed_charge => charge,
470
- :pepxml_version => ppxml_version,
471
- :index => files_with_hits_index,
472
- }
473
-
474
- spectrum_query = Sequest::PepXML::SpectrumQuery.new(sq_hash)
475
-
476
- sequence = top_hit[8]
477
-
478
- # NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
479
- ## THIS IS ALL INNER LOOP, so we make every effort at speed here:
480
- (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(sequence)
481
- # ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
482
-
483
- sh_hash = {
484
- :hit_rank => "1",
485
- :peptide => pepseq,
486
- :peptide_prev_aa => prevaa,
487
- :peptide_next_aa => nextaa,
488
- :protein => top_hit[9].first.reference.split(" ").first,
489
- :num_tot_proteins => top_hit[9].size,
490
- :num_matched_ions => top_hit[6],
491
- :tot_num_ions => top_hit[7],
492
- :calc_neutral_pep_mass => calc_neutral_pep_mass,
493
- :massdiff => massdiff,
494
- :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, sequence),
495
- :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, sequence),
496
- :is_rejected => '0',
497
- # These are search score attributes:
498
- :xcorr => top_hit[3],
499
- :deltacn => top_hit[1],
500
- :deltacnstar => deltacnstar,
501
- :spscore => top_hit[2],
502
- :sprank => top_hit[5],
503
- :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(sequence)[1]),
504
- }
505
- search_hit = Sequest::PepXML::SearchHit.new(sh_hash) # there can be multiple hits
506
-
507
- search_result = Sequest::PepXML::SearchResult.new
508
- search_result.search_hits = [search_hit]
509
- spectrum_query.search_results = [search_result]
510
- spectrum_queries_arr[files_with_hits_index] = spectrum_query
511
- end
512
- spectrum_queries_arr.compact!
513
-
514
- pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
515
- pepxml_obj.base_name = pipeline.msms_run_summary.base_name
516
- pipeline.msms_run_summary.spectrum_queries = spectrum_queries_arr
517
-
518
- pepxml_obj
519
- end
520
-
521
- # takes an .srg or bioworks.xml file
522
- # if possible, ensures that an mzXML file is present for each pepxml file
523
- # :print => true, will print files
524
- def self.set_from_bioworks(bioworks_file, opts={})
525
- opts = Default_Options.merge(opts)
526
- ## Create the out_path directory if necessary
527
-
528
- unless File.exist? opts[:out_path]
529
- FileUtils.mkpath(opts[:out_path])
530
- end
531
- unless File.directory? opts[:out_path]
532
- abort "#{opts[:out_path]} must be a directory!"
533
- end
534
-
535
- spec_id = SpecID.new(bioworks_file)
536
- pepxml_objs =
537
- if spec_id.is_a? Bioworks
538
- abort("must have opts[:params] set!") unless opts[:params]
539
- set_from_bioworks_xml(bioworks_file, opts[:params], opts)
540
- elsif spec_id.is_a? SRFGroup
541
- spec_id.srfs.map do |srf|
542
- new_from_srf(srf, opts)
543
- end
544
- else
545
- abort "invalid object"
546
- end
547
-
548
- if opts[:print]
549
- pepxml_objs.each do |obj|
550
- obj.to_pepxml(obj.base_name + ".xml")
551
- end
552
- end
553
- pepxml_objs
554
- end
555
-
556
-
557
- # Takes bioworks 3.2/3.3 xml output (with no filters)
558
- # Returns a list of PepXML objects
559
- # params = sequest.params file
560
- # bioworks = bioworks.xml exported multi-consensus view file
561
- # pepxml_version = 0 for tpp 1.2.3
562
- # pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
563
- def self.set_from_bioworks_xml(bioworks, params, opts={})
564
- opts = Default_Options.merge(opts)
565
- pepxml_version, sample_enzyme, ms_manufacturer, ms_model, ms_ionization, ms_mass_analyzer, ms_detector, raw_data_type, raw_data, out_data_type, out_data, ms_data, out_path = opts.values_at(:pepxml_version, :sample_enzyme, :ms_manufacturer, :ms_model, :ms_ionization, :ms_mass_analyzer, :ms_detector, :raw_data_type, :raw_data, :out_data_type, :out_data, :ms_data, :out_path)
566
-
567
- unless out_path
568
- out_path = '.'
569
- end
570
-
571
- supported_versions = [0,18]
572
-
573
- unless supported_versions.include?(opts[:pepxml_version])
574
- abort "pepxml_version: #{pepxml_version} not currently supported. Current support is for versions #{supported_versions.join(', ')}"
575
- end
576
-
577
- ## Turn params and bioworks_obj into objects if necessary:
578
- # Params:
579
- if params.class == Sequest::Params # OK!
580
- elsif params.class == String ; params = Sequest::Params.new(params)
581
- else ; abort "Don't recognize #{params} as object or string!"
582
- end
583
- # Bioworks:
584
- if bioworks.class == Bioworks # OK!
585
- elsif bioworks.class == String ; bioworks = SpecID.new(bioworks)
586
- else ; abort "Don't recognize #{bioworks} as object or string!"
587
- end
588
-
589
- #puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
590
-
591
- ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
592
- ## bioworks
593
- #bioworks.peps.each { |pep| if pep.class != Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end }
594
-
595
- ## check to see if we need backup_db
596
-
597
- backup_db_path = opts.delete(:backup_db_path)
598
- if !File.exist?(params.database) && backup_db_path
599
- params.database_path = backup_db_path
600
- end
601
-
602
- ## Start
603
- split_bio_objs = []
604
-
605
- ## (num_prots_by_pep, prot_by_pep) =
606
- #num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE"
607
-
608
- modifications_string = bioworks.modifications
609
- search_summary = Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => Sequest::PepXML::SearchDatabase.new(params), :out_data_type => out_data_type, :out_data => out_data})
610
- modifications_obj = search_summary.modifications
611
-
612
- ## Create a hash of spectrum_query arrays by filename (this very big block):
613
- spectrum_queries_by_base_name = {}
614
- # Hash by the filenames to split into filenames:
615
- bioworks.peps.hash_by(:base_name).map do |base_name, pep_arr|
616
-
617
- pepxml_obj = Sequest::PepXML.new(pepxml_version, params)
618
- full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
619
-
620
- case pepxml_version
621
- when 18
622
- pipeline = Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'})
623
- msms_run_summary = Sequest::PepXML::MSMSRunSummary.new({
624
- :base_name => full_base_name_no_ext,
625
- :ms_manufacturer => ms_manufacturer,
626
- :ms_model => ms_model,
627
- :ms_ionization => ms_ionization,
628
- :ms_mass_analyzer => ms_mass_analyzer,
629
- :ms_detector => ms_detector,
630
- :raw_data_type => raw_data_type,
631
- :raw_data => raw_data,
632
- :sample_enzyme => SampleEnzyme.new(sample_enzyme),
633
- :search_summary => search_summary,
634
- })
635
- pipeline.msms_run_summary = msms_run_summary
636
- pepxml_obj.msms_pipeline_analysis = pipeline
637
- pepxml_obj.msms_pipeline_analysis.msms_run_summary.search_summary.base_name = full_base_name_no_ext
638
- pepxml_obj.base_name = full_base_name_no_ext
639
- pepxml_obj
640
- when 0
641
- ## @TODO: NEED TO REVAMP THIS:
642
- # Sequest::PepXML.new(pepxml_version).set_from_hash({
643
- # :params => params,
644
- # :search_results => spectrum_queries_arr,
645
- # :base_name => self.make_base_name( File.expand_path(out_path), base_name),
646
- # :search_engine => params.search_engine,
647
- # :database => params.database,
648
- # :raw_data_type => "mzXML",
649
- # :raw_data => ".mzXML",
650
- # :out_data_type => "out",
651
- # :out_data => ".tgz",
652
- # :sample_enzyme => params.enzyme,
653
- # })
654
- end
655
-
656
-
657
-
658
-
659
-
660
- # Create a hash by pep object containing num_tot_proteins
661
- # This is only valid if all hits are present (no previous thresholding)
662
- # Since out2summary only acts on one folder at a time,
663
- # we should only do it for one folder at a time! (that's why we do this
664
- # here instead of globally)
665
- self._prot_num_and_first_prot_by_pep(pep_arr)
666
- prec_mz_arr = nil
667
- case x = bioworks.version
668
- when /3.2/
669
- calc_prec_by = :prec_mz_arr
670
- # get the precursor_mz array for this filename
671
- prec_mz_arr = Spec::MSRun.precursor_mz_by_scan(File.join(ms_data, base_name))
672
- when /3.3/
673
- calc_prec_by = :deltamass
674
- else
675
- abort "invalid BioworksBrowser version: #{x}"
676
- end
677
-
678
- if opts[:copy_mzxml]
679
- to_copy = Spec::MzXML.file_to_mzxml(File.join(ms_data, base_name))
680
- if to_copy
681
- FileUtils.cp to_copy, out_path
682
- end
683
- end
684
-
685
-
686
- spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
687
-
688
-
689
- # Sort_by_rank and take the top hit (to mimick out2summary):
690
- arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
691
- top_pep = arr.pop
692
- second_hit = arr.last # needed for deltacnstar
693
-
694
-
695
- case calc_prec_by
696
- when :prec_mz_arr
697
- precursor_neutral_mass = Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, pepxml_obj.avg_parent)
698
- when :deltamass
699
- precursor_neutral_mass = Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
700
- end
701
-
702
- calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
703
- massdiff = precursor_neutral_mass - calc_neutral_pep_mass
704
- if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
705
- else ; massdiff = massdiff.to_s end #already has a -
706
- # deltacn & star:
707
- # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
708
- if second_hit
709
- #top_pep.deltacn = second_hit.deltacn
710
- deltacnstar = '0'
711
- else
712
- top_pep.deltacn = '1.0'
713
- deltacnstar = '1'
714
- end
715
- # Create the nested structure of queries{results{hits}}
716
- # (Ruby's blocks work beautifully for things like this)
717
- spec_query = Sequest::PepXML::SpectrumQuery.new({
718
- :spectrum => [top_pep.base_name, top_pep.first_scan, top_pep.last_scan, top_pep.charge].join("."),
719
- :start_scan => top_pep.first_scan,
720
- :end_scan => top_pep.last_scan,
721
- :precursor_neutral_mass => precursor_neutral_mass.to_s,
722
- :assumed_charge => top_pep.charge,
723
- :pepxml_version => pepxml_version,
724
- })
725
-
726
-
727
- search_result = Sequest::PepXML::SearchResult.new
728
-
729
- ## Calculate some interdependent values;
730
- # NOTE: the bioworks mass is reallyf M+H if two or more scans went
731
- # into the search_hit; calc_neutral_pep_mass is simply the avg of
732
- # precursor masses adjusted to be neutral
733
- (prevaa, pepseq, nextaa) = SpecID::Pep.prepare_sequence(top_pep.sequence)
734
- (num_matched_ions, tot_num_ions) = Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
735
- search_hit = Sequest::PepXML::SearchHit.new({
736
- :hit_rank => "1",
737
- :peptide => pepseq,
738
- :peptide_prev_aa => prevaa,
739
- :peptide_next_aa => nextaa,
740
- :protein => top_pep._first_prot.reference.split(" ").first,
741
- :num_tot_proteins => top_pep._num_prots,
742
- :num_matched_ions => num_matched_ions,
743
- :tot_num_ions => tot_num_ions,
744
- :calc_neutral_pep_mass => calc_neutral_pep_mass.to_s,
745
- :massdiff => massdiff,
746
- :num_tol_term => Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_pep.sequence).to_s,
747
- :num_missed_cleavages => Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_pep.sequence).to_s,
748
- :is_rejected => "0",
749
- # These are search score attributes:
750
- :xcorr => top_pep.xcorr,
751
- :deltacn => top_pep.deltacn,
752
- :deltacnstar => deltacnstar,
753
- :spscore => top_pep.sp,
754
- :sprank => top_pep.rsp,
755
- :modification_info => modifications_obj.modification_info(SpecID::Pep.split_sequence(top_pep.sequence)[1]),
756
- })
757
- search_result.search_hits = [search_hit] # there can be multiple search hits
758
- spec_query.search_results = [search_result] # can be multiple search_results
759
- spec_query
760
- end
761
-
762
- # create an index by spectrum as results end up typically in out2summary
763
- # (I really dislike this order, however)
764
- spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
765
- spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
766
- pipeline.msms_run_summary.spectrum_queries = spectrum_queries_ar
767
- pepxml_obj
768
- end ## collects pepxml_objs
769
- end
770
-
771
- def summary_xml
772
- base_name + ".xml"
773
- end
774
-
775
- def precursor_mass_type
776
- @params.precursor_mass_type
777
- end
778
-
779
- def fragment_mass_type
780
- @params.fragment_mass_type
781
- end
782
-
783
- # combines filename in a manner consistent with the path
784
- def self.make_base_name(path, filename)
785
- sep = "/"
786
- if path.split("/").size < path.split("\\").size
787
- sep = "\\"
788
- end
789
- if path.split("").last == sep
790
- return path + File.basename(filename)
791
- else
792
- return path + sep + File.basename(filename)
793
- end
794
- end
795
-
796
- # outputs pepxml, (to file if given)
797
- def to_pepxml(file=nil)
798
- string = header
799
- string << @msms_pipeline_analysis.to_pepxml
800
-
801
- if file
802
- File.open(file, "w") do |fh| fh.print string end
803
- end
804
- string
805
- end
806
-
807
- # given any kind of filename (from windows or whatever)
808
- # returns the base of the filename with no file extension
809
- def self.base_name_noext(file)
810
- file.gsub!("\\", '/')
811
- File.basename(file).sub(/\.[\w^\.]+$/, '')
812
- end
813
-
814
-
815
- end # PepXML
816
-
817
- ##
818
- # In the future, this guy should accept any version of bioworks params file
819
- # and spit out any param queried.
820
- class Sequest::Params
821
- include SpecIDXML
822
-
823
- # current attributes supported are:
824
- # bioworks 3.2:
825
- @@param_re = / = ?/o
826
- @@param_two_split = ';'
827
-
828
- # opts are the general options
829
- # mods are the weights added to amino acids
830
- attr_accessor :opts, :mods
831
-
832
- # all keys and values stored as strings!
833
- def initialize(file=nil)
834
- if file
835
- parse(file)
836
- end
837
- end
838
-
839
- # returns hash of params for continuous lines of non-whitespace
840
- def grab_params(fh)
841
- hash = {}
842
- while line = fh.gets
843
- if line =~ /[^\s]/
844
- one,two = line.split @@param_re
845
- two,comment = two.split @@param_two_split
846
- hash[one] = two.rstrip
847
- # it is necessary to add this break so that params files inside srf
848
- # files can be read. This will terminate the reading at the end of
849
- # the file even though there are more lines
850
- if line =~ /added to U/ || line =~ /digest_mass_range/## Will only work on bioworks 3.2 & 3.3 (bioworks 3.1 last line => Elastase/Tryp...)
851
- break
852
- end
853
- if line =~ /digest_mass_range/ # there is no space in the srf params files
854
- break
855
- end
856
- else
857
- break
858
- end
859
- end
860
- hash
861
- end
862
-
863
- # returns self
864
- def parse_handle(fh)
865
- sequest_line = fh.gets #[SEQUEST]
866
- @opts = grab_params(fh)
867
- @opts["search_engine"] = "SEQUEST"
868
- @mods = grab_params(fh)
869
-
870
- ## this gets rid of the .hdr postfix on indexed databases
871
- @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
872
- self
873
- end
874
-
875
- ## parses file
876
- ## and drops the .hdr behind indexed fasta files
877
- ## returns self
878
- def parse(file)
879
- File.open(file) do |fh|
880
- parse_handle(fh)
881
- end
882
- self
883
- end
884
-
885
- # returns( split_after, except_before)
886
- def enzyme_specificity
887
- if version == "3.2"
888
- arr = enzyme_info.split(/\s+/)[3,2]
889
- arr.collect! do |str|
890
- if str && str.class == String ; str
891
- else ; ""
892
- end
893
- end
894
- return *arr
895
- end
896
- end
897
-
898
- # Returns the version of the sequest.params file
899
- # Returns String "3.2" if contains "enyzme_info"
900
- # Returns String "3.1" if contains "enzyme_number"
901
- def version
902
- if @opts["enzyme_info"] ; return "3.2"
903
- elsif @opts["enzyme_number"] ; return "3.1"
904
- end
905
- end
906
-
907
- ####################################################
908
- # TO PEPXML
909
- ####################################################
910
- # In some ways, this is merely translating to the older Bioworks
911
- # sequest.params files
912
-
913
- # I'm not sure if this is the right mapping for sequence_search_constraint?
914
- def sequence
915
- pseq = @opts['partial_sequence']
916
- if !pseq || pseq == "" ; pseq = "0" end
917
- pseq
918
- end
919
-
920
- # Returns xml in the form <parameter name="#{method_name}"
921
- # value="#{method_value}"/> for list of symbols
922
- def pepxml_parameters
923
- keys_as_symbols = @opts.sort.map do |k,v| k.to_s end
924
- params_xml(*keys_as_symbols)
925
- # (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
926
- end
927
-
928
- def precursor_mass_type
929
- case @opts['mass_type_parent']
930
- when '0' ; "average"
931
- when '1' ; "monoisotopic"
932
- else ; abort "error in mass_type_parent in sequest!"
933
- end
934
- end
935
-
936
- def fragment_mass_type
937
- fmtype =
938
- case @opts['mass_type_fragment']
939
- when '0' ; "average"
940
- when '1' ; "monoisotopic"
941
- else ; abort "error in mass_type_fragment in sequest!"
942
- end
943
- end
944
-
945
- def method_missing(name, *args)
946
- string = name.to_s
947
- if @opts.key?(string) ; return @opts[string]
948
- elsif @mods.key?(string) ; return @mods[string]
949
- else ; return nil
950
- end
951
- end
952
-
953
- ## We only need to define values if they are different than sequest.params
954
- ## The method_missing will look them up in the hash!
955
-
956
- # Returns a system independent basename
957
- # Splits on "\" or "/"
958
- def _sys_ind_basename(file)
959
- return file.split(/[\\\/]/)[-1]
960
- end
961
-
962
- # changes the path of the database
963
- def database_path=(newpath)
964
- db = @opts["first_database_name"]
965
- newpath = File.join(newpath, _sys_ind_basename(db))
966
- @opts["first_database_name"] = newpath
967
- end
968
-
969
- def database
970
- @opts["first_database_name"]
971
- end
972
-
973
- # returns the appropriate aminoacid mass lookup table (in spec_id.rb SpecID::MONO or
974
- # SpecID::AVG based on precursor_mass_type
975
- def mass_table
976
- case precursor_mass_type
977
- when 'average'
978
- SpecID::AVG
979
- when 'monoisotopic'
980
- SpecID::MONO
981
- end
982
- end
983
-
984
- # at least in Bioworks 3.2, the First number after the enzyme
985
- # is the indication of the enzymatic end stringency (required):
986
- # 1 = Fully enzymatic
987
- # 2 = Either end
988
- # 3 = N terminal only
989
- # 4 = C terminal only
990
- # So, to get min_number_termini we map like this:
991
- # 1 => 2
992
- # 2 => 1
993
- def min_number_termini
994
- termini_number = @opts["enzyme_info"].split(" ")[1]
995
- if termini_number == "1"
996
- return "2"
997
- elsif termini_number == "2"
998
- return "1"
999
- else
1000
- puts "WARNING: Enzyme termini info might be imprecise!"
1001
- return "1"
1002
- end
1003
- end
1004
-
1005
- def enzyme
1006
- #if @opts["enzyme_info"] =~ /Trypsin/ ; return "tryptic"
1007
- #else ; return @opts["enzyme_info"].split('(')[0] end
1008
- return @opts["enzyme_info"].split('(')[0]
1009
- end
1010
-
1011
- def max_num_internal_cleavages
1012
- @opts["max_num_internal_cleavage_sites"]
1013
- end
1014
-
1015
- def peptide_mass_tol
1016
- if @opts["peptide_mass_units"] != "0"
1017
- puts "WARNING: peptide_mass_tol units need to be adjusted!"
1018
- end
1019
- @opts["peptide_mass_tolerance"]
1020
- end
1021
-
1022
- def fragment_ion_tol
1023
- @opts["fragment_ion_tolerance"]
1024
- end
1025
- def max_num_differential_AA_per_mod
1026
- @opts["max_num_differential_per_peptide"]
1027
- end
1028
-
1029
- ## @TODO: We could add some of the parameters not currently being asked for to be more complete
1030
- ## @TODO: We could always add the Bioworks 3.2 specific params as params
1031
-
1032
- ####################################################
1033
- ####################################################
1034
-
1035
- end
1036
-
1037
- class Sequest::PepXML::SearchResult
1038
- include SpecIDXML
1039
- # an array of search_hits
1040
- attr_accessor :search_hits
1041
-
1042
- # if block given, then search_hits set to return value
1043
- def initialize
1044
- if block_given? ; @search_hits = yield
1045
- else ; @search_hits = [] end
1046
- end
1047
-
1048
- def to_pepxml
1049
- element_xml_no_atts(:search_result) do
1050
- @search_hits.map {|sh| sh.to_pepxml }.join
1051
- end
1052
- end
1053
- end
1054
-
1055
- class Sequest::PepXML::SearchSummary
1056
- include SpecIDXML
1057
- attr_accessor :params
1058
- attr_accessor :base_name
1059
- attr_accessor :out_data_type
1060
- attr_accessor :out_data
1061
- attr_accessor :modifications
1062
- # A SearchDatabase object (responds to :local_path and :type)
1063
- attr_accessor :search_database
1064
- # if given a sequest params object, then will set the following attributes:
1065
- # args is a hash of parameters
1066
- # modifications_string -> See Modifications
1067
- def initialize(params, modifications_string='', args=nil)
1068
- @search_id = nil
1069
- @params = params
1070
- @modifications = Sequest::PepXML::Modifications.new(params, modifications_string)
1071
- if args ; set_from_hash(args) end
1072
- end
1073
-
1074
- def method_missing(symbol, *args)
1075
- if @params ; @params.send(symbol, *args) end
1076
- end
1077
-
1078
- def search_id
1079
- if @search_id ; @search_id
1080
- else ; '1' end
1081
- end
1082
-
1083
-
1084
- def to_pepxml
1085
- element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
1086
- search_database.to_pepxml +
1087
- short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
1088
- @modifications.to_pepxml +
1089
- @params.pepxml_parameters
1090
- end
1091
- end
1092
-
1093
- end
1094
-
1095
- class Sequest::PepXML::Modifications
1096
- include SpecIDXML
1097
-
1098
- # sequest params object
1099
- attr_accessor :params
1100
- # array holding AAModifications
1101
- attr_accessor :aa_mods
1102
- # array holding TerminalModifications
1103
- attr_accessor :term_mods
1104
- # a hash of all differential modifications present by aa_one_letter_symbol
1105
- # and special_symbol. This is NOT the mass difference but the total mass {
1106
- # 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
1107
- # the amino acid sequence, they are give the *differential* mass. The
1108
- # termini are given the special symbol as in sequest e.g. '[' => 12.22, #
1109
- # cterminus ']' => 14.55 # nterminus
1110
- attr_accessor :masses_by_diff_mod_hash
1111
- # a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
1112
- # values are the special_symbols
1113
- attr_accessor :mod_symbols_hash
1114
-
1115
- # The modification symbols string looks like this:
1116
- # (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
1117
- # ct is cterminal peptide (differential)
1118
- # nt is nterminal peptide (differential)
1119
- # the C is just cysteine
1120
- # will set_modifications and masses_by_diff_mod hash
1121
- def initialize(params, modification_symbols_string='')
1122
- @params = params
1123
- set_modifications(params, modification_symbols_string)
1124
- end
1125
-
1126
- # set the masses_by_diff_mod and mod_symbols_hash from
1127
- def set_hashes(modification_symbols_string)
1128
-
1129
- @mod_symbols_hash = {}
1130
- @masses_by_diff_mod = {}
1131
- if (modification_symbols_string == nil || modification_symbols_string == '')
1132
- return nil
1133
- end
1134
- table = @params.mass_table
1135
- modification_symbols_string.split(/\)\s+\(/).each do |mod|
1136
- if mod =~ /\(?(\w+)(.) (.[\d\.]+)\)?/
1137
- if $1 == 'ct' || $1 == 'nt'
1138
- mass_diff = $3.to_f
1139
- @masses_by_diff_mod[$2] = mass_diff
1140
- @mod_symbols_hash[[$1, mass_diff]] = $2.dup
1141
- else
1142
- symbol_string = $2.dup
1143
- mass_diff = $3.to_f
1144
- $1.split('').each do |aa|
1145
- aa_as_sym = aa.to_sym
1146
- @masses_by_diff_mod[aa+symbol_string] = mass_diff + table[aa_as_sym]
1147
- @mod_symbols_hash[[aa_as_sym, mass_diff]] = symbol_string
1148
- end
1149
- end
1150
- end
1151
- end
1152
- end
1153
-
1154
- # given a bare peptide (no end pieces) returns a ModificationInfo object
1155
- # e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
1156
- # if there are no modifications, returns nil
1157
- def modification_info(peptide)
1158
- if @masses_by_diff_mod.size == 0
1159
- return nil
1160
- end
1161
- hash = {}
1162
- hash[:modified_peptide] = peptide.dup
1163
- hsh = @masses_by_diff_mod
1164
- table = @params.mass_table
1165
- h = table[:h] # this? or h_plus ??
1166
- oh = table[:o] + h
1167
- ## only the termini can match a single char
1168
- if hsh.key? peptide[0,1]
1169
- # AA + H + differential_mod
1170
- hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
1171
- peptide = peptide[1...(peptide.size)]
1172
- end
1173
- if hsh.key? peptide[(peptide.size-1),1]
1174
- # AA + OH + differential_mod
1175
- hash[:mod_cterm_mass] = table[peptide[(peptide.size-2),1].to_sym] + oh + hsh[peptide[-1,1]]
1176
- peptide.slice!( 0..-2 )
1177
- peptide = peptide[0...(peptide.size-1)]
1178
- end
1179
- mod_array = []
1180
- (0...peptide.size).each do |i|
1181
- if hsh.key? peptide[i,2]
1182
- mod_array << [ i+1 , hsh[peptide[i,2]] ]
1183
- end
1184
- end
1185
- if mod_array.size > 0
1186
- hash[:mod_aminoacid_mass_array] = mod_array
1187
- end
1188
- if hash.size > 1 # if there is more than just the modified peptide there
1189
- Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
1190
- else
1191
- nil
1192
- end
1193
- end
1194
-
1195
- # 1. sets aa_mods and term_mods from a sequest params object
1196
- # 2. sets @params
1197
- # 3. sets @masses_by_diff_mod
1198
- def set_modifications(params, modification_symbols_string)
1199
- @params = params
1200
-
1201
- set_hashes(modification_symbols_string)
1202
-
1203
- ####################################
1204
- ## static mods
1205
- ####################################
1206
-
1207
- static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
1208
- static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
1209
-
1210
- params.mods.each do |k,v|
1211
- v_to_f = v.to_f
1212
- if v_to_f != 0.0
1213
- if k =~ /add_(\w)_/
1214
- static_mods << [$1.to_sym, v_to_f]
1215
- else
1216
- static_terminal_mods << [k, v_to_f]
1217
- end
1218
- end
1219
- end
1220
- aa_hash = params.mass_table
1221
-
1222
- ## Create the static_mods objects
1223
- static_mods.map! do |mod|
1224
- hash = {
1225
- :aminoacid => mod[0].to_s,
1226
- :massdiff => mod[1].to_plus_minus_string,
1227
- :mass => aa_hash[mod[0]] + mod[1],
1228
- :variable => 'N',
1229
- :binary => 'Y',
1230
- }
1231
- Sequest::PepXML::AAModification.new(hash)
1232
- end
1233
-
1234
- ## Create the static_terminal_mods objects
1235
- static_terminal_mods.map! do |mod|
1236
- terminus = if mod[0] =~ /Cterm/ ; 'c'
1237
- else ; 'n' # only two possible termini
1238
- end
1239
- protein_terminus = case mod[0]
1240
- when /Nterm_protein/ ; 'n'
1241
- when /Cterm_protein/ ; 'c'
1242
- else nil
1243
- end
1244
-
1245
- # create the hash
1246
- hash = {
1247
- :terminus => terminus,
1248
- :massdiff => mod[1].to_plus_minus_string,
1249
- :variable => 'N',
1250
- :description => mod[0],
1251
- }
1252
- hash[:protein_terminus] = protein_terminus if protein_terminus
1253
- Sequest::PepXML::TerminalModification.new(hash)
1254
- end
1255
- #################################
1256
- # Variable Mods:
1257
- #################################
1258
- arr = params.diff_search_options.rstrip.split(/\s+/)
1259
- # [aa.to_sym, diff.to_f]
1260
- variable_mods = []
1261
- (0...arr.size).step(2) do |i|
1262
- if arr[i].to_f != 0.0
1263
- variable_mods << [arr[i+1], arr[i].to_f]
1264
- end
1265
- end
1266
- mod_objects = []
1267
- variable_mods.each do |mod|
1268
- mod[0].split('').each do |aa|
1269
- hash = {
1270
-
1271
- :aminoacid => aa,
1272
- :massdiff => mod[1].to_plus_minus_string,
1273
- :mass => aa_hash[aa.to_sym] + mod[1],
1274
- :variable => 'Y',
1275
- :binary => 'N',
1276
- :symbol => @mod_symbols_hash[[aa.to_sym, mod[1]]],
1277
- }
1278
- mod_objects << Sequest::PepXML::AAModification.new(hash)
1279
- end
1280
- end
1281
- variable_mods = mod_objects
1282
- #################################
1283
- # TERMINAL Variable Mods:
1284
- #################################
1285
- # These are always peptide, not protein termini (for sequest)
1286
- (nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
1287
-
1288
- to_add = []
1289
- if nterm_diff != 0.0
1290
- to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
1291
- end
1292
- if cterm_diff != 0.0
1293
- to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
1294
- end
1295
-
1296
- variable_terminal_mods = to_add.map do |term, mssdiff, symb|
1297
- hash = {
1298
- :terminus => term,
1299
- :massdiff => mssdiff,
1300
- :variable => 'Y',
1301
- :symbol => symb,
1302
- }
1303
- Sequest::PepXML::TerminalModification.new(hash)
1304
- end
1305
-
1306
- #########################
1307
- # COLLECT THEM
1308
- #########################
1309
- @aa_mods = static_mods + variable_mods
1310
- @term_mods = static_terminal_mods + variable_terminal_mods
1311
- end
1312
-
1313
- ## Generates the pepxml for static and differential amino acid mods based on
1314
- ## sequest object
1315
- def to_pepxml
1316
- st = ''
1317
- if @aa_mods
1318
- st << @aa_mods.map {|v| v.to_pepxml }.join
1319
- end
1320
- if @term_mods
1321
- st << @term_mods.map {|v| v.to_pepxml }.join
1322
- end
1323
- st
1324
- end
1325
-
3
+ module Sequest
1326
4
  end
1327
5
 
1328
- # Modified aminoacid, static or variable
1329
- # unless otherwise stated, all attributes can be anything
1330
- class Sequest::PepXML::AAModification
1331
- include SpecIDXML
1332
-
1333
- # The amino acid (one letter code)
1334
- attr_accessor :aminoacid
1335
- # Must be a string!!!!
1336
- # Mass difference with respect to unmodified aminoacid, must begin with
1337
- # either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
1338
- # consider Numeric#to_plus_minus_string at top
1339
- attr_accessor :massdiff
1340
- # Mass of modified aminoacid
1341
- attr_accessor :mass
1342
- # Y if both modified and unmodified aminoacid could be present in the
1343
- # dataset, N if only modified aminoacid can be present
1344
- attr_accessor :variable
1345
- # whether modification can reside only at protein terminus (specified 'n',
1346
- # 'c', or 'nc')
1347
- attr_accessor :peptide_terminus
1348
- # Special symbol used by search engine to designate this modification
1349
- attr_accessor :symbol
1350
- # Y if each peptide must have only modified or unmodified aminoacid, N if a
1351
- # peptide may contain both modified and unmodified aminoacid
1352
- attr_accessor :binary
1353
-
1354
- def initialize(hash=nil)
1355
- instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1356
- end
1357
-
1358
- def to_pepxml
1359
- short_element_xml_from_instance_vars("aminoacid_modification")
1360
- end
1361
-
1362
- end
1363
-
1364
- # Modified aminoacid, static or variable
1365
- class Sequest::PepXML::TerminalModification
1366
- include SpecIDXML
1367
-
1368
- # n for N-terminus, c for C-terminus
1369
- attr_accessor :terminus
1370
- # Mass difference with respect to unmodified terminus
1371
- attr_accessor :massdiff
1372
- # Mass of modified terminus
1373
- attr_accessor :mass
1374
- # Y if both modified and unmodified terminus could be present in the
1375
- # dataset, N if only modified terminus can be present
1376
- attr_accessor :variable
1377
- # Special symbol used by search engine to designate this modification
1378
- attr_accessor :symbol
1379
- # whether modification can reside only at protein terminus (specified n or
1380
- # c)
1381
- attr_accessor :protein_terminus
1382
- attr_accessor :description
1383
-
1384
- def initialize(hash=nil)
1385
- instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
1386
- end
1387
-
1388
- def to_pepxml
1389
- short_element_xml_from_instance_vars("terminal_modification")
1390
- end
1391
- end
1392
-
1393
-
1394
- class Sequest::PepXML::SearchDatabase
1395
- include SpecIDXML
1396
- attr_accessor :local_path
1397
- attr_writer :seq_type
1398
- # Takes a SequestParams object
1399
- # Sets :local_path from the params object attr :database
1400
- def initialize(params=nil, args=nil)
1401
- @seq_type = nil
1402
- if params
1403
- @local_path = params.database
1404
- end
1405
- if args ; set_from_hash(args) end
1406
- end
1407
-
1408
- def seq_type
1409
- if @seq_type ; @seq_type
1410
- else
1411
- if @local_path =~ /\.fasta/
1412
- 'AA'
1413
- else
1414
- abort "Don't recognize type from your database local path: #{@local_path}"
1415
- end
1416
- end
1417
- end
1418
-
1419
- def to_pepxml
1420
- short_element_xml_and_att_string(:search_database, "local_path=\"#{local_path}\" type=\"#{seq_type}\"")
1421
- end
1422
-
1423
- end
1424
-
1425
- class Sequest::PepXML::SpectrumQuery
1426
- include SpecIDXML
1427
-
1428
- # basename_noext.first_scan.last_scan.charge
1429
- attr_accessor :spectrum
1430
- attr_accessor :start_scan
1431
- attr_accessor :end_scan
1432
- attr_accessor :precursor_neutral_mass
1433
- attr_accessor :index
1434
- attr_accessor :search_results
1435
-
1436
- # this is a string
1437
- attr_accessor :assumed_charge
1438
- attr_accessor :pepxml_version
1439
-
1440
- # sets the search_results array
1441
- # if block given, sets search_results to return value
1442
- def initialize(hash=nil)
1443
- if block_given? ; @search_results = yield
1444
- else ; @search_results = []
1445
- end
1446
- if hash ; set_from_hash(hash) end
1447
- end
1448
-
1449
- ############################################################
1450
- # FOR PEPXML:
1451
- ############################################################
1452
- def to_pepxml
1453
- case Sequest::PepXML.pepxml_version
1454
- when 18
1455
- element_xml("spectrum_query", [:spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :assumed_charge, :index]) do
1456
- @search_results.collect { |sr| sr.to_pepxml }.join
1457
- end
1458
- when 0
1459
- #element_xml("search_result", [:spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :assumed_charge, :index]) do
1460
- # @search_results.collect { |search_result|
1461
- # search_result.to_pepxml
1462
- # }.join("\n")
1463
- #end
1464
- end
1465
- end
1466
-
1467
- # Returns the precursor_neutral based on the scans and an array indexed by
1468
- # scan numbers. first and last scan and charge should be integers.
1469
- # This is the precursor_mz - h_plus!
1470
- # by=:prec_mz_arr|:deltamass
1471
- # if prec_mz_arr then the following arguments must be supplied:
1472
- # :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor
1473
- # m/z for each product scan, :charge = int
1474
- # if deltamass then the following arguments must be supplied:
1475
- # m_plus_h = float, deltamass = float
1476
- # For both flavors, a final additional argument 'average_weights'
1477
- # can be used. If true (default), average weights will be used, if false,
1478
- # monoisotopic weights (currently this is simply the mass of the proton)
1479
- def self.calc_precursor_neutral_mass(by, *args)
1480
- average_weights = true
1481
- case by
1482
- when :prec_mz_arr
1483
- (first_scan, last_scan, prec_mz_arr, charge, average_weights) = args
1484
- when :deltamass
1485
- (m_plus_h, deltamass, average_weights) = args
1486
- end
1487
-
1488
- if average_weights
1489
- mass_h_plus = SpecID::AVG[:h_plus]
1490
- else
1491
- mass_h_plus = SpecID::MONO[:h_plus]
1492
- end
1493
-
1494
- case by
1495
- when :prec_mz_arr
1496
- mz = nil
1497
- if first_scan != last_scan
1498
- sum = 0.0
1499
- tot_num = 0
1500
- (first_scan..last_scan).each do |scan|
1501
- val = prec_mz_arr[scan]
1502
- if val # if the scan is not an mslevel 2
1503
- sum += val.to_f
1504
- tot_num += 1
1505
- end
1506
- end
1507
- mz = sum/tot_num.to_f
1508
- else
1509
- mz = prec_mz_arr[first_scan].to_f
1510
- end
1511
- charge * (mz - mass_h_plus)
1512
- when :deltamass
1513
- m_plus_h - mass_h_plus + deltamass
1514
- else
1515
- abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}"
1516
- end
1517
- end
1518
-
1519
- end
1520
-
1521
-
1522
-
1523
- Sequest::PepXML::SearchHit = ArrayClass.new( %w( hit_rank peptide peptide_prev_aa peptide_next_aa protein num_tot_proteins num_matched_ions tot_num_ions calc_neutral_pep_mass massdiff num_tol_term num_missed_cleavages is_rejected deltacnstar xcorr deltacn spscore sprank modification_info) )
1524
-
1525
- # hit_rank=0 peptide=1 peptide_prev_aa=2 peptide_next_aa=3 protein=4 num_tot_proteins=5 num_matched_ions=6 tot_num_ions=7 calc_neutral_pep_mass=8 massdiff=9 num_tol_term=10 num_missed_cleavages=11 is_rejected=12 deltacnstar=13 xcorr=14 deltacn=15 spscore=16 sprank=17 modification_info=18
1526
-
1527
- class Sequest::PepXML::SearchHit
1528
- include SpecIDXML
1529
-
1530
- Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
1531
-
1532
-
1533
- # These are all search_score elements:
1534
-
1535
- # 1 if there is no second ranked hit, 0 otherwise
1536
-
1537
- tmp_verb = $VERBOSE
1538
- $VERBOSE = nil
1539
- def initialize(hash=nil)
1540
- super(@@arr_size)
1541
- if hash
1542
- self[0,19] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank], hash[:modification_info]]
1543
- end
1544
- self
1545
- end
1546
- $VERBOSE = tmp_verb
1547
-
1548
- def inspect
1549
- var = @@attributes.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
1550
- "#<SearchHit #{var}>"
1551
- end
1552
-
1553
- # requires Params object and full sequence (with heads and tails)
1554
- def self.calc_num_missed_cleavages(params, sequence)
1555
- num_missed = 0
1556
- split_after, except_before = params.enzyme_specificity
1557
- first, middle, last = SpecID::Pep.split_sequence(sequence)
1558
- arr = middle.scan(/[#{split_after}][^#{except_before}]/)
1559
- return arr.size
1560
- end
1561
-
1562
- # requires Params object and full sequence (with heads and tails)
1563
- def self.calc_num_tol_term(params, sequence)
1564
- num_tol = 0
1565
- split_after, except_before = params.enzyme_specificity
1566
- first, middle, last = SpecID::Pep.split_sequence(sequence)
1567
- last_of_middle = middle[-1,1]
1568
- first_of_middle = middle[0,1]
1569
- if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
1570
- num_tol += 1
1571
- end
1572
- if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
1573
- num_tol += 1
1574
- end
1575
- return num_tol
1576
- end
1577
-
1578
- # Takes ions in the form XX/YY and returns XX, YY
1579
- def self.split_ions(ions)
1580
- return *(ions.split("/"))
1581
- end
1582
-
1583
- def search_score_xml(symbol)
1584
- "#{tabs}<search_score name=\"#{symbol}\" value=\"#{send(symbol)}\"/>"
1585
- end
1586
-
1587
- def search_scores_xml(*symbol_list)
1588
- symbol_list.collect do |sy|
1589
- search_score_xml(sy)
1590
- end.join("\n") + "\n"
1591
- end
1592
-
1593
- def to_pepxml
1594
- mod_pepxml =
1595
- if self[18]
1596
- self[18].to_pepxml
1597
- else
1598
- ''
1599
- end
1600
-
1601
- element_xml("search_hit", [:hit_rank, :peptide, :peptide_prev_aa, :peptide_next_aa, :protein, :num_tot_proteins, :num_matched_ions, :tot_num_ions, :calc_neutral_pep_mass, :massdiff, :num_tol_term, :num_missed_cleavages, :is_rejected]) do
1602
- mod_pepxml +
1603
- search_scores_xml(:xcorr, :deltacn, :deltacnstar, :spscore, :sprank)
1604
- end
1605
- end
1606
-
1607
- end
1608
-
1609
- # Positions and masses of modifications
1610
- class Sequest::PepXML::SearchHit::ModificationInfo
1611
- include SpecIDXML
1612
-
1613
- ## Should be something like this:
1614
- # <modification_info mod_nterm_mass=" " mod_nterm_mass=" " modified_peptide=" ">
1615
- # <mod_aminoacid_mass position=" " mass=" "/>
1616
- # </modification_info>
1617
-
1618
-
1619
- # Mass of modified N terminus<
1620
- attr_accessor :mod_nterm_mass
1621
- # Mass of modified C terminus<
1622
- attr_accessor :mod_cterm_mass
1623
- # Peptide sequence (with indicated modifications) I'm assuming that the
1624
- # native sequest indicators are OK here
1625
- attr_accessor :modified_peptide
1626
- ## A few main types:
1627
-
1628
- # this should be an array of arrays: [[position, modified_mass], ...]
1629
- # position ranges from 1 to peptide length
1630
- attr_accessor :mod_aminoacid_mass_array
1631
-
1632
- def initialize(hash=nil)
1633
- @mod_nterm_mass = nil
1634
- @mod_cterm_mass = nil
1635
- if hash
1636
- instance_var_set_from_hash(hash)
1637
- end
1638
- end
1639
-
1640
- # Will escape any xml special chars in modified_peptide
1641
- def to_pepxml
1642
- ## Collect the modifications:
1643
- mod_strings = []
1644
- if @mod_aminoacid_mass_array
1645
- mod_strings = @mod_aminoacid_mass_array.map do |ar|
1646
- "position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
1647
- end
1648
- end
1649
- ## Create the attribute string:
1650
- att_parts = []
1651
- if @mod_nterm_mass
1652
- att_parts << "mod_nterm_mass=\"#{@mod_nterm_mass}\""
1653
- end
1654
- if @mod_cterm_mass
1655
- att_parts << "mod_cterm_mass=\"#{@mod_cterm_mass}\""
1656
- end
1657
- if @modified_peptide
1658
- att_parts << "modified_peptide=\"#{escape_special_chars(@modified_peptide)}\""
1659
- end
1660
- element_xml_and_att_string('modification_info', att_parts.join(" ")) do
1661
- mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
1662
- end
1663
- end
1664
-
1665
- ##
1666
-
1667
- # <modification_info modified_peptide="GC[546]M[147]PSKEVLSAGAHR">
1668
- # <mod_aminoacid_mass position="2" mass="545.7160"/>
1669
- # <mod_aminoacid_mass position="3" mass="147.1926"/>
1670
- # </modification_info>
1671
-
1672
-
1673
- end
1674
-
1675
-