mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,452 @@
1
+
2
+ require File.expand_path( File.dirname(__FILE__) + '/../../spec_helper' )
3
+
4
+ require 'spec_id'
5
+ require 'spec_id/sequest/pepxml'
6
+ #require 'ms/mzxml'
7
+
8
+
9
+ NODELETE = false
10
+
11
+ describe Sequest::PepXML::SearchHit, 'making enzyme calculations on sequences' do
12
+
13
+ before(:each) do
14
+ @tf_params_fullKRP = Tfiles + "/bioworks32.params"
15
+ # The enzyme is: 1 KR P
16
+ @tf_params_justKR = Tfiles + "/bioworks33.params"
17
+ end
18
+
19
+ it 'calculates the number of tolerant termini' do
20
+ exp = [{
21
+ # full KR/P
22
+ 'K.EPTIDR.E' => 2,
23
+ 'K.PEPTIDR.E' => 1,
24
+ 'F.EEPTIDR.E' => 1,
25
+ 'F.PEPTIDW.R' => 0,
26
+ },
27
+ {
28
+ # just KR
29
+ 'K.EPTIDR.E' => 2,
30
+ 'K.PEPTIDR.E' => 2,
31
+ 'F.EEPTIDR.E' => 1,
32
+ 'F.PEPTIDW.R' => 0,
33
+ }
34
+ ]
35
+ scall = Sequest::PepXML::SearchHit
36
+ sym = :calc_num_tol_term
37
+ params_ar = [Sequest::Params.new(@tf_params_fullKRP), Sequest::Params.new(@tf_params_justKR)]
38
+ params_ar.zip(exp) do |params,hash|
39
+ hash.each do |seq, val|
40
+ scall.send(sym, params, seq).should == val
41
+ end
42
+ end
43
+ end
44
+
45
+ it 'calculates number of missed cleavages' do
46
+ exp = [{
47
+ "K.EPTIDR.E" => 0,
48
+ "K.PEPTIDR.E" => 0,
49
+ "F.EEPTIDR.E" => 0,
50
+ "F.PEPTIDW.R" => 0,
51
+ "F.PERPTIDW.R" => 0,
52
+ "F.PEPKPTIDW.R" => 0,
53
+ "F.PEPKTIDW.R" => 1,
54
+ "K.RTTIDR.E" => 1,
55
+ "K.RTTIKK.E" => 2,
56
+ "F.PKEPRTIDW.R" => 2,
57
+ "F.PKEPRTIDKP.R" => 2,
58
+ "F.PKEPRAALKPEERPTIDKW.R" => 3,
59
+ },
60
+ {
61
+ "K.EPTIDR.E" => 0,
62
+ "K.PEPTIDR.E" => 0,
63
+ "F.EEPTIDR.E" => 0,
64
+ "F.PEPTIDW.R" => 0,
65
+ "F.PERPTIDW.R" => 1,
66
+ "F.PEPKPTIDW.R" => 1,
67
+ "F.PEPKTIDW.R" => 1,
68
+ "K.RTTIDR.E" => 1,
69
+ "K.RTTIKK.E" => 2,
70
+ "F.PKEPRTIDW.R" => 2,
71
+ "F.PKEPRTIDKP.R" => 3,
72
+ "F.PKEPRAALKPEERPTIDKW.R" => 5,
73
+ }
74
+ ]
75
+
76
+ params_ar = [Sequest::Params.new(@tf_params_fullKRP), Sequest::Params.new(@tf_params_justKR)]
77
+ scall = Sequest::PepXML::SearchHit
78
+ sym = :calc_num_missed_cleavages
79
+ #params_ar[1] = params_ar[0]
80
+ params_ar.zip(exp) do |params, hash|
81
+ hash.each do |seq, val|
82
+ scall.send(sym, params, seq).should == val
83
+ end
84
+ end
85
+ end
86
+
87
+ end
88
+
89
+ describe Sequest::PepXML, " created from small bioworks.xml" do
90
+
91
+ spec_large do
92
+ before(:all) do
93
+ tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
94
+
95
+ tf_params = Tfiles + "/bioworks32.params"
96
+ tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
97
+ out_path = Tfiles
98
+ @pepxml_objs = Sequest::PepXML.set_from_bioworks(tf_bioworks_xml, :params => tf_params, :ms_data => tf_mzxml_path, :out_path => out_path)
99
+ end
100
+
101
+ it 'gets some spectrum queries' do
102
+ @pepxml_objs.each do |obj|
103
+ (obj.spectrum_queries.size > 2).should be_true
104
+ (obj.spectrum_queries.first.search_results.first.search_hits.size > 0).should be_true
105
+ end
106
+ #@pepxml_objs.each do |pep| puts pep.to_pepxml end
107
+ end
108
+ end
109
+ end
110
+
111
+
112
+
113
+ describe Sequest::PepXML, " created from large bioworks.xml" do
114
+ # assert_equal_by_pairs (really any old array)
115
+ def assert_equal_pairs(obj, arrs)
116
+ arrs.each do |arr|
117
+ #if obj.send(arr[1]) != arr[0]
118
+ # puts "HELLO"
119
+ # puts "OBJ answer"
120
+ # p obj.send(arr[1])
121
+ # puts "ar0"
122
+ # p arr[0]
123
+ # puts "ar1"
124
+ # p arr[1]
125
+ #end
126
+ if arr[0].is_a? Float
127
+ obj.send(arr[1]).should be_close(arr[0], 0.0000000001)
128
+ else
129
+ obj.send(arr[1]).should == arr[0]
130
+ end
131
+ end
132
+ end
133
+
134
+ #swap the first to guys first
135
+ def assert_equal_pairs_swapped(obj, arrs)
136
+ arrs.each do |arr|
137
+ arr[0], arr[1] = arr[1], arr[0]
138
+ end
139
+ assert_equal_pairs(obj, arrs)
140
+ end
141
+
142
+ spec_large do
143
+ before(:all) do
144
+ st = Time.new
145
+ params = Tfiles + "/opd1/sequest.3.2.params"
146
+ bioworks_xml = Tfiles_l + "/opd1/bioworks.000.oldparams.xml"
147
+ mzxml_path = Tfiles_l + "/opd1"
148
+ out_path = Tfiles
149
+ @pepxml_version = 18
150
+ @pepxml_objs = Sequest::PepXML.set_from_bioworks_xml(bioworks_xml, params, {:ms_data => mzxml_path, :out_path => out_path, :pepxml_version => @pepxml_version})
151
+ puts "- takes #{Time.new - st} secs"
152
+ end
153
+
154
+ it 'extracts MSMSPipelineAnalysis' do
155
+ ######## HMMMMM...
156
+ Sequest::PepXML.pepxml_version.should == @pepxml_version
157
+
158
+ # MSMSPipelineAnalysis
159
+ po = @pepxml_objs.first
160
+ msms_pipeline = po.msms_pipeline_analysis
161
+ msms_pipeline.xmlns.should == 'http://regis-web.systemsbiology.net/pepXML'
162
+ msms_pipeline.xmlns_xsi.should == 'http://www.w3.org/2001/XMLSchema-instance'
163
+ msms_pipeline.xsi_schema_location.should == 'http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd'
164
+ msms_pipeline.summary_xml.should == '000.xml'
165
+ end
166
+
167
+ it 'extracts MSmSRunSummary' do
168
+ # MSMSRunSummary
169
+ rs = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary
170
+ rs.base_name.should =~ /\/000/
171
+ assert_equal_pairs(rs, [ ['ThermoFinnigan', :ms_manufacturer], ['LCQ Deca XP Plus', :ms_model], ['ESI', :ms_ionization], ['Ion Trap', :ms_mass_analyzer], ['UNKNOWN', :ms_detector], ['raw', :raw_data_type], ['.mzXML', :raw_data], ])
172
+ end
173
+
174
+ it 'extracts SampleEnzyme' do
175
+ # SampleEnzyme
176
+ se = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.sample_enzyme
177
+ assert_equal_pairs(se, [ ['Trypsin', :name], ['KR', :cut], [nil, :no_cut], ['C', :sense], ])
178
+ end
179
+
180
+ it 'extracts SearchSummary' do
181
+ # SearchSummary
182
+ ss = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary
183
+ ss.is_a?(Sequest::PepXML::SearchSummary).should be_true
184
+ ss.base_name.should =~ /\/000/
185
+ ss.peptide_mass_tol.should =~ /1\.500/
186
+ assert_equal_pairs_swapped(ss, [ # normal attributes
187
+ [:search_engine, "SEQUEST"], [:precursor_mass_type, "average"], [:fragment_mass_type, "average"], [:out_data_type, "out"], [:out_data, ".tgz"], [:search_id, "1"],
188
+
189
+ # enzymatic_search_constraint
190
+ [:enzyme, 'Trypsin'], [:max_num_internal_cleavages, '2'], [:min_number_termini, '2'],
191
+
192
+ # parameters
193
+ [:fragment_ion_tol, "1.0000"], [:ion_series, "0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0"], [:max_num_differential_AA_per_mod, "3"], [:nucleotide_reading_frame, "0"], [:num_output_lines, "10"], [:remove_precursor_peak, "0"], [:ion_cutoff_percentage, "0.0000"], [:match_peak_count, "0"], [:match_peak_allowed_error, "1"], [:match_peak_tolerance, "1.0000"], [:protein_mass_filter, "0 0"],
194
+ ])
195
+
196
+ end
197
+ it 'extracts SearchDatabase' do
198
+ # SearchDatabase
199
+ sd = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary.search_database
200
+ sd.is_a?(Sequest::PepXML::SearchDatabase).should be_true
201
+ assert_equal_pairs_swapped(sd, [ [:local_path, "C:\\Xcalibur\\database\\ecoli_K12.fasta"], [:seq_type, 'AA'], ])
202
+ end
203
+
204
+ it 'returns SpectrumQueries' do
205
+ # SpectrumQueries
206
+ sq = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.spectrum_queries
207
+ spec = sq.first
208
+ assert_equal_pairs_swapped(spec, [
209
+ [:spectrum, "000.100.100.1"], [:start_scan, "100"], [:end_scan, "100"],
210
+ #[:precursor_neutral_mass, "1074.5920"], # out2summary
211
+ [:precursor_neutral_mass, 1074.666926], # mine
212
+ [:assumed_charge, 1], [:index, "1"],
213
+ ])
214
+ sh = spec.search_results.first.search_hits.first
215
+ assert_equal_pairs_swapped(sh, [
216
+ # normal attributes
217
+ [:hit_rank, 1],
218
+ [:peptide, "SIYFRNFK"],
219
+ [:peptide_prev_aa, "R"],
220
+ [:peptide_next_aa, "G"],
221
+ [:protein, "gi|16130084|ref|NP_416651.1|"],
222
+ [:num_tot_proteins, 1],
223
+ [:num_matched_ions, 4],
224
+ [:tot_num_ions, 14],
225
+ #[:calc_neutral_pep_mass, "1074.1920"], # out2summary
226
+ [:calc_neutral_pep_mass, 1074.23261], # mine
227
+ #[:massdiff, "+0.400000"], # out2summary
228
+ [:massdiff, 0.434316000000081], # mine
229
+ [:num_tol_term, 2], [:num_missed_cleavages, 1], [:is_rejected, 0],
230
+
231
+ # search_score
232
+ [:xcorr, 0.4], [:deltacn, 0.023], [:deltacnstar, "0"], [:spscore, 78.8], [:sprank, 1],
233
+ ])
234
+
235
+ spec = sq[1]
236
+ assert_equal_pairs_swapped(spec, [
237
+ [:spectrum, "000.1000.1000.1"], [:start_scan, "1000"], [:end_scan, "1000"], #[:precursor_neutral_mass, "663.1920"], # out2summary
238
+ [:precursor_neutral_mass, 663.206111], # mine
239
+ [:assumed_charge, 1], [:index, "2"],
240
+ ])
241
+
242
+ sh = spec.search_results.first.search_hits.first
243
+ assert_equal_pairs_swapped(sh, [
244
+ # normal attributes
245
+ [:hit_rank, 1], [:peptide, "ALADFK"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "S"], [:protein, "gi|16128765|ref|NP_415318.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 10],
246
+ [:num_tol_term, 2], [:num_missed_cleavages, 0], [:is_rejected, 0],
247
+ #[:massdiff, "-0.600000"], # out2summary
248
+ [:massdiff, -0.556499000000031], # mine
249
+ #[:calc_neutral_pep_mass, 663.7920], # out2summary
250
+ [:calc_neutral_pep_mass, 663.76261], # mine
251
+
252
+ # search_score
253
+ [:xcorr, 0.965], [:deltacn, 0.132], [:deltacnstar, "0"], [:spscore, 81.1], [:sprank, 1],
254
+ ])
255
+
256
+ spec = sq[9]
257
+ assert_equal_pairs_swapped(spec, [
258
+ [:spectrum, "000.1008.1008.2"], [:start_scan, "1008"], [:end_scan, "1008"], [:assumed_charge, 2],
259
+ #[:precursor_neutral_mass, "691.0920"], # out2summary
260
+ [:precursor_neutral_mass, 691.150992], # mine
261
+ ])
262
+
263
+ sh = spec.search_results.first.search_hits.first
264
+ assert_equal_pairs_swapped(sh, [
265
+ # normal attributes
266
+ [:hit_rank, 1], [:peptide, "RLFTR"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "A"], [:protein, "gi|16130457|ref|NP_417027.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 8], [:num_tol_term, 2],
267
+
268
+ #[:num_missed_cleavages, "0"], # out2summary misses this!
269
+ [:num_missed_cleavages, 1],
270
+ [:is_rejected, 0],
271
+ #[:calc_neutral_pep_mass, "691.7920"], # out2summary
272
+ [:calc_neutral_pep_mass, 691.82261], # mine
273
+ #[:massdiff, "-0.700000"], # out2summary
274
+ [:massdiff, -0.67161800000008], # mine
275
+
276
+ # search_score
277
+ [:xcorr, 0.903], [:deltacn, 0.333], [:deltacnstar, "0"], [:spscore, 172.8], [:sprank, 1],
278
+ ])
279
+ end
280
+
281
+ it 'can generate correct pepxml file' do
282
+
283
+ ## IF OUR OBJECT IS CORRECT, THEN WE GET THE OUTPUT:
284
+ string = @pepxml_objs.first.to_pepxml
285
+ ans_lines = IO.read(Tfiles + "/opd1/000.my_answer.100lines.xml").split("\n")
286
+ base_name_re = /base_name=".*?files\//o
287
+ date_re = /date=".*?"/
288
+ string.split("\n").each_with_index do |line,i|
289
+ if i > 99 ; break end
290
+ ans, exp =
291
+ if i == 1
292
+ [line.sub(date_re,''), ans_lines[i].sub(date_re,'')]
293
+ elsif i == 2
294
+ [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t")]
295
+ elsif i == 6
296
+ [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t\t")]
297
+ else
298
+ [line, ans_lines[i]]
299
+ end
300
+
301
+ #ans.split('').zip(exp.split('')) do |l,a|
302
+ # if l != a
303
+ # puts line
304
+ # puts ans_lines[i]
305
+ # puts l
306
+ # puts a
307
+ # end
308
+ #end
309
+ if ans != exp
310
+ puts ans
311
+ puts exp
312
+ end
313
+ ans.should == exp
314
+ #line.sub(base_name_re,'').should == ans_lines[i].sub(base_name_re,'')
315
+ end
316
+ end
317
+ end
318
+ end
319
+
320
+
321
+
322
+ describe Sequest::PepXML::Modifications do
323
+ before(:each) do
324
+ tf_params = Tfiles + "/bioworks32.params"
325
+ @params = Sequest::Params.new(tf_params)
326
+ # The params object here is completely unnecessary for this test, except
327
+ # that it sets up the mass table
328
+ @obj = Sequest::PepXML::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
329
+ end
330
+ it 'creates a mod_symbols_hash' do
331
+ answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
332
+ @obj.mod_symbols_hash.should == answ
333
+ ## need more here
334
+ end
335
+
336
+ it 'creates a ModificationInfo object given a special peptide sequence' do
337
+ mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
338
+ @params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
339
+ @params.term_diff_search_options = "14.20000 12.33000"
340
+ mod = Sequest::PepXML::Modifications.new(@params, mod_string)
341
+ ## no mods
342
+ peptide = "PEPTIDE"
343
+ mod.modification_info(peptide).should be_nil
344
+ peptide = "]M*EC^S@IDM#M*EMSCM["
345
+ modinfo = mod.modification_info(peptide)
346
+ modinfo.modified_peptide.should == peptide
347
+ modinfo.mod_nterm_mass.should be_close(146.40054, 0.000001)
348
+ modinfo.mod_cterm_mass.should be_close(160.52994, 0.000001)
349
+ end
350
+
351
+ end
352
+
353
+ describe Sequest::PepXML::SearchHit::ModificationInfo do
354
+
355
+ before(:each) do
356
+ modaaobjs = [[3, 150.3], [6, 345.2]].map do |ar|
357
+ Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new(ar)
358
+ end
359
+ hash = {
360
+ :mod_nterm_mass => 520.2,
361
+ :modified_peptide => "MOD*IFI^E&D",
362
+ :mod_aminoacid_masses => modaaobjs,
363
+ }
364
+ #answ = "<modification_info mod_nterm_mass=\"520.2\" modified_peptide=\"MOD*IFI^E&amp;D\">\n\t<mod_aminoacid_mass position=\"3\" mass=\"150.3\"/>\n\t<mod_aminoacid_mass position=\"6\" mass=\"345.2\"/>\n</modification_info>\n"
365
+ @obj = Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
366
+ end
367
+
368
+ def _re(st)
369
+ /#{Regexp.escape(st)}/
370
+ end
371
+
372
+ it 'can produce pepxml' do
373
+ answ = @obj.to_pepxml
374
+ answ.should =~ _re('<modification_info')
375
+ answ.should =~ _re(" mod_nterm_mass=\"520.2\"")
376
+ answ.should =~ _re(" modified_peptide=\"MOD*IFI^E&amp;D\"")
377
+ answ.should =~ _re("<mod_aminoacid_mass")
378
+ answ.should =~ _re(" position=\"3\"")
379
+ answ.should =~ _re(" mass=\"150.3\"")
380
+ answ.should =~ _re(" position=\"6\"")
381
+ answ.should =~ _re(" mass=\"345.2\"")
382
+ answ.should =~ _re("</modification_info>")
383
+ end
384
+ end
385
+
386
+ describe 'bioworks file with modifications transformed into pepxml' do
387
+
388
+ spec_large do
389
+ before(:all) do
390
+ modfiles_sequest_dir = Tfiles_l + '/opd1_2runs_2mods/sequest/'
391
+ modfiles_data_dir = Tfiles_l + '/opd1_2runs_2mods/data/'
392
+ @srgfile = modfiles_sequest_dir + 'tmp.srg'
393
+ @out_path = modfiles_sequest_dir + 'pepxml'
394
+ modfiles = %w(020 040).map do |file|
395
+ modfiles_sequest_dir + file + ".srf"
396
+ end
397
+ objs = Sequest::PepXML.set_from_bioworks( SRFGroup.new(modfiles).to_srg(@srgfile), {:ms_data => modfiles_data_dir, :out_path => @out_path, :print => true, :backup_db_path => '/project/marcotte/marcotte/ms/database'} )
398
+ @out_files = %w(020 040).map do |file|
399
+ @out_path + '/' + file + '.xml'
400
+ end
401
+ end
402
+
403
+ after(:all) do
404
+ File.unlink(@srgfile) unless NODELETE
405
+ FileUtils.rm_r(@out_path)
406
+ #@out_files.each do |fn|
407
+ # File.unlink(fn) unless NODELETE
408
+ #end
409
+ end
410
+
411
+ # splits string on ' 'and matches the line found by find_line_regexp in
412
+ # lines
413
+ def match_modline_pieces(lines, find_line_regexp, string)
414
+ pieces = string.split(' ').map {|v| /#{Regexp.escape(v)}/ }
415
+ lines.each do |line|
416
+ if line =~ find_line_regexp
417
+ pieces.each do |piece|
418
+ line.should =~ piece
419
+ end
420
+ end
421
+ end
422
+ end
423
+
424
+ it 'gets modifications right in real run' do
425
+ @out_files.each do |fn|
426
+ fn.should exist
427
+ beginning = IO.read(fn)
428
+ lines = beginning.split("\n")
429
+ [
430
+ [/aminoacid="M"/, '<aminoacid_modification symbol="*" massdiff="+15.9994" aminoacid="M" variable="Y" binary="N" mass="147.192"'],
431
+
432
+ [/aminoacid="S"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="S" variable="Y" binary="N" mass="167.0581"'],
433
+ [/aminoacid="T"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="T" variable="Y" binary="N" mass="181.085"'],
434
+ [/aminoacid="Y"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="Y" variable="Y" binary="N" mass="243.1559"'],
435
+ [/parameter name="diff_search_options"/, '<parameter name="diff_search_options" value="15.999400 M 79.979900 STY 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>'],
436
+ ].each do |a,b|
437
+ match_modline_pieces(lines, a, b)
438
+ end
439
+ [
440
+ '<modification_info modified_peptide="Y#RLGGS#T#K">',
441
+ '<mod_aminoacid_mass position="1" mass="243.1559"/>',
442
+ '<mod_aminoacid_mass position="7" mass="167.0581"/>',
443
+ '</modification_info>',
444
+ '<mod_aminoacid_mass position="9" mass="181.085"/>'
445
+ ].each do |line|
446
+ beginning.should =~ /#{Regexp.escape(line)}/ # "a modification info for a peptide")
447
+ end
448
+ end
449
+ end
450
+ end
451
+ end
452
+
@@ -0,0 +1,138 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
2
+
3
+ require 'spec_id/srf'
4
+
5
+ SpecHelperHeaderHash = {
6
+ 'SQTGenerator' => 'mspire',
7
+ 'SQTGeneratorVersion' => String,
8
+ 'Database' => 'C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta',
9
+ 'FragmentMasses' => 'AVG',
10
+ 'PrecursorMasses' => 'AVG',
11
+ 'StartTime' => nil,
12
+ 'Alg-MSModel' => 'LCQ Deca XP',
13
+ 'Alg-PreMassUnits' => 'amu',
14
+ 'DBLocusCount' => '4237',
15
+ 'Alg-FragMassTol' => '1.0000',
16
+ 'Alg-PreMassTol' => '1.4000',
17
+ 'Alg-IonSeries' => '0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0',
18
+ 'Alg-Enzyme' => 'Trypsin(KR/P) (2)',
19
+ 'Comment' => ['Created from Bioworks .srf file'],
20
+ 'StaticMod' => ['C=160.1901','Cterm=10.1230','E=161.4455'],
21
+ 'DynamicMod' => ['STY*=+79.97990', 'M#=+14.02660'],
22
+ }
23
+
24
+ SpecHelperOtherLines =<<END
25
+ S 2 2 1 0.0 VELA 391.04541015625 3021.5419921875 0.0 0
26
+ S 3 3 1 0.0 VELA 446.009033203125 1743.96911621094 0.0 122
27
+ M 1 1 445.5769264522 0.0 0.245620265603065 16.6666660308838 1 6 R.SNSK.S U
28
+ L gi|16128266|ref|NP_414815.1|
29
+ END
30
+
31
+ SpecHelperOtherLinesEnd =<<END
32
+ L gi|90111093|ref|NP_414704.4|
33
+ M 10 17 1298.5350544522 0.235343858599663 0.823222815990448 151.717300415039 12 54 K.LQKIITNSY*K U
34
+ L gi|90111124|ref|NP_414904.2|
35
+ END
36
+
37
+ describe 'converting a large srf to sqt' do
38
+ def del(file)
39
+ if File.exist?(file)
40
+ File.unlink(file)
41
+ end
42
+ end
43
+
44
+ # returns true or false
45
+ def header_hash_match(header_lines, hash)
46
+ header_lines.all? do |line|
47
+ (h, k, v) = line.chomp.split("\t")
48
+ if hash[k].is_a? Array
49
+ if hash[k].include?(v)
50
+ true
51
+ else
52
+ puts "FAILED: "
53
+ p k
54
+ p v
55
+ p hash[k]
56
+ false
57
+ end
58
+ elsif hash[k] == String
59
+ v.is_a?(String)
60
+ else
61
+ if v == hash[k]
62
+ true
63
+ else
64
+ puts "FAILED: "
65
+ p k
66
+ p v
67
+ p hash[k]
68
+ false
69
+ end
70
+ end
71
+ end
72
+ end
73
+
74
+ spec_large do
75
+ before(:all) do
76
+ @file = Tfiles_l + '/opd1_static_diff_mods/000.srf'
77
+ @output = Tfiles_l + '/opd1_static_diff_mods/000.sqt.tmp'
78
+ @srf = SRF.new(@file)
79
+ @original_db_filename = @srf.header.db_filename
80
+ end
81
+ it 'converts without bothering with the database' do
82
+ @srf.to_sqt(@output)
83
+ @output.should exist
84
+ lines = File.readlines(@output)
85
+ lines.size.should == 80910
86
+ header_lines = lines.grep(/^H/)
87
+ (header_lines.size > 10).should be_true
88
+ header_hash_match(header_lines, SpecHelperHeaderHash).should be_true
89
+ other_lines = lines.grep(/^[^H]/)
90
+ other_lines[0,4].join('').should == SpecHelperOtherLines
91
+ other_lines[-3,3].join('').should == SpecHelperOtherLinesEnd
92
+ del(@output)
93
+ end
94
+ it 'warns if the db path is incorrect and we want to update db info' do
95
+ # requires some knowledge of how the database file is extracted
96
+ # internally
97
+ wacky_path = '/not/a/real/path/wacky.fasta'
98
+ @srf.header.db_filename = wacky_path
99
+ my_error_string = ''
100
+ StringIO.open(my_error_string, 'w') do |strio|
101
+ $stderr = strio
102
+ @srf.to_sqt(@output, :db_info => true)
103
+ end
104
+ my_error_string.should include(wacky_path)
105
+ @srf.header.db_filename = @original_db_filename
106
+ $stderr = STDERR
107
+ @output.should exist
108
+ IO.readlines(@output).size.should == 80910
109
+ del(@output)
110
+ end
111
+ it 'can get db info with correct path' do
112
+ @srf.to_sqt(@output, :db_info => true, :new_db_path => Tfiles_l + '/opd1_2runs_2mods/sequest')
113
+ @output.should exist
114
+ lines = IO.readlines(@output)
115
+ has_md5 = lines.any? do |line|
116
+ line =~ /DBMD5Sum\s+202b1d95e91f2da30191174a7f13a04e/
117
+ end
118
+ has_md5.should be_true
119
+
120
+ has_seq_len = lines.any? do |line|
121
+ # frozen
122
+ line =~ /DBSeqLength\s+1342842/
123
+ end
124
+ has_seq_len.should be_true
125
+ lines.size.should == 80912
126
+ del(@output)
127
+ end
128
+ it 'can update the Database' do
129
+ @srf.to_sqt(@output, :new_db_path => Tfiles_l + '/opd1_2runs_2mods/sequest', :update_db_path => true)
130
+ regexp = Regexp.new("Database\t/.*/opd1_2runs_2mods/sequest/ecoli_K12_ncbi_20060321.fasta")
131
+ updated_db = IO.readlines(@output).any? do |line|
132
+ line =~ regexp
133
+ end
134
+ updated_db.should be_true
135
+ del(@output)
136
+ end
137
+ end
138
+ end