mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,452 @@
1
+
2
+ require File.expand_path( File.dirname(__FILE__) + '/../../spec_helper' )
3
+
4
+ require 'spec_id'
5
+ require 'spec_id/sequest/pepxml'
6
+ #require 'ms/mzxml'
7
+
8
+
9
+ NODELETE = false
10
+
11
+ describe Sequest::PepXML::SearchHit, 'making enzyme calculations on sequences' do
12
+
13
+ before(:each) do
14
+ @tf_params_fullKRP = Tfiles + "/bioworks32.params"
15
+ # The enzyme is: 1 KR P
16
+ @tf_params_justKR = Tfiles + "/bioworks33.params"
17
+ end
18
+
19
+ it 'calculates the number of tolerant termini' do
20
+ exp = [{
21
+ # full KR/P
22
+ 'K.EPTIDR.E' => 2,
23
+ 'K.PEPTIDR.E' => 1,
24
+ 'F.EEPTIDR.E' => 1,
25
+ 'F.PEPTIDW.R' => 0,
26
+ },
27
+ {
28
+ # just KR
29
+ 'K.EPTIDR.E' => 2,
30
+ 'K.PEPTIDR.E' => 2,
31
+ 'F.EEPTIDR.E' => 1,
32
+ 'F.PEPTIDW.R' => 0,
33
+ }
34
+ ]
35
+ scall = Sequest::PepXML::SearchHit
36
+ sym = :calc_num_tol_term
37
+ params_ar = [Sequest::Params.new(@tf_params_fullKRP), Sequest::Params.new(@tf_params_justKR)]
38
+ params_ar.zip(exp) do |params,hash|
39
+ hash.each do |seq, val|
40
+ scall.send(sym, params, seq).should == val
41
+ end
42
+ end
43
+ end
44
+
45
+ it 'calculates number of missed cleavages' do
46
+ exp = [{
47
+ "K.EPTIDR.E" => 0,
48
+ "K.PEPTIDR.E" => 0,
49
+ "F.EEPTIDR.E" => 0,
50
+ "F.PEPTIDW.R" => 0,
51
+ "F.PERPTIDW.R" => 0,
52
+ "F.PEPKPTIDW.R" => 0,
53
+ "F.PEPKTIDW.R" => 1,
54
+ "K.RTTIDR.E" => 1,
55
+ "K.RTTIKK.E" => 2,
56
+ "F.PKEPRTIDW.R" => 2,
57
+ "F.PKEPRTIDKP.R" => 2,
58
+ "F.PKEPRAALKPEERPTIDKW.R" => 3,
59
+ },
60
+ {
61
+ "K.EPTIDR.E" => 0,
62
+ "K.PEPTIDR.E" => 0,
63
+ "F.EEPTIDR.E" => 0,
64
+ "F.PEPTIDW.R" => 0,
65
+ "F.PERPTIDW.R" => 1,
66
+ "F.PEPKPTIDW.R" => 1,
67
+ "F.PEPKTIDW.R" => 1,
68
+ "K.RTTIDR.E" => 1,
69
+ "K.RTTIKK.E" => 2,
70
+ "F.PKEPRTIDW.R" => 2,
71
+ "F.PKEPRTIDKP.R" => 3,
72
+ "F.PKEPRAALKPEERPTIDKW.R" => 5,
73
+ }
74
+ ]
75
+
76
+ params_ar = [Sequest::Params.new(@tf_params_fullKRP), Sequest::Params.new(@tf_params_justKR)]
77
+ scall = Sequest::PepXML::SearchHit
78
+ sym = :calc_num_missed_cleavages
79
+ #params_ar[1] = params_ar[0]
80
+ params_ar.zip(exp) do |params, hash|
81
+ hash.each do |seq, val|
82
+ scall.send(sym, params, seq).should == val
83
+ end
84
+ end
85
+ end
86
+
87
+ end
88
+
89
+ describe Sequest::PepXML, " created from small bioworks.xml" do
90
+
91
+ spec_large do
92
+ before(:all) do
93
+ tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
94
+
95
+ tf_params = Tfiles + "/bioworks32.params"
96
+ tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
97
+ out_path = Tfiles
98
+ @pepxml_objs = Sequest::PepXML.set_from_bioworks(tf_bioworks_xml, :params => tf_params, :ms_data => tf_mzxml_path, :out_path => out_path)
99
+ end
100
+
101
+ it 'gets some spectrum queries' do
102
+ @pepxml_objs.each do |obj|
103
+ (obj.spectrum_queries.size > 2).should be_true
104
+ (obj.spectrum_queries.first.search_results.first.search_hits.size > 0).should be_true
105
+ end
106
+ #@pepxml_objs.each do |pep| puts pep.to_pepxml end
107
+ end
108
+ end
109
+ end
110
+
111
+
112
+
113
+ describe Sequest::PepXML, " created from large bioworks.xml" do
114
+ # assert_equal_by_pairs (really any old array)
115
+ def assert_equal_pairs(obj, arrs)
116
+ arrs.each do |arr|
117
+ #if obj.send(arr[1]) != arr[0]
118
+ # puts "HELLO"
119
+ # puts "OBJ answer"
120
+ # p obj.send(arr[1])
121
+ # puts "ar0"
122
+ # p arr[0]
123
+ # puts "ar1"
124
+ # p arr[1]
125
+ #end
126
+ if arr[0].is_a? Float
127
+ obj.send(arr[1]).should be_close(arr[0], 0.0000000001)
128
+ else
129
+ obj.send(arr[1]).should == arr[0]
130
+ end
131
+ end
132
+ end
133
+
134
+ #swap the first to guys first
135
+ def assert_equal_pairs_swapped(obj, arrs)
136
+ arrs.each do |arr|
137
+ arr[0], arr[1] = arr[1], arr[0]
138
+ end
139
+ assert_equal_pairs(obj, arrs)
140
+ end
141
+
142
+ spec_large do
143
+ before(:all) do
144
+ st = Time.new
145
+ params = Tfiles + "/opd1/sequest.3.2.params"
146
+ bioworks_xml = Tfiles_l + "/opd1/bioworks.000.oldparams.xml"
147
+ mzxml_path = Tfiles_l + "/opd1"
148
+ out_path = Tfiles
149
+ @pepxml_version = 18
150
+ @pepxml_objs = Sequest::PepXML.set_from_bioworks_xml(bioworks_xml, params, {:ms_data => mzxml_path, :out_path => out_path, :pepxml_version => @pepxml_version})
151
+ puts "- takes #{Time.new - st} secs"
152
+ end
153
+
154
+ it 'extracts MSMSPipelineAnalysis' do
155
+ ######## HMMMMM...
156
+ Sequest::PepXML.pepxml_version.should == @pepxml_version
157
+
158
+ # MSMSPipelineAnalysis
159
+ po = @pepxml_objs.first
160
+ msms_pipeline = po.msms_pipeline_analysis
161
+ msms_pipeline.xmlns.should == 'http://regis-web.systemsbiology.net/pepXML'
162
+ msms_pipeline.xmlns_xsi.should == 'http://www.w3.org/2001/XMLSchema-instance'
163
+ msms_pipeline.xsi_schema_location.should == 'http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd'
164
+ msms_pipeline.summary_xml.should == '000.xml'
165
+ end
166
+
167
+ it 'extracts MSmSRunSummary' do
168
+ # MSMSRunSummary
169
+ rs = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary
170
+ rs.base_name.should =~ /\/000/
171
+ assert_equal_pairs(rs, [ ['ThermoFinnigan', :ms_manufacturer], ['LCQ Deca XP Plus', :ms_model], ['ESI', :ms_ionization], ['Ion Trap', :ms_mass_analyzer], ['UNKNOWN', :ms_detector], ['raw', :raw_data_type], ['.mzXML', :raw_data], ])
172
+ end
173
+
174
+ it 'extracts SampleEnzyme' do
175
+ # SampleEnzyme
176
+ se = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.sample_enzyme
177
+ assert_equal_pairs(se, [ ['Trypsin', :name], ['KR', :cut], [nil, :no_cut], ['C', :sense], ])
178
+ end
179
+
180
+ it 'extracts SearchSummary' do
181
+ # SearchSummary
182
+ ss = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary
183
+ ss.is_a?(Sequest::PepXML::SearchSummary).should be_true
184
+ ss.base_name.should =~ /\/000/
185
+ ss.peptide_mass_tol.should =~ /1\.500/
186
+ assert_equal_pairs_swapped(ss, [ # normal attributes
187
+ [:search_engine, "SEQUEST"], [:precursor_mass_type, "average"], [:fragment_mass_type, "average"], [:out_data_type, "out"], [:out_data, ".tgz"], [:search_id, "1"],
188
+
189
+ # enzymatic_search_constraint
190
+ [:enzyme, 'Trypsin'], [:max_num_internal_cleavages, '2'], [:min_number_termini, '2'],
191
+
192
+ # parameters
193
+ [:fragment_ion_tol, "1.0000"], [:ion_series, "0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0"], [:max_num_differential_AA_per_mod, "3"], [:nucleotide_reading_frame, "0"], [:num_output_lines, "10"], [:remove_precursor_peak, "0"], [:ion_cutoff_percentage, "0.0000"], [:match_peak_count, "0"], [:match_peak_allowed_error, "1"], [:match_peak_tolerance, "1.0000"], [:protein_mass_filter, "0 0"],
194
+ ])
195
+
196
+ end
197
+ it 'extracts SearchDatabase' do
198
+ # SearchDatabase
199
+ sd = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary.search_database
200
+ sd.is_a?(Sequest::PepXML::SearchDatabase).should be_true
201
+ assert_equal_pairs_swapped(sd, [ [:local_path, "C:\\Xcalibur\\database\\ecoli_K12.fasta"], [:seq_type, 'AA'], ])
202
+ end
203
+
204
+ it 'returns SpectrumQueries' do
205
+ # SpectrumQueries
206
+ sq = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.spectrum_queries
207
+ spec = sq.first
208
+ assert_equal_pairs_swapped(spec, [
209
+ [:spectrum, "000.100.100.1"], [:start_scan, "100"], [:end_scan, "100"],
210
+ #[:precursor_neutral_mass, "1074.5920"], # out2summary
211
+ [:precursor_neutral_mass, 1074.666926], # mine
212
+ [:assumed_charge, 1], [:index, "1"],
213
+ ])
214
+ sh = spec.search_results.first.search_hits.first
215
+ assert_equal_pairs_swapped(sh, [
216
+ # normal attributes
217
+ [:hit_rank, 1],
218
+ [:peptide, "SIYFRNFK"],
219
+ [:peptide_prev_aa, "R"],
220
+ [:peptide_next_aa, "G"],
221
+ [:protein, "gi|16130084|ref|NP_416651.1|"],
222
+ [:num_tot_proteins, 1],
223
+ [:num_matched_ions, 4],
224
+ [:tot_num_ions, 14],
225
+ #[:calc_neutral_pep_mass, "1074.1920"], # out2summary
226
+ [:calc_neutral_pep_mass, 1074.23261], # mine
227
+ #[:massdiff, "+0.400000"], # out2summary
228
+ [:massdiff, 0.434316000000081], # mine
229
+ [:num_tol_term, 2], [:num_missed_cleavages, 1], [:is_rejected, 0],
230
+
231
+ # search_score
232
+ [:xcorr, 0.4], [:deltacn, 0.023], [:deltacnstar, "0"], [:spscore, 78.8], [:sprank, 1],
233
+ ])
234
+
235
+ spec = sq[1]
236
+ assert_equal_pairs_swapped(spec, [
237
+ [:spectrum, "000.1000.1000.1"], [:start_scan, "1000"], [:end_scan, "1000"], #[:precursor_neutral_mass, "663.1920"], # out2summary
238
+ [:precursor_neutral_mass, 663.206111], # mine
239
+ [:assumed_charge, 1], [:index, "2"],
240
+ ])
241
+
242
+ sh = spec.search_results.first.search_hits.first
243
+ assert_equal_pairs_swapped(sh, [
244
+ # normal attributes
245
+ [:hit_rank, 1], [:peptide, "ALADFK"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "S"], [:protein, "gi|16128765|ref|NP_415318.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 10],
246
+ [:num_tol_term, 2], [:num_missed_cleavages, 0], [:is_rejected, 0],
247
+ #[:massdiff, "-0.600000"], # out2summary
248
+ [:massdiff, -0.556499000000031], # mine
249
+ #[:calc_neutral_pep_mass, 663.7920], # out2summary
250
+ [:calc_neutral_pep_mass, 663.76261], # mine
251
+
252
+ # search_score
253
+ [:xcorr, 0.965], [:deltacn, 0.132], [:deltacnstar, "0"], [:spscore, 81.1], [:sprank, 1],
254
+ ])
255
+
256
+ spec = sq[9]
257
+ assert_equal_pairs_swapped(spec, [
258
+ [:spectrum, "000.1008.1008.2"], [:start_scan, "1008"], [:end_scan, "1008"], [:assumed_charge, 2],
259
+ #[:precursor_neutral_mass, "691.0920"], # out2summary
260
+ [:precursor_neutral_mass, 691.150992], # mine
261
+ ])
262
+
263
+ sh = spec.search_results.first.search_hits.first
264
+ assert_equal_pairs_swapped(sh, [
265
+ # normal attributes
266
+ [:hit_rank, 1], [:peptide, "RLFTR"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "A"], [:protein, "gi|16130457|ref|NP_417027.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 8], [:num_tol_term, 2],
267
+
268
+ #[:num_missed_cleavages, "0"], # out2summary misses this!
269
+ [:num_missed_cleavages, 1],
270
+ [:is_rejected, 0],
271
+ #[:calc_neutral_pep_mass, "691.7920"], # out2summary
272
+ [:calc_neutral_pep_mass, 691.82261], # mine
273
+ #[:massdiff, "-0.700000"], # out2summary
274
+ [:massdiff, -0.67161800000008], # mine
275
+
276
+ # search_score
277
+ [:xcorr, 0.903], [:deltacn, 0.333], [:deltacnstar, "0"], [:spscore, 172.8], [:sprank, 1],
278
+ ])
279
+ end
280
+
281
+ it 'can generate correct pepxml file' do
282
+
283
+ ## IF OUR OBJECT IS CORRECT, THEN WE GET THE OUTPUT:
284
+ string = @pepxml_objs.first.to_pepxml
285
+ ans_lines = IO.read(Tfiles + "/opd1/000.my_answer.100lines.xml").split("\n")
286
+ base_name_re = /base_name=".*?files\//o
287
+ date_re = /date=".*?"/
288
+ string.split("\n").each_with_index do |line,i|
289
+ if i > 99 ; break end
290
+ ans, exp =
291
+ if i == 1
292
+ [line.sub(date_re,''), ans_lines[i].sub(date_re,'')]
293
+ elsif i == 2
294
+ [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t")]
295
+ elsif i == 6
296
+ [line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t\t")]
297
+ else
298
+ [line, ans_lines[i]]
299
+ end
300
+
301
+ #ans.split('').zip(exp.split('')) do |l,a|
302
+ # if l != a
303
+ # puts line
304
+ # puts ans_lines[i]
305
+ # puts l
306
+ # puts a
307
+ # end
308
+ #end
309
+ if ans != exp
310
+ puts ans
311
+ puts exp
312
+ end
313
+ ans.should == exp
314
+ #line.sub(base_name_re,'').should == ans_lines[i].sub(base_name_re,'')
315
+ end
316
+ end
317
+ end
318
+ end
319
+
320
+
321
+
322
+ describe Sequest::PepXML::Modifications do
323
+ before(:each) do
324
+ tf_params = Tfiles + "/bioworks32.params"
325
+ @params = Sequest::Params.new(tf_params)
326
+ # The params object here is completely unnecessary for this test, except
327
+ # that it sets up the mass table
328
+ @obj = Sequest::PepXML::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
329
+ end
330
+ it 'creates a mod_symbols_hash' do
331
+ answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
332
+ @obj.mod_symbols_hash.should == answ
333
+ ## need more here
334
+ end
335
+
336
+ it 'creates a ModificationInfo object given a special peptide sequence' do
337
+ mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
338
+ @params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
339
+ @params.term_diff_search_options = "14.20000 12.33000"
340
+ mod = Sequest::PepXML::Modifications.new(@params, mod_string)
341
+ ## no mods
342
+ peptide = "PEPTIDE"
343
+ mod.modification_info(peptide).should be_nil
344
+ peptide = "]M*EC^S@IDM#M*EMSCM["
345
+ modinfo = mod.modification_info(peptide)
346
+ modinfo.modified_peptide.should == peptide
347
+ modinfo.mod_nterm_mass.should be_close(146.40054, 0.000001)
348
+ modinfo.mod_cterm_mass.should be_close(160.52994, 0.000001)
349
+ end
350
+
351
+ end
352
+
353
+ describe Sequest::PepXML::SearchHit::ModificationInfo do
354
+
355
+ before(:each) do
356
+ modaaobjs = [[3, 150.3], [6, 345.2]].map do |ar|
357
+ Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new(ar)
358
+ end
359
+ hash = {
360
+ :mod_nterm_mass => 520.2,
361
+ :modified_peptide => "MOD*IFI^E&D",
362
+ :mod_aminoacid_masses => modaaobjs,
363
+ }
364
+ #answ = "<modification_info mod_nterm_mass=\"520.2\" modified_peptide=\"MOD*IFI^E&amp;D\">\n\t<mod_aminoacid_mass position=\"3\" mass=\"150.3\"/>\n\t<mod_aminoacid_mass position=\"6\" mass=\"345.2\"/>\n</modification_info>\n"
365
+ @obj = Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
366
+ end
367
+
368
+ def _re(st)
369
+ /#{Regexp.escape(st)}/
370
+ end
371
+
372
+ it 'can produce pepxml' do
373
+ answ = @obj.to_pepxml
374
+ answ.should =~ _re('<modification_info')
375
+ answ.should =~ _re(" mod_nterm_mass=\"520.2\"")
376
+ answ.should =~ _re(" modified_peptide=\"MOD*IFI^E&amp;D\"")
377
+ answ.should =~ _re("<mod_aminoacid_mass")
378
+ answ.should =~ _re(" position=\"3\"")
379
+ answ.should =~ _re(" mass=\"150.3\"")
380
+ answ.should =~ _re(" position=\"6\"")
381
+ answ.should =~ _re(" mass=\"345.2\"")
382
+ answ.should =~ _re("</modification_info>")
383
+ end
384
+ end
385
+
386
+ describe 'bioworks file with modifications transformed into pepxml' do
387
+
388
+ spec_large do
389
+ before(:all) do
390
+ modfiles_sequest_dir = Tfiles_l + '/opd1_2runs_2mods/sequest/'
391
+ modfiles_data_dir = Tfiles_l + '/opd1_2runs_2mods/data/'
392
+ @srgfile = modfiles_sequest_dir + 'tmp.srg'
393
+ @out_path = modfiles_sequest_dir + 'pepxml'
394
+ modfiles = %w(020 040).map do |file|
395
+ modfiles_sequest_dir + file + ".srf"
396
+ end
397
+ objs = Sequest::PepXML.set_from_bioworks( SRFGroup.new(modfiles).to_srg(@srgfile), {:ms_data => modfiles_data_dir, :out_path => @out_path, :print => true, :backup_db_path => '/project/marcotte/marcotte/ms/database'} )
398
+ @out_files = %w(020 040).map do |file|
399
+ @out_path + '/' + file + '.xml'
400
+ end
401
+ end
402
+
403
+ after(:all) do
404
+ File.unlink(@srgfile) unless NODELETE
405
+ FileUtils.rm_r(@out_path)
406
+ #@out_files.each do |fn|
407
+ # File.unlink(fn) unless NODELETE
408
+ #end
409
+ end
410
+
411
+ # splits string on ' 'and matches the line found by find_line_regexp in
412
+ # lines
413
+ def match_modline_pieces(lines, find_line_regexp, string)
414
+ pieces = string.split(' ').map {|v| /#{Regexp.escape(v)}/ }
415
+ lines.each do |line|
416
+ if line =~ find_line_regexp
417
+ pieces.each do |piece|
418
+ line.should =~ piece
419
+ end
420
+ end
421
+ end
422
+ end
423
+
424
+ it 'gets modifications right in real run' do
425
+ @out_files.each do |fn|
426
+ fn.should exist
427
+ beginning = IO.read(fn)
428
+ lines = beginning.split("\n")
429
+ [
430
+ [/aminoacid="M"/, '<aminoacid_modification symbol="*" massdiff="+15.9994" aminoacid="M" variable="Y" binary="N" mass="147.192"'],
431
+
432
+ [/aminoacid="S"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="S" variable="Y" binary="N" mass="167.0581"'],
433
+ [/aminoacid="T"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="T" variable="Y" binary="N" mass="181.085"'],
434
+ [/aminoacid="Y"/, '<aminoacid_modification symbol="#" massdiff="+79.9799" aminoacid="Y" variable="Y" binary="N" mass="243.1559"'],
435
+ [/parameter name="diff_search_options"/, '<parameter name="diff_search_options" value="15.999400 M 79.979900 STY 0.000000 M 0.000000 X 0.000000 T 0.000000 Y"/>'],
436
+ ].each do |a,b|
437
+ match_modline_pieces(lines, a, b)
438
+ end
439
+ [
440
+ '<modification_info modified_peptide="Y#RLGGS#T#K">',
441
+ '<mod_aminoacid_mass position="1" mass="243.1559"/>',
442
+ '<mod_aminoacid_mass position="7" mass="167.0581"/>',
443
+ '</modification_info>',
444
+ '<mod_aminoacid_mass position="9" mass="181.085"/>'
445
+ ].each do |line|
446
+ beginning.should =~ /#{Regexp.escape(line)}/ # "a modification info for a peptide")
447
+ end
448
+ end
449
+ end
450
+ end
451
+ end
452
+
@@ -0,0 +1,138 @@
1
+ require File.expand_path( File.dirname(__FILE__) + '/../spec_helper' )
2
+
3
+ require 'spec_id/srf'
4
+
5
+ SpecHelperHeaderHash = {
6
+ 'SQTGenerator' => 'mspire',
7
+ 'SQTGeneratorVersion' => String,
8
+ 'Database' => 'C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta',
9
+ 'FragmentMasses' => 'AVG',
10
+ 'PrecursorMasses' => 'AVG',
11
+ 'StartTime' => nil,
12
+ 'Alg-MSModel' => 'LCQ Deca XP',
13
+ 'Alg-PreMassUnits' => 'amu',
14
+ 'DBLocusCount' => '4237',
15
+ 'Alg-FragMassTol' => '1.0000',
16
+ 'Alg-PreMassTol' => '1.4000',
17
+ 'Alg-IonSeries' => '0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0',
18
+ 'Alg-Enzyme' => 'Trypsin(KR/P) (2)',
19
+ 'Comment' => ['Created from Bioworks .srf file'],
20
+ 'StaticMod' => ['C=160.1901','Cterm=10.1230','E=161.4455'],
21
+ 'DynamicMod' => ['STY*=+79.97990', 'M#=+14.02660'],
22
+ }
23
+
24
+ SpecHelperOtherLines =<<END
25
+ S 2 2 1 0.0 VELA 391.04541015625 3021.5419921875 0.0 0
26
+ S 3 3 1 0.0 VELA 446.009033203125 1743.96911621094 0.0 122
27
+ M 1 1 445.5769264522 0.0 0.245620265603065 16.6666660308838 1 6 R.SNSK.S U
28
+ L gi|16128266|ref|NP_414815.1|
29
+ END
30
+
31
+ SpecHelperOtherLinesEnd =<<END
32
+ L gi|90111093|ref|NP_414704.4|
33
+ M 10 17 1298.5350544522 0.235343858599663 0.823222815990448 151.717300415039 12 54 K.LQKIITNSY*K U
34
+ L gi|90111124|ref|NP_414904.2|
35
+ END
36
+
37
+ describe 'converting a large srf to sqt' do
38
+ def del(file)
39
+ if File.exist?(file)
40
+ File.unlink(file)
41
+ end
42
+ end
43
+
44
+ # returns true or false
45
+ def header_hash_match(header_lines, hash)
46
+ header_lines.all? do |line|
47
+ (h, k, v) = line.chomp.split("\t")
48
+ if hash[k].is_a? Array
49
+ if hash[k].include?(v)
50
+ true
51
+ else
52
+ puts "FAILED: "
53
+ p k
54
+ p v
55
+ p hash[k]
56
+ false
57
+ end
58
+ elsif hash[k] == String
59
+ v.is_a?(String)
60
+ else
61
+ if v == hash[k]
62
+ true
63
+ else
64
+ puts "FAILED: "
65
+ p k
66
+ p v
67
+ p hash[k]
68
+ false
69
+ end
70
+ end
71
+ end
72
+ end
73
+
74
+ spec_large do
75
+ before(:all) do
76
+ @file = Tfiles_l + '/opd1_static_diff_mods/000.srf'
77
+ @output = Tfiles_l + '/opd1_static_diff_mods/000.sqt.tmp'
78
+ @srf = SRF.new(@file)
79
+ @original_db_filename = @srf.header.db_filename
80
+ end
81
+ it 'converts without bothering with the database' do
82
+ @srf.to_sqt(@output)
83
+ @output.should exist
84
+ lines = File.readlines(@output)
85
+ lines.size.should == 80910
86
+ header_lines = lines.grep(/^H/)
87
+ (header_lines.size > 10).should be_true
88
+ header_hash_match(header_lines, SpecHelperHeaderHash).should be_true
89
+ other_lines = lines.grep(/^[^H]/)
90
+ other_lines[0,4].join('').should == SpecHelperOtherLines
91
+ other_lines[-3,3].join('').should == SpecHelperOtherLinesEnd
92
+ del(@output)
93
+ end
94
+ it 'warns if the db path is incorrect and we want to update db info' do
95
+ # requires some knowledge of how the database file is extracted
96
+ # internally
97
+ wacky_path = '/not/a/real/path/wacky.fasta'
98
+ @srf.header.db_filename = wacky_path
99
+ my_error_string = ''
100
+ StringIO.open(my_error_string, 'w') do |strio|
101
+ $stderr = strio
102
+ @srf.to_sqt(@output, :db_info => true)
103
+ end
104
+ my_error_string.should include(wacky_path)
105
+ @srf.header.db_filename = @original_db_filename
106
+ $stderr = STDERR
107
+ @output.should exist
108
+ IO.readlines(@output).size.should == 80910
109
+ del(@output)
110
+ end
111
+ it 'can get db info with correct path' do
112
+ @srf.to_sqt(@output, :db_info => true, :new_db_path => Tfiles_l + '/opd1_2runs_2mods/sequest')
113
+ @output.should exist
114
+ lines = IO.readlines(@output)
115
+ has_md5 = lines.any? do |line|
116
+ line =~ /DBMD5Sum\s+202b1d95e91f2da30191174a7f13a04e/
117
+ end
118
+ has_md5.should be_true
119
+
120
+ has_seq_len = lines.any? do |line|
121
+ # frozen
122
+ line =~ /DBSeqLength\s+1342842/
123
+ end
124
+ has_seq_len.should be_true
125
+ lines.size.should == 80912
126
+ del(@output)
127
+ end
128
+ it 'can update the Database' do
129
+ @srf.to_sqt(@output, :new_db_path => Tfiles_l + '/opd1_2runs_2mods/sequest', :update_db_path => true)
130
+ regexp = Regexp.new("Database\t/.*/opd1_2runs_2mods/sequest/ecoli_K12_ncbi_20060321.fasta")
131
+ updated_db = IO.readlines(@output).any? do |line|
132
+ line =~ regexp
133
+ end
134
+ updated_db.should be_true
135
+ del(@output)
136
+ end
137
+ end
138
+ end