mspire 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+ require 'ms/cvlist'
3
+ require 'cv'
4
+
5
+ describe 'appending CV params objects to an MS::CVList' do
6
+ describe 'intelligently appending params with #param' do
7
+ before do
8
+ @cv = MS::CVList.new
9
+ end
10
+ it 'sends detailed descriptions to CV::Param.new' do
11
+ arglist = [
12
+ ['IMS', 'IMS:1000052', 'position z', 22],
13
+ ['IMS', 'IMS:1000030', 'continuous'],
14
+ ['IMS', 'IMS:1000052', 'position z', 22, 'UO:0000008'],
15
+ ['IMS', 'IMS:1000030', 'continuous', 'UO:0000008'],
16
+ ['IMS', 'IMS:1000052', 'position z', 22, MS::CV::Param.new('UO:0000008')],
17
+ ['IMS', 'IMS:1000030', 'continuous', MS::CV::Param.new('UO:0000008')],
18
+ ]
19
+ arglist.each do |args|
20
+ @cv.param *args
21
+ end
22
+ @cv.size.should == arglist.size
23
+ arglist.each_with_index do |args, i|
24
+ @cv[i].should == MS::CV::Param.new(*args)
25
+ end
26
+ end
27
+ it 'deciphers short accession descriptions' do
28
+ @cv.param 'MS:1000004' # sample mass
29
+ @cv.param 'IMS:1000042', 23 # max count of pixels x
30
+ {cv_ref: 'MS', accession: 'MS:1000004', name: 'sample mass', value: nil}.each do |key,val|
31
+ @cv[0].send(key).should == val
32
+ end
33
+ {cv_ref: 'IMS', accession: 'IMS:1000042', name: 'max count of pixels x', value: 23}.each do |key,val|
34
+ @cv[1].send(key).should == val
35
+ end
36
+ end
37
+ describe 'appending on initialization' do
38
+ it 'can be done with a block' do
39
+ cvlist = MS::CVList.new do
40
+ param 'MS:1000004' # sample mass
41
+ param 'IMS:1000042', 23 # max count of pixels of y
42
+ end
43
+ cvlist.size.should == 2
44
+ end
45
+ end
46
+
47
+ it 'can be done with brackets' do
48
+ args = ['IMS', 'IMS:1000052', 'position z', 22]
49
+ param_obj = CV::Param.new(*args)
50
+ cvlist = MS::CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
51
+ cvlist.size.should == 5
52
+ cvlist[0].should == cvlist[1]
53
+ cvlist.each do |param|
54
+ param.accession.should_not be_nil
55
+ param.name.should_not be_nil
56
+ param.cv_ref.should_not be_nil
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,351 @@
1
+ require 'spec_helper.rb'
2
+
3
+ require 'ms/digester'
4
+ require 'pp'
5
+
6
+ describe 'a digester' do
7
+ before do
8
+ @digester = MS::Digester.new('arg', 'R')
9
+ end
10
+
11
+ def spp(input, str="")
12
+ PP.singleline_pp(input, str)
13
+ end
14
+
15
+ def nk_string(n, split)
16
+ str = []
17
+ count = 0
18
+
19
+ (n * 1000).times do
20
+ count += 1
21
+ if count < split
22
+ str << 'A'
23
+ else
24
+ count = 0
25
+ str << 'R'
26
+ end
27
+ end
28
+
29
+ str.join('')
30
+ end
31
+
32
+ it 'finds cleavage site indices' do
33
+ {
34
+ "" => [0,0],
35
+ "A" => [0,1],
36
+ "R" => [0,1],
37
+ "AAA" => [0,3],
38
+ "RAA" => [0,1,3],
39
+ "ARA" => [0,2,3],
40
+ "AAR" => [0,3],
41
+ "RRA" => [0,1,2,3],
42
+ "RAR" => [0,1,3],
43
+ "RRR" => [0,1,2,3],
44
+
45
+ "R\nR\nR" => [0,2,4,5],
46
+ "R\n\n\nR\nR\n\n" => [0,4,6,9]
47
+ }.each do |sequence, expected|
48
+ @digester.cleavage_sites(sequence).should == expected
49
+ end
50
+ end
51
+
52
+ it 'finds cleavage sites with exception' do
53
+ @digester = MS::Digester.new('argp', 'R', 'P')
54
+ {
55
+ "" => [0,0],
56
+ "A" => [0,1],
57
+ "R" => [0,1],
58
+ "AAA" => [0,3],
59
+ "RAA" => [0,1,3],
60
+ "ARA" => [0,2,3],
61
+ "AAR" => [0,3],
62
+ "RRA" => [0,1,2,3],
63
+ "RAR" => [0,1,3],
64
+ "RRR" => [0,1,2,3],
65
+
66
+ "PR" => [0,1,2],
67
+ "PR" => [0,2],
68
+ "PRR" => [0,2,3],
69
+ "RPR" => [0,3],
70
+ "RRP" => [0,1,3],
71
+ "APRA" => [0,3,4],
72
+ "ARPA" => [0,4],
73
+ "ARPARA" => [0,5,6],
74
+ "R\nPR\nR" => [0,5,6],
75
+ "RP\nR\nR" => [0,5,6],
76
+ "RP\nR\nR\n" => [0,5,7]
77
+ }.each do |sequence, expected|
78
+ @digester.cleavage_sites(sequence).should == expected
79
+ end
80
+ end
81
+
82
+
83
+
84
+ it 'finds cleavage sites with offset and limit' do
85
+ {
86
+ "RxxR" => [2,4],
87
+ "RxAxR" => [2,4],
88
+ "RxAAAxR" => [2,4],
89
+ "RxRRRxR" => [2,3,4]
90
+ }.each do |sequence, expected|
91
+ @digester.cleavage_sites(sequence, 2, 2).should == expected
92
+ end
93
+ end
94
+
95
+ it 'finds cleavage sites fast' do
96
+ str = nk_string(10, 1000)
97
+ @digester.cleavage_sites(str).length.should == 11
98
+ benchmark(20) do |x|
99
+ x.report("10kx - fragments") do
100
+ 10000.times { @digester.cleavage_sites(str) }
101
+ end
102
+ end
103
+ end
104
+
105
+ it 'digests proteins' do
106
+ {
107
+ "" => [''],
108
+ "A" => ["A"],
109
+ "R" => ["R"],
110
+ "AAA" => ["AAA"],
111
+ "RAA" => ["R", "AA"],
112
+ "ARA" => ["AR", "A"],
113
+ "AAR" => ["AAR"],
114
+ "RRA" => ["R", "R", "A"],
115
+ "RAR" => ["R", "AR"],
116
+ "RRR" => ["R", "R", "R"]
117
+ }.each do |sequence, expected|
118
+ # spp(sequence)
119
+ @digester.digest(sequence).should == expected
120
+ #@digester.digest(sequence) {|frag, s, e| frag}.should == expected
121
+ end
122
+ end
123
+
124
+ it 'digests with missed cleavages' do
125
+ {
126
+ "" => [''],
127
+ "A" => ["A"],
128
+ "R" => ["R"],
129
+ "AAA" => ["AAA"],
130
+ "RAA" => ["R", "RAA", "AA"],
131
+ "ARA" => ["AR", "ARA", "A"],
132
+ "AAR" => ["AAR"],
133
+ "RRA" => ["R", "RR", "R", "RA", "A"],
134
+ "RAR" => ["R", "RAR", "AR"],
135
+ "RRR" => ["R", "RR", "R", "RR", "R"]
136
+ }.each do |sequence, expected|
137
+ @digester.digest(sequence, 1).should == expected
138
+ #@digester.digest(sequence, 1) {|frag, s, e| frag}.should == expected
139
+ end
140
+ end
141
+
142
+ it 'digests with two missed cleavages' do
143
+ {
144
+ "" => [''],
145
+ "A" => ["A"],
146
+ "R" => ["R"],
147
+ "AAA" => ["AAA"],
148
+ "RAA" => ["R", "RAA", "AA"],
149
+ "ARA" => ["AR", "ARA", "A"],
150
+ "AAR" => ["AAR"],
151
+ "RRA" => ["R", "RR", "RRA", "R", "RA", "A"],
152
+ "RAR" => ["R", "RAR", "AR"],
153
+ "RRR" => ["R", "RR", "RRR", "R", "RR", "R"]
154
+ }.each do |sequence, expected|
155
+ @digester.digest(sequence, 2).should == expected
156
+ #@digester.digest(sequence, 2) {|frag, s, e| frag}.should == expected
157
+ end
158
+ end
159
+
160
+ it 'digests fast' do
161
+ str = nk_string(10, 1000)
162
+ @digester.digest(str).length.should == 10
163
+ benchmark(20) do |x|
164
+ x.report("10kx - fragments") do
165
+ 10000.times { @digester.digest(str) }
166
+ end
167
+ end
168
+ end
169
+
170
+ it 'finds sites to be digested' do
171
+ {
172
+ "" => [[0,0]],
173
+ "A" => [[0,1]],
174
+ "R" => [[0,1]],
175
+ "AAA" => [[0,3]],
176
+ "RAA" => [[0,1],[1,3]],
177
+ "ARA" => [[0,2],[2,3]],
178
+ "AAR" => [[0,3]],
179
+ "RRA" => [[0,1],[1,2],[2,3]],
180
+ "RAR" => [[0,1],[1,3]],
181
+ "RRR" => [[0,1],[1,2],[2,3]]
182
+ }.each do |sequence, expected|
183
+ @digester.site_digest(sequence).should == expected
184
+ end
185
+ end
186
+
187
+ it 'finds sites to be digested with missed cleavages' do
188
+ {
189
+ "" => [[0,0]],
190
+ "A" => [[0,1]],
191
+ "R" => [[0,1]],
192
+ "AAA" => [[0,3]],
193
+ "RAA" => [[0,1],[0,3],[1,3]],
194
+ "ARA" => [[0,2],[0,3],[2,3]],
195
+ "AAR" => [[0,3]],
196
+ "RRA" => [[0,1],[0,2],[1,2],[1,3],[2,3]],
197
+ "RAR" => [[0,1],[0,3],[1,3]],
198
+ "RRR" => [[0,1],[0,2],[1,2],[1,3],[2,3]]
199
+ }.each do |sequence, expected|
200
+ @digester.site_digest(sequence, 1).should == expected
201
+ end
202
+ end
203
+
204
+ it 'finds sites to be digested with two missed cleavages' do
205
+ {
206
+ "" => [[0,0]],
207
+ "A" => [[0,1]],
208
+ "R" => [[0,1]],
209
+ "AAA" => [[0,3]],
210
+ "RAA" => [[0,1],[0,3],[1,3]],
211
+ "ARA" => [[0,2],[0,3],[2,3]],
212
+ "AAR" => [[0,3]],
213
+ "RRA" => [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]],
214
+ "RAR" => [[0,1],[0,3],[1,3]],
215
+ "RRR" => [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
216
+ }.each do |sequence, expected|
217
+ @digester.site_digest(sequence, 2).should == expected
218
+ end
219
+ end
220
+
221
+ it 'does site digestion fast' do
222
+ str = nk_string(10, 1000)
223
+ @digester.site_digest(str).length.should == 10
224
+ benchmark(20) do |x|
225
+ x.report("10kx - fragments") do
226
+ 10000.times { @digester.site_digest(str) }
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+
233
+ describe 'performs as documented in readme' do
234
+ it 'runs cleavage sites documentation' do
235
+ d = MS::Digester.new('Trypsin', 'KR', 'P')
236
+ seq = "AARGGR"
237
+ sites = d.cleavage_sites(seq)
238
+ sites.should == [0, 3, 6]
239
+
240
+ seq[sites[0], sites[0+1] - sites[0]].should == "AAR"
241
+ seq[sites[1], sites[1+1] - sites[1]].should == "GGR"
242
+
243
+ seq = "AAR \n GGR"
244
+ sites = d.cleavage_sites(seq)
245
+ sites.should == [0, 8, 11]
246
+
247
+ seq[sites[0], sites[0+1] - sites[0]].should == "AAR \n "
248
+ seq[sites[1], sites[1+1] - sites[1]].should == "GGR"
249
+ end
250
+ end
251
+
252
+ describe 'basic trypsin digestion' do
253
+ it 'performs digestion and can specify sites of digestion' do
254
+ trypsin = MS::Digester['Trypsin']
255
+
256
+ expected = [
257
+ 'MIVIGR',
258
+ 'SIVHPYITNEYEPFAAEK',
259
+ 'QQILSIMAG']
260
+ trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG').should == expected
261
+
262
+ expected = [
263
+ 'MIVIGR',
264
+ 'MIVIGRSIVHPYITNEYEPFAAEK',
265
+ 'SIVHPYITNEYEPFAAEK',
266
+ 'SIVHPYITNEYEPFAAEKQQILSIMAG',
267
+ 'QQILSIMAG']
268
+ trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1).should == expected
269
+
270
+ expected = [
271
+ [0,6],
272
+ [0,24],
273
+ [6,24],
274
+ [6,33],
275
+ [24,33]]
276
+ trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1).should == expected
277
+ end
278
+
279
+ it 'completely ignores whitespace inside protein sequences' do
280
+ expected = [
281
+ "\tMIVIGR",
282
+ "SIVHP\nYITNEYEPFAAE K",
283
+ "QQILSI\rMAG"]
284
+ MS::Digester['Trypsin'].digest("\tMIVIGRSIVHP\nYITNEYEPFAAE KQQILSI\rMAG").should == expected
285
+ end
286
+
287
+ it 'does a trypsin digest' do
288
+ trypsin = MS::Digester[:trypsin]
289
+ {
290
+ "" => [''],
291
+ "A" => ["A"],
292
+ "R" => ["R"],
293
+ "AAA" => ["AAA"],
294
+ "RAA" => ["R", "AA"],
295
+ "ARA" => ["AR", "A"],
296
+ "AAR" => ["AAR"],
297
+ "RRA" => ["R", "R", "A"],
298
+ "RAR" => ["R", "AR"],
299
+ "RRR" => ["R", "R", "R"],
300
+ "RKR" => ["R", "K", "R"],
301
+
302
+ "ARP" => ["ARP"],
303
+ "PRA" => ["PR","A"],
304
+ "ARPARAA" => ["ARPAR", "AA"],
305
+ "RPRRR" => ["RPR", "R", "R"]
306
+ }.each do |sequence, expected|
307
+ trypsin.digest(sequence).should == expected
308
+ end
309
+ end
310
+
311
+
312
+
313
+ end
314
+
315
+ describe 'digestion with other enzymes' do
316
+
317
+ # This is how to access the already created enzyme:
318
+ # MS::Digester['Arg-C'] (or :arg_c, 'ARG-C', :ARG_C')
319
+ {
320
+ ['Arg-C', :arg_c] => {
321
+ "AARC" => ["AAR", "C"],
322
+ "AARP" => ["AARP"]
323
+ },
324
+ ['Asp-N', :asp_n] => {
325
+ "AABDS" => ["AA", "B", "DS"],
326
+ "ADZBS" => ["A", "DZ", "BS"],
327
+ "B" => %w(B),
328
+ "A" => %w(A),
329
+ "ABD" => %w(A B D),
330
+ },
331
+ ['Asp-N_ambic', :asp_n_ambic] => {
332
+ "AAEDS" => ["AA", "E", "DS"],
333
+ "ADZES" => ["A", "DZ", "ES"],
334
+ "AED" => %w(A E D),
335
+ "GDE" => %w(G D E),
336
+ "AAECCDGG" => %w(AA ECC DGG),
337
+ }
338
+ }.each do |enzyme_names, test_hash|
339
+ it "digests with '#{enzyme_names.first}'" do
340
+ digester = MS::Digester[enzyme_names.first]
341
+ digester.should == MS::Digester[enzyme_names.last]
342
+ digester.name.should == enzyme_names.first
343
+ test_hash.each do |sequence, expected|
344
+ digester.digest(sequence).should == expected
345
+ end
346
+ end
347
+ end
348
+ end
349
+
350
+
351
+
@@ -0,0 +1,100 @@
1
+ require 'spec_helper'
2
+
3
+ require 'ms/fasta'
4
+
5
+ describe 'basic fasta operations' do
6
+ before do
7
+ @headers = [">gi|5524211 [hello]", ">another B", ">again C"]
8
+ @entries = ["LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV\nGLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX\nIENY", "ABCDEF\nGHIJK", "ABCD"]
9
+ @sequences = @entries.map {|v| v.gsub("\n", '') }
10
+ @data = {}
11
+ @data['newlines'] = @headers.zip(@entries).map do |header, data|
12
+ header + "\n" + data
13
+ end.join("\n")
14
+ @data['carriage_returns_and_newlines'] = @data['newlines'].gsub("\n", "\r\n")
15
+ file_key_to_filename_pairs = @data.map do |k,v|
16
+ file_key = k + '_file'
17
+ filename = k + '.tmp'
18
+ File.open(filename, 'w') {|out| out.print v }
19
+ [file_key, filename]
20
+ end
21
+ file_key_to_filename_pairs.each {|k,v| @data[k] = v }
22
+ end
23
+
24
+ after do
25
+ @data.select {|k,v| k =~ /_file$/ }.each do |k,filename|
26
+ index = filename.sub('.tmp', '.index')
27
+ [filename, index].each do |fn|
28
+ File.unlink(fn) if File.exist? fn
29
+ end
30
+ end
31
+ end
32
+
33
+ def fasta_correct?(fasta)
34
+ entries = fasta.map
35
+ @headers.size.times.zip(entries) do |i,entry|
36
+ header, sequence, entry = @headers[i], @sequences[i], entry
37
+ entry.header.should_not == nil
38
+ entry.sequence.should_not == nil
39
+ entry.header.should == header[1..-1]
40
+ entry.sequence.should == sequence
41
+ end
42
+ end
43
+
44
+ xit 'can deliver length and description hashes' do
45
+ # need to test
46
+ end
47
+
48
+ it 'can read a file' do
49
+ %w(newlines_file carriage_returns_and_newlines_file).each do |file|
50
+ MS::Fasta.open(@data[file]) do |fasta|
51
+ fasta_correct? fasta
52
+ end
53
+ end
54
+ end
55
+
56
+ it 'can read an IO object' do
57
+ %w(newlines_file carriage_returns_and_newlines_file).each do |file|
58
+ File.open(@data[file]) do |io|
59
+ fasta = MS::Fasta.new(io)
60
+ fasta_correct? fasta
61
+ end
62
+ end
63
+ end
64
+
65
+ it 'can read a string' do
66
+ %w(newlines carriage_returns_and_newlines).each do |key|
67
+ fasta = MS::Fasta.new @data[key]
68
+ fasta_correct? fasta
69
+ end
70
+ end
71
+
72
+ it 'iterates entries with foreach' do
73
+ %w(newlines_file carriage_returns_and_newlines_file).each do |file|
74
+ MS::Fasta.foreach(@data[file]) do |entry|
75
+ entry.should be_an_instance_of Bio::FastaFormat
76
+ end
77
+ end
78
+ end
79
+
80
+ it 'runs the documentation' do
81
+ fasta_file = @data['newlines_file']
82
+ ids = MS::Fasta.open(fasta_file) do |fasta|
83
+ fasta.map(&:entry_id)
84
+ end
85
+ ids.is_a?(Array)
86
+ ids.should == %w(gi|5524211 another again)
87
+
88
+ # this code is already tested above
89
+ # File.open(fasta_file) do |io|
90
+ # fasta = MS::Fasta.new(io)
91
+ # end
92
+
93
+ # taking a string
94
+ string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
95
+ fasta = MS::Fasta.new(string)
96
+ (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
97
+ simple.first.header.include?("simple").should == true
98
+ not_simple.first.header.include?("simple").should == false
99
+ end
100
+ end
@@ -0,0 +1,108 @@
1
+ require 'spec_helper'
2
+
3
+ require 'yaml'
4
+ path = 'ms/ident/peptide/db'
5
+ require path
6
+
7
+ module Kernel
8
+
9
+ def capture_stdout
10
+ out = StringIO.new
11
+ $stdout = out
12
+ yield
13
+ out.rewind
14
+ return out.read
15
+ ensure
16
+ $stdout = STDOUT
17
+ end
18
+
19
+ end
20
+
21
+
22
+ describe 'a uniprot fasta file' do
23
+
24
+ before do
25
+ @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
26
+ end
27
+
28
+ describe 'amino acid expansion' do
29
+
30
+ it 'can expand out wildcard amino acid combinations' do
31
+ array = MS::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
32
+ array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
33
+ end
34
+
35
+ it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
36
+ # this is from real data
37
+ worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
38
+ MS::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
39
+ end
40
+
41
+ it 'returns the peptide in the array if no expansion' do
42
+ array = MS::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
43
+ array.should == ['ZZZZZ']
44
+ end
45
+
46
+ end
47
+
48
+ describe 'creating a peptide centric database' do
49
+ before do
50
+
51
+ #@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
52
+ @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
53
+ end
54
+
55
+ it 'converts a fasta file into peptide centric db' do
56
+ output_files = MS::Ident::Peptide::Db.cmdline([@fasta_file])
57
+ output_files.first.should == File.expand_path(@output_file)
58
+ File.exist?(@output_file).should == true
59
+ hash = {}
60
+ YAML.load_file(@output_file).each do |k,v|
61
+ hash[k] = v.split("\t")
62
+ end
63
+ sorted = hash.sort
64
+ # these are merely frozen, not perfectly defined
65
+ sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
66
+ sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
67
+ sorted.size.should == 728
68
+ File.unlink(@output_file)
69
+ end
70
+
71
+ it 'lists approved enzymes and exits' do
72
+ output = capture_stdout do
73
+ begin
74
+ MS::Ident::Peptide::Db.cmdline(['--list-enzymes'])
75
+ rescue SystemExit
76
+ 1.should == 1 # we exited
77
+ end
78
+ end
79
+ lines = output.split("\n")
80
+ lines.include?("trypsin").should == true
81
+ lines.include?("chymotrypsin").should == true
82
+ end
83
+ end
84
+
85
+ describe 'reading a peptide centric database' do
86
+ before do
87
+ outfiles = MS::Ident::Peptide::Db.cmdline([@fasta_file])
88
+ @outfile = outfiles.first
89
+ end
90
+
91
+ it 'creates a hash that can retrieve peptides as an array' do
92
+ hash = MS::Ident::Peptide::Db.new(@outfile)
93
+ hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
94
+ hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
95
+ end
96
+
97
+ it 'reads the file on disk with random access or is enumerable' do
98
+ MS::Ident::Peptide::Db::IO.open(@outfile) do |io|
99
+ io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
100
+ io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
101
+ io.each_with_index do |key_prots, i|
102
+ key_prots.first.should be_an_instance_of String
103
+ key_prots.last.should be_a_kind_of Array
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end