mspire 0.5.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+ require 'ms/cvlist'
3
+ require 'cv'
4
+
5
+ describe 'appending CV params objects to an MS::CVList' do
6
+ describe 'intelligently appending params with #param' do
7
+ before do
8
+ @cv = MS::CVList.new
9
+ end
10
+ it 'sends detailed descriptions to CV::Param.new' do
11
+ arglist = [
12
+ ['IMS', 'IMS:1000052', 'position z', 22],
13
+ ['IMS', 'IMS:1000030', 'continuous'],
14
+ ['IMS', 'IMS:1000052', 'position z', 22, 'UO:0000008'],
15
+ ['IMS', 'IMS:1000030', 'continuous', 'UO:0000008'],
16
+ ['IMS', 'IMS:1000052', 'position z', 22, MS::CV::Param.new('UO:0000008')],
17
+ ['IMS', 'IMS:1000030', 'continuous', MS::CV::Param.new('UO:0000008')],
18
+ ]
19
+ arglist.each do |args|
20
+ @cv.param *args
21
+ end
22
+ @cv.size.should == arglist.size
23
+ arglist.each_with_index do |args, i|
24
+ @cv[i].should == MS::CV::Param.new(*args)
25
+ end
26
+ end
27
+ it 'deciphers short accession descriptions' do
28
+ @cv.param 'MS:1000004' # sample mass
29
+ @cv.param 'IMS:1000042', 23 # max count of pixels x
30
+ {cv_ref: 'MS', accession: 'MS:1000004', name: 'sample mass', value: nil}.each do |key,val|
31
+ @cv[0].send(key).should == val
32
+ end
33
+ {cv_ref: 'IMS', accession: 'IMS:1000042', name: 'max count of pixels x', value: 23}.each do |key,val|
34
+ @cv[1].send(key).should == val
35
+ end
36
+ end
37
+ describe 'appending on initialization' do
38
+ it 'can be done with a block' do
39
+ cvlist = MS::CVList.new do
40
+ param 'MS:1000004' # sample mass
41
+ param 'IMS:1000042', 23 # max count of pixels of y
42
+ end
43
+ cvlist.size.should == 2
44
+ end
45
+ end
46
+
47
+ it 'can be done with brackets' do
48
+ args = ['IMS', 'IMS:1000052', 'position z', 22]
49
+ param_obj = CV::Param.new(*args)
50
+ cvlist = MS::CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
51
+ cvlist.size.should == 5
52
+ cvlist[0].should == cvlist[1]
53
+ cvlist.each do |param|
54
+ param.accession.should_not be_nil
55
+ param.name.should_not be_nil
56
+ param.cv_ref.should_not be_nil
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,351 @@
1
+ require 'spec_helper.rb'
2
+
3
+ require 'ms/digester'
4
+ require 'pp'
5
+
6
+ describe 'a digester' do
7
+ before do
8
+ @digester = MS::Digester.new('arg', 'R')
9
+ end
10
+
11
+ def spp(input, str="")
12
+ PP.singleline_pp(input, str)
13
+ end
14
+
15
+ def nk_string(n, split)
16
+ str = []
17
+ count = 0
18
+
19
+ (n * 1000).times do
20
+ count += 1
21
+ if count < split
22
+ str << 'A'
23
+ else
24
+ count = 0
25
+ str << 'R'
26
+ end
27
+ end
28
+
29
+ str.join('')
30
+ end
31
+
32
+ it 'finds cleavage site indices' do
33
+ {
34
+ "" => [0,0],
35
+ "A" => [0,1],
36
+ "R" => [0,1],
37
+ "AAA" => [0,3],
38
+ "RAA" => [0,1,3],
39
+ "ARA" => [0,2,3],
40
+ "AAR" => [0,3],
41
+ "RRA" => [0,1,2,3],
42
+ "RAR" => [0,1,3],
43
+ "RRR" => [0,1,2,3],
44
+
45
+ "R\nR\nR" => [0,2,4,5],
46
+ "R\n\n\nR\nR\n\n" => [0,4,6,9]
47
+ }.each do |sequence, expected|
48
+ @digester.cleavage_sites(sequence).should == expected
49
+ end
50
+ end
51
+
52
+ it 'finds cleavage sites with exception' do
53
+ @digester = MS::Digester.new('argp', 'R', 'P')
54
+ {
55
+ "" => [0,0],
56
+ "A" => [0,1],
57
+ "R" => [0,1],
58
+ "AAA" => [0,3],
59
+ "RAA" => [0,1,3],
60
+ "ARA" => [0,2,3],
61
+ "AAR" => [0,3],
62
+ "RRA" => [0,1,2,3],
63
+ "RAR" => [0,1,3],
64
+ "RRR" => [0,1,2,3],
65
+
66
+ "PR" => [0,1,2],
67
+ "PR" => [0,2],
68
+ "PRR" => [0,2,3],
69
+ "RPR" => [0,3],
70
+ "RRP" => [0,1,3],
71
+ "APRA" => [0,3,4],
72
+ "ARPA" => [0,4],
73
+ "ARPARA" => [0,5,6],
74
+ "R\nPR\nR" => [0,5,6],
75
+ "RP\nR\nR" => [0,5,6],
76
+ "RP\nR\nR\n" => [0,5,7]
77
+ }.each do |sequence, expected|
78
+ @digester.cleavage_sites(sequence).should == expected
79
+ end
80
+ end
81
+
82
+
83
+
84
+ it 'finds cleavage sites with offset and limit' do
85
+ {
86
+ "RxxR" => [2,4],
87
+ "RxAxR" => [2,4],
88
+ "RxAAAxR" => [2,4],
89
+ "RxRRRxR" => [2,3,4]
90
+ }.each do |sequence, expected|
91
+ @digester.cleavage_sites(sequence, 2, 2).should == expected
92
+ end
93
+ end
94
+
95
+ it 'finds cleavage sites fast' do
96
+ str = nk_string(10, 1000)
97
+ @digester.cleavage_sites(str).length.should == 11
98
+ benchmark(20) do |x|
99
+ x.report("10kx - fragments") do
100
+ 10000.times { @digester.cleavage_sites(str) }
101
+ end
102
+ end
103
+ end
104
+
105
+ it 'digests proteins' do
106
+ {
107
+ "" => [''],
108
+ "A" => ["A"],
109
+ "R" => ["R"],
110
+ "AAA" => ["AAA"],
111
+ "RAA" => ["R", "AA"],
112
+ "ARA" => ["AR", "A"],
113
+ "AAR" => ["AAR"],
114
+ "RRA" => ["R", "R", "A"],
115
+ "RAR" => ["R", "AR"],
116
+ "RRR" => ["R", "R", "R"]
117
+ }.each do |sequence, expected|
118
+ # spp(sequence)
119
+ @digester.digest(sequence).should == expected
120
+ #@digester.digest(sequence) {|frag, s, e| frag}.should == expected
121
+ end
122
+ end
123
+
124
+ it 'digests with missed cleavages' do
125
+ {
126
+ "" => [''],
127
+ "A" => ["A"],
128
+ "R" => ["R"],
129
+ "AAA" => ["AAA"],
130
+ "RAA" => ["R", "RAA", "AA"],
131
+ "ARA" => ["AR", "ARA", "A"],
132
+ "AAR" => ["AAR"],
133
+ "RRA" => ["R", "RR", "R", "RA", "A"],
134
+ "RAR" => ["R", "RAR", "AR"],
135
+ "RRR" => ["R", "RR", "R", "RR", "R"]
136
+ }.each do |sequence, expected|
137
+ @digester.digest(sequence, 1).should == expected
138
+ #@digester.digest(sequence, 1) {|frag, s, e| frag}.should == expected
139
+ end
140
+ end
141
+
142
+ it 'digests with two missed cleavages' do
143
+ {
144
+ "" => [''],
145
+ "A" => ["A"],
146
+ "R" => ["R"],
147
+ "AAA" => ["AAA"],
148
+ "RAA" => ["R", "RAA", "AA"],
149
+ "ARA" => ["AR", "ARA", "A"],
150
+ "AAR" => ["AAR"],
151
+ "RRA" => ["R", "RR", "RRA", "R", "RA", "A"],
152
+ "RAR" => ["R", "RAR", "AR"],
153
+ "RRR" => ["R", "RR", "RRR", "R", "RR", "R"]
154
+ }.each do |sequence, expected|
155
+ @digester.digest(sequence, 2).should == expected
156
+ #@digester.digest(sequence, 2) {|frag, s, e| frag}.should == expected
157
+ end
158
+ end
159
+
160
+ it 'digests fast' do
161
+ str = nk_string(10, 1000)
162
+ @digester.digest(str).length.should == 10
163
+ benchmark(20) do |x|
164
+ x.report("10kx - fragments") do
165
+ 10000.times { @digester.digest(str) }
166
+ end
167
+ end
168
+ end
169
+
170
+ it 'finds sites to be digested' do
171
+ {
172
+ "" => [[0,0]],
173
+ "A" => [[0,1]],
174
+ "R" => [[0,1]],
175
+ "AAA" => [[0,3]],
176
+ "RAA" => [[0,1],[1,3]],
177
+ "ARA" => [[0,2],[2,3]],
178
+ "AAR" => [[0,3]],
179
+ "RRA" => [[0,1],[1,2],[2,3]],
180
+ "RAR" => [[0,1],[1,3]],
181
+ "RRR" => [[0,1],[1,2],[2,3]]
182
+ }.each do |sequence, expected|
183
+ @digester.site_digest(sequence).should == expected
184
+ end
185
+ end
186
+
187
+ it 'finds sites to be digested with missed cleavages' do
188
+ {
189
+ "" => [[0,0]],
190
+ "A" => [[0,1]],
191
+ "R" => [[0,1]],
192
+ "AAA" => [[0,3]],
193
+ "RAA" => [[0,1],[0,3],[1,3]],
194
+ "ARA" => [[0,2],[0,3],[2,3]],
195
+ "AAR" => [[0,3]],
196
+ "RRA" => [[0,1],[0,2],[1,2],[1,3],[2,3]],
197
+ "RAR" => [[0,1],[0,3],[1,3]],
198
+ "RRR" => [[0,1],[0,2],[1,2],[1,3],[2,3]]
199
+ }.each do |sequence, expected|
200
+ @digester.site_digest(sequence, 1).should == expected
201
+ end
202
+ end
203
+
204
+ it 'finds sites to be digested with two missed cleavages' do
205
+ {
206
+ "" => [[0,0]],
207
+ "A" => [[0,1]],
208
+ "R" => [[0,1]],
209
+ "AAA" => [[0,3]],
210
+ "RAA" => [[0,1],[0,3],[1,3]],
211
+ "ARA" => [[0,2],[0,3],[2,3]],
212
+ "AAR" => [[0,3]],
213
+ "RRA" => [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]],
214
+ "RAR" => [[0,1],[0,3],[1,3]],
215
+ "RRR" => [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
216
+ }.each do |sequence, expected|
217
+ @digester.site_digest(sequence, 2).should == expected
218
+ end
219
+ end
220
+
221
+ it 'does site digestion fast' do
222
+ str = nk_string(10, 1000)
223
+ @digester.site_digest(str).length.should == 10
224
+ benchmark(20) do |x|
225
+ x.report("10kx - fragments") do
226
+ 10000.times { @digester.site_digest(str) }
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+
233
+ describe 'performs as documented in readme' do
234
+ it 'runs cleavage sites documentation' do
235
+ d = MS::Digester.new('Trypsin', 'KR', 'P')
236
+ seq = "AARGGR"
237
+ sites = d.cleavage_sites(seq)
238
+ sites.should == [0, 3, 6]
239
+
240
+ seq[sites[0], sites[0+1] - sites[0]].should == "AAR"
241
+ seq[sites[1], sites[1+1] - sites[1]].should == "GGR"
242
+
243
+ seq = "AAR \n GGR"
244
+ sites = d.cleavage_sites(seq)
245
+ sites.should == [0, 8, 11]
246
+
247
+ seq[sites[0], sites[0+1] - sites[0]].should == "AAR \n "
248
+ seq[sites[1], sites[1+1] - sites[1]].should == "GGR"
249
+ end
250
+ end
251
+
252
+ describe 'basic trypsin digestion' do
253
+ it 'performs digestion and can specify sites of digestion' do
254
+ trypsin = MS::Digester['Trypsin']
255
+
256
+ expected = [
257
+ 'MIVIGR',
258
+ 'SIVHPYITNEYEPFAAEK',
259
+ 'QQILSIMAG']
260
+ trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG').should == expected
261
+
262
+ expected = [
263
+ 'MIVIGR',
264
+ 'MIVIGRSIVHPYITNEYEPFAAEK',
265
+ 'SIVHPYITNEYEPFAAEK',
266
+ 'SIVHPYITNEYEPFAAEKQQILSIMAG',
267
+ 'QQILSIMAG']
268
+ trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1).should == expected
269
+
270
+ expected = [
271
+ [0,6],
272
+ [0,24],
273
+ [6,24],
274
+ [6,33],
275
+ [24,33]]
276
+ trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1).should == expected
277
+ end
278
+
279
+ it 'completely ignores whitespace inside protein sequences' do
280
+ expected = [
281
+ "\tMIVIGR",
282
+ "SIVHP\nYITNEYEPFAAE K",
283
+ "QQILSI\rMAG"]
284
+ MS::Digester['Trypsin'].digest("\tMIVIGRSIVHP\nYITNEYEPFAAE KQQILSI\rMAG").should == expected
285
+ end
286
+
287
+ it 'does a trypsin digest' do
288
+ trypsin = MS::Digester[:trypsin]
289
+ {
290
+ "" => [''],
291
+ "A" => ["A"],
292
+ "R" => ["R"],
293
+ "AAA" => ["AAA"],
294
+ "RAA" => ["R", "AA"],
295
+ "ARA" => ["AR", "A"],
296
+ "AAR" => ["AAR"],
297
+ "RRA" => ["R", "R", "A"],
298
+ "RAR" => ["R", "AR"],
299
+ "RRR" => ["R", "R", "R"],
300
+ "RKR" => ["R", "K", "R"],
301
+
302
+ "ARP" => ["ARP"],
303
+ "PRA" => ["PR","A"],
304
+ "ARPARAA" => ["ARPAR", "AA"],
305
+ "RPRRR" => ["RPR", "R", "R"]
306
+ }.each do |sequence, expected|
307
+ trypsin.digest(sequence).should == expected
308
+ end
309
+ end
310
+
311
+
312
+
313
+ end
314
+
315
+ describe 'digestion with other enzymes' do
316
+
317
+ # This is how to access the already created enzyme:
318
+ # MS::Digester['Arg-C'] (or :arg_c, 'ARG-C', :ARG_C')
319
+ {
320
+ ['Arg-C', :arg_c] => {
321
+ "AARC" => ["AAR", "C"],
322
+ "AARP" => ["AARP"]
323
+ },
324
+ ['Asp-N', :asp_n] => {
325
+ "AABDS" => ["AA", "B", "DS"],
326
+ "ADZBS" => ["A", "DZ", "BS"],
327
+ "B" => %w(B),
328
+ "A" => %w(A),
329
+ "ABD" => %w(A B D),
330
+ },
331
+ ['Asp-N_ambic', :asp_n_ambic] => {
332
+ "AAEDS" => ["AA", "E", "DS"],
333
+ "ADZES" => ["A", "DZ", "ES"],
334
+ "AED" => %w(A E D),
335
+ "GDE" => %w(G D E),
336
+ "AAECCDGG" => %w(AA ECC DGG),
337
+ }
338
+ }.each do |enzyme_names, test_hash|
339
+ it "digests with '#{enzyme_names.first}'" do
340
+ digester = MS::Digester[enzyme_names.first]
341
+ digester.should == MS::Digester[enzyme_names.last]
342
+ digester.name.should == enzyme_names.first
343
+ test_hash.each do |sequence, expected|
344
+ digester.digest(sequence).should == expected
345
+ end
346
+ end
347
+ end
348
+ end
349
+
350
+
351
+
@@ -0,0 +1,100 @@
1
+ require 'spec_helper'
2
+
3
+ require 'ms/fasta'
4
+
5
+ describe 'basic fasta operations' do
6
+ before do
7
+ @headers = [">gi|5524211 [hello]", ">another B", ">again C"]
8
+ @entries = ["LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV\nGLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX\nIENY", "ABCDEF\nGHIJK", "ABCD"]
9
+ @sequences = @entries.map {|v| v.gsub("\n", '') }
10
+ @data = {}
11
+ @data['newlines'] = @headers.zip(@entries).map do |header, data|
12
+ header + "\n" + data
13
+ end.join("\n")
14
+ @data['carriage_returns_and_newlines'] = @data['newlines'].gsub("\n", "\r\n")
15
+ file_key_to_filename_pairs = @data.map do |k,v|
16
+ file_key = k + '_file'
17
+ filename = k + '.tmp'
18
+ File.open(filename, 'w') {|out| out.print v }
19
+ [file_key, filename]
20
+ end
21
+ file_key_to_filename_pairs.each {|k,v| @data[k] = v }
22
+ end
23
+
24
+ after do
25
+ @data.select {|k,v| k =~ /_file$/ }.each do |k,filename|
26
+ index = filename.sub('.tmp', '.index')
27
+ [filename, index].each do |fn|
28
+ File.unlink(fn) if File.exist? fn
29
+ end
30
+ end
31
+ end
32
+
33
+ def fasta_correct?(fasta)
34
+ entries = fasta.map
35
+ @headers.size.times.zip(entries) do |i,entry|
36
+ header, sequence, entry = @headers[i], @sequences[i], entry
37
+ entry.header.should_not == nil
38
+ entry.sequence.should_not == nil
39
+ entry.header.should == header[1..-1]
40
+ entry.sequence.should == sequence
41
+ end
42
+ end
43
+
44
+ xit 'can deliver length and description hashes' do
45
+ # need to test
46
+ end
47
+
48
+ it 'can read a file' do
49
+ %w(newlines_file carriage_returns_and_newlines_file).each do |file|
50
+ MS::Fasta.open(@data[file]) do |fasta|
51
+ fasta_correct? fasta
52
+ end
53
+ end
54
+ end
55
+
56
+ it 'can read an IO object' do
57
+ %w(newlines_file carriage_returns_and_newlines_file).each do |file|
58
+ File.open(@data[file]) do |io|
59
+ fasta = MS::Fasta.new(io)
60
+ fasta_correct? fasta
61
+ end
62
+ end
63
+ end
64
+
65
+ it 'can read a string' do
66
+ %w(newlines carriage_returns_and_newlines).each do |key|
67
+ fasta = MS::Fasta.new @data[key]
68
+ fasta_correct? fasta
69
+ end
70
+ end
71
+
72
+ it 'iterates entries with foreach' do
73
+ %w(newlines_file carriage_returns_and_newlines_file).each do |file|
74
+ MS::Fasta.foreach(@data[file]) do |entry|
75
+ entry.should be_an_instance_of Bio::FastaFormat
76
+ end
77
+ end
78
+ end
79
+
80
+ it 'runs the documentation' do
81
+ fasta_file = @data['newlines_file']
82
+ ids = MS::Fasta.open(fasta_file) do |fasta|
83
+ fasta.map(&:entry_id)
84
+ end
85
+ ids.is_a?(Array)
86
+ ids.should == %w(gi|5524211 another again)
87
+
88
+ # this code is already tested above
89
+ # File.open(fasta_file) do |io|
90
+ # fasta = MS::Fasta.new(io)
91
+ # end
92
+
93
+ # taking a string
94
+ string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
95
+ fasta = MS::Fasta.new(string)
96
+ (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
97
+ simple.first.header.include?("simple").should == true
98
+ not_simple.first.header.include?("simple").should == false
99
+ end
100
+ end
@@ -0,0 +1,108 @@
1
+ require 'spec_helper'
2
+
3
+ require 'yaml'
4
+ path = 'ms/ident/peptide/db'
5
+ require path
6
+
7
+ module Kernel
8
+
9
+ def capture_stdout
10
+ out = StringIO.new
11
+ $stdout = out
12
+ yield
13
+ out.rewind
14
+ return out.read
15
+ ensure
16
+ $stdout = STDOUT
17
+ end
18
+
19
+ end
20
+
21
+
22
+ describe 'a uniprot fasta file' do
23
+
24
+ before do
25
+ @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
26
+ end
27
+
28
+ describe 'amino acid expansion' do
29
+
30
+ it 'can expand out wildcard amino acid combinations' do
31
+ array = MS::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
32
+ array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
33
+ end
34
+
35
+ it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
36
+ # this is from real data
37
+ worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
38
+ MS::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
39
+ end
40
+
41
+ it 'returns the peptide in the array if no expansion' do
42
+ array = MS::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
43
+ array.should == ['ZZZZZ']
44
+ end
45
+
46
+ end
47
+
48
+ describe 'creating a peptide centric database' do
49
+ before do
50
+
51
+ #@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
52
+ @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
53
+ end
54
+
55
+ it 'converts a fasta file into peptide centric db' do
56
+ output_files = MS::Ident::Peptide::Db.cmdline([@fasta_file])
57
+ output_files.first.should == File.expand_path(@output_file)
58
+ File.exist?(@output_file).should == true
59
+ hash = {}
60
+ YAML.load_file(@output_file).each do |k,v|
61
+ hash[k] = v.split("\t")
62
+ end
63
+ sorted = hash.sort
64
+ # these are merely frozen, not perfectly defined
65
+ sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
66
+ sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
67
+ sorted.size.should == 728
68
+ File.unlink(@output_file)
69
+ end
70
+
71
+ it 'lists approved enzymes and exits' do
72
+ output = capture_stdout do
73
+ begin
74
+ MS::Ident::Peptide::Db.cmdline(['--list-enzymes'])
75
+ rescue SystemExit
76
+ 1.should == 1 # we exited
77
+ end
78
+ end
79
+ lines = output.split("\n")
80
+ lines.include?("trypsin").should == true
81
+ lines.include?("chymotrypsin").should == true
82
+ end
83
+ end
84
+
85
+ describe 'reading a peptide centric database' do
86
+ before do
87
+ outfiles = MS::Ident::Peptide::Db.cmdline([@fasta_file])
88
+ @outfile = outfiles.first
89
+ end
90
+
91
+ it 'creates a hash that can retrieve peptides as an array' do
92
+ hash = MS::Ident::Peptide::Db.new(@outfile)
93
+ hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
94
+ hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
95
+ end
96
+
97
+ it 'reads the file on disk with random access or is enumerable' do
98
+ MS::Ident::Peptide::Db::IO.open(@outfile) do |io|
99
+ io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
100
+ io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
101
+ io.each_with_index do |key_prots, i|
102
+ key_prots.first.should be_an_instance_of String
103
+ key_prots.last.should be_a_kind_of Array
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end