mspire-sequest 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +30 -0
- data/.gitmodules +9 -0
- data/History +79 -0
- data/LICENSE +22 -0
- data/README.rdoc +85 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/bin/srf_to_pepxml.rb +7 -0
- data/bin/srf_to_search.rb +7 -0
- data/bin/srf_to_sqt.rb +8 -0
- data/lib/mspire/sequest/params.rb +331 -0
- data/lib/mspire/sequest/pepxml/modifications.rb +247 -0
- data/lib/mspire/sequest/pepxml/params.rb +32 -0
- data/lib/mspire/sequest/sqt.rb +393 -0
- data/lib/mspire/sequest/srf/pepxml/sequest.rb +21 -0
- data/lib/mspire/sequest/srf/pepxml.rb +333 -0
- data/lib/mspire/sequest/srf/search.rb +158 -0
- data/lib/mspire/sequest/srf/sqt.rb +218 -0
- data/lib/mspire/sequest/srf.rb +715 -0
- data/lib/mspire/sequest.rb +6 -0
- data/script/fasta_ipi_to_ncbi-ish.rb +29 -0
- data/spec/mspire/sequest/params_spec.rb +135 -0
- data/spec/mspire/sequest/pepxml/modifications_spec.rb +50 -0
- data/spec/mspire/sequest/pepxml_spec.rb +311 -0
- data/spec/mspire/sequest/sqt_spec.rb +51 -0
- data/spec/mspire/sequest/sqt_spec_helper.rb +34 -0
- data/spec/mspire/sequest/srf/pepxml_spec.rb +89 -0
- data/spec/mspire/sequest/srf/search_spec.rb +131 -0
- data/spec/mspire/sequest/srf/sqt_spec.rb +228 -0
- data/spec/mspire/sequest/srf_spec.rb +113 -0
- data/spec/mspire/sequest/srf_spec_helper.rb +172 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/testfiles/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/spec/testfiles/bioworks31.params +77 -0
- data/spec/testfiles/bioworks32.params +62 -0
- data/spec/testfiles/bioworks33.params +63 -0
- data/spec/testfiles/corrupted_900.srf +0 -0
- data/spec/testfiles/small.sqt +87 -0
- data/spec/testfiles/small2.sqt +176 -0
- metadata +185 -0
@@ -0,0 +1,715 @@
|
|
1
|
+
|
2
|
+
# standard lib
|
3
|
+
require 'set'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'scanf'
|
6
|
+
|
7
|
+
# in library
|
8
|
+
require 'mspire/ident/search'
|
9
|
+
require 'mspire/ident/peptide'
|
10
|
+
require 'mspire/ident/protein'
|
11
|
+
require 'mspire/sequest/params'
|
12
|
+
|
13
|
+
|
14
|
+
module Mspire ; end
|
15
|
+
module Mspire::Sequest ; end
|
16
|
+
|
17
|
+
class Mspire::Sequest::Srf < Mspire::Ident::Search
|
18
|
+
class NoSequestParamsError < ArgumentError
|
19
|
+
end
|
20
|
+
|
21
|
+
# inherits peptide_hits from Search
|
22
|
+
|
23
|
+
# a String: 3.5, 3.3 or 3.2
|
24
|
+
attr_accessor :version
|
25
|
+
|
26
|
+
attr_accessor :header
|
27
|
+
attr_accessor :dta_files
|
28
|
+
attr_accessor :out_files
|
29
|
+
attr_accessor :params
|
30
|
+
# a parallel array to dta_files and out_files where each entry is:
|
31
|
+
# [first_scan, last_scan, charge]
|
32
|
+
attr_accessor :index
|
33
|
+
|
34
|
+
# the base name of the file with no extension
|
35
|
+
attr_accessor :base_name
|
36
|
+
|
37
|
+
alias_method :base_name_noext, :base_name
|
38
|
+
alias_method :base_name_noext=, :base_name=
|
39
|
+
|
40
|
+
# the directory the srf file was residing in when the filename was passed
|
41
|
+
# in. May not be available.
|
42
|
+
attr_accessor :resident_dir
|
43
|
+
|
44
|
+
# a boolean to indicate if the results have been filtered by the
|
45
|
+
# sequest.params precursor mass tolerance
|
46
|
+
attr_accessor :filtered_by_precursor_mass_tolerance
|
47
|
+
|
48
|
+
def protein_class
|
49
|
+
Mspire::Sequest::Srf::Out::Protein
|
50
|
+
end
|
51
|
+
|
52
|
+
# returns a Sequest::Params object or nil if none
|
53
|
+
def self.get_sequest_params_and_finish_pos(filename)
|
54
|
+
# split the file in half and only read the second half (since we can be
|
55
|
+
# confident that the params file will be there!)
|
56
|
+
|
57
|
+
params = nil
|
58
|
+
finish_parsing_io_pos = nil
|
59
|
+
File.open(filename, 'rb') do |handle|
|
60
|
+
halfway = handle.stat.size / 2
|
61
|
+
handle.seek halfway
|
62
|
+
last_half = handle.read
|
63
|
+
if sequest_start_from_last_half = last_half.rindex('[SEQUEST]')
|
64
|
+
params_start_index = sequest_start_from_last_half + halfway
|
65
|
+
handle.seek(params_start_index)
|
66
|
+
params = Mspire::Sequest::Params.new.parse_io(handle)
|
67
|
+
finish_parsing_io_pos = handle.pos
|
68
|
+
else
|
69
|
+
nil # not found
|
70
|
+
end
|
71
|
+
end
|
72
|
+
[params, finish_parsing_io_pos]
|
73
|
+
end
|
74
|
+
|
75
|
+
def dta_start_byte
|
76
|
+
case @version
|
77
|
+
when '3.2' ; 3260
|
78
|
+
when '3.3' ; 3644
|
79
|
+
when '3.5' ; 3644
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
# opts:
|
85
|
+
# :filter_by_precursor_mass_tolerance => true | false (default true)
|
86
|
+
# this will filter by the sequest params prec tolerance as is
|
87
|
+
# typically done by the Bioworks software.
|
88
|
+
#
|
89
|
+
# :read_pephits => true | false (default true)
|
90
|
+
# will attempt to read peptide hit information (equivalent to .out
|
91
|
+
# files), otherwise, just reads the dta information.
|
92
|
+
def initialize(filename=nil, opts={})
|
93
|
+
@peptide_hits = []
|
94
|
+
@dta_files = []
|
95
|
+
@out_files = []
|
96
|
+
if filename
|
97
|
+
from_file(filename, opts)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
# 1. updates the out_file's list of hits based on passing peptide_hits (but not
|
103
|
+
# the original hit id; rank is implicit in array ordering)
|
104
|
+
# 2. recalculates deltacn values completely if number of hits changed (does
|
105
|
+
# not touch deltacn orig)
|
106
|
+
#
|
107
|
+
# This can spoil proper protein -> peptide linkages. Mspire::Id::Search.merge!
|
108
|
+
# should be run after this method to ensure correct protein -> peptide
|
109
|
+
# linkages.
|
110
|
+
def filter_by_precursor_mass_tolerance!
|
111
|
+
pmt = params.peptide_mass_tolerance.to_f
|
112
|
+
methd = nil # the method to
|
113
|
+
|
114
|
+
case params.peptide_mass_units
|
115
|
+
when '0'
|
116
|
+
amu_based = true
|
117
|
+
milli_amu = false
|
118
|
+
when '1'
|
119
|
+
amu_based = true
|
120
|
+
milli_amu = true
|
121
|
+
when '2'
|
122
|
+
amu_based = false
|
123
|
+
end
|
124
|
+
|
125
|
+
self.filtered_by_precursor_mass_tolerance = true
|
126
|
+
self.out_files.each do |out_file|
|
127
|
+
hits = out_file.hits
|
128
|
+
before = hits.size
|
129
|
+
hits.reject! do |pep|
|
130
|
+
if amu_based
|
131
|
+
if milli_amu
|
132
|
+
(pep.deltamass.abs > (pmt/1000))
|
133
|
+
else
|
134
|
+
(pep.deltamass.abs > pmt)
|
135
|
+
end
|
136
|
+
else
|
137
|
+
(pep.ppm.abs > pmt)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
if hits.size != before
|
141
|
+
out_file.hits = hits # <- is this necessary
|
142
|
+
Mspire::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits)
|
143
|
+
out_file.num_hits = hits.size
|
144
|
+
end
|
145
|
+
end
|
146
|
+
self
|
147
|
+
end
|
148
|
+
|
149
|
+
def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0)
|
150
|
+
dta_files = Array.new(num_files)
|
151
|
+
out_files = Array.new(num_files)
|
152
|
+
start = dta_start_byte
|
153
|
+
fh.pos = start
|
154
|
+
|
155
|
+
num_files.times do |i|
|
156
|
+
dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
|
157
|
+
#p dta_files[i]
|
158
|
+
out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
|
159
|
+
#p out_files[i]
|
160
|
+
end
|
161
|
+
[dta_files, out_files]
|
162
|
+
end
|
163
|
+
|
164
|
+
# returns self
|
165
|
+
# opts are the same as for 'new'
|
166
|
+
def from_file(filename, opts)
|
167
|
+
@resident_dir = File.dirname(File.expand_path(filename))
|
168
|
+
opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts)
|
169
|
+
|
170
|
+
(@params, after_params_io_pos) = Mspire::Sequest::Srf.get_sequest_params_and_finish_pos(filename)
|
171
|
+
return unless @params
|
172
|
+
|
173
|
+
dup_references = 0
|
174
|
+
dup_refs_gt_0 = false
|
175
|
+
|
176
|
+
dup_references = @params.print_duplicate_references.to_i
|
177
|
+
if dup_references == 0
|
178
|
+
# warn %Q{
|
179
|
+
#*****************************************************************************
|
180
|
+
#WARNING: This srf file lists only 1 protein per peptide! (based on the
|
181
|
+
#print_duplicate_references parameter in the sequest.params file used in its
|
182
|
+
#creation) So, downstream output will likewise only contain a single protein
|
183
|
+
#for each peptide hit. In many instances this is OK since downstream programs
|
184
|
+
#will recalculate protein-to-peptide linkages from the database file anyway.
|
185
|
+
#For complete protein lists per peptide hit, .srf files must be created with
|
186
|
+
#print_duplicate_references > 0. HINT: to capture all duplicate references,
|
187
|
+
#set the sequest parameter 'print_duplicate_references' to 100 or greater.
|
188
|
+
#*****************************************************************************
|
189
|
+
# }
|
190
|
+
else
|
191
|
+
dup_refs_gt_0 = true
|
192
|
+
end
|
193
|
+
|
194
|
+
File.open(filename, 'rb') do |fh|
|
195
|
+
@header = Mspire::Sequest::Srf::Header.from_io(fh)
|
196
|
+
@version = @header.version
|
197
|
+
|
198
|
+
unpack_35 = case @version
|
199
|
+
when '3.2'
|
200
|
+
false
|
201
|
+
when '3.3'
|
202
|
+
false
|
203
|
+
when '3.5'
|
204
|
+
true
|
205
|
+
end
|
206
|
+
|
207
|
+
if @header.combined
|
208
|
+
@base_name = File.basename(filename, '.*')
|
209
|
+
# I'm not sure why this is the case, but the reported number is too
|
210
|
+
# big by one on the 2 files I've seen so far, so we will correct it here!
|
211
|
+
@header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1
|
212
|
+
if opts[:read_pephits] == false
|
213
|
+
raise NotImplementedError, "on combined files must read everything right now!"
|
214
|
+
end
|
215
|
+
(@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0)
|
216
|
+
else
|
217
|
+
@base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
|
218
|
+
|
219
|
+
@dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35)
|
220
|
+
if opts[:read_pephits]
|
221
|
+
# need the params file to know if the duplicate_references is set > 0
|
222
|
+
raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil?
|
223
|
+
@out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0)
|
224
|
+
|
225
|
+
# FOR DISPLAY ONLY!
|
226
|
+
#@out_files.each do |f|
|
227
|
+
# if f.num_hits == 10
|
228
|
+
# p f.hits.last
|
229
|
+
# end
|
230
|
+
#end
|
231
|
+
|
232
|
+
if fh.eof?
|
233
|
+
#warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
|
234
|
+
@params = nil
|
235
|
+
@index = []
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
fh.pos = after_params_io_pos
|
241
|
+
|
242
|
+
# This is very sensitive to the grab_params method in sequest params
|
243
|
+
fh.read(12) ## gap between last params entry and index
|
244
|
+
|
245
|
+
@index = read_scan_index(fh,@header.num_dta_files)
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
### UPDATE SOME THINGS:
|
250
|
+
# give each hit a base_name, first_scan, last_scan
|
251
|
+
if opts[:read_pephits] && !@header.combined
|
252
|
+
@index.each_with_index do |ind,i|
|
253
|
+
mass_measured = @dta_files[i][0]
|
254
|
+
outfile = @out_files[i]
|
255
|
+
outfile.first_scan = ind[0]
|
256
|
+
outfile.last_scan = ind[1]
|
257
|
+
outfile.charge = ind[2]
|
258
|
+
|
259
|
+
pep_hits = @out_files[i].hits
|
260
|
+
@peptide_hits.push( *pep_hits )
|
261
|
+
pep_hits.each do |pep_hit|
|
262
|
+
pep_hit[15] = @base_name
|
263
|
+
pep_hit[16] = ind[0]
|
264
|
+
pep_hit[17] = ind[1]
|
265
|
+
pep_hit[18] = ind[2]
|
266
|
+
# add the deltamass
|
267
|
+
pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass)
|
268
|
+
pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm
|
269
|
+
pep_hit[19] = self ## link with the srf object
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
filter_by_precursor_mass_tolerance! if params
|
274
|
+
end
|
275
|
+
|
276
|
+
self
|
277
|
+
end
|
278
|
+
|
279
|
+
# returns an index where each entry is [first_scan, last_scan, charge]
|
280
|
+
def read_scan_index(fh, num)
|
281
|
+
#string = fh.read(80)
|
282
|
+
#puts "STRING: "
|
283
|
+
#p string
|
284
|
+
#puts string
|
285
|
+
#File.open("tmp.tmp",'wb') {|out| out.print string }
|
286
|
+
#abort 'her'
|
287
|
+
ind_len = 24
|
288
|
+
index = Array.new(num)
|
289
|
+
unpack_string = 'III'
|
290
|
+
st = ''
|
291
|
+
ind_len.times do st << '0' end ## create a 24 byte string to receive data
|
292
|
+
num.times do |i|
|
293
|
+
fh.read(ind_len, st)
|
294
|
+
result = st.unpack(unpack_string)
|
295
|
+
index[i] = st.unpack(unpack_string)
|
296
|
+
end
|
297
|
+
index
|
298
|
+
end
|
299
|
+
|
300
|
+
# returns an array of dta_files
|
301
|
+
def read_dta_files(fh, num_files, unpack_35)
|
302
|
+
dta_files = Array.new(num_files)
|
303
|
+
start = dta_start_byte
|
304
|
+
fh.pos = start
|
305
|
+
|
306
|
+
header.num_dta_files.times do |i|
|
307
|
+
dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
|
308
|
+
end
|
309
|
+
dta_files
|
310
|
+
end
|
311
|
+
|
312
|
+
# filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
|
313
|
+
# will put the fh there.
|
314
|
+
def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0)
|
315
|
+
out_files = Array.new(number_files)
|
316
|
+
header.num_dta_files.times do |i|
|
317
|
+
out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
|
318
|
+
end
|
319
|
+
out_files
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
class Mspire::Sequest::Srf::Header
|
325
|
+
|
326
|
+
Start_byte = {
|
327
|
+
:enzyme => 438,
|
328
|
+
:ion_series => 694,
|
329
|
+
:model => 950,
|
330
|
+
:modifications => 982,
|
331
|
+
:raw_filename => 1822,
|
332
|
+
:db_filename => 2082,
|
333
|
+
:dta_log_filename => 2602,
|
334
|
+
:params_filename => 3122,
|
335
|
+
:sequest_log_filename => 3382,
|
336
|
+
}
|
337
|
+
Byte_length = {
|
338
|
+
:enzyme => 256,
|
339
|
+
:ion_series => 256,
|
340
|
+
:model => 32,
|
341
|
+
:modifications => 840,
|
342
|
+
:raw_filename => 260,
|
343
|
+
:db_filename => 520,
|
344
|
+
:dta_log_filename => 520,
|
345
|
+
:params_filename => 260,
|
346
|
+
:sequest_log_filename => 262, ## is this really 262?? or should be 260??
|
347
|
+
}
|
348
|
+
Byte_length_v32 = {
|
349
|
+
:modifications => 456,
|
350
|
+
}
|
351
|
+
|
352
|
+
attr_accessor :version
|
353
|
+
# a Mspire::Sequest::Srf::DtaGen object
|
354
|
+
attr_accessor :dta_gen
|
355
|
+
attr_accessor :enzyme
|
356
|
+
attr_accessor :ion_series
|
357
|
+
attr_accessor :model
|
358
|
+
attr_accessor :modifications
|
359
|
+
attr_accessor :raw_filename
|
360
|
+
attr_accessor :db_filename
|
361
|
+
attr_accessor :dta_log_filename
|
362
|
+
attr_accessor :params_filename
|
363
|
+
attr_accessor :sequest_log_filename
|
364
|
+
|
365
|
+
|
366
|
+
# true if this is a combined file, false if represents a single file
|
367
|
+
# this is set by examining the DtaGen object for signs of a single file
|
368
|
+
attr_reader :combined
|
369
|
+
|
370
|
+
__chars_re = Regexp.escape( "\r\0" )
|
371
|
+
NEWLINE_OR_NULL_RE = /[#{__chars_re}]/o
|
372
|
+
|
373
|
+
def num_dta_files
|
374
|
+
@dta_gen.num_dta_files
|
375
|
+
end
|
376
|
+
|
377
|
+
def self.from_io(fh)
|
378
|
+
self.new.from_io(fh)
|
379
|
+
end
|
380
|
+
|
381
|
+
# sets fh to 0 and grabs the information it wants
|
382
|
+
def from_io(fh)
|
383
|
+
st = fh.read(4)
|
384
|
+
@version = '3.' + st.unpack('I').first.to_s
|
385
|
+
@dta_gen = Mspire::Sequest::Srf::DtaGen.from_io(fh)
|
386
|
+
# if the start_mass end_mass start_scan and end_scan are all zero, its a
|
387
|
+
# combined srf file:
|
388
|
+
@combined = [0.0, 0.0, 0, 0].zip(%w(start_mass end_mass start_scan end_scan)).all? do |one,two|
|
389
|
+
one == @dta_gen.send(two.to_sym)
|
390
|
+
end
|
391
|
+
|
392
|
+
## get the rest of the info
|
393
|
+
byte_length = Byte_length.dup
|
394
|
+
byte_length.merge! Byte_length_v32 if @version == '3.2'
|
395
|
+
|
396
|
+
fh.pos = Start_byte[:enzyme]
|
397
|
+
[:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
|
398
|
+
send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param], @combined))
|
399
|
+
end
|
400
|
+
self
|
401
|
+
end
|
402
|
+
|
403
|
+
private
|
404
|
+
def get_null_padded_string(fh, bytes, combined=false)
|
405
|
+
st = fh.read(bytes)
|
406
|
+
# for empty declarations
|
407
|
+
if st[0] == 0x000000
|
408
|
+
return ''
|
409
|
+
end
|
410
|
+
if combined
|
411
|
+
st = st[ 0, st.index(NEWLINE_OR_NULL_RE) ]
|
412
|
+
else
|
413
|
+
st.rstrip!
|
414
|
+
end
|
415
|
+
st
|
416
|
+
end
|
417
|
+
|
418
|
+
|
419
|
+
end
|
420
|
+
|
421
|
+
# the Dta Generation Params
|
422
|
+
class Mspire::Sequest::Srf::DtaGen
|
423
|
+
|
424
|
+
## not sure if this is correct
|
425
|
+
# Float
|
426
|
+
attr_accessor :start_time
|
427
|
+
# Float
|
428
|
+
attr_accessor :start_mass
|
429
|
+
# Float
|
430
|
+
attr_accessor :end_mass
|
431
|
+
# Integer
|
432
|
+
attr_accessor :num_dta_files
|
433
|
+
# Integer
|
434
|
+
attr_accessor :group_scan
|
435
|
+
## not sure if this is correct
|
436
|
+
# Integer
|
437
|
+
attr_accessor :min_group_count
|
438
|
+
# Integer
|
439
|
+
attr_accessor :min_ion_threshold
|
440
|
+
#attr_accessor :intensity_threshold # can't find yet
|
441
|
+
#attr_accessor :precursor_tolerance # can't find yet
|
442
|
+
# Integer
|
443
|
+
attr_accessor :start_scan
|
444
|
+
# Integer
|
445
|
+
attr_accessor :end_scan
|
446
|
+
|
447
|
+
def self.from_io(io)
|
448
|
+
self.new.from_io(io)
|
449
|
+
end
|
450
|
+
|
451
|
+
# sets self based on the io object and returns self
|
452
|
+
def from_io(io)
|
453
|
+
io.pos = 0 if io.pos != 0
|
454
|
+
st = io.read(148)
|
455
|
+
(@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
|
456
|
+
self
|
457
|
+
end
|
458
|
+
end
|
459
|
+
|
460
|
+
# total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
|
461
|
+
# unknown is, well unknown...
|
462
|
+
|
463
|
+
Mspire::Sequest::Srf::Dta = Struct.new( *%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks).map(&:to_sym) )
|
464
|
+
|
465
|
+
class Mspire::Sequest::Srf::Dta
|
466
|
+
# original
|
467
|
+
# Unpack = "EeIvvvv"
|
468
|
+
Unpack_32 = "EeIvvvv"
|
469
|
+
Unpack_35 = "Ex8eVx2vvvv"
|
470
|
+
|
471
|
+
|
472
|
+
# note on peaks (self[7])
|
473
|
+
# this is a byte array of floats, you can get the peaks out with
|
474
|
+
# unpack("e*")
|
475
|
+
|
476
|
+
undef_method :inspect
|
477
|
+
def inspect
|
478
|
+
peaks_st = 'nil'
|
479
|
+
if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
|
480
|
+
"<Mspire::Sequest::Srf::Dta @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
|
481
|
+
end
|
482
|
+
|
483
|
+
def self.from_io(fh, unpack_35)
|
484
|
+
(unpack, read_header, read_spacer) =
|
485
|
+
if unpack_35
|
486
|
+
[Unpack_35, 34, 22]
|
487
|
+
else
|
488
|
+
[Unpack_32, 24, 24]
|
489
|
+
end
|
490
|
+
|
491
|
+
# get the bulk of the data in single unpack
|
492
|
+
# sets the first 7 attributes
|
493
|
+
dta = self.new(*fh.read(read_header).unpack(unpack))
|
494
|
+
|
495
|
+
# Scan numbers are given at the end in an index!
|
496
|
+
fh.read(read_spacer) # throwaway the spacer
|
497
|
+
|
498
|
+
dta[7] = fh.read(dta.num_peaks * 8) # (num_peaks * 8) is the number of bytes to read
|
499
|
+
dta
|
500
|
+
end
|
501
|
+
|
502
|
+
def to_dta_file_data
|
503
|
+
string = "#{round(mh, 6)} #{charge}\r\n"
|
504
|
+
peak_ar = peaks.unpack('e*')
|
505
|
+
(0...(peak_ar.size)).step(2) do |i|
|
506
|
+
# %d is equivalent to floor, so we round by adding 0.5!
|
507
|
+
string << "#{round(peak_ar[i], 4)} #{(peak_ar[i+1] + 0.5).floor}\r\n"
|
508
|
+
#string << peak_ar[i,2].join(' ') << "\r\n"
|
509
|
+
end
|
510
|
+
string
|
511
|
+
end
|
512
|
+
|
513
|
+
# write a class dta file to the io object
|
514
|
+
def write_dta_file(io)
|
515
|
+
io.print to_dta_file_data
|
516
|
+
end
|
517
|
+
|
518
|
+
# returns a string where the float has been rounded to the specified number
|
519
|
+
# of decimal places
|
520
|
+
def round(float, decimal_places)
|
521
|
+
sprintf("%.#{decimal_places}f", float)
|
522
|
+
end
|
523
|
+
|
524
|
+
end
|
525
|
+
|
526
|
+
|
527
|
+
#Mspire::Sequest::Srf::Out = Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) )
|
528
|
+
Mspire::Sequest::Srf::Out = Struct.new( *%w(num_hits computer date_time total_inten lowest_sp num_matched_peptides db_locus_count hits first_scan last_scan charge).map(&:to_sym) )
|
529
|
+
|
530
|
+
# 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
|
531
|
+
|
532
|
+
class Mspire::Sequest::Srf::Out
|
533
|
+
Unpack_32 = '@36vx2Z*@60Z*'
|
534
|
+
Unpack_35 = '@36vx4Z*@62Z*'
|
535
|
+
|
536
|
+
undef_method :inspect
|
537
|
+
def inspect
|
538
|
+
hits_s =
|
539
|
+
if self.hits
|
540
|
+
", @hits(#)=#{hits.size}"
|
541
|
+
else
|
542
|
+
''
|
543
|
+
end
|
544
|
+
"<Mspire::Sequest::Srf::Out first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
|
545
|
+
end
|
546
|
+
|
547
|
+
# returns an Mspire::Sequest::Srf::Out object
|
548
|
+
def self.from_io(fh, unpack_35, dup_refs_gt_0)
|
549
|
+
## EMPTY out file is 96 bytes
|
550
|
+
## each hit is 320 bytes
|
551
|
+
## num_hits and charge:
|
552
|
+
st = fh.read(96)
|
553
|
+
|
554
|
+
# num_hits computer date_time
|
555
|
+
initial_vals = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
|
556
|
+
# total_inten lowest_sp num_matched_peptides db_locus_count
|
557
|
+
initial_vals.push( *st.unpack('@8eex4Ix4I') )
|
558
|
+
out_obj = self.new( *initial_vals )
|
559
|
+
|
560
|
+
_num_hits = out_obj.num_hits
|
561
|
+
|
562
|
+
ar = Array.new(_num_hits)
|
563
|
+
if ar.size > 0
|
564
|
+
num_extra_references = 0
|
565
|
+
_num_hits.times do |i|
|
566
|
+
ar[i] = Mspire::Sequest::Srf::Out::Peptide.from_io(fh, unpack_35)
|
567
|
+
num_extra_references += ar[i].num_other_loci
|
568
|
+
end
|
569
|
+
if dup_refs_gt_0
|
570
|
+
Mspire::Sequest::Srf::Out::Peptide.read_extra_references(fh, num_extra_references, ar)
|
571
|
+
end
|
572
|
+
## The xcorrs are already ordered by best to worst hit
|
573
|
+
## ADJUST the deltacn's to be meaningful for the top hit:
|
574
|
+
## (the same as bioworks and prophet)
|
575
|
+
Mspire::Sequest::Srf::Out::Peptide.set_deltacn_from_deltacn_orig(ar)
|
576
|
+
end
|
577
|
+
out_obj.hits = ar
|
578
|
+
out_obj[1].chomp! # computer
|
579
|
+
out_obj
|
580
|
+
end
|
581
|
+
|
582
|
+
end
|
583
|
+
|
584
|
+
|
585
|
+
|
586
|
+
# deltacn_orig - the one that sequest originally reports (top hit gets 0.0)
|
587
|
+
# deltacn - modified to be that of the next best hit (by xcorr) and the last
|
588
|
+
# hit takes 1.1. This is what is called deltacn by bioworks and pepprophet
|
589
|
+
# (at least for the first few years). If filtering occurs, it will be
|
590
|
+
# updated.
|
591
|
+
# deltacn_orig_updated - the latest updated value of deltacn.
|
592
|
+
# Originally, this will be equal to deltacn_orig. After filtering, this will
|
593
|
+
# be recalculated. To know if this will be different from deltacn_orig, query
|
594
|
+
# match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then
|
595
|
+
# deltacn should also be changed to reflect it.
|
596
|
+
# mh - the theoretical mass + h
|
597
|
+
# proteins are created as SRF prot objects with a reference and linked to their
|
598
|
+
# peptide_hits (from global hash by reference)
|
599
|
+
# ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
|
600
|
+
# This is calculated for the M+H mass!
|
601
|
+
# num_other_loci is the number of other loci that the peptide matches beyond
|
602
|
+
# the first one listed
|
603
|
+
# srf = the srf object this scan came from
|
604
|
+
Mspire::Sequest::Srf::Out::Peptide = Struct.new( *%w(mh deltacn_orig sf sp xcorr id num_other_loci rsp ions_matched ions_total sequence proteins deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated).map(&:to_sym) )
|
605
|
+
# 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=proteins 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
|
606
|
+
|
607
|
+
class Mspire::Sequest::Srf::Out::Peptide
|
608
|
+
|
609
|
+
# creates the deltacn that is meaningful for the top hit (the deltacn_orig
|
610
|
+
# or the second best hit and so on).
|
611
|
+
# assumes sorted
|
612
|
+
def self.set_deltacn_from_deltacn_orig(ar)
|
613
|
+
(1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
|
614
|
+
ar[-1].deltacn = 1.1
|
615
|
+
end
|
616
|
+
|
617
|
+
# (assumes sorted)
|
618
|
+
# recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn
|
619
|
+
def self.update_deltacns_from_xcorr(ar)
|
620
|
+
if ar.size > 0
|
621
|
+
top_score = ar.first[4]
|
622
|
+
other_scores = (1...(ar.size)).to_a.map do |i|
|
623
|
+
1.0 - (ar[i][4]/top_score)
|
624
|
+
end
|
625
|
+
ar.first[21] = 0.0
|
626
|
+
(0...(ar.size-1)).each do |i|
|
627
|
+
ar[i][20] = other_scores[i] # deltacn
|
628
|
+
ar[i+1][21] = other_scores[i] # deltacn_orig_updated
|
629
|
+
end
|
630
|
+
ar.last[20] = 1.1
|
631
|
+
end
|
632
|
+
end
|
633
|
+
|
634
|
+
def self.read_extra_references(fh, num_extra_references, pep_hits)
|
635
|
+
num_extra_references.times do
|
636
|
+
# 80 bytes total (with index number)
|
637
|
+
pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
|
638
|
+
|
639
|
+
ref = fh.read(80).unpack('A*').first
|
640
|
+
pep[11] << Mspire::Sequest::Srf::Out::Protein.new(ref[0,38])
|
641
|
+
end
|
642
|
+
# fh.read(6) if unpack_35
|
643
|
+
end
|
644
|
+
|
645
|
+
Unpack_35 = '@64Ex8ex8eeeIx18Ivx2vvx8Z*@246Z*'
|
646
|
+
# translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
|
647
|
+
# x8=8unknown bytes, e=sf, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
|
648
|
+
# v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
|
649
|
+
# byte 240 grab the string (which is proteins).
|
650
|
+
#Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
|
651
|
+
Unpack_32 = '@64Ex8ex8eeeIx14Ivvvx8Z*@240Z*'
|
652
|
+
Unpack_four_null_bytes = 'a*'
|
653
|
+
Unpack_Zstar = 'Z*'
|
654
|
+
Read_35 = 426
|
655
|
+
Read_32 = 320
|
656
|
+
|
657
|
+
FourNullBytes_as_string = "\0\0\0\0"
|
658
|
+
#NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
|
659
|
+
NewRecordStart = 0x01.chr + 0x00.chr
|
660
|
+
Sequest_record_start = "[SEQUEST]"
|
661
|
+
|
662
|
+
undef_method :inspect
|
663
|
+
def inspect
|
664
|
+
st = %w(aaseq sequence mh deltacn_orig sf sp xcorr id rsp ions_matched ions_total proteins deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
|
665
|
+
if v == 'proteins'
|
666
|
+
"#{v}(#)=#{send(v.to_sym).size}"
|
667
|
+
elsif v.is_a? Array
|
668
|
+
"##{v}=#{send(v.to_sym).size}"
|
669
|
+
else
|
670
|
+
"#{v}=#{send(v.to_sym).inspect}"
|
671
|
+
end
|
672
|
+
end
|
673
|
+
st.unshift("<#{self.class}")
|
674
|
+
if srf
|
675
|
+
st.push("srf(base_name)=#{srf.base_name.inspect}")
|
676
|
+
end
|
677
|
+
st.push('>')
|
678
|
+
st.join(' ')
|
679
|
+
#"<Mspire::Sequest::Srf::Out::Peptide @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @proteins(count)=#{proteins.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
|
680
|
+
end
|
681
|
+
# extra_references_array is an array that grows with peptide_hits as extra
|
682
|
+
# references are discovered.
|
683
|
+
def self.from_io(fh, unpack_35)
|
684
|
+
## get the first part of the info
|
685
|
+
st = fh.read( unpack_35 ? Read_35 : Read_32 ) ## read all the hit data
|
686
|
+
|
687
|
+
|
688
|
+
# sets the the first 11 attributes
|
689
|
+
peptide = self.new( *st.unpack( unpack_35 ? Unpack_35 : Unpack_32 ) )
|
690
|
+
|
691
|
+
# set deltacn_orig_updated
|
692
|
+
peptide[21] = peptide[1]
|
693
|
+
|
694
|
+
# we are slicing the reference to 38 chars to be the same length as
|
695
|
+
# duplicate references
|
696
|
+
peptide[11] = [Mspire::Sequest::Srf::Out::Protein.new(peptide[11][0,38])]
|
697
|
+
|
698
|
+
peptide[14] = Mspire::Ident::Peptide.sequence_to_aaseq(peptide[10])
|
699
|
+
|
700
|
+
fh.read(6) if unpack_35
|
701
|
+
|
702
|
+
peptide
|
703
|
+
end
|
704
|
+
|
705
|
+
end
|
706
|
+
|
707
|
+
class Mspire::Sequest::Srf::Out::Protein < Mspire::Ident::Protein
|
708
|
+
alias_method :reference, :id
|
709
|
+
|
710
|
+
# the first entry
|
711
|
+
def first_entry
|
712
|
+
reference.split(' ',2)[0]
|
713
|
+
end
|
714
|
+
end
|
715
|
+
|