ms-sequest 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History +8 -0
- data/MIT-LICENSE +20 -0
- data/README +23 -0
- data/lib/ms/sequest.rb +6 -0
- data/lib/ms/sequest/params.rb +343 -0
- data/lib/ms/sequest/sqt.rb +363 -0
- data/lib/ms/sequest/srf.rb +707 -0
- data/lib/ms/sequest/srf/sqt.rb +169 -0
- metadata +88 -0
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
|
|
2
|
+
# standard lib
|
|
3
|
+
require 'set'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
|
|
6
|
+
# other gems
|
|
7
|
+
require 'arrayclass'
|
|
8
|
+
|
|
9
|
+
# in library
|
|
10
|
+
require 'ms/id/peptide'
|
|
11
|
+
require 'ms/id/protein'
|
|
12
|
+
require 'ms/id/search'
|
|
13
|
+
require 'ms/sequest/params'
|
|
14
|
+
|
|
15
|
+
# for conversions
|
|
16
|
+
require 'ms/sequest/srf/mgf'
|
|
17
|
+
require 'ms/sequest/srf/sqt'
|
|
18
|
+
require 'ms/sequest/srf/dta'
|
|
19
|
+
|
|
20
|
+
module Ms ; end
|
|
21
|
+
module Ms::Sequest ; end
|
|
22
|
+
|
|
23
|
+
class Ms::Sequest::Srf
|
|
24
|
+
include Ms::Id::Search
|
|
25
|
+
|
|
26
|
+
# inherits peps and prots from Search
|
|
27
|
+
|
|
28
|
+
# a String: 3.5, 3.3 or 3.2
|
|
29
|
+
attr_accessor :version
|
|
30
|
+
|
|
31
|
+
attr_accessor :header
|
|
32
|
+
attr_accessor :dta_files
|
|
33
|
+
attr_accessor :out_files
|
|
34
|
+
attr_accessor :params
|
|
35
|
+
# a parallel array to dta_files and out_files where each entry is:
|
|
36
|
+
# [first_scan, last_scan, charge]
|
|
37
|
+
attr_accessor :index
|
|
38
|
+
attr_accessor :base_name
|
|
39
|
+
|
|
40
|
+
# a boolean to indicate if the results have been filtered by the
|
|
41
|
+
# sequest.params precursor mass tolerance
|
|
42
|
+
attr_accessor :filtered_by_precursor_mass_tolerance
|
|
43
|
+
|
|
44
|
+
def protein_class
|
|
45
|
+
Ms::Sequest::Srf::Out::Prot
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# returns a Sequest::Params object or nil if none
|
|
49
|
+
def self.get_sequest_params(filename)
|
|
50
|
+
# split the file in half and only read the second half (since we can be
|
|
51
|
+
# confident that the params file will be there!)
|
|
52
|
+
File.open(filename) do |handle|
|
|
53
|
+
halfway = handle.stat.size / 2
|
|
54
|
+
handle.seek halfway
|
|
55
|
+
last_half = handle.read
|
|
56
|
+
if sequest_start_index = last_half.rindex('[SEQUEST]')
|
|
57
|
+
params_start_index = sequest_start_index + halfway
|
|
58
|
+
handle.seek(params_start_index)
|
|
59
|
+
Ms::Sequest::Params.new.parse_io(handle)
|
|
60
|
+
else
|
|
61
|
+
warn "#{filename} has no SEQUEST information, may be a truncated/corrupt file!"
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def dta_start_byte
|
|
68
|
+
case @version
|
|
69
|
+
when '3.2' ; 3260
|
|
70
|
+
when '3.3' ; 3644
|
|
71
|
+
when '3.5' ; 3644
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# opts:
|
|
76
|
+
# :filter_by_precursor_mass_tolerance => true | false (default true)
|
|
77
|
+
# # this will filter by the sequest params prec tolerance as is
|
|
78
|
+
# # typically done by Bioworks.
|
|
79
|
+
#
|
|
80
|
+
# :link_protein_hits => true | false (default true)
|
|
81
|
+
# # if true, generates the @prot attribute for the :prot method
|
|
82
|
+
# # and creates one protein per reference that is linked to each
|
|
83
|
+
# # relevant peptide hit.
|
|
84
|
+
# # if false, each protein for each peptide hit is a unique object
|
|
85
|
+
# # and the :prots method returns nil. If you are merging multiple
|
|
86
|
+
# # searches then you probably want to set this to false to avoid
|
|
87
|
+
# # recalculation.
|
|
88
|
+
#
|
|
89
|
+
def initialize(filename=nil, opts={})
|
|
90
|
+
@peps = []
|
|
91
|
+
|
|
92
|
+
@dta_files = []
|
|
93
|
+
@out_files = []
|
|
94
|
+
if filename
|
|
95
|
+
from_file(filename, opts)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def round(float, decimal_places)
|
|
100
|
+
sprintf("%.#{decimal_places}f", float)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# 1. updates the out_file's list of hits based on passing peptides (but not
|
|
104
|
+
# the original hit id; rank is implicit in array ordering)
|
|
105
|
+
# 2. recalculates deltacn values completely if number of hits changed (does
|
|
106
|
+
# not touch deltacn orig)
|
|
107
|
+
#
|
|
108
|
+
# This can spoil proper protein -> peptide linkages. Ms::Id::Search.merge!
|
|
109
|
+
# should be run after this method to ensure correct protein -> peptide
|
|
110
|
+
# linkages.
|
|
111
|
+
def filter_by_precursor_mass_tolerance!
|
|
112
|
+
pmt = params.peptide_mass_tolerance.to_f
|
|
113
|
+
methd = nil # the method to
|
|
114
|
+
|
|
115
|
+
case params.peptide_mass_units
|
|
116
|
+
when '0'
|
|
117
|
+
amu_based = true
|
|
118
|
+
milli_amu = false
|
|
119
|
+
when '1'
|
|
120
|
+
amu_based = true
|
|
121
|
+
milli_amu = true
|
|
122
|
+
when '2'
|
|
123
|
+
amu_based = false
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
self.filtered_by_precursor_mass_tolerance = true
|
|
127
|
+
self.out_files.each do |out_file|
|
|
128
|
+
hits = out_file.hits
|
|
129
|
+
before = hits.size
|
|
130
|
+
hits.reject! do |pep|
|
|
131
|
+
if amu_based
|
|
132
|
+
if milli_amu
|
|
133
|
+
(pep.deltamass.abs > (pmt/1000))
|
|
134
|
+
else
|
|
135
|
+
(pep.deltamass.abs > pmt)
|
|
136
|
+
end
|
|
137
|
+
else
|
|
138
|
+
(pep.ppm.abs > pmt)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
if hits.size != before
|
|
142
|
+
out_file.hits = hits # <- is this necessary
|
|
143
|
+
Ms::Sequest::Srf::Out::Pep.update_deltacns_from_xcorr(hits)
|
|
144
|
+
out_file.num_hits = hits.size
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
self
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# returns self
|
|
151
|
+
# opts are the same as for 'new'
|
|
152
|
+
def from_file(filename, opts)
|
|
153
|
+
opts = { :filter_by_precursor_mass_tolerance => true, :link_protein_hits => true}.merge(opts)
|
|
154
|
+
params = Ms::Sequest::Srf.get_sequest_params(filename)
|
|
155
|
+
dups_gt_0 = false
|
|
156
|
+
if params
|
|
157
|
+
dups = params.print_duplicate_references
|
|
158
|
+
if dups == '0'
|
|
159
|
+
warn <<END
|
|
160
|
+
***************************************************************************
|
|
161
|
+
For complete protein <=> peptide linkages, .srf files must be created with
|
|
162
|
+
print_duplicate_references > 0. To capture all duplicate references, set the
|
|
163
|
+
sequest parameter 'print_duplicate_references' to 100 or greater.
|
|
164
|
+
***************************************************************************
|
|
165
|
+
END
|
|
166
|
+
else
|
|
167
|
+
dups_gt_0 = true
|
|
168
|
+
end
|
|
169
|
+
else
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
File.open(filename, "rb") do |fh|
|
|
173
|
+
@header = Ms::Sequest::Srf::Header.new.from_io(fh)
|
|
174
|
+
@version = @header.version
|
|
175
|
+
|
|
176
|
+
unpack_35 = case @version
|
|
177
|
+
when '3.2'
|
|
178
|
+
false
|
|
179
|
+
when '3.3'
|
|
180
|
+
false
|
|
181
|
+
when '3.5'
|
|
182
|
+
true
|
|
183
|
+
end
|
|
184
|
+
@dta_files, measured_mhs = read_dta_files(fh,@header.num_dta_files, unpack_35)
|
|
185
|
+
|
|
186
|
+
@out_files = read_out_files(fh,@header.num_dta_files, measured_mhs, unpack_35)
|
|
187
|
+
if fh.eof?
|
|
188
|
+
#warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
|
|
189
|
+
@params = nil
|
|
190
|
+
@index = []
|
|
191
|
+
else
|
|
192
|
+
@params = Ms::Sequest::Params.new.parse_io(fh)
|
|
193
|
+
# This is very sensitive to the grab_params method in sequest params
|
|
194
|
+
fh.read(12) ## gap between last params entry and index
|
|
195
|
+
@index = read_scan_index(fh,@header.num_dta_files)
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
### UPDATE SOME THINGS:
|
|
201
|
+
@base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
|
|
202
|
+
# give each hit a base_name, first_scan, last_scan
|
|
203
|
+
@index.each_with_index do |ind,i|
|
|
204
|
+
mass_measured = @dta_files[i][0]
|
|
205
|
+
#puts @out_files[i].join(", ")
|
|
206
|
+
@out_files[i][0,3] = *ind
|
|
207
|
+
pep_hits = @out_files[i][6]
|
|
208
|
+
@peps.push( *pep_hits )
|
|
209
|
+
pep_hits.each do |pep_hit|
|
|
210
|
+
pep_hit[14,4] = @base_name, *ind
|
|
211
|
+
# add the deltamass
|
|
212
|
+
pep_hit[11] = pep_hit[0] - mass_measured # real - measured (deltamass)
|
|
213
|
+
pep_hit[12] = 1.0e6 * pep_hit[11].abs / mass_measured ## ppm
|
|
214
|
+
pep_hit[18] = self ## link with the srf object
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
filter_by_precursor_mass_tolerance! if params
|
|
219
|
+
|
|
220
|
+
if opts[:link_protein_hits]
|
|
221
|
+
(@peps, @prots) = merge!([peps]) do |_prot, _peps|
|
|
222
|
+
prot = Ms::Sequest::Srf::Out::Prot.new(_prot.reference, _peps)
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
self
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# returns an index where each entry is [first_scan, last_scan, charge]
|
|
230
|
+
def read_scan_index(fh, num)
|
|
231
|
+
ind_len = 24
|
|
232
|
+
index = Array.new(num)
|
|
233
|
+
unpack_string = 'III'
|
|
234
|
+
st = ''
|
|
235
|
+
ind_len.times do st << '0' end ## create a 24 byte string to receive data
|
|
236
|
+
num.times do |i|
|
|
237
|
+
fh.read(ind_len, st)
|
|
238
|
+
index[i] = st.unpack(unpack_string)
|
|
239
|
+
end
|
|
240
|
+
index
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# returns an array of dta_files
|
|
244
|
+
def read_dta_files(fh, num_files, unpack_35)
|
|
245
|
+
measured_mhs = Array.new(num_files) ## A parallel array to capture the actual mh
|
|
246
|
+
dta_files = Array.new(num_files)
|
|
247
|
+
start = dta_start_byte
|
|
248
|
+
unless fh.pos == start
|
|
249
|
+
fh.pos = start
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
header.num_dta_files.times do |i|
|
|
253
|
+
dta_file = Ms::Sequest::Srf::DTA.new.from_io(fh, unpack_35)
|
|
254
|
+
measured_mhs[i] = dta_file[0]
|
|
255
|
+
dta_files[i] = dta_file
|
|
256
|
+
end
|
|
257
|
+
[dta_files, measured_mhs]
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
|
|
261
|
+
# will put the fh there.
|
|
262
|
+
def read_out_files(fh,number_files, measured_mhs, unpack_35)
|
|
263
|
+
out_files = Array.new(number_files)
|
|
264
|
+
header.num_dta_files.times do |i|
|
|
265
|
+
out_files[i] = Ms::Sequest::Srf::Out.new.from_io(fh, unpack_35)
|
|
266
|
+
end
|
|
267
|
+
out_files
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
class Ms::Sequest::Srf::Header
|
|
273
|
+
|
|
274
|
+
Start_byte = {
|
|
275
|
+
:enzyme => 438,
|
|
276
|
+
:ion_series => 694,
|
|
277
|
+
:model => 950,
|
|
278
|
+
:modifications => 982,
|
|
279
|
+
:raw_filename => 1822,
|
|
280
|
+
:db_filename => 2082,
|
|
281
|
+
:dta_log_filename => 2602,
|
|
282
|
+
:params_filename => 3122,
|
|
283
|
+
:sequest_log_filename => 3382,
|
|
284
|
+
}
|
|
285
|
+
Byte_length = {
|
|
286
|
+
:enzyme => 256,
|
|
287
|
+
:ion_series => 256,
|
|
288
|
+
:model => 32,
|
|
289
|
+
:modifications => 840,
|
|
290
|
+
:raw_filename => 260,
|
|
291
|
+
:db_filename => 520,
|
|
292
|
+
:dta_log_filename => 520,
|
|
293
|
+
:params_filename => 260,
|
|
294
|
+
:sequest_log_filename => 262, ## is this really 262?? or should be 260??
|
|
295
|
+
}
|
|
296
|
+
Byte_length_v32 = {
|
|
297
|
+
:modifications => 456,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
# a Ms::Sequest::Srf::DTAGen object
|
|
301
|
+
attr_accessor :version
|
|
302
|
+
attr_accessor :dta_gen
|
|
303
|
+
attr_accessor :enzyme
|
|
304
|
+
attr_accessor :ion_series
|
|
305
|
+
attr_accessor :model
|
|
306
|
+
attr_accessor :modifications
|
|
307
|
+
attr_accessor :raw_filename
|
|
308
|
+
attr_accessor :db_filename
|
|
309
|
+
attr_accessor :dta_log_filename
|
|
310
|
+
attr_accessor :params_filename
|
|
311
|
+
attr_accessor :sequest_log_filename
|
|
312
|
+
|
|
313
|
+
def num_dta_files
|
|
314
|
+
@dta_gen.num_dta_files
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# sets fh to 0 and grabs the information it wants
|
|
318
|
+
def from_io(fh)
|
|
319
|
+
st = fh.read(4)
|
|
320
|
+
@version = '3.' + st.unpack('I').first.to_s
|
|
321
|
+
@dta_gen = Ms::Sequest::Srf::DTAGen.new.from_io(fh)
|
|
322
|
+
|
|
323
|
+
## get the rest of the info
|
|
324
|
+
byte_length = Byte_length.dup
|
|
325
|
+
byte_length.merge! Byte_length_v32 if @version == '3.2'
|
|
326
|
+
|
|
327
|
+
fh.pos = Start_byte[:enzyme]
|
|
328
|
+
[:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
|
|
329
|
+
send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param]) )
|
|
330
|
+
end
|
|
331
|
+
self
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
private
|
|
335
|
+
def get_null_padded_string(fh,bytes)
|
|
336
|
+
st = fh.read(bytes)
|
|
337
|
+
# for empty declarations
|
|
338
|
+
if st[0] == 0x000000
|
|
339
|
+
return ''
|
|
340
|
+
end
|
|
341
|
+
st.rstrip!
|
|
342
|
+
st
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# the DTA Generation Params
|
|
349
|
+
class Ms::Sequest::Srf::DTAGen
|
|
350
|
+
|
|
351
|
+
## not sure if this is correct
|
|
352
|
+
# Float
|
|
353
|
+
attr_accessor :start_time
|
|
354
|
+
# Float
|
|
355
|
+
attr_accessor :start_mass
|
|
356
|
+
# Float
|
|
357
|
+
attr_accessor :end_mass
|
|
358
|
+
# Integer
|
|
359
|
+
attr_accessor :num_dta_files
|
|
360
|
+
# Integer
|
|
361
|
+
attr_accessor :group_scan
|
|
362
|
+
## not sure if this is correct
|
|
363
|
+
# Integer
|
|
364
|
+
attr_accessor :min_group_count
|
|
365
|
+
# Integer
|
|
366
|
+
attr_accessor :min_ion_threshold
|
|
367
|
+
#attr_accessor :intensity_threshold # can't find yet
|
|
368
|
+
#attr_accessor :precursor_tolerance # can't find yet
|
|
369
|
+
# Integer
|
|
370
|
+
attr_accessor :start_scan
|
|
371
|
+
# Integer
|
|
372
|
+
attr_accessor :end_scan
|
|
373
|
+
|
|
374
|
+
#
|
|
375
|
+
def from_io(fh)
|
|
376
|
+
fh.pos = 0 if fh.pos != 0
|
|
377
|
+
st = fh.read(148)
|
|
378
|
+
(@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
|
|
379
|
+
self
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
# total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
|
|
384
|
+
# unknown is, well unknown...
|
|
385
|
+
|
|
386
|
+
Ms::Sequest::Srf::DTA = Arrayclass.new( %w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks) )
|
|
387
|
+
|
|
388
|
+
class Ms::Sequest::Srf::DTA
|
|
389
|
+
# original
|
|
390
|
+
# Unpack = "EeIvvvv"
|
|
391
|
+
Unpack_32 = "EeIvvvv"
|
|
392
|
+
Unpack_35 = "Ex8eVx2vvvv"
|
|
393
|
+
|
|
394
|
+
# note on peaks (self[7])
|
|
395
|
+
# this is a byte array of floats, you can get the peaks out with
|
|
396
|
+
# unpack("e*")
|
|
397
|
+
|
|
398
|
+
undef_method :inspect
|
|
399
|
+
def inspect
|
|
400
|
+
peaks_st = 'nil'
|
|
401
|
+
if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
|
|
402
|
+
"<Ms::Sequest::Srf::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
def from_io(fh, unpack_35)
|
|
406
|
+
if unpack_35
|
|
407
|
+
@unpack = Unpack_35
|
|
408
|
+
@read_header = 34
|
|
409
|
+
@read_spacer = 22
|
|
410
|
+
else
|
|
411
|
+
@unpack = Unpack_32
|
|
412
|
+
@read_header = 24
|
|
413
|
+
@read_spacer = 24
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
st = fh.read(@read_header)
|
|
417
|
+
# get the bulk of the data in single unpack
|
|
418
|
+
self[0,7] = st.unpack(@unpack)
|
|
419
|
+
|
|
420
|
+
# Scan numbers are given at the end in an index!
|
|
421
|
+
st2 = fh.read(@read_spacer)
|
|
422
|
+
|
|
423
|
+
num_bytes_to_read = num_peaks * 8
|
|
424
|
+
st3 = fh.read(num_bytes_to_read)
|
|
425
|
+
self[7] = st3
|
|
426
|
+
self
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def to_dta_file_data
|
|
430
|
+
string = "#{round(mh, 6)} #{charge}\r\n"
|
|
431
|
+
peak_ar = peaks.unpack('e*')
|
|
432
|
+
(0...(peak_ar.size)).step(2) do |i|
|
|
433
|
+
# %d is equivalent to floor, so we round by adding 0.5!
|
|
434
|
+
string << "#{round(peak_ar[i], 4)} #{(peak_ar[i+1] + 0.5).floor}\r\n"
|
|
435
|
+
#string << peak_ar[i,2].join(' ') << "\r\n"
|
|
436
|
+
end
|
|
437
|
+
string
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# write a class dta file to the io object
|
|
441
|
+
def write_dta_file(io)
|
|
442
|
+
io.print to_dta_file_data
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
Ms::Sequest::Srf::Out = Arrayclass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
|
|
449
|
+
|
|
450
|
+
# 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
|
|
451
|
+
|
|
452
|
+
class Ms::Sequest::Srf::Out
|
|
453
|
+
Unpack_32 = '@36vx2Z*@60Z*'
|
|
454
|
+
Unpack_35 = '@36vx4Z*@62Z*'
|
|
455
|
+
|
|
456
|
+
undef_method :inspect
|
|
457
|
+
def inspect
|
|
458
|
+
hits_s =
|
|
459
|
+
if self[6]
|
|
460
|
+
", @hits(#)=#{hits.size}"
|
|
461
|
+
else
|
|
462
|
+
''
|
|
463
|
+
end
|
|
464
|
+
"<Ms::Sequest::Srf::Out first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
def from_io(fh, unpack_35)
|
|
468
|
+
## EMPTY out file is 96 bytes
|
|
469
|
+
## each hit is 320 bytes
|
|
470
|
+
## num_hits and charge:
|
|
471
|
+
st = fh.read(96)
|
|
472
|
+
|
|
473
|
+
self[3,3] = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
|
|
474
|
+
self[7,4] = st.unpack('@8eex4Ix4I')
|
|
475
|
+
num_hits = self[3]
|
|
476
|
+
|
|
477
|
+
ar = Array.new(num_hits)
|
|
478
|
+
if ar.size > 0
|
|
479
|
+
num_extra_references = 0
|
|
480
|
+
num_hits.times do |i|
|
|
481
|
+
ar[i] = Ms::Sequest::Srf::Out::Pep.new.from_io(fh, unpack_35)
|
|
482
|
+
num_extra_references += ar[i].num_other_loci
|
|
483
|
+
end
|
|
484
|
+
Ms::Sequest::Srf::Out::Pep.read_extra_references(fh, num_extra_references, ar)
|
|
485
|
+
## The xcorrs are already ordered by best to worst hit
|
|
486
|
+
## ADJUST the deltacn's to be meaningful for the top hit:
|
|
487
|
+
## (the same as bioworks and prophet)
|
|
488
|
+
Ms::Sequest::Srf::Out::Pep.set_deltacn_from_deltacn_orig(ar)
|
|
489
|
+
#puts ar.map {|a| a.deltacn }.join(", ")
|
|
490
|
+
end
|
|
491
|
+
self[6] = ar
|
|
492
|
+
self
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# deltacn_orig - the one that sequest originally reports (top hit gets 0.0)
|
|
501
|
+
# deltacn - modified to be that of the next best hit (by xcorr) and the last
|
|
502
|
+
# hit takes 1.1. This is what is called deltacn by bioworks and pepprophet
|
|
503
|
+
# (at least for the first few years). If filtering occurs, it will be
|
|
504
|
+
# updated.
|
|
505
|
+
# deltacn_orig_updated - the latest updated value of deltacn.
|
|
506
|
+
# Originally, this will be equal to deltacn_orig. After filtering, this will
|
|
507
|
+
# be recalculated. To know if this will be different from deltacn_orig, query
|
|
508
|
+
# match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then
|
|
509
|
+
# deltacn should also be changed to reflect it.
|
|
510
|
+
# mh - the theoretical mass + h
|
|
511
|
+
# prots are created as SRF prot objects with a reference and linked to their
|
|
512
|
+
# peptides (from global hash by reference)
|
|
513
|
+
# ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
|
|
514
|
+
# This is calculated for the M+H mass!
|
|
515
|
+
# num_other_loci is the number of other loci that the peptide matches beyond
|
|
516
|
+
# the first one listed
|
|
517
|
+
# srf = the srf object this scan came from
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
Ms::Sequest::Srf::Out::Pep = Arrayclass.new( %w(mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
|
|
521
|
+
|
|
522
|
+
# 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
|
|
523
|
+
|
|
524
|
+
class Ms::Sequest::Srf::Out::Pep
|
|
525
|
+
#include SpecID::Pep
|
|
526
|
+
|
|
527
|
+
# creates the deltacn that is meaningful for the top hit (the deltacn_orig
|
|
528
|
+
# or the second best hit and so on).
|
|
529
|
+
# assumes sorted
|
|
530
|
+
def self.set_deltacn_from_deltacn_orig(ar)
|
|
531
|
+
(1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
|
|
532
|
+
ar[-1].deltacn = 1.1
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
# (assumes sorted)
|
|
536
|
+
# recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn
|
|
537
|
+
def self.update_deltacns_from_xcorr(ar)
|
|
538
|
+
if ar.size > 0
|
|
539
|
+
top_score = ar.first[3]
|
|
540
|
+
other_scores = (1...(ar.size)).to_a.map do |i|
|
|
541
|
+
1.0 - (ar[i][3]/top_score)
|
|
542
|
+
end
|
|
543
|
+
ar.first[20] = 0.0
|
|
544
|
+
(0...(ar.size-1)).each do |i|
|
|
545
|
+
ar[i][19] = other_scores[i] # deltacn
|
|
546
|
+
ar[i+1][20] = other_scores[i] # deltacn_orig_updated
|
|
547
|
+
end
|
|
548
|
+
ar.last[19] = 1.1
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
def self.read_extra_references(fh, num_extra_references, pep_hits)
|
|
553
|
+
num_extra_references.times do
|
|
554
|
+
# 80 bytes total (with index number)
|
|
555
|
+
pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
|
|
556
|
+
|
|
557
|
+
ref = fh.read(80).unpack('A*').first
|
|
558
|
+
pep[10] << Ms::Sequest::Srf::Out::Prot.new(ref[0,38])
|
|
559
|
+
end
|
|
560
|
+
# fh.read(6) if unpack_35
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
# x2=???
|
|
564
|
+
#Unpack_35 = '@64Ex8ex12eeIx22vx2vvx8Z*@246Z*'
|
|
565
|
+
### NOTE:
|
|
566
|
+
# I need to verify that this is correct (I mean the 'I' after x18)
|
|
567
|
+
Unpack_35 = '@64Ex8ex12eeIx18Ivx2vvx8Z*@246Z*'
|
|
568
|
+
# translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
|
|
569
|
+
# x12=12unknown bytes, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
|
|
570
|
+
# v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
|
|
571
|
+
# byte 240 grab the string (which is proteins).
|
|
572
|
+
#Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
|
|
573
|
+
Unpack_32 = '@64Ex8ex12eeIx14Ivvvx8Z*@240Z*'
|
|
574
|
+
Unpack_four_null_bytes = 'a*'
|
|
575
|
+
Unpack_Zstar = 'Z*'
|
|
576
|
+
Read_35 = 426
|
|
577
|
+
Read_32 = 320
|
|
578
|
+
|
|
579
|
+
FourNullBytes_as_string = "\0\0\0\0"
|
|
580
|
+
#NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
|
|
581
|
+
NewRecordStart = 0x01.chr + 0x00.chr
|
|
582
|
+
Sequest_record_start = "[SEQUEST]"
|
|
583
|
+
|
|
584
|
+
undef_method :inspect
|
|
585
|
+
def inspect
|
|
586
|
+
st = %w(aaseq sequence mh deltacn_orig sp xcorr id rsp ions_matched ions_total prots deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
|
|
587
|
+
if v == 'prots'
|
|
588
|
+
"#{v}(#)=#{send(v.to_sym).size}"
|
|
589
|
+
elsif v.is_a? Array
|
|
590
|
+
"##{v}=#{send(v.to_sym).size}"
|
|
591
|
+
else
|
|
592
|
+
"#{v}=#{send(v.to_sym).inspect}"
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
st.unshift("<#{self.class}")
|
|
596
|
+
if srf
|
|
597
|
+
st.push("srf(base_name)=#{srf.base_name.inspect}")
|
|
598
|
+
end
|
|
599
|
+
st.push('>')
|
|
600
|
+
st.join(' ')
|
|
601
|
+
#"<Ms::Sequest::Srf::Out::Pep @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @prots(count)=#{prots.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
|
|
602
|
+
end
|
|
603
|
+
# extra_references_array is an array that grows with peptides as extra
|
|
604
|
+
# references are discovered.
|
|
605
|
+
def from_io(fh, unpack_35)
|
|
606
|
+
unpack =
|
|
607
|
+
if unpack_35 ; Unpack_35
|
|
608
|
+
else ; Unpack_32
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
## get the first part of the info
|
|
612
|
+
st = fh.read(( unpack_35 ? Read_35 : Read_32) ) ## read all the hit data
|
|
613
|
+
|
|
614
|
+
self[0,10] = st.unpack(unpack)
|
|
615
|
+
|
|
616
|
+
# set deltacn_orig_updated
|
|
617
|
+
self[20] = self[1]
|
|
618
|
+
|
|
619
|
+
# we are slicing the reference to 38 chars to be the same length as
|
|
620
|
+
# duplicate references
|
|
621
|
+
self[10] = [Ms::Sequest::Srf::Out::Prot.new(self[10][0,38])]
|
|
622
|
+
|
|
623
|
+
self[13] = Ms::Id::Peptide.sequence_to_aaseq(self[9])
|
|
624
|
+
|
|
625
|
+
fh.read(6) if unpack_35
|
|
626
|
+
|
|
627
|
+
self
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
Ms::Sequest::Srf::Out::Prot = Arrayclass.new( %w(reference peps) )
|
|
634
|
+
|
|
635
|
+
class Ms::Sequest::Srf::Out::Prot
|
|
636
|
+
include Ms::Id::Protein
|
|
637
|
+
## we shouldn't have to do this because this is inlcuded in SpecID::Prot, but
|
|
638
|
+
## under some circumstances it won't work without explicitly calling it.
|
|
639
|
+
#include ProteinReferenceable
|
|
640
|
+
|
|
641
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
|
642
|
+
def initialize(reference=nil, peps=[])
|
|
643
|
+
#super(@@arr_size)
|
|
644
|
+
super(self.class.size)
|
|
645
|
+
#@reference = reference
|
|
646
|
+
#@peps = peps
|
|
647
|
+
self[0,2] = reference, peps
|
|
648
|
+
end
|
|
649
|
+
$VERBOSE = tmp
|
|
650
|
+
|
|
651
|
+
# "<Ms::Sequest::Srf::Out::Prot reference=\"#{@reference}\">"
|
|
652
|
+
|
|
653
|
+
undef_method :inspect
|
|
654
|
+
def inspect
|
|
655
|
+
"<Ms::Sequest::Srf::Out::Prot @reference=#{reference}, @peps(#)=#{peps.size}>"
|
|
656
|
+
end
|
|
657
|
+
end
|
|
658
|
+
|
|
659
|
+
class Ms::Sequest::SrfGroup
|
|
660
|
+
include Ms::Id::SearchGroup
|
|
661
|
+
|
|
662
|
+
# inherets an array of Ms::Sequest::Srf::Out::Pep objects
|
|
663
|
+
# inherets an array of Ms::Sequest::Srf::Out::Prot objects
|
|
664
|
+
|
|
665
|
+
# see Ms::Id::Search for acceptable arguments
|
|
666
|
+
# (filename, filenames, array of objects)
|
|
667
|
+
# opts =
|
|
668
|
+
# :filter_by_precursor_mass_tolerance => true | false (default true)
|
|
669
|
+
def initialize(arg, opts={}, &block)
|
|
670
|
+
orig_opts = opts.dup
|
|
671
|
+
indiv_opts = { :link_protein_hits => false }
|
|
672
|
+
super(arg, opts.merge(indiv_opts)) do
|
|
673
|
+
unless orig_opts[:link_protein_hits] == false
|
|
674
|
+
puts "MERGING GROUP!"
|
|
675
|
+
(@peps, @prots) = merge!(@searches.map {|v| v.peps }) do |_prot, _peps|
|
|
676
|
+
Ms::Sequest::Srf::Out::Prot.new(_prot.reference, _peps)
|
|
677
|
+
end
|
|
678
|
+
end
|
|
679
|
+
end
|
|
680
|
+
block.call(self) if block_given?
|
|
681
|
+
end
|
|
682
|
+
|
|
683
|
+
def search_class
|
|
684
|
+
Ms::Sequest::Srf
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
# returns the filename used
|
|
688
|
+
# if the file exists, the name will be expanded to full path, otherwise just
|
|
689
|
+
# what is given
|
|
690
|
+
def to_srg(srg_filename='bioworks.srg')
|
|
691
|
+
File.open(srg_filename, 'w') do |v|
|
|
692
|
+
@filenames.each do |srf_file|
|
|
693
|
+
if File.exist? srf_file
|
|
694
|
+
v.puts File.expand_path(srf_file)
|
|
695
|
+
else
|
|
696
|
+
v.puts srf_file
|
|
697
|
+
end
|
|
698
|
+
end
|
|
699
|
+
end
|
|
700
|
+
srg_filename
|
|
701
|
+
end
|
|
702
|
+
end
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
|