mspire-sequest 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +30 -0
- data/.gitmodules +9 -0
- data/History +79 -0
- data/LICENSE +22 -0
- data/README.rdoc +85 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/bin/srf_to_pepxml.rb +7 -0
- data/bin/srf_to_search.rb +7 -0
- data/bin/srf_to_sqt.rb +8 -0
- data/lib/mspire/sequest/params.rb +331 -0
- data/lib/mspire/sequest/pepxml/modifications.rb +247 -0
- data/lib/mspire/sequest/pepxml/params.rb +32 -0
- data/lib/mspire/sequest/sqt.rb +393 -0
- data/lib/mspire/sequest/srf/pepxml/sequest.rb +21 -0
- data/lib/mspire/sequest/srf/pepxml.rb +333 -0
- data/lib/mspire/sequest/srf/search.rb +158 -0
- data/lib/mspire/sequest/srf/sqt.rb +218 -0
- data/lib/mspire/sequest/srf.rb +715 -0
- data/lib/mspire/sequest.rb +6 -0
- data/script/fasta_ipi_to_ncbi-ish.rb +29 -0
- data/spec/mspire/sequest/params_spec.rb +135 -0
- data/spec/mspire/sequest/pepxml/modifications_spec.rb +50 -0
- data/spec/mspire/sequest/pepxml_spec.rb +311 -0
- data/spec/mspire/sequest/sqt_spec.rb +51 -0
- data/spec/mspire/sequest/sqt_spec_helper.rb +34 -0
- data/spec/mspire/sequest/srf/pepxml_spec.rb +89 -0
- data/spec/mspire/sequest/srf/search_spec.rb +131 -0
- data/spec/mspire/sequest/srf/sqt_spec.rb +228 -0
- data/spec/mspire/sequest/srf_spec.rb +113 -0
- data/spec/mspire/sequest/srf_spec_helper.rb +172 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/testfiles/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/spec/testfiles/bioworks31.params +77 -0
- data/spec/testfiles/bioworks32.params +62 -0
- data/spec/testfiles/bioworks33.params +63 -0
- data/spec/testfiles/corrupted_900.srf +0 -0
- data/spec/testfiles/small.sqt +87 -0
- data/spec/testfiles/small2.sqt +176 -0
- metadata +185 -0
@@ -0,0 +1,715 @@
|
|
1
|
+
|
2
|
+
# standard lib
|
3
|
+
require 'set'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'scanf'
|
6
|
+
|
7
|
+
# in library
|
8
|
+
require 'mspire/ident/search'
|
9
|
+
require 'mspire/ident/peptide'
|
10
|
+
require 'mspire/ident/protein'
|
11
|
+
require 'mspire/sequest/params'
|
12
|
+
|
13
|
+
|
14
|
+
module Mspire ; end
|
15
|
+
module Mspire::Sequest ; end
|
16
|
+
|
17
|
+
class Mspire::Sequest::Srf < Mspire::Ident::Search
|
18
|
+
class NoSequestParamsError < ArgumentError
|
19
|
+
end
|
20
|
+
|
21
|
+
# inherits peptide_hits from Search
|
22
|
+
|
23
|
+
# a String: 3.5, 3.3 or 3.2
|
24
|
+
attr_accessor :version
|
25
|
+
|
26
|
+
attr_accessor :header
|
27
|
+
attr_accessor :dta_files
|
28
|
+
attr_accessor :out_files
|
29
|
+
attr_accessor :params
|
30
|
+
# a parallel array to dta_files and out_files where each entry is:
|
31
|
+
# [first_scan, last_scan, charge]
|
32
|
+
attr_accessor :index
|
33
|
+
|
34
|
+
# the base name of the file with no extension
|
35
|
+
attr_accessor :base_name
|
36
|
+
|
37
|
+
alias_method :base_name_noext, :base_name
|
38
|
+
alias_method :base_name_noext=, :base_name=
|
39
|
+
|
40
|
+
# the directory the srf file was residing in when the filename was passed
|
41
|
+
# in. May not be available.
|
42
|
+
attr_accessor :resident_dir
|
43
|
+
|
44
|
+
# a boolean to indicate if the results have been filtered by the
|
45
|
+
# sequest.params precursor mass tolerance
|
46
|
+
attr_accessor :filtered_by_precursor_mass_tolerance
|
47
|
+
|
48
|
+
def protein_class
|
49
|
+
Mspire::Sequest::Srf::Out::Protein
|
50
|
+
end
|
51
|
+
|
52
|
+
# returns a Sequest::Params object or nil if none
|
53
|
+
def self.get_sequest_params_and_finish_pos(filename)
|
54
|
+
# split the file in half and only read the second half (since we can be
|
55
|
+
# confident that the params file will be there!)
|
56
|
+
|
57
|
+
params = nil
|
58
|
+
finish_parsing_io_pos = nil
|
59
|
+
File.open(filename, 'rb') do |handle|
|
60
|
+
halfway = handle.stat.size / 2
|
61
|
+
handle.seek halfway
|
62
|
+
last_half = handle.read
|
63
|
+
if sequest_start_from_last_half = last_half.rindex('[SEQUEST]')
|
64
|
+
params_start_index = sequest_start_from_last_half + halfway
|
65
|
+
handle.seek(params_start_index)
|
66
|
+
params = Mspire::Sequest::Params.new.parse_io(handle)
|
67
|
+
finish_parsing_io_pos = handle.pos
|
68
|
+
else
|
69
|
+
nil # not found
|
70
|
+
end
|
71
|
+
end
|
72
|
+
[params, finish_parsing_io_pos]
|
73
|
+
end
|
74
|
+
|
75
|
+
def dta_start_byte
|
76
|
+
case @version
|
77
|
+
when '3.2' ; 3260
|
78
|
+
when '3.3' ; 3644
|
79
|
+
when '3.5' ; 3644
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
# opts:
|
85
|
+
# :filter_by_precursor_mass_tolerance => true | false (default true)
|
86
|
+
# this will filter by the sequest params prec tolerance as is
|
87
|
+
# typically done by the Bioworks software.
|
88
|
+
#
|
89
|
+
# :read_pephits => true | false (default true)
|
90
|
+
# will attempt to read peptide hit information (equivalent to .out
|
91
|
+
# files), otherwise, just reads the dta information.
|
92
|
+
def initialize(filename=nil, opts={})
|
93
|
+
@peptide_hits = []
|
94
|
+
@dta_files = []
|
95
|
+
@out_files = []
|
96
|
+
if filename
|
97
|
+
from_file(filename, opts)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
# 1. updates the out_file's list of hits based on passing peptide_hits (but not
|
103
|
+
# the original hit id; rank is implicit in array ordering)
|
104
|
+
# 2. recalculates deltacn values completely if number of hits changed (does
|
105
|
+
# not touch deltacn orig)
|
106
|
+
#
|
107
|
+
# This can spoil proper protein -> peptide linkages. Mspire::Id::Search.merge!
|
108
|
+
# should be run after this method to ensure correct protein -> peptide
|
109
|
+
# linkages.
|
110
|
+
def filter_by_precursor_mass_tolerance!
|
111
|
+
pmt = params.peptide_mass_tolerance.to_f
|
112
|
+
methd = nil # the method to
|
113
|
+
|
114
|
+
case params.peptide_mass_units
|
115
|
+
when '0'
|
116
|
+
amu_based = true
|
117
|
+
milli_amu = false
|
118
|
+
when '1'
|
119
|
+
amu_based = true
|
120
|
+
milli_amu = true
|
121
|
+
when '2'
|
122
|
+
amu_based = false
|
123
|
+
end
|
124
|
+
|
125
|
+
self.filtered_by_precursor_mass_tolerance = true
|
126
|
+
self.out_files.each do |out_file|
|
127
|
+
hits = out_file.hits
|
128
|
+
before = hits.size
|
129
|
+
hits.reject! do |pep|
|
130
|
+
if amu_based
|
131
|
+
if milli_amu
|
132
|
+
(pep.deltamass.abs > (pmt/1000))
|
133
|
+
else
|
134
|
+
(pep.deltamass.abs > pmt)
|
135
|
+
end
|
136
|
+
else
|
137
|
+
(pep.ppm.abs > pmt)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
if hits.size != before
|
141
|
+
out_file.hits = hits # <- is this necessary
|
142
|
+
Mspire::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits)
|
143
|
+
out_file.num_hits = hits.size
|
144
|
+
end
|
145
|
+
end
|
146
|
+
self
|
147
|
+
end
|
148
|
+
|
149
|
+
def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0)
|
150
|
+
dta_files = Array.new(num_files)
|
151
|
+
out_files = Array.new(num_files)
|
152
|
+
start = dta_start_byte
|
153
|
+
fh.pos = start
|
154
|
+
|
155
|
+
num_files.times do |i|
|
156
|
+
dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
|
157
|
+
#p dta_files[i]
|
158
|
+
out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
|
159
|
+
#p out_files[i]
|
160
|
+
end
|
161
|
+
[dta_files, out_files]
|
162
|
+
end
|
163
|
+
|
164
|
+
# returns self
|
165
|
+
# opts are the same as for 'new'
|
166
|
+
def from_file(filename, opts)
|
167
|
+
@resident_dir = File.dirname(File.expand_path(filename))
|
168
|
+
opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts)
|
169
|
+
|
170
|
+
(@params, after_params_io_pos) = Mspire::Sequest::Srf.get_sequest_params_and_finish_pos(filename)
|
171
|
+
return unless @params
|
172
|
+
|
173
|
+
dup_references = 0
|
174
|
+
dup_refs_gt_0 = false
|
175
|
+
|
176
|
+
dup_references = @params.print_duplicate_references.to_i
|
177
|
+
if dup_references == 0
|
178
|
+
# warn %Q{
|
179
|
+
#*****************************************************************************
|
180
|
+
#WARNING: This srf file lists only 1 protein per peptide! (based on the
|
181
|
+
#print_duplicate_references parameter in the sequest.params file used in its
|
182
|
+
#creation) So, downstream output will likewise only contain a single protein
|
183
|
+
#for each peptide hit. In many instances this is OK since downstream programs
|
184
|
+
#will recalculate protein-to-peptide linkages from the database file anyway.
|
185
|
+
#For complete protein lists per peptide hit, .srf files must be created with
|
186
|
+
#print_duplicate_references > 0. HINT: to capture all duplicate references,
|
187
|
+
#set the sequest parameter 'print_duplicate_references' to 100 or greater.
|
188
|
+
#*****************************************************************************
|
189
|
+
# }
|
190
|
+
else
|
191
|
+
dup_refs_gt_0 = true
|
192
|
+
end
|
193
|
+
|
194
|
+
File.open(filename, 'rb') do |fh|
|
195
|
+
@header = Mspire::Sequest::Srf::Header.from_io(fh)
|
196
|
+
@version = @header.version
|
197
|
+
|
198
|
+
unpack_35 = case @version
|
199
|
+
when '3.2'
|
200
|
+
false
|
201
|
+
when '3.3'
|
202
|
+
false
|
203
|
+
when '3.5'
|
204
|
+
true
|
205
|
+
end
|
206
|
+
|
207
|
+
if @header.combined
|
208
|
+
@base_name = File.basename(filename, '.*')
|
209
|
+
# I'm not sure why this is the case, but the reported number is too
|
210
|
+
# big by one on the 2 files I've seen so far, so we will correct it here!
|
211
|
+
@header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1
|
212
|
+
if opts[:read_pephits] == false
|
213
|
+
raise NotImplementedError, "on combined files must read everything right now!"
|
214
|
+
end
|
215
|
+
(@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0)
|
216
|
+
else
|
217
|
+
@base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
|
218
|
+
|
219
|
+
@dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35)
|
220
|
+
if opts[:read_pephits]
|
221
|
+
# need the params file to know if the duplicate_references is set > 0
|
222
|
+
raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil?
|
223
|
+
@out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0)
|
224
|
+
|
225
|
+
# FOR DISPLAY ONLY!
|
226
|
+
#@out_files.each do |f|
|
227
|
+
# if f.num_hits == 10
|
228
|
+
# p f.hits.last
|
229
|
+
# end
|
230
|
+
#end
|
231
|
+
|
232
|
+
if fh.eof?
|
233
|
+
#warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
|
234
|
+
@params = nil
|
235
|
+
@index = []
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
fh.pos = after_params_io_pos
|
241
|
+
|
242
|
+
# This is very sensitive to the grab_params method in sequest params
|
243
|
+
fh.read(12) ## gap between last params entry and index
|
244
|
+
|
245
|
+
@index = read_scan_index(fh,@header.num_dta_files)
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
### UPDATE SOME THINGS:
|
250
|
+
# give each hit a base_name, first_scan, last_scan
|
251
|
+
if opts[:read_pephits] && !@header.combined
|
252
|
+
@index.each_with_index do |ind,i|
|
253
|
+
mass_measured = @dta_files[i][0]
|
254
|
+
outfile = @out_files[i]
|
255
|
+
outfile.first_scan = ind[0]
|
256
|
+
outfile.last_scan = ind[1]
|
257
|
+
outfile.charge = ind[2]
|
258
|
+
|
259
|
+
pep_hits = @out_files[i].hits
|
260
|
+
@peptide_hits.push( *pep_hits )
|
261
|
+
pep_hits.each do |pep_hit|
|
262
|
+
pep_hit[15] = @base_name
|
263
|
+
pep_hit[16] = ind[0]
|
264
|
+
pep_hit[17] = ind[1]
|
265
|
+
pep_hit[18] = ind[2]
|
266
|
+
# add the deltamass
|
267
|
+
pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass)
|
268
|
+
pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm
|
269
|
+
pep_hit[19] = self ## link with the srf object
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
filter_by_precursor_mass_tolerance! if params
|
274
|
+
end
|
275
|
+
|
276
|
+
self
|
277
|
+
end
|
278
|
+
|
279
|
+
# returns an index where each entry is [first_scan, last_scan, charge]
|
280
|
+
def read_scan_index(fh, num)
|
281
|
+
#string = fh.read(80)
|
282
|
+
#puts "STRING: "
|
283
|
+
#p string
|
284
|
+
#puts string
|
285
|
+
#File.open("tmp.tmp",'wb') {|out| out.print string }
|
286
|
+
#abort 'her'
|
287
|
+
ind_len = 24
|
288
|
+
index = Array.new(num)
|
289
|
+
unpack_string = 'III'
|
290
|
+
st = ''
|
291
|
+
ind_len.times do st << '0' end ## create a 24 byte string to receive data
|
292
|
+
num.times do |i|
|
293
|
+
fh.read(ind_len, st)
|
294
|
+
result = st.unpack(unpack_string)
|
295
|
+
index[i] = st.unpack(unpack_string)
|
296
|
+
end
|
297
|
+
index
|
298
|
+
end
|
299
|
+
|
300
|
+
# returns an array of dta_files
|
301
|
+
def read_dta_files(fh, num_files, unpack_35)
|
302
|
+
dta_files = Array.new(num_files)
|
303
|
+
start = dta_start_byte
|
304
|
+
fh.pos = start
|
305
|
+
|
306
|
+
header.num_dta_files.times do |i|
|
307
|
+
dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
|
308
|
+
end
|
309
|
+
dta_files
|
310
|
+
end
|
311
|
+
|
312
|
+
# filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
|
313
|
+
# will put the fh there.
|
314
|
+
def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0)
|
315
|
+
out_files = Array.new(number_files)
|
316
|
+
header.num_dta_files.times do |i|
|
317
|
+
out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
|
318
|
+
end
|
319
|
+
out_files
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
class Mspire::Sequest::Srf::Header
|
325
|
+
|
326
|
+
Start_byte = {
|
327
|
+
:enzyme => 438,
|
328
|
+
:ion_series => 694,
|
329
|
+
:model => 950,
|
330
|
+
:modifications => 982,
|
331
|
+
:raw_filename => 1822,
|
332
|
+
:db_filename => 2082,
|
333
|
+
:dta_log_filename => 2602,
|
334
|
+
:params_filename => 3122,
|
335
|
+
:sequest_log_filename => 3382,
|
336
|
+
}
|
337
|
+
Byte_length = {
|
338
|
+
:enzyme => 256,
|
339
|
+
:ion_series => 256,
|
340
|
+
:model => 32,
|
341
|
+
:modifications => 840,
|
342
|
+
:raw_filename => 260,
|
343
|
+
:db_filename => 520,
|
344
|
+
:dta_log_filename => 520,
|
345
|
+
:params_filename => 260,
|
346
|
+
:sequest_log_filename => 262, ## is this really 262?? or should be 260??
|
347
|
+
}
|
348
|
+
Byte_length_v32 = {
|
349
|
+
:modifications => 456,
|
350
|
+
}
|
351
|
+
|
352
|
+
attr_accessor :version
|
353
|
+
# a Mspire::Sequest::Srf::DtaGen object
|
354
|
+
attr_accessor :dta_gen
|
355
|
+
attr_accessor :enzyme
|
356
|
+
attr_accessor :ion_series
|
357
|
+
attr_accessor :model
|
358
|
+
attr_accessor :modifications
|
359
|
+
attr_accessor :raw_filename
|
360
|
+
attr_accessor :db_filename
|
361
|
+
attr_accessor :dta_log_filename
|
362
|
+
attr_accessor :params_filename
|
363
|
+
attr_accessor :sequest_log_filename
|
364
|
+
|
365
|
+
|
366
|
+
# true if this is a combined file, false if represents a single file
|
367
|
+
# this is set by examining the DtaGen object for signs of a single file
|
368
|
+
attr_reader :combined
|
369
|
+
|
370
|
+
__chars_re = Regexp.escape( "\r\0" )
|
371
|
+
NEWLINE_OR_NULL_RE = /[#{__chars_re}]/o
|
372
|
+
|
373
|
+
def num_dta_files
|
374
|
+
@dta_gen.num_dta_files
|
375
|
+
end
|
376
|
+
|
377
|
+
def self.from_io(fh)
|
378
|
+
self.new.from_io(fh)
|
379
|
+
end
|
380
|
+
|
381
|
+
# sets fh to 0 and grabs the information it wants
|
382
|
+
def from_io(fh)
|
383
|
+
st = fh.read(4)
|
384
|
+
@version = '3.' + st.unpack('I').first.to_s
|
385
|
+
@dta_gen = Mspire::Sequest::Srf::DtaGen.from_io(fh)
|
386
|
+
# if the start_mass end_mass start_scan and end_scan are all zero, its a
|
387
|
+
# combined srf file:
|
388
|
+
@combined = [0.0, 0.0, 0, 0].zip(%w(start_mass end_mass start_scan end_scan)).all? do |one,two|
|
389
|
+
one == @dta_gen.send(two.to_sym)
|
390
|
+
end
|
391
|
+
|
392
|
+
## get the rest of the info
|
393
|
+
byte_length = Byte_length.dup
|
394
|
+
byte_length.merge! Byte_length_v32 if @version == '3.2'
|
395
|
+
|
396
|
+
fh.pos = Start_byte[:enzyme]
|
397
|
+
[:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
|
398
|
+
send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param], @combined))
|
399
|
+
end
|
400
|
+
self
|
401
|
+
end
|
402
|
+
|
403
|
+
private
|
404
|
+
def get_null_padded_string(fh, bytes, combined=false)
|
405
|
+
st = fh.read(bytes)
|
406
|
+
# for empty declarations
|
407
|
+
if st[0] == 0x000000
|
408
|
+
return ''
|
409
|
+
end
|
410
|
+
if combined
|
411
|
+
st = st[ 0, st.index(NEWLINE_OR_NULL_RE) ]
|
412
|
+
else
|
413
|
+
st.rstrip!
|
414
|
+
end
|
415
|
+
st
|
416
|
+
end
|
417
|
+
|
418
|
+
|
419
|
+
end
|
420
|
+
|
421
|
+
# the Dta Generation Params
|
422
|
+
class Mspire::Sequest::Srf::DtaGen
|
423
|
+
|
424
|
+
## not sure if this is correct
|
425
|
+
# Float
|
426
|
+
attr_accessor :start_time
|
427
|
+
# Float
|
428
|
+
attr_accessor :start_mass
|
429
|
+
# Float
|
430
|
+
attr_accessor :end_mass
|
431
|
+
# Integer
|
432
|
+
attr_accessor :num_dta_files
|
433
|
+
# Integer
|
434
|
+
attr_accessor :group_scan
|
435
|
+
## not sure if this is correct
|
436
|
+
# Integer
|
437
|
+
attr_accessor :min_group_count
|
438
|
+
# Integer
|
439
|
+
attr_accessor :min_ion_threshold
|
440
|
+
#attr_accessor :intensity_threshold # can't find yet
|
441
|
+
#attr_accessor :precursor_tolerance # can't find yet
|
442
|
+
# Integer
|
443
|
+
attr_accessor :start_scan
|
444
|
+
# Integer
|
445
|
+
attr_accessor :end_scan
|
446
|
+
|
447
|
+
def self.from_io(io)
|
448
|
+
self.new.from_io(io)
|
449
|
+
end
|
450
|
+
|
451
|
+
# sets self based on the io object and returns self
|
452
|
+
def from_io(io)
|
453
|
+
io.pos = 0 if io.pos != 0
|
454
|
+
st = io.read(148)
|
455
|
+
(@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
|
456
|
+
self
|
457
|
+
end
|
458
|
+
end
|
459
|
+
|
460
|
+
# total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
|
461
|
+
# unknown is, well unknown...
|
462
|
+
|
463
|
+
Mspire::Sequest::Srf::Dta = Struct.new( *%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks).map(&:to_sym) )
|
464
|
+
|
465
|
+
class Mspire::Sequest::Srf::Dta
|
466
|
+
# original
|
467
|
+
# Unpack = "EeIvvvv"
|
468
|
+
Unpack_32 = "EeIvvvv"
|
469
|
+
Unpack_35 = "Ex8eVx2vvvv"
|
470
|
+
|
471
|
+
|
472
|
+
# note on peaks (self[7])
|
473
|
+
# this is a byte array of floats, you can get the peaks out with
|
474
|
+
# unpack("e*")
|
475
|
+
|
476
|
+
undef_method :inspect
|
477
|
+
def inspect
|
478
|
+
peaks_st = 'nil'
|
479
|
+
if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
|
480
|
+
"<Mspire::Sequest::Srf::Dta @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
|
481
|
+
end
|
482
|
+
|
483
|
+
def self.from_io(fh, unpack_35)
|
484
|
+
(unpack, read_header, read_spacer) =
|
485
|
+
if unpack_35
|
486
|
+
[Unpack_35, 34, 22]
|
487
|
+
else
|
488
|
+
[Unpack_32, 24, 24]
|
489
|
+
end
|
490
|
+
|
491
|
+
# get the bulk of the data in single unpack
|
492
|
+
# sets the first 7 attributes
|
493
|
+
dta = self.new(*fh.read(read_header).unpack(unpack))
|
494
|
+
|
495
|
+
# Scan numbers are given at the end in an index!
|
496
|
+
fh.read(read_spacer) # throwaway the spacer
|
497
|
+
|
498
|
+
dta[7] = fh.read(dta.num_peaks * 8) # (num_peaks * 8) is the number of bytes to read
|
499
|
+
dta
|
500
|
+
end
|
501
|
+
|
502
|
+
def to_dta_file_data
|
503
|
+
string = "#{round(mh, 6)} #{charge}\r\n"
|
504
|
+
peak_ar = peaks.unpack('e*')
|
505
|
+
(0...(peak_ar.size)).step(2) do |i|
|
506
|
+
# %d is equivalent to floor, so we round by adding 0.5!
|
507
|
+
string << "#{round(peak_ar[i], 4)} #{(peak_ar[i+1] + 0.5).floor}\r\n"
|
508
|
+
#string << peak_ar[i,2].join(' ') << "\r\n"
|
509
|
+
end
|
510
|
+
string
|
511
|
+
end
|
512
|
+
|
513
|
+
# write a class dta file to the io object
|
514
|
+
def write_dta_file(io)
|
515
|
+
io.print to_dta_file_data
|
516
|
+
end
|
517
|
+
|
518
|
+
# returns a string where the float has been rounded to the specified number
|
519
|
+
# of decimal places
|
520
|
+
def round(float, decimal_places)
|
521
|
+
sprintf("%.#{decimal_places}f", float)
|
522
|
+
end
|
523
|
+
|
524
|
+
end
|
525
|
+
|
526
|
+
|
527
|
+
#Mspire::Sequest::Srf::Out = Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) )
|
528
|
+
Mspire::Sequest::Srf::Out = Struct.new( *%w(num_hits computer date_time total_inten lowest_sp num_matched_peptides db_locus_count hits first_scan last_scan charge).map(&:to_sym) )
|
529
|
+
|
530
|
+
# 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
|
531
|
+
|
532
|
+
class Mspire::Sequest::Srf::Out
|
533
|
+
Unpack_32 = '@36vx2Z*@60Z*'
|
534
|
+
Unpack_35 = '@36vx4Z*@62Z*'
|
535
|
+
|
536
|
+
undef_method :inspect
|
537
|
+
def inspect
|
538
|
+
hits_s =
|
539
|
+
if self.hits
|
540
|
+
", @hits(#)=#{hits.size}"
|
541
|
+
else
|
542
|
+
''
|
543
|
+
end
|
544
|
+
"<Mspire::Sequest::Srf::Out first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
|
545
|
+
end
|
546
|
+
|
547
|
+
# returns an Mspire::Sequest::Srf::Out object
|
548
|
+
def self.from_io(fh, unpack_35, dup_refs_gt_0)
|
549
|
+
## EMPTY out file is 96 bytes
|
550
|
+
## each hit is 320 bytes
|
551
|
+
## num_hits and charge:
|
552
|
+
st = fh.read(96)
|
553
|
+
|
554
|
+
# num_hits computer date_time
|
555
|
+
initial_vals = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
|
556
|
+
# total_inten lowest_sp num_matched_peptides db_locus_count
|
557
|
+
initial_vals.push( *st.unpack('@8eex4Ix4I') )
|
558
|
+
out_obj = self.new( *initial_vals )
|
559
|
+
|
560
|
+
_num_hits = out_obj.num_hits
|
561
|
+
|
562
|
+
ar = Array.new(_num_hits)
|
563
|
+
if ar.size > 0
|
564
|
+
num_extra_references = 0
|
565
|
+
_num_hits.times do |i|
|
566
|
+
ar[i] = Mspire::Sequest::Srf::Out::Peptide.from_io(fh, unpack_35)
|
567
|
+
num_extra_references += ar[i].num_other_loci
|
568
|
+
end
|
569
|
+
if dup_refs_gt_0
|
570
|
+
Mspire::Sequest::Srf::Out::Peptide.read_extra_references(fh, num_extra_references, ar)
|
571
|
+
end
|
572
|
+
## The xcorrs are already ordered by best to worst hit
|
573
|
+
## ADJUST the deltacn's to be meaningful for the top hit:
|
574
|
+
## (the same as bioworks and prophet)
|
575
|
+
Mspire::Sequest::Srf::Out::Peptide.set_deltacn_from_deltacn_orig(ar)
|
576
|
+
end
|
577
|
+
out_obj.hits = ar
|
578
|
+
out_obj[1].chomp! # computer
|
579
|
+
out_obj
|
580
|
+
end
|
581
|
+
|
582
|
+
end
|
583
|
+
|
584
|
+
|
585
|
+
|
586
|
+
# deltacn_orig - the one that sequest originally reports (top hit gets 0.0)
|
587
|
+
# deltacn - modified to be that of the next best hit (by xcorr) and the last
|
588
|
+
# hit takes 1.1. This is what is called deltacn by bioworks and pepprophet
|
589
|
+
# (at least for the first few years). If filtering occurs, it will be
|
590
|
+
# updated.
|
591
|
+
# deltacn_orig_updated - the latest updated value of deltacn.
|
592
|
+
# Originally, this will be equal to deltacn_orig. After filtering, this will
|
593
|
+
# be recalculated. To know if this will be different from deltacn_orig, query
|
594
|
+
# match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then
|
595
|
+
# deltacn should also be changed to reflect it.
|
596
|
+
# mh - the theoretical mass + h
|
597
|
+
# proteins are created as SRF prot objects with a reference and linked to their
|
598
|
+
# peptide_hits (from global hash by reference)
|
599
|
+
# ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
|
600
|
+
# This is calculated for the M+H mass!
|
601
|
+
# num_other_loci is the number of other loci that the peptide matches beyond
|
602
|
+
# the first one listed
|
603
|
+
# srf = the srf object this scan came from
|
604
|
+
Mspire::Sequest::Srf::Out::Peptide = Struct.new( *%w(mh deltacn_orig sf sp xcorr id num_other_loci rsp ions_matched ions_total sequence proteins deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated).map(&:to_sym) )
|
605
|
+
# 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=proteins 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
|
606
|
+
|
607
|
+
class Mspire::Sequest::Srf::Out::Peptide
|
608
|
+
|
609
|
+
# creates the deltacn that is meaningful for the top hit (the deltacn_orig
|
610
|
+
# or the second best hit and so on).
|
611
|
+
# assumes sorted
|
612
|
+
def self.set_deltacn_from_deltacn_orig(ar)
|
613
|
+
(1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
|
614
|
+
ar[-1].deltacn = 1.1
|
615
|
+
end
|
616
|
+
|
617
|
+
# (assumes sorted)
|
618
|
+
# recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn
|
619
|
+
def self.update_deltacns_from_xcorr(ar)
|
620
|
+
if ar.size > 0
|
621
|
+
top_score = ar.first[4]
|
622
|
+
other_scores = (1...(ar.size)).to_a.map do |i|
|
623
|
+
1.0 - (ar[i][4]/top_score)
|
624
|
+
end
|
625
|
+
ar.first[21] = 0.0
|
626
|
+
(0...(ar.size-1)).each do |i|
|
627
|
+
ar[i][20] = other_scores[i] # deltacn
|
628
|
+
ar[i+1][21] = other_scores[i] # deltacn_orig_updated
|
629
|
+
end
|
630
|
+
ar.last[20] = 1.1
|
631
|
+
end
|
632
|
+
end
|
633
|
+
|
634
|
+
def self.read_extra_references(fh, num_extra_references, pep_hits)
|
635
|
+
num_extra_references.times do
|
636
|
+
# 80 bytes total (with index number)
|
637
|
+
pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
|
638
|
+
|
639
|
+
ref = fh.read(80).unpack('A*').first
|
640
|
+
pep[11] << Mspire::Sequest::Srf::Out::Protein.new(ref[0,38])
|
641
|
+
end
|
642
|
+
# fh.read(6) if unpack_35
|
643
|
+
end
|
644
|
+
|
645
|
+
Unpack_35 = '@64Ex8ex8eeeIx18Ivx2vvx8Z*@246Z*'
|
646
|
+
# translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
|
647
|
+
# x8=8unknown bytes, e=sf, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
|
648
|
+
# v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
|
649
|
+
# byte 240 grab the string (which is proteins).
|
650
|
+
#Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
|
651
|
+
Unpack_32 = '@64Ex8ex8eeeIx14Ivvvx8Z*@240Z*'
|
652
|
+
Unpack_four_null_bytes = 'a*'
|
653
|
+
Unpack_Zstar = 'Z*'
|
654
|
+
Read_35 = 426
|
655
|
+
Read_32 = 320
|
656
|
+
|
657
|
+
FourNullBytes_as_string = "\0\0\0\0"
|
658
|
+
#NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
|
659
|
+
NewRecordStart = 0x01.chr + 0x00.chr
|
660
|
+
Sequest_record_start = "[SEQUEST]"
|
661
|
+
|
662
|
+
undef_method :inspect
|
663
|
+
def inspect
|
664
|
+
st = %w(aaseq sequence mh deltacn_orig sf sp xcorr id rsp ions_matched ions_total proteins deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
|
665
|
+
if v == 'proteins'
|
666
|
+
"#{v}(#)=#{send(v.to_sym).size}"
|
667
|
+
elsif v.is_a? Array
|
668
|
+
"##{v}=#{send(v.to_sym).size}"
|
669
|
+
else
|
670
|
+
"#{v}=#{send(v.to_sym).inspect}"
|
671
|
+
end
|
672
|
+
end
|
673
|
+
st.unshift("<#{self.class}")
|
674
|
+
if srf
|
675
|
+
st.push("srf(base_name)=#{srf.base_name.inspect}")
|
676
|
+
end
|
677
|
+
st.push('>')
|
678
|
+
st.join(' ')
|
679
|
+
#"<Mspire::Sequest::Srf::Out::Peptide @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @proteins(count)=#{proteins.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
|
680
|
+
end
|
681
|
+
# extra_references_array is an array that grows with peptide_hits as extra
|
682
|
+
# references are discovered.
|
683
|
+
def self.from_io(fh, unpack_35)
|
684
|
+
## get the first part of the info
|
685
|
+
st = fh.read( unpack_35 ? Read_35 : Read_32 ) ## read all the hit data
|
686
|
+
|
687
|
+
|
688
|
+
# sets the the first 11 attributes
|
689
|
+
peptide = self.new( *st.unpack( unpack_35 ? Unpack_35 : Unpack_32 ) )
|
690
|
+
|
691
|
+
# set deltacn_orig_updated
|
692
|
+
peptide[21] = peptide[1]
|
693
|
+
|
694
|
+
# we are slicing the reference to 38 chars to be the same length as
|
695
|
+
# duplicate references
|
696
|
+
peptide[11] = [Mspire::Sequest::Srf::Out::Protein.new(peptide[11][0,38])]
|
697
|
+
|
698
|
+
peptide[14] = Mspire::Ident::Peptide.sequence_to_aaseq(peptide[10])
|
699
|
+
|
700
|
+
fh.read(6) if unpack_35
|
701
|
+
|
702
|
+
peptide
|
703
|
+
end
|
704
|
+
|
705
|
+
end
|
706
|
+
|
707
|
+
class Mspire::Sequest::Srf::Out::Protein < Mspire::Ident::Protein
|
708
|
+
alias_method :reference, :id
|
709
|
+
|
710
|
+
# the first entry
|
711
|
+
def first_entry
|
712
|
+
reference.split(' ',2)[0]
|
713
|
+
end
|
714
|
+
end
|
715
|
+
|