mspire-sequest 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/.autotest +30 -0
  2. data/.gitmodules +9 -0
  3. data/History +79 -0
  4. data/LICENSE +22 -0
  5. data/README.rdoc +85 -0
  6. data/Rakefile +52 -0
  7. data/VERSION +1 -0
  8. data/bin/srf_to_pepxml.rb +7 -0
  9. data/bin/srf_to_search.rb +7 -0
  10. data/bin/srf_to_sqt.rb +8 -0
  11. data/lib/mspire/sequest/params.rb +331 -0
  12. data/lib/mspire/sequest/pepxml/modifications.rb +247 -0
  13. data/lib/mspire/sequest/pepxml/params.rb +32 -0
  14. data/lib/mspire/sequest/sqt.rb +393 -0
  15. data/lib/mspire/sequest/srf/pepxml/sequest.rb +21 -0
  16. data/lib/mspire/sequest/srf/pepxml.rb +333 -0
  17. data/lib/mspire/sequest/srf/search.rb +158 -0
  18. data/lib/mspire/sequest/srf/sqt.rb +218 -0
  19. data/lib/mspire/sequest/srf.rb +715 -0
  20. data/lib/mspire/sequest.rb +6 -0
  21. data/script/fasta_ipi_to_ncbi-ish.rb +29 -0
  22. data/spec/mspire/sequest/params_spec.rb +135 -0
  23. data/spec/mspire/sequest/pepxml/modifications_spec.rb +50 -0
  24. data/spec/mspire/sequest/pepxml_spec.rb +311 -0
  25. data/spec/mspire/sequest/sqt_spec.rb +51 -0
  26. data/spec/mspire/sequest/sqt_spec_helper.rb +34 -0
  27. data/spec/mspire/sequest/srf/pepxml_spec.rb +89 -0
  28. data/spec/mspire/sequest/srf/search_spec.rb +131 -0
  29. data/spec/mspire/sequest/srf/sqt_spec.rb +228 -0
  30. data/spec/mspire/sequest/srf_spec.rb +113 -0
  31. data/spec/mspire/sequest/srf_spec_helper.rb +172 -0
  32. data/spec/spec_helper.rb +22 -0
  33. data/spec/testfiles/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  34. data/spec/testfiles/bioworks31.params +77 -0
  35. data/spec/testfiles/bioworks32.params +62 -0
  36. data/spec/testfiles/bioworks33.params +63 -0
  37. data/spec/testfiles/corrupted_900.srf +0 -0
  38. data/spec/testfiles/small.sqt +87 -0
  39. data/spec/testfiles/small2.sqt +176 -0
  40. metadata +185 -0
@@ -0,0 +1,715 @@
1
+
2
+ # standard lib
3
+ require 'set'
4
+ require 'fileutils'
5
+ require 'scanf'
6
+
7
+ # in library
8
+ require 'mspire/ident/search'
9
+ require 'mspire/ident/peptide'
10
+ require 'mspire/ident/protein'
11
+ require 'mspire/sequest/params'
12
+
13
+
14
+ module Mspire ; end
15
+ module Mspire::Sequest ; end
16
+
17
+ class Mspire::Sequest::Srf < Mspire::Ident::Search
18
+ class NoSequestParamsError < ArgumentError
19
+ end
20
+
21
+ # inherits peptide_hits from Search
22
+
23
+ # a String: 3.5, 3.3 or 3.2
24
+ attr_accessor :version
25
+
26
+ attr_accessor :header
27
+ attr_accessor :dta_files
28
+ attr_accessor :out_files
29
+ attr_accessor :params
30
+ # a parallel array to dta_files and out_files where each entry is:
31
+ # [first_scan, last_scan, charge]
32
+ attr_accessor :index
33
+
34
+ # the base name of the file with no extension
35
+ attr_accessor :base_name
36
+
37
+ alias_method :base_name_noext, :base_name
38
+ alias_method :base_name_noext=, :base_name=
39
+
40
+ # the directory the srf file was residing in when the filename was passed
41
+ # in. May not be available.
42
+ attr_accessor :resident_dir
43
+
44
+ # a boolean to indicate if the results have been filtered by the
45
+ # sequest.params precursor mass tolerance
46
+ attr_accessor :filtered_by_precursor_mass_tolerance
47
+
48
+ def protein_class
49
+ Mspire::Sequest::Srf::Out::Protein
50
+ end
51
+
52
+ # returns a Sequest::Params object or nil if none
53
+ def self.get_sequest_params_and_finish_pos(filename)
54
+ # split the file in half and only read the second half (since we can be
55
+ # confident that the params file will be there!)
56
+
57
+ params = nil
58
+ finish_parsing_io_pos = nil
59
+ File.open(filename, 'rb') do |handle|
60
+ halfway = handle.stat.size / 2
61
+ handle.seek halfway
62
+ last_half = handle.read
63
+ if sequest_start_from_last_half = last_half.rindex('[SEQUEST]')
64
+ params_start_index = sequest_start_from_last_half + halfway
65
+ handle.seek(params_start_index)
66
+ params = Mspire::Sequest::Params.new.parse_io(handle)
67
+ finish_parsing_io_pos = handle.pos
68
+ else
69
+ nil # not found
70
+ end
71
+ end
72
+ [params, finish_parsing_io_pos]
73
+ end
74
+
75
+ def dta_start_byte
76
+ case @version
77
+ when '3.2' ; 3260
78
+ when '3.3' ; 3644
79
+ when '3.5' ; 3644
80
+ end
81
+ end
82
+
83
+
84
+ # opts:
85
+ # :filter_by_precursor_mass_tolerance => true | false (default true)
86
+ # this will filter by the sequest params prec tolerance as is
87
+ # typically done by the Bioworks software.
88
+ #
89
+ # :read_pephits => true | false (default true)
90
+ # will attempt to read peptide hit information (equivalent to .out
91
+ # files), otherwise, just reads the dta information.
92
+ def initialize(filename=nil, opts={})
93
+ @peptide_hits = []
94
+ @dta_files = []
95
+ @out_files = []
96
+ if filename
97
+ from_file(filename, opts)
98
+ end
99
+ end
100
+
101
+
102
+ # 1. updates the out_file's list of hits based on passing peptide_hits (but not
103
+ # the original hit id; rank is implicit in array ordering)
104
+ # 2. recalculates deltacn values completely if number of hits changed (does
105
+ # not touch deltacn orig)
106
+ #
107
+ # This can spoil proper protein -> peptide linkages. Mspire::Id::Search.merge!
108
+ # should be run after this method to ensure correct protein -> peptide
109
+ # linkages.
110
+ def filter_by_precursor_mass_tolerance!
111
+ pmt = params.peptide_mass_tolerance.to_f
112
+ methd = nil # the method to
113
+
114
+ case params.peptide_mass_units
115
+ when '0'
116
+ amu_based = true
117
+ milli_amu = false
118
+ when '1'
119
+ amu_based = true
120
+ milli_amu = true
121
+ when '2'
122
+ amu_based = false
123
+ end
124
+
125
+ self.filtered_by_precursor_mass_tolerance = true
126
+ self.out_files.each do |out_file|
127
+ hits = out_file.hits
128
+ before = hits.size
129
+ hits.reject! do |pep|
130
+ if amu_based
131
+ if milli_amu
132
+ (pep.deltamass.abs > (pmt/1000))
133
+ else
134
+ (pep.deltamass.abs > pmt)
135
+ end
136
+ else
137
+ (pep.ppm.abs > pmt)
138
+ end
139
+ end
140
+ if hits.size != before
141
+ out_file.hits = hits # <- is this necessary
142
+ Mspire::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits)
143
+ out_file.num_hits = hits.size
144
+ end
145
+ end
146
+ self
147
+ end
148
+
149
+ def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0)
150
+ dta_files = Array.new(num_files)
151
+ out_files = Array.new(num_files)
152
+ start = dta_start_byte
153
+ fh.pos = start
154
+
155
+ num_files.times do |i|
156
+ dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
157
+ #p dta_files[i]
158
+ out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
159
+ #p out_files[i]
160
+ end
161
+ [dta_files, out_files]
162
+ end
163
+
164
+ # returns self
165
+ # opts are the same as for 'new'
166
+ def from_file(filename, opts)
167
+ @resident_dir = File.dirname(File.expand_path(filename))
168
+ opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts)
169
+
170
+ (@params, after_params_io_pos) = Mspire::Sequest::Srf.get_sequest_params_and_finish_pos(filename)
171
+ return unless @params
172
+
173
+ dup_references = 0
174
+ dup_refs_gt_0 = false
175
+
176
+ dup_references = @params.print_duplicate_references.to_i
177
+ if dup_references == 0
178
+ # warn %Q{
179
+ #*****************************************************************************
180
+ #WARNING: This srf file lists only 1 protein per peptide! (based on the
181
+ #print_duplicate_references parameter in the sequest.params file used in its
182
+ #creation) So, downstream output will likewise only contain a single protein
183
+ #for each peptide hit. In many instances this is OK since downstream programs
184
+ #will recalculate protein-to-peptide linkages from the database file anyway.
185
+ #For complete protein lists per peptide hit, .srf files must be created with
186
+ #print_duplicate_references > 0. HINT: to capture all duplicate references,
187
+ #set the sequest parameter 'print_duplicate_references' to 100 or greater.
188
+ #*****************************************************************************
189
+ # }
190
+ else
191
+ dup_refs_gt_0 = true
192
+ end
193
+
194
+ File.open(filename, 'rb') do |fh|
195
+ @header = Mspire::Sequest::Srf::Header.from_io(fh)
196
+ @version = @header.version
197
+
198
+ unpack_35 = case @version
199
+ when '3.2'
200
+ false
201
+ when '3.3'
202
+ false
203
+ when '3.5'
204
+ true
205
+ end
206
+
207
+ if @header.combined
208
+ @base_name = File.basename(filename, '.*')
209
+ # I'm not sure why this is the case, but the reported number is too
210
+ # big by one on the 2 files I've seen so far, so we will correct it here!
211
+ @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1
212
+ if opts[:read_pephits] == false
213
+ raise NotImplementedError, "on combined files must read everything right now!"
214
+ end
215
+ (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0)
216
+ else
217
+ @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
218
+
219
+ @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35)
220
+ if opts[:read_pephits]
221
+ # need the params file to know if the duplicate_references is set > 0
222
+ raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil?
223
+ @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0)
224
+
225
+ # FOR DISPLAY ONLY!
226
+ #@out_files.each do |f|
227
+ # if f.num_hits == 10
228
+ # p f.hits.last
229
+ # end
230
+ #end
231
+
232
+ if fh.eof?
233
+ #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
234
+ @params = nil
235
+ @index = []
236
+ end
237
+ end
238
+ end
239
+
240
+ fh.pos = after_params_io_pos
241
+
242
+ # This is very sensitive to the grab_params method in sequest params
243
+ fh.read(12) ## gap between last params entry and index
244
+
245
+ @index = read_scan_index(fh,@header.num_dta_files)
246
+ end
247
+
248
+
249
+ ### UPDATE SOME THINGS:
250
+ # give each hit a base_name, first_scan, last_scan
251
+ if opts[:read_pephits] && !@header.combined
252
+ @index.each_with_index do |ind,i|
253
+ mass_measured = @dta_files[i][0]
254
+ outfile = @out_files[i]
255
+ outfile.first_scan = ind[0]
256
+ outfile.last_scan = ind[1]
257
+ outfile.charge = ind[2]
258
+
259
+ pep_hits = @out_files[i].hits
260
+ @peptide_hits.push( *pep_hits )
261
+ pep_hits.each do |pep_hit|
262
+ pep_hit[15] = @base_name
263
+ pep_hit[16] = ind[0]
264
+ pep_hit[17] = ind[1]
265
+ pep_hit[18] = ind[2]
266
+ # add the deltamass
267
+ pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass)
268
+ pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm
269
+ pep_hit[19] = self ## link with the srf object
270
+ end
271
+ end
272
+
273
+ filter_by_precursor_mass_tolerance! if params
274
+ end
275
+
276
+ self
277
+ end
278
+
279
+ # returns an index where each entry is [first_scan, last_scan, charge]
280
+ def read_scan_index(fh, num)
281
+ #string = fh.read(80)
282
+ #puts "STRING: "
283
+ #p string
284
+ #puts string
285
+ #File.open("tmp.tmp",'wb') {|out| out.print string }
286
+ #abort 'her'
287
+ ind_len = 24
288
+ index = Array.new(num)
289
+ unpack_string = 'III'
290
+ st = ''
291
+ ind_len.times do st << '0' end ## create a 24 byte string to receive data
292
+ num.times do |i|
293
+ fh.read(ind_len, st)
294
+ result = st.unpack(unpack_string)
295
+ index[i] = st.unpack(unpack_string)
296
+ end
297
+ index
298
+ end
299
+
300
+ # returns an array of dta_files
301
+ def read_dta_files(fh, num_files, unpack_35)
302
+ dta_files = Array.new(num_files)
303
+ start = dta_start_byte
304
+ fh.pos = start
305
+
306
+ header.num_dta_files.times do |i|
307
+ dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
308
+ end
309
+ dta_files
310
+ end
311
+
312
+ # filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
313
+ # will put the fh there.
314
+ def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0)
315
+ out_files = Array.new(number_files)
316
+ header.num_dta_files.times do |i|
317
+ out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
318
+ end
319
+ out_files
320
+ end
321
+
322
+ end
323
+
324
+ class Mspire::Sequest::Srf::Header
325
+
326
+ Start_byte = {
327
+ :enzyme => 438,
328
+ :ion_series => 694,
329
+ :model => 950,
330
+ :modifications => 982,
331
+ :raw_filename => 1822,
332
+ :db_filename => 2082,
333
+ :dta_log_filename => 2602,
334
+ :params_filename => 3122,
335
+ :sequest_log_filename => 3382,
336
+ }
337
+ Byte_length = {
338
+ :enzyme => 256,
339
+ :ion_series => 256,
340
+ :model => 32,
341
+ :modifications => 840,
342
+ :raw_filename => 260,
343
+ :db_filename => 520,
344
+ :dta_log_filename => 520,
345
+ :params_filename => 260,
346
+ :sequest_log_filename => 262, ## is this really 262?? or should be 260??
347
+ }
348
+ Byte_length_v32 = {
349
+ :modifications => 456,
350
+ }
351
+
352
+ attr_accessor :version
353
+ # a Mspire::Sequest::Srf::DtaGen object
354
+ attr_accessor :dta_gen
355
+ attr_accessor :enzyme
356
+ attr_accessor :ion_series
357
+ attr_accessor :model
358
+ attr_accessor :modifications
359
+ attr_accessor :raw_filename
360
+ attr_accessor :db_filename
361
+ attr_accessor :dta_log_filename
362
+ attr_accessor :params_filename
363
+ attr_accessor :sequest_log_filename
364
+
365
+
366
+ # true if this is a combined file, false if represents a single file
367
+ # this is set by examining the DtaGen object for signs of a single file
368
+ attr_reader :combined
369
+
370
+ __chars_re = Regexp.escape( "\r\0" )
371
+ NEWLINE_OR_NULL_RE = /[#{__chars_re}]/o
372
+
373
+ def num_dta_files
374
+ @dta_gen.num_dta_files
375
+ end
376
+
377
+ def self.from_io(fh)
378
+ self.new.from_io(fh)
379
+ end
380
+
381
+ # sets fh to 0 and grabs the information it wants
382
+ def from_io(fh)
383
+ st = fh.read(4)
384
+ @version = '3.' + st.unpack('I').first.to_s
385
+ @dta_gen = Mspire::Sequest::Srf::DtaGen.from_io(fh)
386
+ # if the start_mass end_mass start_scan and end_scan are all zero, its a
387
+ # combined srf file:
388
+ @combined = [0.0, 0.0, 0, 0].zip(%w(start_mass end_mass start_scan end_scan)).all? do |one,two|
389
+ one == @dta_gen.send(two.to_sym)
390
+ end
391
+
392
+ ## get the rest of the info
393
+ byte_length = Byte_length.dup
394
+ byte_length.merge! Byte_length_v32 if @version == '3.2'
395
+
396
+ fh.pos = Start_byte[:enzyme]
397
+ [:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
398
+ send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param], @combined))
399
+ end
400
+ self
401
+ end
402
+
403
+ private
404
+ def get_null_padded_string(fh, bytes, combined=false)
405
+ st = fh.read(bytes)
406
+ # for empty declarations
407
+ if st[0] == 0x000000
408
+ return ''
409
+ end
410
+ if combined
411
+ st = st[ 0, st.index(NEWLINE_OR_NULL_RE) ]
412
+ else
413
+ st.rstrip!
414
+ end
415
+ st
416
+ end
417
+
418
+
419
+ end
420
+
421
+ # the Dta Generation Params
422
+ class Mspire::Sequest::Srf::DtaGen
423
+
424
+ ## not sure if this is correct
425
+ # Float
426
+ attr_accessor :start_time
427
+ # Float
428
+ attr_accessor :start_mass
429
+ # Float
430
+ attr_accessor :end_mass
431
+ # Integer
432
+ attr_accessor :num_dta_files
433
+ # Integer
434
+ attr_accessor :group_scan
435
+ ## not sure if this is correct
436
+ # Integer
437
+ attr_accessor :min_group_count
438
+ # Integer
439
+ attr_accessor :min_ion_threshold
440
+ #attr_accessor :intensity_threshold # can't find yet
441
+ #attr_accessor :precursor_tolerance # can't find yet
442
+ # Integer
443
+ attr_accessor :start_scan
444
+ # Integer
445
+ attr_accessor :end_scan
446
+
447
+ def self.from_io(io)
448
+ self.new.from_io(io)
449
+ end
450
+
451
+ # sets self based on the io object and returns self
452
+ def from_io(io)
453
+ io.pos = 0 if io.pos != 0
454
+ st = io.read(148)
455
+ (@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
456
+ self
457
+ end
458
+ end
459
+
460
+ # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
461
+ # unknown is, well unknown...
462
+
463
+ Mspire::Sequest::Srf::Dta = Struct.new( *%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks).map(&:to_sym) )
464
+
465
+ class Mspire::Sequest::Srf::Dta
466
+ # original
467
+ # Unpack = "EeIvvvv"
468
+ Unpack_32 = "EeIvvvv"
469
+ Unpack_35 = "Ex8eVx2vvvv"
470
+
471
+
472
+ # note on peaks (self[7])
473
+ # this is a byte array of floats, you can get the peaks out with
474
+ # unpack("e*")
475
+
476
+ undef_method :inspect
477
+ def inspect
478
+ peaks_st = 'nil'
479
+ if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
480
+ "<Mspire::Sequest::Srf::Dta @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
481
+ end
482
+
483
+ def self.from_io(fh, unpack_35)
484
+ (unpack, read_header, read_spacer) =
485
+ if unpack_35
486
+ [Unpack_35, 34, 22]
487
+ else
488
+ [Unpack_32, 24, 24]
489
+ end
490
+
491
+ # get the bulk of the data in single unpack
492
+ # sets the first 7 attributes
493
+ dta = self.new(*fh.read(read_header).unpack(unpack))
494
+
495
+ # Scan numbers are given at the end in an index!
496
+ fh.read(read_spacer) # throwaway the spacer
497
+
498
+ dta[7] = fh.read(dta.num_peaks * 8) # (num_peaks * 8) is the number of bytes to read
499
+ dta
500
+ end
501
+
502
+ def to_dta_file_data
503
+ string = "#{round(mh, 6)} #{charge}\r\n"
504
+ peak_ar = peaks.unpack('e*')
505
+ (0...(peak_ar.size)).step(2) do |i|
506
+ # %d is equivalent to floor, so we round by adding 0.5!
507
+ string << "#{round(peak_ar[i], 4)} #{(peak_ar[i+1] + 0.5).floor}\r\n"
508
+ #string << peak_ar[i,2].join(' ') << "\r\n"
509
+ end
510
+ string
511
+ end
512
+
513
+ # write a class dta file to the io object
514
+ def write_dta_file(io)
515
+ io.print to_dta_file_data
516
+ end
517
+
518
+ # returns a string where the float has been rounded to the specified number
519
+ # of decimal places
520
+ def round(float, decimal_places)
521
+ sprintf("%.#{decimal_places}f", float)
522
+ end
523
+
524
+ end
525
+
526
+
527
+ #Mspire::Sequest::Srf::Out = Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) )
528
+ Mspire::Sequest::Srf::Out = Struct.new( *%w(num_hits computer date_time total_inten lowest_sp num_matched_peptides db_locus_count hits first_scan last_scan charge).map(&:to_sym) )
529
+
530
+ # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
531
+
532
+ class Mspire::Sequest::Srf::Out
533
+ Unpack_32 = '@36vx2Z*@60Z*'
534
+ Unpack_35 = '@36vx4Z*@62Z*'
535
+
536
+ undef_method :inspect
537
+ def inspect
538
+ hits_s =
539
+ if self.hits
540
+ ", @hits(#)=#{hits.size}"
541
+ else
542
+ ''
543
+ end
544
+ "<Mspire::Sequest::Srf::Out first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
545
+ end
546
+
547
+ # returns an Mspire::Sequest::Srf::Out object
548
+ def self.from_io(fh, unpack_35, dup_refs_gt_0)
549
+ ## EMPTY out file is 96 bytes
550
+ ## each hit is 320 bytes
551
+ ## num_hits and charge:
552
+ st = fh.read(96)
553
+
554
+ # num_hits computer date_time
555
+ initial_vals = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
556
+ # total_inten lowest_sp num_matched_peptides db_locus_count
557
+ initial_vals.push( *st.unpack('@8eex4Ix4I') )
558
+ out_obj = self.new( *initial_vals )
559
+
560
+ _num_hits = out_obj.num_hits
561
+
562
+ ar = Array.new(_num_hits)
563
+ if ar.size > 0
564
+ num_extra_references = 0
565
+ _num_hits.times do |i|
566
+ ar[i] = Mspire::Sequest::Srf::Out::Peptide.from_io(fh, unpack_35)
567
+ num_extra_references += ar[i].num_other_loci
568
+ end
569
+ if dup_refs_gt_0
570
+ Mspire::Sequest::Srf::Out::Peptide.read_extra_references(fh, num_extra_references, ar)
571
+ end
572
+ ## The xcorrs are already ordered by best to worst hit
573
+ ## ADJUST the deltacn's to be meaningful for the top hit:
574
+ ## (the same as bioworks and prophet)
575
+ Mspire::Sequest::Srf::Out::Peptide.set_deltacn_from_deltacn_orig(ar)
576
+ end
577
+ out_obj.hits = ar
578
+ out_obj[1].chomp! # computer
579
+ out_obj
580
+ end
581
+
582
+ end
583
+
584
+
585
+
586
+ # deltacn_orig - the one that sequest originally reports (top hit gets 0.0)
587
+ # deltacn - modified to be that of the next best hit (by xcorr) and the last
588
+ # hit takes 1.1. This is what is called deltacn by bioworks and pepprophet
589
+ # (at least for the first few years). If filtering occurs, it will be
590
+ # updated.
591
+ # deltacn_orig_updated - the latest updated value of deltacn.
592
+ # Originally, this will be equal to deltacn_orig. After filtering, this will
593
+ # be recalculated. To know if this will be different from deltacn_orig, query
594
+ # match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then
595
+ # deltacn should also be changed to reflect it.
596
+ # mh - the theoretical mass + h
597
+ # proteins are created as SRF prot objects with a reference and linked to their
598
+ # peptide_hits (from global hash by reference)
599
+ # ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
600
+ # This is calculated for the M+H mass!
601
+ # num_other_loci is the number of other loci that the peptide matches beyond
602
+ # the first one listed
603
+ # srf = the srf object this scan came from
604
+ Mspire::Sequest::Srf::Out::Peptide = Struct.new( *%w(mh deltacn_orig sf sp xcorr id num_other_loci rsp ions_matched ions_total sequence proteins deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated).map(&:to_sym) )
605
+ # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=proteins 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
606
+
607
+ class Mspire::Sequest::Srf::Out::Peptide
608
+
609
+ # creates the deltacn that is meaningful for the top hit (the deltacn_orig
610
+ # or the second best hit and so on).
611
+ # assumes sorted
612
+ def self.set_deltacn_from_deltacn_orig(ar)
613
+ (1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
614
+ ar[-1].deltacn = 1.1
615
+ end
616
+
617
+ # (assumes sorted)
618
+ # recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn
619
+ def self.update_deltacns_from_xcorr(ar)
620
+ if ar.size > 0
621
+ top_score = ar.first[4]
622
+ other_scores = (1...(ar.size)).to_a.map do |i|
623
+ 1.0 - (ar[i][4]/top_score)
624
+ end
625
+ ar.first[21] = 0.0
626
+ (0...(ar.size-1)).each do |i|
627
+ ar[i][20] = other_scores[i] # deltacn
628
+ ar[i+1][21] = other_scores[i] # deltacn_orig_updated
629
+ end
630
+ ar.last[20] = 1.1
631
+ end
632
+ end
633
+
634
+ def self.read_extra_references(fh, num_extra_references, pep_hits)
635
+ num_extra_references.times do
636
+ # 80 bytes total (with index number)
637
+ pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
638
+
639
+ ref = fh.read(80).unpack('A*').first
640
+ pep[11] << Mspire::Sequest::Srf::Out::Protein.new(ref[0,38])
641
+ end
642
+ # fh.read(6) if unpack_35
643
+ end
644
+
645
+ Unpack_35 = '@64Ex8ex8eeeIx18Ivx2vvx8Z*@246Z*'
646
+ # translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
647
+ # x8=8unknown bytes, e=sf, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
648
+ # v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
649
+ # byte 240 grab the string (which is proteins).
650
+ #Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
651
+ Unpack_32 = '@64Ex8ex8eeeIx14Ivvvx8Z*@240Z*'
652
+ Unpack_four_null_bytes = 'a*'
653
+ Unpack_Zstar = 'Z*'
654
+ Read_35 = 426
655
+ Read_32 = 320
656
+
657
+ FourNullBytes_as_string = "\0\0\0\0"
658
+ #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
659
+ NewRecordStart = 0x01.chr + 0x00.chr
660
+ Sequest_record_start = "[SEQUEST]"
661
+
662
+ undef_method :inspect
663
+ def inspect
664
+ st = %w(aaseq sequence mh deltacn_orig sf sp xcorr id rsp ions_matched ions_total proteins deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
665
+ if v == 'proteins'
666
+ "#{v}(#)=#{send(v.to_sym).size}"
667
+ elsif v.is_a? Array
668
+ "##{v}=#{send(v.to_sym).size}"
669
+ else
670
+ "#{v}=#{send(v.to_sym).inspect}"
671
+ end
672
+ end
673
+ st.unshift("<#{self.class}")
674
+ if srf
675
+ st.push("srf(base_name)=#{srf.base_name.inspect}")
676
+ end
677
+ st.push('>')
678
+ st.join(' ')
679
+ #"<Mspire::Sequest::Srf::Out::Peptide @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @proteins(count)=#{proteins.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
680
+ end
681
+ # extra_references_array is an array that grows with peptide_hits as extra
682
+ # references are discovered.
683
+ def self.from_io(fh, unpack_35)
684
+ ## get the first part of the info
685
+ st = fh.read( unpack_35 ? Read_35 : Read_32 ) ## read all the hit data
686
+
687
+
688
+ # sets the the first 11 attributes
689
+ peptide = self.new( *st.unpack( unpack_35 ? Unpack_35 : Unpack_32 ) )
690
+
691
+ # set deltacn_orig_updated
692
+ peptide[21] = peptide[1]
693
+
694
+ # we are slicing the reference to 38 chars to be the same length as
695
+ # duplicate references
696
+ peptide[11] = [Mspire::Sequest::Srf::Out::Protein.new(peptide[11][0,38])]
697
+
698
+ peptide[14] = Mspire::Ident::Peptide.sequence_to_aaseq(peptide[10])
699
+
700
+ fh.read(6) if unpack_35
701
+
702
+ peptide
703
+ end
704
+
705
+ end
706
+
707
+ class Mspire::Sequest::Srf::Out::Protein < Mspire::Ident::Protein
708
+ alias_method :reference, :id
709
+
710
+ # the first entry
711
+ def first_entry
712
+ reference.split(' ',2)[0]
713
+ end
714
+ end
715
+