mspire-sequest 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/.autotest +30 -0
  2. data/.gitmodules +9 -0
  3. data/History +79 -0
  4. data/LICENSE +22 -0
  5. data/README.rdoc +85 -0
  6. data/Rakefile +52 -0
  7. data/VERSION +1 -0
  8. data/bin/srf_to_pepxml.rb +7 -0
  9. data/bin/srf_to_search.rb +7 -0
  10. data/bin/srf_to_sqt.rb +8 -0
  11. data/lib/mspire/sequest/params.rb +331 -0
  12. data/lib/mspire/sequest/pepxml/modifications.rb +247 -0
  13. data/lib/mspire/sequest/pepxml/params.rb +32 -0
  14. data/lib/mspire/sequest/sqt.rb +393 -0
  15. data/lib/mspire/sequest/srf/pepxml/sequest.rb +21 -0
  16. data/lib/mspire/sequest/srf/pepxml.rb +333 -0
  17. data/lib/mspire/sequest/srf/search.rb +158 -0
  18. data/lib/mspire/sequest/srf/sqt.rb +218 -0
  19. data/lib/mspire/sequest/srf.rb +715 -0
  20. data/lib/mspire/sequest.rb +6 -0
  21. data/script/fasta_ipi_to_ncbi-ish.rb +29 -0
  22. data/spec/mspire/sequest/params_spec.rb +135 -0
  23. data/spec/mspire/sequest/pepxml/modifications_spec.rb +50 -0
  24. data/spec/mspire/sequest/pepxml_spec.rb +311 -0
  25. data/spec/mspire/sequest/sqt_spec.rb +51 -0
  26. data/spec/mspire/sequest/sqt_spec_helper.rb +34 -0
  27. data/spec/mspire/sequest/srf/pepxml_spec.rb +89 -0
  28. data/spec/mspire/sequest/srf/search_spec.rb +131 -0
  29. data/spec/mspire/sequest/srf/sqt_spec.rb +228 -0
  30. data/spec/mspire/sequest/srf_spec.rb +113 -0
  31. data/spec/mspire/sequest/srf_spec_helper.rb +172 -0
  32. data/spec/spec_helper.rb +22 -0
  33. data/spec/testfiles/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  34. data/spec/testfiles/bioworks31.params +77 -0
  35. data/spec/testfiles/bioworks32.params +62 -0
  36. data/spec/testfiles/bioworks33.params +63 -0
  37. data/spec/testfiles/corrupted_900.srf +0 -0
  38. data/spec/testfiles/small.sqt +87 -0
  39. data/spec/testfiles/small2.sqt +176 -0
  40. metadata +185 -0
@@ -0,0 +1,715 @@
1
+
2
+ # standard lib
3
+ require 'set'
4
+ require 'fileutils'
5
+ require 'scanf'
6
+
7
+ # in library
8
+ require 'mspire/ident/search'
9
+ require 'mspire/ident/peptide'
10
+ require 'mspire/ident/protein'
11
+ require 'mspire/sequest/params'
12
+
13
+
14
+ module Mspire ; end
15
+ module Mspire::Sequest ; end
16
+
17
+ class Mspire::Sequest::Srf < Mspire::Ident::Search
18
+ class NoSequestParamsError < ArgumentError
19
+ end
20
+
21
+ # inherits peptide_hits from Search
22
+
23
+ # a String: 3.5, 3.3 or 3.2
24
+ attr_accessor :version
25
+
26
+ attr_accessor :header
27
+ attr_accessor :dta_files
28
+ attr_accessor :out_files
29
+ attr_accessor :params
30
+ # a parallel array to dta_files and out_files where each entry is:
31
+ # [first_scan, last_scan, charge]
32
+ attr_accessor :index
33
+
34
+ # the base name of the file with no extension
35
+ attr_accessor :base_name
36
+
37
+ alias_method :base_name_noext, :base_name
38
+ alias_method :base_name_noext=, :base_name=
39
+
40
+ # the directory the srf file was residing in when the filename was passed
41
+ # in. May not be available.
42
+ attr_accessor :resident_dir
43
+
44
+ # a boolean to indicate if the results have been filtered by the
45
+ # sequest.params precursor mass tolerance
46
+ attr_accessor :filtered_by_precursor_mass_tolerance
47
+
48
+ def protein_class
49
+ Mspire::Sequest::Srf::Out::Protein
50
+ end
51
+
52
+ # returns a Sequest::Params object or nil if none
53
+ def self.get_sequest_params_and_finish_pos(filename)
54
+ # split the file in half and only read the second half (since we can be
55
+ # confident that the params file will be there!)
56
+
57
+ params = nil
58
+ finish_parsing_io_pos = nil
59
+ File.open(filename, 'rb') do |handle|
60
+ halfway = handle.stat.size / 2
61
+ handle.seek halfway
62
+ last_half = handle.read
63
+ if sequest_start_from_last_half = last_half.rindex('[SEQUEST]')
64
+ params_start_index = sequest_start_from_last_half + halfway
65
+ handle.seek(params_start_index)
66
+ params = Mspire::Sequest::Params.new.parse_io(handle)
67
+ finish_parsing_io_pos = handle.pos
68
+ else
69
+ nil # not found
70
+ end
71
+ end
72
+ [params, finish_parsing_io_pos]
73
+ end
74
+
75
+ def dta_start_byte
76
+ case @version
77
+ when '3.2' ; 3260
78
+ when '3.3' ; 3644
79
+ when '3.5' ; 3644
80
+ end
81
+ end
82
+
83
+
84
+ # opts:
85
+ # :filter_by_precursor_mass_tolerance => true | false (default true)
86
+ # this will filter by the sequest params prec tolerance as is
87
+ # typically done by the Bioworks software.
88
+ #
89
+ # :read_pephits => true | false (default true)
90
+ # will attempt to read peptide hit information (equivalent to .out
91
+ # files), otherwise, just reads the dta information.
92
+ def initialize(filename=nil, opts={})
93
+ @peptide_hits = []
94
+ @dta_files = []
95
+ @out_files = []
96
+ if filename
97
+ from_file(filename, opts)
98
+ end
99
+ end
100
+
101
+
102
+ # 1. updates the out_file's list of hits based on passing peptide_hits (but not
103
+ # the original hit id; rank is implicit in array ordering)
104
+ # 2. recalculates deltacn values completely if number of hits changed (does
105
+ # not touch deltacn orig)
106
+ #
107
+ # This can spoil proper protein -> peptide linkages. Mspire::Id::Search.merge!
108
+ # should be run after this method to ensure correct protein -> peptide
109
+ # linkages.
110
+ def filter_by_precursor_mass_tolerance!
111
+ pmt = params.peptide_mass_tolerance.to_f
112
+ methd = nil # the method to
113
+
114
+ case params.peptide_mass_units
115
+ when '0'
116
+ amu_based = true
117
+ milli_amu = false
118
+ when '1'
119
+ amu_based = true
120
+ milli_amu = true
121
+ when '2'
122
+ amu_based = false
123
+ end
124
+
125
+ self.filtered_by_precursor_mass_tolerance = true
126
+ self.out_files.each do |out_file|
127
+ hits = out_file.hits
128
+ before = hits.size
129
+ hits.reject! do |pep|
130
+ if amu_based
131
+ if milli_amu
132
+ (pep.deltamass.abs > (pmt/1000))
133
+ else
134
+ (pep.deltamass.abs > pmt)
135
+ end
136
+ else
137
+ (pep.ppm.abs > pmt)
138
+ end
139
+ end
140
+ if hits.size != before
141
+ out_file.hits = hits # <- is this necessary
142
+ Mspire::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits)
143
+ out_file.num_hits = hits.size
144
+ end
145
+ end
146
+ self
147
+ end
148
+
149
+ def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0)
150
+ dta_files = Array.new(num_files)
151
+ out_files = Array.new(num_files)
152
+ start = dta_start_byte
153
+ fh.pos = start
154
+
155
+ num_files.times do |i|
156
+ dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
157
+ #p dta_files[i]
158
+ out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
159
+ #p out_files[i]
160
+ end
161
+ [dta_files, out_files]
162
+ end
163
+
164
+ # returns self
165
+ # opts are the same as for 'new'
166
+ def from_file(filename, opts)
167
+ @resident_dir = File.dirname(File.expand_path(filename))
168
+ opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts)
169
+
170
+ (@params, after_params_io_pos) = Mspire::Sequest::Srf.get_sequest_params_and_finish_pos(filename)
171
+ return unless @params
172
+
173
+ dup_references = 0
174
+ dup_refs_gt_0 = false
175
+
176
+ dup_references = @params.print_duplicate_references.to_i
177
+ if dup_references == 0
178
+ # warn %Q{
179
+ #*****************************************************************************
180
+ #WARNING: This srf file lists only 1 protein per peptide! (based on the
181
+ #print_duplicate_references parameter in the sequest.params file used in its
182
+ #creation) So, downstream output will likewise only contain a single protein
183
+ #for each peptide hit. In many instances this is OK since downstream programs
184
+ #will recalculate protein-to-peptide linkages from the database file anyway.
185
+ #For complete protein lists per peptide hit, .srf files must be created with
186
+ #print_duplicate_references > 0. HINT: to capture all duplicate references,
187
+ #set the sequest parameter 'print_duplicate_references' to 100 or greater.
188
+ #*****************************************************************************
189
+ # }
190
+ else
191
+ dup_refs_gt_0 = true
192
+ end
193
+
194
+ File.open(filename, 'rb') do |fh|
195
+ @header = Mspire::Sequest::Srf::Header.from_io(fh)
196
+ @version = @header.version
197
+
198
+ unpack_35 = case @version
199
+ when '3.2'
200
+ false
201
+ when '3.3'
202
+ false
203
+ when '3.5'
204
+ true
205
+ end
206
+
207
+ if @header.combined
208
+ @base_name = File.basename(filename, '.*')
209
+ # I'm not sure why this is the case, but the reported number is too
210
+ # big by one on the 2 files I've seen so far, so we will correct it here!
211
+ @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1
212
+ if opts[:read_pephits] == false
213
+ raise NotImplementedError, "on combined files must read everything right now!"
214
+ end
215
+ (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0)
216
+ else
217
+ @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
218
+
219
+ @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35)
220
+ if opts[:read_pephits]
221
+ # need the params file to know if the duplicate_references is set > 0
222
+ raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil?
223
+ @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0)
224
+
225
+ # FOR DISPLAY ONLY!
226
+ #@out_files.each do |f|
227
+ # if f.num_hits == 10
228
+ # p f.hits.last
229
+ # end
230
+ #end
231
+
232
+ if fh.eof?
233
+ #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
234
+ @params = nil
235
+ @index = []
236
+ end
237
+ end
238
+ end
239
+
240
+ fh.pos = after_params_io_pos
241
+
242
+ # This is very sensitive to the grab_params method in sequest params
243
+ fh.read(12) ## gap between last params entry and index
244
+
245
+ @index = read_scan_index(fh,@header.num_dta_files)
246
+ end
247
+
248
+
249
+ ### UPDATE SOME THINGS:
250
+ # give each hit a base_name, first_scan, last_scan
251
+ if opts[:read_pephits] && !@header.combined
252
+ @index.each_with_index do |ind,i|
253
+ mass_measured = @dta_files[i][0]
254
+ outfile = @out_files[i]
255
+ outfile.first_scan = ind[0]
256
+ outfile.last_scan = ind[1]
257
+ outfile.charge = ind[2]
258
+
259
+ pep_hits = @out_files[i].hits
260
+ @peptide_hits.push( *pep_hits )
261
+ pep_hits.each do |pep_hit|
262
+ pep_hit[15] = @base_name
263
+ pep_hit[16] = ind[0]
264
+ pep_hit[17] = ind[1]
265
+ pep_hit[18] = ind[2]
266
+ # add the deltamass
267
+ pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass)
268
+ pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm
269
+ pep_hit[19] = self ## link with the srf object
270
+ end
271
+ end
272
+
273
+ filter_by_precursor_mass_tolerance! if params
274
+ end
275
+
276
+ self
277
+ end
278
+
279
+ # returns an index where each entry is [first_scan, last_scan, charge]
280
+ def read_scan_index(fh, num)
281
+ #string = fh.read(80)
282
+ #puts "STRING: "
283
+ #p string
284
+ #puts string
285
+ #File.open("tmp.tmp",'wb') {|out| out.print string }
286
+ #abort 'her'
287
+ ind_len = 24
288
+ index = Array.new(num)
289
+ unpack_string = 'III'
290
+ st = ''
291
+ ind_len.times do st << '0' end ## create a 24 byte string to receive data
292
+ num.times do |i|
293
+ fh.read(ind_len, st)
294
+ result = st.unpack(unpack_string)
295
+ index[i] = st.unpack(unpack_string)
296
+ end
297
+ index
298
+ end
299
+
300
+ # returns an array of dta_files
301
+ def read_dta_files(fh, num_files, unpack_35)
302
+ dta_files = Array.new(num_files)
303
+ start = dta_start_byte
304
+ fh.pos = start
305
+
306
+ header.num_dta_files.times do |i|
307
+ dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35)
308
+ end
309
+ dta_files
310
+ end
311
+
312
+ # filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
313
+ # will put the fh there.
314
+ def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0)
315
+ out_files = Array.new(number_files)
316
+ header.num_dta_files.times do |i|
317
+ out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
318
+ end
319
+ out_files
320
+ end
321
+
322
+ end
323
+
324
+ class Mspire::Sequest::Srf::Header
325
+
326
+ Start_byte = {
327
+ :enzyme => 438,
328
+ :ion_series => 694,
329
+ :model => 950,
330
+ :modifications => 982,
331
+ :raw_filename => 1822,
332
+ :db_filename => 2082,
333
+ :dta_log_filename => 2602,
334
+ :params_filename => 3122,
335
+ :sequest_log_filename => 3382,
336
+ }
337
+ Byte_length = {
338
+ :enzyme => 256,
339
+ :ion_series => 256,
340
+ :model => 32,
341
+ :modifications => 840,
342
+ :raw_filename => 260,
343
+ :db_filename => 520,
344
+ :dta_log_filename => 520,
345
+ :params_filename => 260,
346
+ :sequest_log_filename => 262, ## is this really 262?? or should be 260??
347
+ }
348
+ Byte_length_v32 = {
349
+ :modifications => 456,
350
+ }
351
+
352
+ attr_accessor :version
353
+ # a Mspire::Sequest::Srf::DtaGen object
354
+ attr_accessor :dta_gen
355
+ attr_accessor :enzyme
356
+ attr_accessor :ion_series
357
+ attr_accessor :model
358
+ attr_accessor :modifications
359
+ attr_accessor :raw_filename
360
+ attr_accessor :db_filename
361
+ attr_accessor :dta_log_filename
362
+ attr_accessor :params_filename
363
+ attr_accessor :sequest_log_filename
364
+
365
+
366
+ # true if this is a combined file, false if represents a single file
367
+ # this is set by examining the DtaGen object for signs of a single file
368
+ attr_reader :combined
369
+
370
+ __chars_re = Regexp.escape( "\r\0" )
371
+ NEWLINE_OR_NULL_RE = /[#{__chars_re}]/o
372
+
373
+ def num_dta_files
374
+ @dta_gen.num_dta_files
375
+ end
376
+
377
+ def self.from_io(fh)
378
+ self.new.from_io(fh)
379
+ end
380
+
381
+ # sets fh to 0 and grabs the information it wants
382
+ def from_io(fh)
383
+ st = fh.read(4)
384
+ @version = '3.' + st.unpack('I').first.to_s
385
+ @dta_gen = Mspire::Sequest::Srf::DtaGen.from_io(fh)
386
+ # if the start_mass end_mass start_scan and end_scan are all zero, its a
387
+ # combined srf file:
388
+ @combined = [0.0, 0.0, 0, 0].zip(%w(start_mass end_mass start_scan end_scan)).all? do |one,two|
389
+ one == @dta_gen.send(two.to_sym)
390
+ end
391
+
392
+ ## get the rest of the info
393
+ byte_length = Byte_length.dup
394
+ byte_length.merge! Byte_length_v32 if @version == '3.2'
395
+
396
+ fh.pos = Start_byte[:enzyme]
397
+ [:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
398
+ send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param], @combined))
399
+ end
400
+ self
401
+ end
402
+
403
+ private
404
+ def get_null_padded_string(fh, bytes, combined=false)
405
+ st = fh.read(bytes)
406
+ # for empty declarations
407
+ if st[0] == 0x000000
408
+ return ''
409
+ end
410
+ if combined
411
+ st = st[ 0, st.index(NEWLINE_OR_NULL_RE) ]
412
+ else
413
+ st.rstrip!
414
+ end
415
+ st
416
+ end
417
+
418
+
419
+ end
420
+
421
+ # the Dta Generation Params
422
+ class Mspire::Sequest::Srf::DtaGen
423
+
424
+ ## not sure if this is correct
425
+ # Float
426
+ attr_accessor :start_time
427
+ # Float
428
+ attr_accessor :start_mass
429
+ # Float
430
+ attr_accessor :end_mass
431
+ # Integer
432
+ attr_accessor :num_dta_files
433
+ # Integer
434
+ attr_accessor :group_scan
435
+ ## not sure if this is correct
436
+ # Integer
437
+ attr_accessor :min_group_count
438
+ # Integer
439
+ attr_accessor :min_ion_threshold
440
+ #attr_accessor :intensity_threshold # can't find yet
441
+ #attr_accessor :precursor_tolerance # can't find yet
442
+ # Integer
443
+ attr_accessor :start_scan
444
+ # Integer
445
+ attr_accessor :end_scan
446
+
447
+ def self.from_io(io)
448
+ self.new.from_io(io)
449
+ end
450
+
451
+ # sets self based on the io object and returns self
452
+ def from_io(io)
453
+ io.pos = 0 if io.pos != 0
454
+ st = io.read(148)
455
+ (@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
456
+ self
457
+ end
458
+ end
459
+
460
+ # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
461
+ # unknown is, well unknown...
462
+
463
+ Mspire::Sequest::Srf::Dta = Struct.new( *%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks).map(&:to_sym) )
464
+
465
+ class Mspire::Sequest::Srf::Dta
466
+ # original
467
+ # Unpack = "EeIvvvv"
468
+ Unpack_32 = "EeIvvvv"
469
+ Unpack_35 = "Ex8eVx2vvvv"
470
+
471
+
472
+ # note on peaks (self[7])
473
+ # this is a byte array of floats, you can get the peaks out with
474
+ # unpack("e*")
475
+
476
+ undef_method :inspect
477
+ def inspect
478
+ peaks_st = 'nil'
479
+ if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
480
+ "<Mspire::Sequest::Srf::Dta @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
481
+ end
482
+
483
+ def self.from_io(fh, unpack_35)
484
+ (unpack, read_header, read_spacer) =
485
+ if unpack_35
486
+ [Unpack_35, 34, 22]
487
+ else
488
+ [Unpack_32, 24, 24]
489
+ end
490
+
491
+ # get the bulk of the data in single unpack
492
+ # sets the first 7 attributes
493
+ dta = self.new(*fh.read(read_header).unpack(unpack))
494
+
495
+ # Scan numbers are given at the end in an index!
496
+ fh.read(read_spacer) # throwaway the spacer
497
+
498
+ dta[7] = fh.read(dta.num_peaks * 8) # (num_peaks * 8) is the number of bytes to read
499
+ dta
500
+ end
501
+
502
+ def to_dta_file_data
503
+ string = "#{round(mh, 6)} #{charge}\r\n"
504
+ peak_ar = peaks.unpack('e*')
505
+ (0...(peak_ar.size)).step(2) do |i|
506
+ # %d is equivalent to floor, so we round by adding 0.5!
507
+ string << "#{round(peak_ar[i], 4)} #{(peak_ar[i+1] + 0.5).floor}\r\n"
508
+ #string << peak_ar[i,2].join(' ') << "\r\n"
509
+ end
510
+ string
511
+ end
512
+
513
+ # write a class dta file to the io object
514
+ def write_dta_file(io)
515
+ io.print to_dta_file_data
516
+ end
517
+
518
+ # returns a string where the float has been rounded to the specified number
519
+ # of decimal places
520
+ def round(float, decimal_places)
521
+ sprintf("%.#{decimal_places}f", float)
522
+ end
523
+
524
+ end
525
+
526
+
527
+ #Mspire::Sequest::Srf::Out = Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) )
528
+ Mspire::Sequest::Srf::Out = Struct.new( *%w(num_hits computer date_time total_inten lowest_sp num_matched_peptides db_locus_count hits first_scan last_scan charge).map(&:to_sym) )
529
+
530
+ # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
531
+
532
+ class Mspire::Sequest::Srf::Out
533
+ Unpack_32 = '@36vx2Z*@60Z*'
534
+ Unpack_35 = '@36vx4Z*@62Z*'
535
+
536
+ undef_method :inspect
537
+ def inspect
538
+ hits_s =
539
+ if self.hits
540
+ ", @hits(#)=#{hits.size}"
541
+ else
542
+ ''
543
+ end
544
+ "<Mspire::Sequest::Srf::Out first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
545
+ end
546
+
547
+ # returns an Mspire::Sequest::Srf::Out object
548
+ def self.from_io(fh, unpack_35, dup_refs_gt_0)
549
+ ## EMPTY out file is 96 bytes
550
+ ## each hit is 320 bytes
551
+ ## num_hits and charge:
552
+ st = fh.read(96)
553
+
554
+ # num_hits computer date_time
555
+ initial_vals = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
556
+ # total_inten lowest_sp num_matched_peptides db_locus_count
557
+ initial_vals.push( *st.unpack('@8eex4Ix4I') )
558
+ out_obj = self.new( *initial_vals )
559
+
560
+ _num_hits = out_obj.num_hits
561
+
562
+ ar = Array.new(_num_hits)
563
+ if ar.size > 0
564
+ num_extra_references = 0
565
+ _num_hits.times do |i|
566
+ ar[i] = Mspire::Sequest::Srf::Out::Peptide.from_io(fh, unpack_35)
567
+ num_extra_references += ar[i].num_other_loci
568
+ end
569
+ if dup_refs_gt_0
570
+ Mspire::Sequest::Srf::Out::Peptide.read_extra_references(fh, num_extra_references, ar)
571
+ end
572
+ ## The xcorrs are already ordered by best to worst hit
573
+ ## ADJUST the deltacn's to be meaningful for the top hit:
574
+ ## (the same as bioworks and prophet)
575
+ Mspire::Sequest::Srf::Out::Peptide.set_deltacn_from_deltacn_orig(ar)
576
+ end
577
+ out_obj.hits = ar
578
+ out_obj[1].chomp! # computer
579
+ out_obj
580
+ end
581
+
582
+ end
583
+
584
+
585
+
586
+ # deltacn_orig - the one that sequest originally reports (top hit gets 0.0)
587
+ # deltacn - modified to be that of the next best hit (by xcorr) and the last
588
+ # hit takes 1.1. This is what is called deltacn by bioworks and pepprophet
589
+ # (at least for the first few years). If filtering occurs, it will be
590
+ # updated.
591
+ # deltacn_orig_updated - the latest updated value of deltacn.
592
+ # Originally, this will be equal to deltacn_orig. After filtering, this will
593
+ # be recalculated. To know if this will be different from deltacn_orig, query
594
+ # match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then
595
+ # deltacn should also be changed to reflect it.
596
+ # mh - the theoretical mass + h
597
+ # proteins are created as SRF prot objects with a reference and linked to their
598
+ # peptide_hits (from global hash by reference)
599
+ # ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
600
+ # This is calculated for the M+H mass!
601
+ # num_other_loci is the number of other loci that the peptide matches beyond
602
+ # the first one listed
603
+ # srf = the srf object this scan came from
604
+ Mspire::Sequest::Srf::Out::Peptide = Struct.new( *%w(mh deltacn_orig sf sp xcorr id num_other_loci rsp ions_matched ions_total sequence proteins deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated).map(&:to_sym) )
605
+ # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=proteins 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
606
+
607
+ class Mspire::Sequest::Srf::Out::Peptide
608
+
609
+ # creates the deltacn that is meaningful for the top hit (the deltacn_orig
610
+ # or the second best hit and so on).
611
+ # assumes sorted
612
+ def self.set_deltacn_from_deltacn_orig(ar)
613
+ (1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
614
+ ar[-1].deltacn = 1.1
615
+ end
616
+
617
+ # (assumes sorted)
618
+ # recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn
619
+ def self.update_deltacns_from_xcorr(ar)
620
+ if ar.size > 0
621
+ top_score = ar.first[4]
622
+ other_scores = (1...(ar.size)).to_a.map do |i|
623
+ 1.0 - (ar[i][4]/top_score)
624
+ end
625
+ ar.first[21] = 0.0
626
+ (0...(ar.size-1)).each do |i|
627
+ ar[i][20] = other_scores[i] # deltacn
628
+ ar[i+1][21] = other_scores[i] # deltacn_orig_updated
629
+ end
630
+ ar.last[20] = 1.1
631
+ end
632
+ end
633
+
634
+ def self.read_extra_references(fh, num_extra_references, pep_hits)
635
+ num_extra_references.times do
636
+ # 80 bytes total (with index number)
637
+ pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
638
+
639
+ ref = fh.read(80).unpack('A*').first
640
+ pep[11] << Mspire::Sequest::Srf::Out::Protein.new(ref[0,38])
641
+ end
642
+ # fh.read(6) if unpack_35
643
+ end
644
+
645
+ Unpack_35 = '@64Ex8ex8eeeIx18Ivx2vvx8Z*@246Z*'
646
+ # translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
647
+ # x8=8unknown bytes, e=sf, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
648
+ # v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
649
+ # byte 240 grab the string (which is proteins).
650
+ #Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
651
+ Unpack_32 = '@64Ex8ex8eeeIx14Ivvvx8Z*@240Z*'
652
+ Unpack_four_null_bytes = 'a*'
653
+ Unpack_Zstar = 'Z*'
654
+ Read_35 = 426
655
+ Read_32 = 320
656
+
657
+ FourNullBytes_as_string = "\0\0\0\0"
658
+ #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
659
+ NewRecordStart = 0x01.chr + 0x00.chr
660
+ Sequest_record_start = "[SEQUEST]"
661
+
662
+ undef_method :inspect
663
+ def inspect
664
+ st = %w(aaseq sequence mh deltacn_orig sf sp xcorr id rsp ions_matched ions_total proteins deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
665
+ if v == 'proteins'
666
+ "#{v}(#)=#{send(v.to_sym).size}"
667
+ elsif v.is_a? Array
668
+ "##{v}=#{send(v.to_sym).size}"
669
+ else
670
+ "#{v}=#{send(v.to_sym).inspect}"
671
+ end
672
+ end
673
+ st.unshift("<#{self.class}")
674
+ if srf
675
+ st.push("srf(base_name)=#{srf.base_name.inspect}")
676
+ end
677
+ st.push('>')
678
+ st.join(' ')
679
+ #"<Mspire::Sequest::Srf::Out::Peptide @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @proteins(count)=#{proteins.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
680
+ end
681
+ # extra_references_array is an array that grows with peptide_hits as extra
682
+ # references are discovered.
683
+ def self.from_io(fh, unpack_35)
684
+ ## get the first part of the info
685
+ st = fh.read( unpack_35 ? Read_35 : Read_32 ) ## read all the hit data
686
+
687
+
688
+ # sets the the first 11 attributes
689
+ peptide = self.new( *st.unpack( unpack_35 ? Unpack_35 : Unpack_32 ) )
690
+
691
+ # set deltacn_orig_updated
692
+ peptide[21] = peptide[1]
693
+
694
+ # we are slicing the reference to 38 chars to be the same length as
695
+ # duplicate references
696
+ peptide[11] = [Mspire::Sequest::Srf::Out::Protein.new(peptide[11][0,38])]
697
+
698
+ peptide[14] = Mspire::Ident::Peptide.sequence_to_aaseq(peptide[10])
699
+
700
+ fh.read(6) if unpack_35
701
+
702
+ peptide
703
+ end
704
+
705
+ end
706
+
707
+ class Mspire::Sequest::Srf::Out::Protein < Mspire::Ident::Protein
708
+ alias_method :reference, :id
709
+
710
+ # the first entry
711
+ def first_entry
712
+ reference.split(' ',2)[0]
713
+ end
714
+ end
715
+