mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,486 @@
1
+ require 'spec_id/sequest'
2
+
3
+ module BinaryReader
4
+ Null_char = "\0"[0] ## change for ruby 1.9 or 2.0
5
+ # extracts a string with all empty chars at the end stripped
6
+ # expects the filehandle to be at the proper location
7
+ def get_null_padded_string(fh,bytes)
8
+ st = fh.read(bytes)
9
+ # for empty declarations
10
+ if st[0] == Null_char
11
+ return ''
12
+ end
13
+ st.rstrip!
14
+ st
15
+ end
16
+ end
17
+
18
+ # class to extract information from <file>_dta.log files
19
+ class DTALog
20
+ # returns an array indexed by the dta file number (starting at 0)
21
+ # each entry is an array [first_scan, last_scan, dta_filename_noext]
22
+ # this is now obsolete since I found the scan # index at the end of the srf
23
+ # files
24
+ def self.dta_and_scans_by_dta_index(file)
25
+ dta_index = nil
26
+ final_scan = nil
27
+ dta_cnt = 0
28
+ re = /^ m/o
29
+ scan_line_re = /scan: (\d+) - (\d+), Datafile: (.*?) (.*)/o
30
+ other_dta_re = /Datafile: (.*?) /o
31
+ File.open(file) do |fh|
32
+ 10.times { fh.readline }
33
+ scan_range_line = fh.readline
34
+ if scan_range_line =~ /scan range\s+= \d+ - (\d+)/
35
+ # this is an overestimate (since MS scans have no dta, but that's OK)
36
+ dta_index = Array.new($1.to_i)
37
+ else
38
+ dta_index = []
39
+ end
40
+ 3.times { fh.readline }
41
+ fh.each do |line|
42
+ if line =~ re
43
+ if line =~ scan_line_re
44
+ first_scan = $1.to_i
45
+ last_scan = $2.to_i
46
+ the_rest = $4.dup
47
+ dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
48
+ dta_cnt += 1
49
+ if the_rest =~ other_dta_re
50
+ dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
51
+ dta_cnt += 1
52
+ end
53
+ end
54
+ break
55
+ end
56
+ end
57
+ fh.each do |line|
58
+ if line =~ scan_line_re
59
+ first_scan = $1.to_i
60
+ last_scan = $2.to_i
61
+ the_rest = $4.dup
62
+ dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
63
+ dta_cnt += 1
64
+ if the_rest =~ other_dta_re
65
+ dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
66
+ dta_cnt += 1
67
+ end
68
+ end
69
+ end
70
+ end
71
+ dta_index.compact! # remove those trailing nils
72
+ dta_index
73
+ end
74
+ end
75
+
76
+ class SRF
77
+
78
+ # a string 3.3 or 3.2
79
+ attr_accessor :version
80
+
81
+ attr_accessor :header
82
+ attr_accessor :dta_files
83
+ attr_accessor :out_files
84
+ attr_accessor :params
85
+ # a parallel array to dta_files and out_files where each entry is:
86
+ # [first_scan, last_scan, charge]
87
+ attr_accessor :index
88
+
89
+ def dta_start_byte
90
+ case @version
91
+ when '3.2' ; 3260
92
+ when '3.3' ; 3644
93
+ end
94
+ end
95
+
96
+ def initialize(filename=nil)
97
+ @dta_files = []
98
+ @out_files = []
99
+ if filename
100
+ from_file(filename)
101
+ end
102
+ end
103
+
104
+ # returns self
105
+ def from_file(filename)
106
+
107
+ File.open(filename, "rb") do |fh|
108
+ @header = SRF::Header.new.from_handle(fh)
109
+ @version = @header.version
110
+ @dta_files = read_dta_files(fh,@header.num_dta_files)
111
+ @out_files = read_out_files(fh,@header.num_dta_files)
112
+ @params = SpecID::Sequest::Params.new.parse_handle(fh)
113
+ fh.read(12) ## gap between last params entry and index
114
+ @index = read_scan_index(fh,@header.num_dta_files)
115
+ end
116
+ self
117
+ end
118
+
119
+ # returns an index where each entry is [first_scan, last_scan, charge]
120
+ def read_scan_index(fh, num)
121
+ ind_len = 24
122
+ index = Array.new(num)
123
+ unpack_string = 'III'
124
+ st = ''
125
+ ind_len.times do st << '0' end ## create a 24 byte string to receive data
126
+ num.times do |i|
127
+ fh.read(ind_len, st)
128
+ index[i] = st.unpack(unpack_string)
129
+ end
130
+ index
131
+ end
132
+
133
+ # given a zero indexed list where each entry is [first_scan, last_scan,
134
+ # dta_filename] updates the out info
135
+ # returns self
136
+ def update_out_scan_info_from_dta_log(dta_log)
137
+ index = DTALog.dta_and_scans_by_dta_index(dta_log)
138
+ @out_files.each_with_index do |ot,i|
139
+ ot[4,3] = index[i] #contingent on implementation of ot
140
+ end
141
+ self
142
+ end
143
+
144
+ # returns an array of dta_files
145
+ def read_dta_files(fh, num_files)
146
+ dta_files = Array.new(num_files)
147
+ start = dta_start_byte
148
+ unless fh.pos == start
149
+ fh.pos = start
150
+ end
151
+ header.num_dta_files.times do |i|
152
+ dta_files[i] = SRF::DTA.new.from_handle(fh)
153
+ end
154
+ dta_files
155
+ end
156
+
157
+ # filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
158
+ # will put the fh there.
159
+ def read_out_files(fh,number_files)
160
+ out_files = Array.new(number_files)
161
+ header.num_dta_files.times do |i|
162
+ #if i == header.num_dta_files - 2
163
+ # abort
164
+ #end
165
+ out_files[i] = SRF::OUT.new.from_handle(fh)
166
+ end
167
+ out_files
168
+ end
169
+
170
+ end
171
+
172
+ class SRF::Header
173
+ include BinaryReader
174
+
175
+ Start_byte = {
176
+ :enzyme => 438,
177
+ :ion_series => 694,
178
+ :model => 950,
179
+ :modifications => 982,
180
+ :raw_filename => 1822,
181
+ :db_filename => 2082,
182
+ :dta_log_filename => 2602,
183
+ :params_filename => 3122,
184
+ :sequest_log_filename => 3382,
185
+ }
186
+ Byte_length = {
187
+ :enzyme => 256,
188
+ :ion_series => 256,
189
+ :model => 32,
190
+ :modifications => 840,
191
+ :raw_filename => 260,
192
+ :db_filename => 520,
193
+ :dta_log_filename => 520,
194
+ :params_filename => 260,
195
+ :sequest_log_filename => 262, ## is this really 262?? or should be 260??
196
+ }
197
+ Byte_length_v32 = {
198
+ :modifications => 456,
199
+ }
200
+
201
+ # a SRF::DTAGen object
202
+ attr_accessor :version
203
+ attr_accessor :dta_gen
204
+ attr_accessor :enzyme
205
+ attr_accessor :ion_series
206
+ attr_accessor :model
207
+ attr_accessor :modifications
208
+ attr_accessor :raw_filename
209
+ attr_accessor :db_filename
210
+ attr_accessor :dta_log_filename
211
+ attr_accessor :params_filename
212
+ attr_accessor :sequest_log_filename
213
+
214
+ def num_dta_files
215
+ @dta_gen.num_dta_files
216
+ end
217
+
218
+ # sets fh to 0 and grabs the information it wants
219
+ def from_handle(fh)
220
+ st = fh.read(4)
221
+ @version = '3.' + st.unpack('I').first.to_s
222
+ @dta_gen = SRF::DTAGen.new.from_handle(fh)
223
+
224
+ ## get the rest of the info
225
+ byte_length = Byte_length.dup
226
+ byte_length.merge! Byte_length_v32 if @version == '3.2'
227
+
228
+ fh.pos = Start_byte[:enzyme]
229
+ [:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
230
+ send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param]) )
231
+ end
232
+ self
233
+ end
234
+
235
+ end
236
+
237
+ # the DTA Generation Params
238
+ class SRF::DTAGen
239
+
240
+ ## not sure if this is correct
241
+ attr_accessor :start_time
242
+ # group scan (not sure if this is correct)
243
+ attr_accessor :start_mass
244
+ attr_accessor :end_mass
245
+ attr_accessor :num_dta_files
246
+ attr_accessor :group_scan
247
+ ## not sure if this is correct
248
+ attr_accessor :min_group_count
249
+ attr_accessor :min_ion_threshold
250
+ #attr_accessor :intensity_threshold # can't find yet
251
+ #attr_accessor :precursor_tolerance # can't find yet
252
+ attr_accessor :start_scan
253
+ attr_accessor :end_scan
254
+
255
+ #
256
+ def from_handle(fh)
257
+ fh.pos = 0 if fh.pos != 0
258
+ st = fh.read(148)
259
+ (@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
260
+ self
261
+ end
262
+ end
263
+
264
+ class SRF::DTA < Array
265
+
266
+ # is this universal?
267
+ First_record_start_byte = 3644
268
+
269
+ ## mucky details. This should be encapsulated into a class to inherit from, etc.
270
+ ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
271
+ ind_keys = {:mh => 0, :dta_tic => 1, :num_peaks => 2, :charge => 3, :ms_level => 4, :unknown => 5, :total_num_possible_charge_states => 6, :peaks => 7}
272
+ @@arr_size = ind_keys.size
273
+
274
+ def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
275
+ def dta_tic ; self[1] end ; def dta_tic=(oth) ; self[1] = oth end
276
+ def num_peaks ; self[2] end ; def num_peaks=(oth) ; self[2] = oth end
277
+ def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
278
+ def ms_level ; self[4] end ; def ms_level=(oth) ; self[4] = oth end
279
+ def unknown ; self[5] end ; def unknown=(oth) ; self[5] = oth end
280
+ def total_num_possible_charge_states ; self[6] end ; def total_num_possible_charge_states=(oth) ; self[6] = oth end
281
+
282
+ # this is a byte array of floats, you can get the peaks out with
283
+ # unpack("e*")
284
+ def peaks ; self[7] end
285
+ # this is a byte array of floats, you can get the peaks out with
286
+ def peaks=(oth) ; self[7] = oth end
287
+
288
+ @@arr_size = ind_keys.size
289
+ ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
290
+ ind_keys.merge!(ind_keys_w_eq)
291
+ ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
292
+
293
+ def initialize(args=nil)
294
+ super(@@arr_size.size)
295
+ if args
296
+ if args.is_a? Hash
297
+ args.each do |k,v|
298
+ self[@@ind[k]] = v
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+ def inspect
305
+ peaks_st = 'nil'
306
+ if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
307
+ "<SRF::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
308
+ end
309
+
310
+ def from_handle(fh)
311
+ st = fh.read(24)
312
+ # get the bulk of the data in single unpack
313
+ self[0,7] = st.unpack("EeIvvvv")
314
+
315
+ # Scan numbers possibly hidden in this next sequence of bytes (I think)
316
+ st2 = fh.read(24)
317
+
318
+ num_bytes_to_read = num_peaks * 8
319
+ st3 = fh.read(num_bytes_to_read)
320
+ self[7] = st3
321
+ self
322
+ end
323
+
324
+ end
325
+
326
+
327
+ class SRF::OUT < Array
328
+ ## mucky details. This should be encapsulated into a class to inherit from, etc.
329
+ ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
330
+ ind_keys = {:num_hits => 0, :charge => 1, :computer => 2, :date_time => 3, :first_scan => 4, :last_scan => 5, :filename_noext => 6, :hits => 7}
331
+ @@arr_size = ind_keys.size
332
+
333
+ def num_hits ; self[0] end ; def num_hits=(oth) ; self[0] = oth end
334
+ def charge ; self[1] end ; def charge=(oth) ; self[1] = oth end
335
+ def computer ; self[2] end ; def computer=(oth) ; self[2] = oth end
336
+ def date_time ; self[3] end ; def date_time=(oth) ; self[3] = oth end
337
+ def first_scan ; self[4] end ; def first_scan=(oth) ; self[4] = oth end
338
+ def last_scan ; self[5] end ; def last_scan=(oth) ; self[5] = oth end
339
+ def filename_noext ; self[6] end ; def filename_noext=(oth) ; self[6] = oth end
340
+ def hits ; self[7] end ; def hits=(oth) ; self[7] = oth end
341
+
342
+ @@arr_size = ind_keys.size
343
+ ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
344
+ ind_keys.merge!(ind_keys_w_eq)
345
+ ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
346
+
347
+ def initialize(args=nil)
348
+ super(@@arr_size.size)
349
+ if args
350
+ if args.is_a? Hash
351
+ args.each do |k,v|
352
+ self[@@ind[k]] = v
353
+ end
354
+ end
355
+ end
356
+ end
357
+
358
+ def inspect
359
+ if first_scan
360
+ ins = "@first_scan=#{first_scan}, @last_scan=#{last_scan}, @filename_noext=#{filename_noext}, "
361
+ end
362
+ "<SRF::OUT @num_hits=#{num_hits}, @charge=#{charge}, @computer=#{computer}, @date_time=#{date_time}, #{ins}@hits=#{hits.inspect}>"
363
+ end
364
+
365
+ def from_handle(fh)
366
+ ## EMPTY out file is 96 bytes
367
+ ## each hit is 320 bytes
368
+ ## num_hits and charge:
369
+ st = fh.read(96)
370
+ self[0,4] = st.unpack("@36vvZ*@60Z*")
371
+ num_hits = self[0]
372
+
373
+ ar = Array.new(num_hits)
374
+ num_hits.times do |i|
375
+ ar[i] = SRF::OUT::Hit.new.from_handle(fh)
376
+ end
377
+ self[7] = ar
378
+
379
+ self
380
+ end
381
+
382
+ end
383
+
384
+ class SRF::OUT::Hit < Array
385
+ FourNullBytes_as_string = "\0\0\0\0"
386
+ #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
387
+ NewRecordStart = 0x01.chr + 0x00.chr
388
+ Sequest_record_start = "[SEQUEST]"
389
+
390
+ ## mucky details. This should be encapsulated into a class to inherit from, etc.
391
+ ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
392
+ ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
393
+ @@arr_size = ind_keys.size
394
+
395
+ def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
396
+ def deltacn ; self[1] end ; def deltacn=(oth) ; self[1] = oth end
397
+ def sp ; self[2] end ; def sp=(oth) ; self[2] = oth end
398
+ def xcorr ; self[3] end ; def xcorr=(oth) ; self[3] = oth end
399
+ def id ; self[4] end ; def id=(oth) ; self[4] = oth end
400
+ def rsp ; self[5] end ; def rsp=(oth) ; self[5] = oth end
401
+ def ions_matched ; self[6] end ; def ions_matched=(oth) ; self[6] = oth end
402
+ def ions_total ; self[7] end ; def ions_total=(oth) ; self[7] = oth end
403
+ def peptide ; self[8] end ; def peptide=(oth) ; self[8] = oth end
404
+ def reference ; self[9] end ; def reference=(oth) ; self[9] = oth end
405
+ # The number of total proteins sharing this peptide
406
+ def num_tot_proteins ; self[10] end ; def num_tot_proteins=(oth) ; self[10] = oth end
407
+
408
+ def initialize(args=nil)
409
+ super(@@arr_size.size)
410
+ if args
411
+ if args.is_a? Hash
412
+ args.each do |k,v|
413
+ self[@@ind[k]] = v
414
+ end
415
+ end
416
+ end
417
+ end
418
+
419
+ def inspect
420
+ "<SRF::OUT::Hit @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @peptide=#{peptide}, @reference=#{reference}, @num_tot_proteins=#{num_tot_proteins}>"
421
+ end
422
+
423
+ ## There must be a better way to do this.
424
+ ## We are checking that there are no additional protein references only
425
+ ## so that we are in register for the next reading
426
+ def read_extra_references(fh)
427
+ $SRF_OUT_HIT_FH_POS = fh.pos
428
+ st = fh.read(4)
429
+ #puts "HHH: " + st.unpack("H*").first
430
+ ## if we see 0000 0000 we are done
431
+ if st.unpack("a*").first == FourNullBytes_as_string
432
+ fh.pos = $SRF_OUT_HIT_FH_POS
433
+ return nil
434
+ end
435
+ # read in context of 4 bytes read above:
436
+
437
+ ## NOTE: in context of 4 bytes read above!
438
+ st = fh.read(36)
439
+ #p self
440
+ #puts "HHHH: " + st.unpack("H*").first
441
+ #puts st[34,2].unpack("H*").first
442
+ if st[34,2] == NewRecordStart
443
+ fh.pos = $SRF_OUT_HIT_FH_POS
444
+ return nil
445
+ end
446
+
447
+ ##if st.unpack("@22H*").first == NewRecordStart_as_hex
448
+ #if st[22,6] == NewRecordStart
449
+ # fh.pos = $SRF_OUT_HIT_FH_POS
450
+ # return nil
451
+ #end
452
+
453
+ # is this the end of the outfiles?
454
+ ## BACK to beginning of this section
455
+ fh.pos = $SRF_OUT_HIT_FH_POS
456
+ if fh.read(9) == Sequest_record_start
457
+ fh.pos = $SRF_OUT_HIT_FH_POS
458
+ return
459
+ end
460
+
461
+ ## we have extra references
462
+ self[10] += 1
463
+ fh.read(79)
464
+
465
+ #p self
466
+ #$glob ||= 0
467
+ #$glob += 1
468
+ #if $glob == 100
469
+ # abort
470
+ #end
471
+
472
+ read_extra_references(fh)
473
+ end
474
+
475
+
476
+ def from_handle(fh)
477
+ ## get the first part of the info
478
+ st = fh.read(320) ## read all the hit data
479
+ self[0,10] = st.unpack('@64Ex8ex12eeIx18vvvx8Z*@240Z*')
480
+ self[10] = 1
481
+ read_extra_references(fh)
482
+ self
483
+ end
484
+
485
+ end
486
+