mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,486 @@
1
+ require 'spec_id/sequest'
2
+
3
+ module BinaryReader
4
+ Null_char = "\0"[0] ## change for ruby 1.9 or 2.0
5
+ # extracts a string with all empty chars at the end stripped
6
+ # expects the filehandle to be at the proper location
7
+ def get_null_padded_string(fh,bytes)
8
+ st = fh.read(bytes)
9
+ # for empty declarations
10
+ if st[0] == Null_char
11
+ return ''
12
+ end
13
+ st.rstrip!
14
+ st
15
+ end
16
+ end
17
+
18
+ # class to extract information from <file>_dta.log files
19
+ class DTALog
20
+ # returns an array indexed by the dta file number (starting at 0)
21
+ # each entry is an array [first_scan, last_scan, dta_filename_noext]
22
+ # this is now obsolete since I found the scan # index at the end of the srf
23
+ # files
24
+ def self.dta_and_scans_by_dta_index(file)
25
+ dta_index = nil
26
+ final_scan = nil
27
+ dta_cnt = 0
28
+ re = /^ m/o
29
+ scan_line_re = /scan: (\d+) - (\d+), Datafile: (.*?) (.*)/o
30
+ other_dta_re = /Datafile: (.*?) /o
31
+ File.open(file) do |fh|
32
+ 10.times { fh.readline }
33
+ scan_range_line = fh.readline
34
+ if scan_range_line =~ /scan range\s+= \d+ - (\d+)/
35
+ # this is an overestimate (since MS scans have no dta, but that's OK)
36
+ dta_index = Array.new($1.to_i)
37
+ else
38
+ dta_index = []
39
+ end
40
+ 3.times { fh.readline }
41
+ fh.each do |line|
42
+ if line =~ re
43
+ if line =~ scan_line_re
44
+ first_scan = $1.to_i
45
+ last_scan = $2.to_i
46
+ the_rest = $4.dup
47
+ dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
48
+ dta_cnt += 1
49
+ if the_rest =~ other_dta_re
50
+ dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
51
+ dta_cnt += 1
52
+ end
53
+ end
54
+ break
55
+ end
56
+ end
57
+ fh.each do |line|
58
+ if line =~ scan_line_re
59
+ first_scan = $1.to_i
60
+ last_scan = $2.to_i
61
+ the_rest = $4.dup
62
+ dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
63
+ dta_cnt += 1
64
+ if the_rest =~ other_dta_re
65
+ dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
66
+ dta_cnt += 1
67
+ end
68
+ end
69
+ end
70
+ end
71
+ dta_index.compact! # remove those trailing nils
72
+ dta_index
73
+ end
74
+ end
75
+
76
+ class SRF
77
+
78
+ # a string 3.3 or 3.2
79
+ attr_accessor :version
80
+
81
+ attr_accessor :header
82
+ attr_accessor :dta_files
83
+ attr_accessor :out_files
84
+ attr_accessor :params
85
+ # a parallel array to dta_files and out_files where each entry is:
86
+ # [first_scan, last_scan, charge]
87
+ attr_accessor :index
88
+
89
+ def dta_start_byte
90
+ case @version
91
+ when '3.2' ; 3260
92
+ when '3.3' ; 3644
93
+ end
94
+ end
95
+
96
+ def initialize(filename=nil)
97
+ @dta_files = []
98
+ @out_files = []
99
+ if filename
100
+ from_file(filename)
101
+ end
102
+ end
103
+
104
+ # returns self
105
+ def from_file(filename)
106
+
107
+ File.open(filename, "rb") do |fh|
108
+ @header = SRF::Header.new.from_handle(fh)
109
+ @version = @header.version
110
+ @dta_files = read_dta_files(fh,@header.num_dta_files)
111
+ @out_files = read_out_files(fh,@header.num_dta_files)
112
+ @params = SpecID::Sequest::Params.new.parse_handle(fh)
113
+ fh.read(12) ## gap between last params entry and index
114
+ @index = read_scan_index(fh,@header.num_dta_files)
115
+ end
116
+ self
117
+ end
118
+
119
+ # returns an index where each entry is [first_scan, last_scan, charge]
120
+ def read_scan_index(fh, num)
121
+ ind_len = 24
122
+ index = Array.new(num)
123
+ unpack_string = 'III'
124
+ st = ''
125
+ ind_len.times do st << '0' end ## create a 24 byte string to receive data
126
+ num.times do |i|
127
+ fh.read(ind_len, st)
128
+ index[i] = st.unpack(unpack_string)
129
+ end
130
+ index
131
+ end
132
+
133
+ # given a zero indexed list where each entry is [first_scan, last_scan,
134
+ # dta_filename] updates the out info
135
+ # returns self
136
+ def update_out_scan_info_from_dta_log(dta_log)
137
+ index = DTALog.dta_and_scans_by_dta_index(dta_log)
138
+ @out_files.each_with_index do |ot,i|
139
+ ot[4,3] = index[i] #contingent on implementation of ot
140
+ end
141
+ self
142
+ end
143
+
144
+ # returns an array of dta_files
145
+ def read_dta_files(fh, num_files)
146
+ dta_files = Array.new(num_files)
147
+ start = dta_start_byte
148
+ unless fh.pos == start
149
+ fh.pos = start
150
+ end
151
+ header.num_dta_files.times do |i|
152
+ dta_files[i] = SRF::DTA.new.from_handle(fh)
153
+ end
154
+ dta_files
155
+ end
156
+
157
+ # filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
158
+ # will put the fh there.
159
+ def read_out_files(fh,number_files)
160
+ out_files = Array.new(number_files)
161
+ header.num_dta_files.times do |i|
162
+ #if i == header.num_dta_files - 2
163
+ # abort
164
+ #end
165
+ out_files[i] = SRF::OUT.new.from_handle(fh)
166
+ end
167
+ out_files
168
+ end
169
+
170
+ end
171
+
172
+ class SRF::Header
173
+ include BinaryReader
174
+
175
+ Start_byte = {
176
+ :enzyme => 438,
177
+ :ion_series => 694,
178
+ :model => 950,
179
+ :modifications => 982,
180
+ :raw_filename => 1822,
181
+ :db_filename => 2082,
182
+ :dta_log_filename => 2602,
183
+ :params_filename => 3122,
184
+ :sequest_log_filename => 3382,
185
+ }
186
+ Byte_length = {
187
+ :enzyme => 256,
188
+ :ion_series => 256,
189
+ :model => 32,
190
+ :modifications => 840,
191
+ :raw_filename => 260,
192
+ :db_filename => 520,
193
+ :dta_log_filename => 520,
194
+ :params_filename => 260,
195
+ :sequest_log_filename => 262, ## is this really 262?? or should be 260??
196
+ }
197
+ Byte_length_v32 = {
198
+ :modifications => 456,
199
+ }
200
+
201
+ # a SRF::DTAGen object
202
+ attr_accessor :version
203
+ attr_accessor :dta_gen
204
+ attr_accessor :enzyme
205
+ attr_accessor :ion_series
206
+ attr_accessor :model
207
+ attr_accessor :modifications
208
+ attr_accessor :raw_filename
209
+ attr_accessor :db_filename
210
+ attr_accessor :dta_log_filename
211
+ attr_accessor :params_filename
212
+ attr_accessor :sequest_log_filename
213
+
214
+ def num_dta_files
215
+ @dta_gen.num_dta_files
216
+ end
217
+
218
+ # sets fh to 0 and grabs the information it wants
219
+ def from_handle(fh)
220
+ st = fh.read(4)
221
+ @version = '3.' + st.unpack('I').first.to_s
222
+ @dta_gen = SRF::DTAGen.new.from_handle(fh)
223
+
224
+ ## get the rest of the info
225
+ byte_length = Byte_length.dup
226
+ byte_length.merge! Byte_length_v32 if @version == '3.2'
227
+
228
+ fh.pos = Start_byte[:enzyme]
229
+ [:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
230
+ send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param]) )
231
+ end
232
+ self
233
+ end
234
+
235
+ end
236
+
237
+ # the DTA Generation Params
238
+ class SRF::DTAGen
239
+
240
+ ## not sure if this is correct
241
+ attr_accessor :start_time
242
+ # group scan (not sure if this is correct)
243
+ attr_accessor :start_mass
244
+ attr_accessor :end_mass
245
+ attr_accessor :num_dta_files
246
+ attr_accessor :group_scan
247
+ ## not sure if this is correct
248
+ attr_accessor :min_group_count
249
+ attr_accessor :min_ion_threshold
250
+ #attr_accessor :intensity_threshold # can't find yet
251
+ #attr_accessor :precursor_tolerance # can't find yet
252
+ attr_accessor :start_scan
253
+ attr_accessor :end_scan
254
+
255
+ #
256
+ def from_handle(fh)
257
+ fh.pos = 0 if fh.pos != 0
258
+ st = fh.read(148)
259
+ (@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
260
+ self
261
+ end
262
+ end
263
+
264
+ class SRF::DTA < Array
265
+
266
+ # is this universal?
267
+ First_record_start_byte = 3644
268
+
269
+ ## mucky details. This should be encapsulated into a class to inherit from, etc.
270
+ ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
271
+ ind_keys = {:mh => 0, :dta_tic => 1, :num_peaks => 2, :charge => 3, :ms_level => 4, :unknown => 5, :total_num_possible_charge_states => 6, :peaks => 7}
272
+ @@arr_size = ind_keys.size
273
+
274
+ def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
275
+ def dta_tic ; self[1] end ; def dta_tic=(oth) ; self[1] = oth end
276
+ def num_peaks ; self[2] end ; def num_peaks=(oth) ; self[2] = oth end
277
+ def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
278
+ def ms_level ; self[4] end ; def ms_level=(oth) ; self[4] = oth end
279
+ def unknown ; self[5] end ; def unknown=(oth) ; self[5] = oth end
280
+ def total_num_possible_charge_states ; self[6] end ; def total_num_possible_charge_states=(oth) ; self[6] = oth end
281
+
282
+ # this is a byte array of floats, you can get the peaks out with
283
+ # unpack("e*")
284
+ def peaks ; self[7] end
285
+ # this is a byte array of floats, you can get the peaks out with
286
+ def peaks=(oth) ; self[7] = oth end
287
+
288
+ @@arr_size = ind_keys.size
289
+ ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
290
+ ind_keys.merge!(ind_keys_w_eq)
291
+ ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
292
+
293
+ def initialize(args=nil)
294
+ super(@@arr_size.size)
295
+ if args
296
+ if args.is_a? Hash
297
+ args.each do |k,v|
298
+ self[@@ind[k]] = v
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+ def inspect
305
+ peaks_st = 'nil'
306
+ if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
307
+ "<SRF::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
308
+ end
309
+
310
+ def from_handle(fh)
311
+ st = fh.read(24)
312
+ # get the bulk of the data in single unpack
313
+ self[0,7] = st.unpack("EeIvvvv")
314
+
315
+ # Scan numbers possibly hidden in this next sequence of bytes (I think)
316
+ st2 = fh.read(24)
317
+
318
+ num_bytes_to_read = num_peaks * 8
319
+ st3 = fh.read(num_bytes_to_read)
320
+ self[7] = st3
321
+ self
322
+ end
323
+
324
+ end
325
+
326
+
327
+ class SRF::OUT < Array
328
+ ## mucky details. This should be encapsulated into a class to inherit from, etc.
329
+ ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
330
+ ind_keys = {:num_hits => 0, :charge => 1, :computer => 2, :date_time => 3, :first_scan => 4, :last_scan => 5, :filename_noext => 6, :hits => 7}
331
+ @@arr_size = ind_keys.size
332
+
333
+ def num_hits ; self[0] end ; def num_hits=(oth) ; self[0] = oth end
334
+ def charge ; self[1] end ; def charge=(oth) ; self[1] = oth end
335
+ def computer ; self[2] end ; def computer=(oth) ; self[2] = oth end
336
+ def date_time ; self[3] end ; def date_time=(oth) ; self[3] = oth end
337
+ def first_scan ; self[4] end ; def first_scan=(oth) ; self[4] = oth end
338
+ def last_scan ; self[5] end ; def last_scan=(oth) ; self[5] = oth end
339
+ def filename_noext ; self[6] end ; def filename_noext=(oth) ; self[6] = oth end
340
+ def hits ; self[7] end ; def hits=(oth) ; self[7] = oth end
341
+
342
+ @@arr_size = ind_keys.size
343
+ ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
344
+ ind_keys.merge!(ind_keys_w_eq)
345
+ ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
346
+
347
+ def initialize(args=nil)
348
+ super(@@arr_size.size)
349
+ if args
350
+ if args.is_a? Hash
351
+ args.each do |k,v|
352
+ self[@@ind[k]] = v
353
+ end
354
+ end
355
+ end
356
+ end
357
+
358
+ def inspect
359
+ if first_scan
360
+ ins = "@first_scan=#{first_scan}, @last_scan=#{last_scan}, @filename_noext=#{filename_noext}, "
361
+ end
362
+ "<SRF::OUT @num_hits=#{num_hits}, @charge=#{charge}, @computer=#{computer}, @date_time=#{date_time}, #{ins}@hits=#{hits.inspect}>"
363
+ end
364
+
365
+ def from_handle(fh)
366
+ ## EMPTY out file is 96 bytes
367
+ ## each hit is 320 bytes
368
+ ## num_hits and charge:
369
+ st = fh.read(96)
370
+ self[0,4] = st.unpack("@36vvZ*@60Z*")
371
+ num_hits = self[0]
372
+
373
+ ar = Array.new(num_hits)
374
+ num_hits.times do |i|
375
+ ar[i] = SRF::OUT::Hit.new.from_handle(fh)
376
+ end
377
+ self[7] = ar
378
+
379
+ self
380
+ end
381
+
382
+ end
383
+
384
+ class SRF::OUT::Hit < Array
385
+ FourNullBytes_as_string = "\0\0\0\0"
386
+ #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
387
+ NewRecordStart = 0x01.chr + 0x00.chr
388
+ Sequest_record_start = "[SEQUEST]"
389
+
390
+ ## mucky details. This should be encapsulated into a class to inherit from, etc.
391
+ ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
392
+ ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
393
+ @@arr_size = ind_keys.size
394
+
395
+ def mh ; self[0] end ; def mh=(oth) ; self[0] = oth end
396
+ def deltacn ; self[1] end ; def deltacn=(oth) ; self[1] = oth end
397
+ def sp ; self[2] end ; def sp=(oth) ; self[2] = oth end
398
+ def xcorr ; self[3] end ; def xcorr=(oth) ; self[3] = oth end
399
+ def id ; self[4] end ; def id=(oth) ; self[4] = oth end
400
+ def rsp ; self[5] end ; def rsp=(oth) ; self[5] = oth end
401
+ def ions_matched ; self[6] end ; def ions_matched=(oth) ; self[6] = oth end
402
+ def ions_total ; self[7] end ; def ions_total=(oth) ; self[7] = oth end
403
+ def peptide ; self[8] end ; def peptide=(oth) ; self[8] = oth end
404
+ def reference ; self[9] end ; def reference=(oth) ; self[9] = oth end
405
+ # The number of total proteins sharing this peptide
406
+ def num_tot_proteins ; self[10] end ; def num_tot_proteins=(oth) ; self[10] = oth end
407
+
408
+ def initialize(args=nil)
409
+ super(@@arr_size.size)
410
+ if args
411
+ if args.is_a? Hash
412
+ args.each do |k,v|
413
+ self[@@ind[k]] = v
414
+ end
415
+ end
416
+ end
417
+ end
418
+
419
+ def inspect
420
+ "<SRF::OUT::Hit @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @peptide=#{peptide}, @reference=#{reference}, @num_tot_proteins=#{num_tot_proteins}>"
421
+ end
422
+
423
+ ## There must be a better way to do this.
424
+ ## We are checking that there are no additional protein references only
425
+ ## so that we are in register for the next reading
426
+ def read_extra_references(fh)
427
+ $SRF_OUT_HIT_FH_POS = fh.pos
428
+ st = fh.read(4)
429
+ #puts "HHH: " + st.unpack("H*").first
430
+ ## if we see 0000 0000 we are done
431
+ if st.unpack("a*").first == FourNullBytes_as_string
432
+ fh.pos = $SRF_OUT_HIT_FH_POS
433
+ return nil
434
+ end
435
+ # read in context of 4 bytes read above:
436
+
437
+ ## NOTE: in context of 4 bytes read above!
438
+ st = fh.read(36)
439
+ #p self
440
+ #puts "HHHH: " + st.unpack("H*").first
441
+ #puts st[34,2].unpack("H*").first
442
+ if st[34,2] == NewRecordStart
443
+ fh.pos = $SRF_OUT_HIT_FH_POS
444
+ return nil
445
+ end
446
+
447
+ ##if st.unpack("@22H*").first == NewRecordStart_as_hex
448
+ #if st[22,6] == NewRecordStart
449
+ # fh.pos = $SRF_OUT_HIT_FH_POS
450
+ # return nil
451
+ #end
452
+
453
+ # is this the end of the outfiles?
454
+ ## BACK to beginning of this section
455
+ fh.pos = $SRF_OUT_HIT_FH_POS
456
+ if fh.read(9) == Sequest_record_start
457
+ fh.pos = $SRF_OUT_HIT_FH_POS
458
+ return
459
+ end
460
+
461
+ ## we have extra references
462
+ self[10] += 1
463
+ fh.read(79)
464
+
465
+ #p self
466
+ #$glob ||= 0
467
+ #$glob += 1
468
+ #if $glob == 100
469
+ # abort
470
+ #end
471
+
472
+ read_extra_references(fh)
473
+ end
474
+
475
+
476
+ def from_handle(fh)
477
+ ## get the first part of the info
478
+ st = fh.read(320) ## read all the hit data
479
+ self[0,10] = st.unpack('@64Ex8ex12eeIx18vvvx8Z*@240Z*')
480
+ self[10] = 1
481
+ read_extra_references(fh)
482
+ self
483
+ end
484
+
485
+ end
486
+