mspire 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/Rakefile +41 -14
  2. data/bin/bioworks2excel.rb +1 -1
  3. data/bin/bioworks_to_pepxml.rb +46 -59
  4. data/bin/fasta_shaker.rb +1 -1
  5. data/bin/filter.rb +6 -0
  6. data/bin/find_aa_freq.rb +23 -0
  7. data/bin/id_precision.rb +3 -2
  8. data/bin/mzxml_to_lmat.rb +2 -1
  9. data/bin/pepproph_filter.rb +1 -1
  10. data/bin/precision.rb +1 -1
  11. data/bin/protein_summary.rb +2 -451
  12. data/bin/raw_to_mzXML.rb +55 -0
  13. data/bin/srf_group.rb +26 -0
  14. data/changelog.txt +7 -0
  15. data/lib/align.rb +3 -3
  16. data/lib/fasta.rb +6 -1
  17. data/lib/gi.rb +9 -4
  18. data/lib/roc.rb +2 -0
  19. data/lib/sample_enzyme.rb +2 -1
  20. data/lib/spec/mzxml/parser.rb +2 -43
  21. data/lib/spec/mzxml.rb +65 -2
  22. data/lib/spec_id/aa_freqs.rb +10 -7
  23. data/lib/spec_id/bioworks.rb +67 -87
  24. data/lib/spec_id/filter.rb +794 -0
  25. data/lib/spec_id/precision.rb +29 -36
  26. data/lib/spec_id/proph.rb +5 -3
  27. data/lib/spec_id/protein_summary.rb +459 -0
  28. data/lib/spec_id/sequest.rb +323 -271
  29. data/lib/spec_id/srf.rb +189 -135
  30. data/lib/spec_id.rb +276 -227
  31. data/lib/spec_id_xml.rb +101 -0
  32. data/lib/toppred.rb +18 -0
  33. data/script/degenerate_peptides.rb +47 -0
  34. data/script/filter-peps.rb +5 -1
  35. data/test/tc_align.rb +1 -1
  36. data/test/tc_bioworks.rb +25 -22
  37. data/test/tc_bioworks_to_pepxml.rb +37 -4
  38. data/test/tc_fasta.rb +3 -1
  39. data/test/tc_fasta_shaker.rb +8 -6
  40. data/test/tc_filter.rb +203 -0
  41. data/test/tc_gi.rb +6 -9
  42. data/test/tc_id_precision.rb +31 -0
  43. data/test/tc_mzxml.rb +8 -6
  44. data/test/tc_peptide_parent_times.rb +2 -1
  45. data/test/tc_precision.rb +1 -1
  46. data/test/tc_proph.rb +5 -5
  47. data/test/tc_protein_summary.rb +36 -13
  48. data/test/tc_sequest.rb +78 -33
  49. data/test/tc_spec_id.rb +128 -6
  50. data/test/tc_srf.rb +84 -38
  51. metadata +67 -62
  52. data/bin/fasta_cat.rb +0 -39
  53. data/bin/fasta_cat_mod.rb +0 -59
  54. data/bin/fasta_mod.rb +0 -57
  55. data/bin/filter_spec_id.rb +0 -365
  56. data/bin/raw2mzXML.rb +0 -21
  57. data/script/gen_database_searching.rb +0 -258
@@ -0,0 +1,794 @@
1
+
2
+ require 'spec_id'
3
+ require 'optparse'
4
+ require 'ostruct'
5
+ require 'spec_id/aa_freqs'
6
+ require 'shuffle'
7
+ require 'vec'
8
+ require 'table'
9
+
10
+
11
+ ########################################################
12
+ WRITE_CYS_FIND = false
13
+ ########################################################
14
+
15
+
16
+ module SpecID
17
+ attr_accessor :orig_peps, :passed_peps, :passed_prots
18
+ # The filename passed in for filtering
19
+ attr_accessor :passed_in_filename
20
+
21
+ # returns the top peptide hits per file dta (first_scan + charge)
22
+ # all hits with same score as top score are returned
23
+ # assumes that all fields are strings...
24
+ # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
25
+ # deletes the protein array (but not relevant proteins)
26
+ # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
27
+ # sets the @orig_peps attribute to those passing
28
+ def top_peps_prefilter!
29
+ ## Bioworks peps are text based and need to be transformed first
30
+ if peps.first.is_a? Bioworks::Pep
31
+ peps.each do |pep|
32
+ pep.xcorr = pep.xcorr.to_f
33
+ pep.deltacn = pep.deltacn.to_f
34
+ pep.deltamass = pep.deltamass.to_f
35
+ pep.mass = pep.mass.to_f
36
+ pep.charge = pep.charge.to_i
37
+ pep.first_scan = pep.first_scan.to_i
38
+ end
39
+ end
40
+ ## Srf Peps need no transformation!
41
+
42
+ # get the top peptide by firstscan/charge (equivalent to .out files)
43
+ top_peps = []
44
+ self.peps.hash_by {|pep| [pep.base_name, pep.first_scan, pep.charge]}.values.map do |v|
45
+ #self.peps.hash_by {|pep| [pep.aaseq, pep.charge]}.values.map do |v|
46
+ best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
47
+ top_score = best_to_worst.first.xcorr
48
+ best_to_worst.each do |pep|
49
+ if pep.xcorr == top_score
50
+ top_peps << pep
51
+ else ; break
52
+ end
53
+ end
54
+ end
55
+ @orig_peps = top_peps
56
+ end
57
+
58
+ # (xcorr1, xcorr2, xcorr3, deltacn, ppm)
59
+ # interface very unstable. For now, keeping it very loose...
60
+ # assumed that peptide xcorr, deltacn, deltamass, mass, ppm are Floats
61
+ # assumed that peptide charge is Integer
62
+ # returns peps_passed
63
+ # must respond to 'peps'
64
+ # DOES NOT UPDATE the prot.peps attribute!!
65
+ def filter_sequest(args, include_deltacnstar=false)
66
+ (x1, x2, x3, deltacn, ppm) = args
67
+ self.peps.select do |pep|
68
+ # have to add the upper limit to deltacn because the lowest score is often
69
+ # assigned a 1.10 in bioworks!
70
+ pep_deltacn = pep.deltacn
71
+ pep_charge = pep.charge
72
+
73
+ ## The outer parentheses are critical to getting the correct answer!
74
+ passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
75
+
76
+ if passing
77
+ if !include_deltacnstar && pep_deltacn > 1.0
78
+ false
79
+ else
80
+ true
81
+ end
82
+ else
83
+ false
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ # given some list of SpecID::Pep based objects, finds the list of proteins
90
+ # associated with those peptides
91
+ # update_prot_peps => when true, updates prot.peps attribute given the list
92
+ # of pephits
93
+ # kind =
94
+ # :no_update (current proteins are returned, but their peps attribute
95
+ # is not updated)
96
+ # :update (current proteins returned with peps attribute updated)
97
+ # :new (new proteins are created complete with peps attribute)
98
+ def self.passing_proteins(pephits, kind=:no_update)
99
+
100
+ orig_pephits_prts = []
101
+ if kind == :new
102
+ new_prots = {}
103
+ pephits.each_with_index do |pep,i|
104
+ orig_pephits_prts[i] = pep.prots
105
+ peps_new_prts = pep.prots.map do |prt|
106
+ if new_prots.key? prt.reference
107
+ already_exists = new_prots[prt.reference]
108
+ else
109
+ np = prt.dup
110
+ np.peps = []
111
+ new_prots[np.reference] = np
112
+ np
113
+ end
114
+ end
115
+ pep.prots = peps_new_prts
116
+ end
117
+ end
118
+
119
+ if kind == :update
120
+ pephits.each do |pep|
121
+ pep.prots.each do |prt|
122
+ prt.peps = []
123
+ end
124
+ end
125
+ end
126
+
127
+ prot_set = {}
128
+ pephits.each do |pep|
129
+ prts = pep.prots
130
+ prts.each do |prt|
131
+ prot_set[ prt.reference ] = prt
132
+ end
133
+ if (kind == :update || kind == :new)
134
+ prts.each do |prt|
135
+ prt.peps << pep
136
+ end
137
+ end
138
+ end
139
+
140
+ ## Reset the original protein hits
141
+ if kind == :new
142
+ pephits.each_with_index do |pep,i|
143
+ pep.prots = orig_pephits_prts[i]
144
+ end
145
+ end
146
+
147
+ prot_set.values
148
+ end
149
+ end
150
+
151
+
152
+ class SpecID::Filter
153
+
154
+ NUM_PROT_FPPR_ITERATIONS = 10
155
+
156
+ def self.run_from_argv(argv)
157
+ obj = self.new
158
+ obj.run_from_argv(argv)
159
+ end
160
+
161
+ def run_from_argv(argv)
162
+ reply = get_options(argv)
163
+ return unless reply
164
+ files, opt = reply
165
+
166
+ #files = ARGV.map {|file| file }
167
+ #ARGV.clear
168
+
169
+ $stderr.puts "reading files (can take a minute or two for large files)..." if $VERBOSE
170
+ spec_ids = files.map do |file|
171
+ spec_id = file_to_prefiltered_spec_id(file, opt)
172
+ spec_id
173
+ end
174
+
175
+ ## the options hash
176
+ hash = {}
177
+ if opt.cys
178
+ if opt.cys[1]
179
+ opt.cys[1] = opt.cys[1].to_f
180
+ else
181
+ opt.cys[1] = 0.0
182
+ end
183
+ hash[:cys] = opt.cys
184
+ end
185
+
186
+
187
+ hash[:tps] =
188
+ if opt.tps
189
+ Fasta.new.read_file(opt.tps).prots.map do |prot|
190
+ prot.aaseq.chomp
191
+ end
192
+ end
193
+
194
+ hash[:dcy] =
195
+ if opt.false
196
+ new_spec_ids = []
197
+ prefixes_or_files = SpecID.extend_args(opt.false, files.size)
198
+ false_spec_ids = spec_ids.zip(prefixes_or_files).map do |spec_id, prefix_or_file|
199
+ if File.exist? prefix_or_file
200
+ new_spec_ids << spec_id
201
+ file_to_prefiltered_spec_id(prefix_or_file, opt)
202
+ else
203
+ (tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
204
+ fps_specid = spec_id.class.new
205
+ tps_specid = spec_id.class.new
206
+
207
+ fps_specid.peps = fps
208
+ tps_specid.peps = tps
209
+ new_spec_ids << tps_specid
210
+ fps_specid
211
+ end
212
+ end
213
+ spec_ids = new_spec_ids
214
+ false_spec_ids
215
+ end
216
+
217
+ defaults = {
218
+ :dcy => nil, # { spec_id => false_spec_id }
219
+ :cys => nil, # [cys_background_freq, cys_containing_freq]
220
+ :tps => nil,
221
+ :tmm => nil,
222
+ :occams_razor => opt.occams_razor,
223
+ }
224
+ args = defaults.merge hash
225
+
226
+
227
+ base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.ppm]
228
+
229
+ #################################################### <--
230
+ @fppr_methods = [:tmm, :tps, :cys, :dcy].select do |x|
231
+ args[x]
232
+ end
233
+ @groups_reporting = [:pephits, :aaseq, :prothits]
234
+ @groups_reporting.push( :occams_razor ) if args[:occams_razor]
235
+
236
+ @cat_labels = {
237
+ :pephits => 'pep_hits',
238
+ :prothits => 'prot_hits',
239
+ :aaseq => 'uniq_aa_hits',
240
+ :occams_razor => 'occams_prot_hits',
241
+ }
242
+ #################################################### <--
243
+
244
+ if opt.log
245
+ @logfh = File.open(opt.log, 'w')
246
+ else
247
+ @logfh = nil
248
+ end
249
+ #########################################
250
+ # PRINT FILTER LEGEND
251
+ out filter_legend(@fppr_methods)
252
+ #########################################
253
+
254
+ if opt.filters_file
255
+ lines = IO.readlines(opt.filters_file)
256
+ lines.each do |line|
257
+ line.chomp!
258
+ answer = prep_reply(line, base_args)
259
+ next if answer == false
260
+ base_args = answer
261
+ filter_round(spec_ids, base_args, args)
262
+ end
263
+ elsif opt.i
264
+ ## CLEAR ARGV (since otherwise, gets reads it!)
265
+ ARGV.clear
266
+ out interactive_help
267
+ reply = "nil"
268
+ loop do
269
+ b = base_args
270
+ out "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} ppm:#{b[4]}"
271
+ loop do
272
+ reply = gets.chomp
273
+ answer = prep_reply(reply, base_args)
274
+ if answer == false
275
+ out interactive_help
276
+ else
277
+ base_args = answer
278
+ filter_round(spec_ids, base_args, args)
279
+ break
280
+ end
281
+ end
282
+ end
283
+ else
284
+ filter_round(spec_ids, base_args, args)
285
+ end
286
+
287
+ if opt.log
288
+ @logfh.close
289
+ end
290
+
291
+ end
292
+
293
+ def out(string)
294
+ puts string
295
+ if @logfh
296
+ @logfh.puts string
297
+ end
298
+ end
299
+
300
+ # takes a fasta file or a string ( to be cast as a float )
301
+ def get_cys_freq(arg)
302
+ if File.exist? arg
303
+ SpecID::AAFreqs.new(arg).aafreqs[:C]
304
+ else
305
+ arg.to_f
306
+ end
307
+ end
308
+
309
+ # prints shortened number for display
310
+ def short(num)
311
+ sprintf( "%.3f",num)
312
+ end
313
+
314
+ # if good arguments, returns [files_array, options]
315
+ # else prints an error argument and returns nil
316
+ def get_options(argv)
317
+ dup_argv = argv.dup
318
+
319
+ opt = OpenStruct.new
320
+ opt.x1 = 1.0
321
+ opt.x2 = 1.5
322
+ opt.x3 = 2.0
323
+ opt.c = 0.1
324
+ opt.ppm = 1000.0
325
+ opt.false = false
326
+
327
+ opts = OptionParser.new do |op|
328
+ op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <bioworks.xml | bioworks.srg>"
329
+ op.separator("prints number of peptides/proteins ID'd at given thresholds")
330
+ op.separator "only top hit (by xcorr) per scan+charge is considered"
331
+
332
+ #op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
333
+ #op.separator(" (these are peptides who are the only hit with xcorr > 0)")
334
+ op.separator ""
335
+ op.on("-1", "--xcorr1 N", Float, "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v}
336
+ op.on("-2", "--xcorr2 N", Float, "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v}
337
+ op.on("-3", "--xcorr3 N", Float, "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v}
338
+ op.on("-c", "--deltacn N", Float, ">= deltacn d: #{opt.c}") {|v| opt.c = v}
339
+ op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
340
+ op.separator " if bioworks.xml, = 10^6deltamass/mass"
341
+ op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
342
+ op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
343
+ op.separator(" last given will apply to remaining files")
344
+ op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
345
+ v[0] = get_cys_freq(v[0])
346
+ opt.cys = v
347
+ end
348
+ op.separator(" freq = freq of cysteine as amino acid")
349
+ op.separator(" [bkg] = freq of cys containing peps d: 0.0")
350
+ op.on("--filters_file <file>", "(no -i) file with list of interactive input") {|v| opt.filters_file = v}
351
+ op.on("-t", "--tps <fasta>", "fasta file containing true hits") {|v| opt.tps = v }
352
+ #op.on("--tmm <toppred.out>", "toppred.out file with transmembr. topology") {|v| opt.tps = v }
353
+ op.on("--yaml", "spits out yaml-ized data") {|v| opt.tabulate = v }
354
+ op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
355
+ op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
356
+ op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
357
+ op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
358
+ op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
359
+ end
360
+
361
+ opts.parse!(dup_argv)
362
+
363
+ if dup_argv.size < 1
364
+ puts opts
365
+ return nil
366
+ end
367
+
368
+ [dup_argv, opt]
369
+ end
370
+
371
+ # (actual # with cys, expected # with cys, total#peptides,
372
+ # mean_fraction_of_cysteines_true, std)
373
+ # PepHit(C) = Peptide containing cysteine
374
+ # # Total PepHit(C) # Observed Bad Pep (C)
375
+ # ------------------ proportional_to ----------------------
376
+ # # Total PepHit # Total Bad PepHit (X)
377
+ # returns the fppr and the total number false
378
+ def fppr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
379
+
380
+ # the number of bona fide BAD cysteine hits
381
+ # (some of the cysteine hits (~5%) are true positives)
382
+
383
+ ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
384
+ if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
385
+ total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
386
+ fppr = total_number_false / total_peptides
387
+ [fppr, total_number_false]
388
+ end
389
+
390
+ # num_peps_per_protein is an array of the number of peptides per protein hit
391
+ # (these are the true hits)
392
+ # assumes that the number follows a gaussian distribution (binomial
393
+ # distributions tend toward gaussians, I believe, at large N)
394
+ # returns [mean_num_wrong, mean_fppr, stdev_num_wrong, stdev_fppr] fppr
395
+ def protein_fppr( num_peps_per_protein, number_false_peptides, num_iterations=10)
396
+
397
+ ## Check for more false peptides than peptides in our proteins:
398
+ total_protein_peps = 0
399
+ contained = num_peps_per_protein.each do |num|
400
+ total_protein_peps += num
401
+ end
402
+ ## All peptides will be wrong every time!
403
+ ## which means all proteins will be wrong every time!
404
+ if number_false_peptides >= total_protein_peps
405
+ # [all proteins wrong, fppr=1.0
406
+ return [num_peps_per_protein.size, 1.0, 0.0, 0.0]
407
+ end
408
+
409
+
410
+ num_prots = num_peps_per_protein.size
411
+ sample = VecD.new(num_iterations)
412
+ # indexed by peptide_number, pointing to a protein's peptide_count
413
+ # we shuffle the indices and then walk along until we are finished
414
+ # then we count how many proteins still have peptides
415
+
416
+ # we create an array to hold the peptide number for each protein, then we
417
+ # can reference the same entity when subtracting the peptides in the
418
+ # algorithm
419
+ cont_pep_num_per_prot_ars = (0...num_iterations).map do |i|
420
+ total_protein_peps = 0
421
+ contained = num_peps_per_protein.map do |num|
422
+ [num]
423
+ end
424
+ end
425
+
426
+ cont_num_by_pep_index_ars = cont_pep_num_per_prot_ars.map do |ar|
427
+ index_count = 0
428
+ pc_ar = []
429
+ ar.each do |contained_num|
430
+ contained_num.first.times do
431
+ pc_ar[index_count] = contained_num
432
+ index_count += 1
433
+ end
434
+ end
435
+ pc_ar
436
+ end
437
+
438
+ indices = (0...(cont_num_by_pep_index_ars.first.size)).map {|x| x }
439
+
440
+
441
+ (0...num_iterations).each do |i|
442
+ num_false = 0
443
+ indices.shuffle!
444
+ pc = cont_num_by_pep_index_ars[i]
445
+ number_false_peptides.times do |shuffle_index|
446
+ #big_i = indices[shuffle_index]
447
+ pc[indices[shuffle_index]][0] -= 1
448
+ end
449
+ cont_pep_num_per_prot_ars[i].each do |contained_pep_count|
450
+ if contained_pep_count.first == 0
451
+ num_false += 1
452
+ end
453
+ end
454
+ sample[i] = num_false
455
+ end
456
+ (mean_num_wrong, stdev) = sample.sample_stats
457
+ mean_fppr = mean_num_wrong / num_prots
458
+ stdev_fppr = stdev / num_prots
459
+ [mean_num_wrong, mean_fppr, stdev, stdev_fppr]
460
+ end
461
+
462
+ # returns [total_number_false, fppr, fraction_expected]
463
+ # also takes a hash of pephits keyed on :aaseq
464
+ def fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
465
+ (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(pephits, cys_bg_freq)
466
+ fraction_of_expected = ac.to_f/exp
467
+
468
+ (cys_fprate, total_num_false) = fppr_by_cysteines(ac, exp, pephits.size, cys_containing_freq)
469
+ [total_num_false, cys_fprate, fraction_of_expected]
470
+ end
471
+
472
+ def report_cysteines
473
+ #### UNDERWAY:::
474
+ cys_tps = pep_nums[i] - total_num_false
475
+
476
+ puts "CYSTEINE FPR: "
477
+ puts " (# peps containing >= 1 cysteines)"
478
+ puts " actual: #{ac}"
479
+ puts "fraction of expected: #{short(fraction_of_expected)}"
480
+ puts " expected # FP's: " + short(total_num_false)
481
+ puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
482
+
483
+ puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)"
484
+ puts "Combined Score & FPR"
485
+ puts "#{combined_score}\t#{cys_fprate}"
486
+ puts "Combined Score & fraction of expected"
487
+ #puts "#{combined_score} #{fraction_of_expected}"
488
+ to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
489
+ puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
490
+ puts(['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
491
+
492
+ end
493
+
494
+ def filter_legend(fppr_methods)
495
+ lines = []
496
+ lines << "Note: protein FPPR values are probably optimistic"
497
+ lines << "[this implementation assumes an equal likelihood that a false peptide"
498
+ lines << " comes from a protein with more hits as one with less (which is probably"
499
+ lines << " not the case)]"
500
+ lines << "* = deltacn_star = peptides with deltacn > 1.0 (no sibling hits)"
501
+ if fppr_methods.size > 0
502
+ lines << "Following are methods for determining false identification rate:"
503
+ lines << ['dcy=decoy', 'cys=cysteine', 'tps=known_true_positives'].join(" ")
504
+ ## when tmm is implemented:
505
+ #lines << ['dcy=decoy', 'cys=cysteine', 'tmm=transmembrane', 'tps=known_true_positives'].join(" ")
506
+ end
507
+ lines.join("\n")
508
+ end
509
+
510
+ # does this give aafreq from a fasta file?
511
+ # freq = cysteines.aafreqs[:C]
512
+
513
+ # returns [total_number_false, fppr]
514
+ # pephits can be an array or a hash of peptides keyed on :aaseq
515
+ def fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
516
+ if pephits.is_a? Hash
517
+ seqs = pephits.keys
518
+ else
519
+ seqs = pephits.map do |v|
520
+ v.aaseq
521
+ end
522
+ end
523
+ real_tps = 0
524
+ real_fps = 0
525
+ # could also do with partition
526
+ seqs.each do |pep_aaseq|
527
+ if true_pos_aaseqs_ar.any? {|prot_aaseq| prot_aaseq.include? pep_aaseq}
528
+ real_tps += 1
529
+ else
530
+ real_fps += 1
531
+ end
532
+ end
533
+ real_fppr = real_fps.to_f/pephits.size
534
+ [real_fps, real_fppr]
535
+ end
536
+
537
+ def filter_spec_id(spec_id, filter_args, args)
538
+ results_hash = {}
539
+ # that second argument is to update protein peptides
540
+ pephits = spec_id.filter_sequest(filter_args)
541
+
542
+ results_hash[:prothits] = SpecID.passing_proteins(pephits, :no_update)
543
+ results_hash[:pephits] = pephits
544
+ results_hash[:dcn_cnt] = pephits.select{|v| v.deltacn > 1.0}.size
545
+ # be aware that this is a hash keyed by aaseq and values of arrays of
546
+ # peptides sharing the same aaseq!
547
+ results_hash[:aaseq] = pephits.hash_by(:aaseq)
548
+ results_hash
549
+ end
550
+
551
+ # returns [#FP, FPPR]
552
+ def dcy_fppr(pephits, false_pephits)
553
+ fps = false_pephits.size
554
+ [fps, fps.to_f/pephits.size]
555
+ end
556
+
557
+ def tmm_fppr(pephits)
558
+ abort "NEED TO IMPLEMENT"
559
+ end
560
+
561
+ # returns [#FP, FPPR]
562
+ def cys_fppr(pephits, cys_bg_freq, cys_containing_freq)
563
+ (total_num_false, cys_fprate, fraction_of_expected) = fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
564
+ [total_num_false, cys_fprate]
565
+ end
566
+
567
+ def tps_fppr(pephits, true_pos_aaseqs_ar)
568
+ fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
569
+ end
570
+
571
+ ## methods should be passed in like this 'cysteine' for cysteine_fppr
572
+ ## all methods should return [number_false, fppr]
573
+ ## returns a hash (by method) for each set of pephits
574
+ ## if :dcy is given as a method, then expects the false pephits array
575
+ def calculate_pep_fppr(pephits_ar, methods, args, false_pephits_ar=nil)
576
+ cnt = 0
577
+ pephits_ar.map do |ph|
578
+ hash = {}
579
+ methods.each do |mth|
580
+ case mth
581
+ when :dcy
582
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, false_pephits_ar[cnt])
583
+ when :cys
584
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, *(args[:cys]) )
585
+ when :tps
586
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, (args[:tps]) )
587
+ else
588
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph)
589
+ end
590
+ end
591
+ cnt += 1
592
+ hash
593
+ end
594
+ end
595
+
596
+ # fpr is a SpecID obj that is the false positives
597
+ # cysteines holds an aafreqs object or nil
598
+ def filter_round(spec_ids, filter_args, args)
599
+
600
+ # push fpr on the end for the calculations
601
+ ## FILTER the NORMAL spec_id objects
602
+ little_tables = []
603
+ spec_ids.each_with_index do |spec_id, i|
604
+ normal_results = filter_spec_id(spec_id, filter_args, args)
605
+
606
+ ## FILTER the FALSE objects (if given)
607
+ false_results =
608
+ if args[:dcy]
609
+ little_args_hash = args.dup
610
+ false_results = filter_spec_id(args[:dcy][i], filter_args, little_args_hash)
611
+ end
612
+
613
+ ## HOW TO CALCULATE FPPR FOR EVERYTHING:
614
+ # pephits Fpephits C/Tpephits TPpephits
615
+ # uniqaa Funiqaa C/Tuniqaa TPuniqaa
616
+ # prothits ProtFPR(Fpephits, prothits) ProtFPR(C/Tpephits, prothits) ProtFPR(total-TPpephits, prothits)
617
+ # OccProthits ProtFPR(Funiqaa, OccProthits) ProtFPR(C/Tuniqaa, OccProthits) ProtFPR(total-TPuniqaa, OccProthits)
618
+ # C/T = cystein or Transmembrane method
619
+
620
+ ## set up false results array
621
+ if args[:dcy]
622
+ fr_ar = [false_results[:pephits], false_results[:aaseq]]
623
+ else
624
+ fr_ar = nil
625
+ end
626
+ (pephits_fppr_results, aaseq_fppr_results) = calculate_pep_fppr([normal_results[:pephits], normal_results[:aaseq]], @fppr_methods, args, fr_ar)
627
+
628
+ ## NORMAL prothits
629
+ ## update prothits peptides
630
+ updated_proteins = SpecID.passing_proteins(normal_results[:pephits], :update)
631
+ pep_cnt_arr = updated_proteins.map {|v| v.peps.size }
632
+
633
+ ## update occams prothits
634
+ if args[:occams_razor]
635
+ updated_occams_protein_triplets = SpecID::occams_razor(updated_proteins, true)
636
+ occams_pep_cnt_arr = updated_occams_protein_triplets.map {|v| v[1].size }
637
+ occams_prots = updated_occams_protein_triplets.map {|v| v[0] }
638
+ normal_results[:occams_razor] = occams_prots
639
+ end
640
+
641
+ ## note that the original prot.peps arrays are obliterated by this.
642
+ ## we would need to re-update if someone wanted these
643
+
644
+ prothits_fppr_results = {}
645
+ occams_results = {}
646
+ @fppr_methods.each do |mth|
647
+ prothits_fppr_results[mth] = protein_fppr(pep_cnt_arr, pephits_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS)
648
+ occams_results[mth] = protein_fppr(occams_pep_cnt_arr, aaseq_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) if args[:occams_razor]
649
+ end
650
+
651
+ fppr_results = {
652
+ :pephits => pephits_fppr_results,
653
+ :aaseq => aaseq_fppr_results,
654
+ :prothits => prothits_fppr_results,
655
+ }
656
+ fppr_results[:occams_razor] = occams_results if args[:occams_razor]
657
+
658
+ ## CHANGE ALL RESULTS INTO PERCENTAGES:
659
+ fppr_results.each do |bk,hash|
660
+ hash.each do |k,val|
661
+ hash[k][1] = 100.0 * val[1]
662
+ end
663
+ end
664
+ little_tables[i] = to_table( spec_id, args, normal_results, fppr_results, @groups_reporting, @fppr_methods, @cat_labels)
665
+ end
666
+
667
+ out filter_params_string(filter_args, @fppr_methods)
668
+ little_tables.each do |tbl|
669
+ out tbl.to_formatted_string(nil, ' ')
670
+ out "-----------------------------------------------\n"
671
+ end
672
+ #big_table(spec_ids, filter_args, args, normal_results, groups_reporting, fppr_results, cat_labels)
673
+
674
+ end
675
+
676
+
677
+
678
+ def filter_params_string(filter_args, fppr_methods)
679
+ (x1, x2, x3, deltacn, ppm) = filter_args
680
+ st = []
681
+ st << "=========================================================================="
682
+ st << " xcorr(1,2,3) >= #{x1},#{x2},#{x3} || deltacn >= #{deltacn} || ppm <= #{ppm} "
683
+ st << ''
684
+ st.join("\n")
685
+ #st = []
686
+ #st << ["xcorr(1,2,3) >= #{x1},#{x2},#{x3}", "deltacn >= #{deltacn}", "ppm <= #{ppm}"].join("\t")
687
+ #st
688
+ end
689
+
690
+ def to_table(spec_id, args, normal_results, fppr_results, groups_reporting, fppr_methods, cat_labels)
691
+ #table is in the form: { column heading => [ values ] }
692
+
693
+ title = spec_id.passed_in_filename
694
+ col_labels = ['num', *(fppr_methods.map{|v| "#{v}%" })]
695
+
696
+ row_labels = groups_reporting.map {|grp| cat_labels[grp]}
697
+ dt = groups_reporting.map do |grp|
698
+ line = [normal_results[grp].size]
699
+ fppr_methods.each do |mth|
700
+ line << fppr_results[grp][mth][1]
701
+ end
702
+ line
703
+ end
704
+
705
+ Table.new(dt, row_labels, col_labels, title)
706
+ #puts(['TABULATE:', combined_score, pep_tps, pep_fppr, real_tps, real_fppr, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
707
+ end
708
+
709
+ def combined_score(filter_args)
710
+ (x1, x2, x3, deltacn, ppm) = filter_args
711
+ combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)
712
+ end
713
+
714
+ # assumes its already chomped
715
+ # updates the 5 globals
716
+ def prep_reply(reply, base)
717
+ if reply == 'q' ; exit ; end
718
+ if reply =~ /^\s*$/
719
+ base
720
+ elsif reply
721
+ arr = reply.split(/\s+/)
722
+ to_change = []
723
+ to_change_hash = {}
724
+ arr.each do |it|
725
+ if it.include? ':'
726
+ (k,v) = it.split(':')
727
+ to_change_hash[k] = v
728
+ else
729
+ to_change << it
730
+ end
731
+ end
732
+ to_change.each_with_index do |tc,i|
733
+ begin
734
+ base[i] = tc.to_f
735
+ rescue NoMethodError
736
+ out "BAD ARG: #{tc}"
737
+ return false
738
+ end
739
+ end
740
+ to_change_hash.each do |k,v|
741
+ case k
742
+ when 'x1' ; base[0] = v
743
+ when 'x2' ; base[1] = v
744
+ when 'x3' ; base[2] = v
745
+ when 'dcn' ; base[3] = v
746
+ when 'ppm' ; base[4] = v
747
+ else
748
+ out "BAD ARG: #{k}:#{v}"
749
+ end
750
+ end
751
+ base.map {|v| v.to_f }
752
+ else
753
+ false
754
+ end
755
+ end
756
+
757
+ def file_to_prefiltered_spec_id(file, opt)
758
+ spec_id = nil
759
+ marshal_file = file + ".prefiltered.msh"
760
+ if File.exist?(marshal_file)
761
+ File.open(marshal_file) do |fh|
762
+ spec_id = Marshal.load(fh)
763
+ end
764
+ else
765
+ spec_id = SpecID.new(file)
766
+ spec_id.passed_in_filename = file
767
+ spec_id.top_peps_prefilter!
768
+ ## marshal it!
769
+ if opt.marshal
770
+ File.open(marshal_file, "w") do |fh|
771
+ Marshal.dump(spec_id,fh)
772
+ end
773
+ end
774
+ end
775
+ spec_id
776
+ end
777
+
778
+ def interactive_help
779
+ string = []
780
+ string << "********************************************************"
781
+ string << "INTERACTIVE FILTERING HELP:"
782
+ string << "enter: <x1> <x2> <x3> <dcn> <ppm>"
783
+ string << "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> ppm:<ppm>"
784
+ string << "or : dcn:<dcn>"
785
+ string << "or : <x1> <x2> ppm:<ppm>"
786
+ string << "etc..."
787
+ string << "<enter> to (re)run current values"
788
+ string << "'q' to quit"
789
+ string << "********************************************************"
790
+ string.join("\n")
791
+ end
792
+
793
+
794
+ end