mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/Rakefile +41 -14
  2. data/bin/bioworks2excel.rb +1 -1
  3. data/bin/bioworks_to_pepxml.rb +46 -59
  4. data/bin/fasta_shaker.rb +1 -1
  5. data/bin/filter.rb +6 -0
  6. data/bin/find_aa_freq.rb +23 -0
  7. data/bin/id_precision.rb +3 -2
  8. data/bin/mzxml_to_lmat.rb +2 -1
  9. data/bin/pepproph_filter.rb +1 -1
  10. data/bin/precision.rb +1 -1
  11. data/bin/protein_summary.rb +2 -451
  12. data/bin/raw_to_mzXML.rb +55 -0
  13. data/bin/srf_group.rb +26 -0
  14. data/changelog.txt +7 -0
  15. data/lib/align.rb +3 -3
  16. data/lib/fasta.rb +6 -1
  17. data/lib/gi.rb +9 -4
  18. data/lib/roc.rb +2 -0
  19. data/lib/sample_enzyme.rb +2 -1
  20. data/lib/spec/mzxml/parser.rb +2 -43
  21. data/lib/spec/mzxml.rb +65 -2
  22. data/lib/spec_id/aa_freqs.rb +10 -7
  23. data/lib/spec_id/bioworks.rb +67 -87
  24. data/lib/spec_id/filter.rb +794 -0
  25. data/lib/spec_id/precision.rb +29 -36
  26. data/lib/spec_id/proph.rb +5 -3
  27. data/lib/spec_id/protein_summary.rb +459 -0
  28. data/lib/spec_id/sequest.rb +323 -271
  29. data/lib/spec_id/srf.rb +189 -135
  30. data/lib/spec_id.rb +276 -227
  31. data/lib/spec_id_xml.rb +101 -0
  32. data/lib/toppred.rb +18 -0
  33. data/script/degenerate_peptides.rb +47 -0
  34. data/script/filter-peps.rb +5 -1
  35. data/test/tc_align.rb +1 -1
  36. data/test/tc_bioworks.rb +25 -22
  37. data/test/tc_bioworks_to_pepxml.rb +37 -4
  38. data/test/tc_fasta.rb +3 -1
  39. data/test/tc_fasta_shaker.rb +8 -6
  40. data/test/tc_filter.rb +203 -0
  41. data/test/tc_gi.rb +6 -9
  42. data/test/tc_id_precision.rb +31 -0
  43. data/test/tc_mzxml.rb +8 -6
  44. data/test/tc_peptide_parent_times.rb +2 -1
  45. data/test/tc_precision.rb +1 -1
  46. data/test/tc_proph.rb +5 -5
  47. data/test/tc_protein_summary.rb +36 -13
  48. data/test/tc_sequest.rb +78 -33
  49. data/test/tc_spec_id.rb +128 -6
  50. data/test/tc_srf.rb +84 -38
  51. metadata +67 -62
  52. data/bin/fasta_cat.rb +0 -39
  53. data/bin/fasta_cat_mod.rb +0 -59
  54. data/bin/fasta_mod.rb +0 -57
  55. data/bin/filter_spec_id.rb +0 -365
  56. data/bin/raw2mzXML.rb +0 -21
  57. data/script/gen_database_searching.rb +0 -258
@@ -0,0 +1,794 @@
1
+
2
+ require 'spec_id'
3
+ require 'optparse'
4
+ require 'ostruct'
5
+ require 'spec_id/aa_freqs'
6
+ require 'shuffle'
7
+ require 'vec'
8
+ require 'table'
9
+
10
+
11
+ ########################################################
12
+ WRITE_CYS_FIND = false
13
+ ########################################################
14
+
15
+
16
+ module SpecID
17
+ attr_accessor :orig_peps, :passed_peps, :passed_prots
18
+ # The filename passed in for filtering
19
+ attr_accessor :passed_in_filename
20
+
21
+ # returns the top peptide hits per file dta (first_scan + charge)
22
+ # all hits with same score as top score are returned
23
+ # assumes that all fields are strings...
24
+ # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
25
+ # deletes the protein array (but not relevant proteins)
26
+ # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
27
+ # sets the @orig_peps attribute to those passing
28
+ def top_peps_prefilter!
29
+ ## Bioworks peps are text based and need to be transformed first
30
+ if peps.first.is_a? Bioworks::Pep
31
+ peps.each do |pep|
32
+ pep.xcorr = pep.xcorr.to_f
33
+ pep.deltacn = pep.deltacn.to_f
34
+ pep.deltamass = pep.deltamass.to_f
35
+ pep.mass = pep.mass.to_f
36
+ pep.charge = pep.charge.to_i
37
+ pep.first_scan = pep.first_scan.to_i
38
+ end
39
+ end
40
+ ## Srf Peps need no transformation!
41
+
42
+ # get the top peptide by firstscan/charge (equivalent to .out files)
43
+ top_peps = []
44
+ self.peps.hash_by {|pep| [pep.base_name, pep.first_scan, pep.charge]}.values.map do |v|
45
+ #self.peps.hash_by {|pep| [pep.aaseq, pep.charge]}.values.map do |v|
46
+ best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
47
+ top_score = best_to_worst.first.xcorr
48
+ best_to_worst.each do |pep|
49
+ if pep.xcorr == top_score
50
+ top_peps << pep
51
+ else ; break
52
+ end
53
+ end
54
+ end
55
+ @orig_peps = top_peps
56
+ end
57
+
58
+ # (xcorr1, xcorr2, xcorr3, deltacn, ppm)
59
+ # interface very unstable. For now, keeping it very loose...
60
+ # assumed that peptide xcorr, deltacn, deltamass, mass, ppm are Floats
61
+ # assumed that peptide charge is Integer
62
+ # returns peps_passed
63
+ # must respond to 'peps'
64
+ # DOES NOT UPDATE the prot.peps attribute!!
65
+ def filter_sequest(args, include_deltacnstar=false)
66
+ (x1, x2, x3, deltacn, ppm) = args
67
+ self.peps.select do |pep|
68
+ # have to add the upper limit to deltacn because the lowest score is often
69
+ # assigned a 1.10 in bioworks!
70
+ pep_deltacn = pep.deltacn
71
+ pep_charge = pep.charge
72
+
73
+ ## The outer parentheses are critical to getting the correct answer!
74
+ passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
75
+
76
+ if passing
77
+ if !include_deltacnstar && pep_deltacn > 1.0
78
+ false
79
+ else
80
+ true
81
+ end
82
+ else
83
+ false
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ # given some list of SpecID::Pep based objects, finds the list of proteins
90
+ # associated with those peptides
91
+ # update_prot_peps => when true, updates prot.peps attribute given the list
92
+ # of pephits
93
+ # kind =
94
+ # :no_update (current proteins are returned, but their peps attribute
95
+ # is not updated)
96
+ # :update (current proteins returned with peps attribute updated)
97
+ # :new (new proteins are created complete with peps attribute)
98
+ def self.passing_proteins(pephits, kind=:no_update)
99
+
100
+ orig_pephits_prts = []
101
+ if kind == :new
102
+ new_prots = {}
103
+ pephits.each_with_index do |pep,i|
104
+ orig_pephits_prts[i] = pep.prots
105
+ peps_new_prts = pep.prots.map do |prt|
106
+ if new_prots.key? prt.reference
107
+ already_exists = new_prots[prt.reference]
108
+ else
109
+ np = prt.dup
110
+ np.peps = []
111
+ new_prots[np.reference] = np
112
+ np
113
+ end
114
+ end
115
+ pep.prots = peps_new_prts
116
+ end
117
+ end
118
+
119
+ if kind == :update
120
+ pephits.each do |pep|
121
+ pep.prots.each do |prt|
122
+ prt.peps = []
123
+ end
124
+ end
125
+ end
126
+
127
+ prot_set = {}
128
+ pephits.each do |pep|
129
+ prts = pep.prots
130
+ prts.each do |prt|
131
+ prot_set[ prt.reference ] = prt
132
+ end
133
+ if (kind == :update || kind == :new)
134
+ prts.each do |prt|
135
+ prt.peps << pep
136
+ end
137
+ end
138
+ end
139
+
140
+ ## Reset the original protein hits
141
+ if kind == :new
142
+ pephits.each_with_index do |pep,i|
143
+ pep.prots = orig_pephits_prts[i]
144
+ end
145
+ end
146
+
147
+ prot_set.values
148
+ end
149
+ end
150
+
151
+
152
+ class SpecID::Filter
153
+
154
+ NUM_PROT_FPPR_ITERATIONS = 10
155
+
156
+ def self.run_from_argv(argv)
157
+ obj = self.new
158
+ obj.run_from_argv(argv)
159
+ end
160
+
161
+ def run_from_argv(argv)
162
+ reply = get_options(argv)
163
+ return unless reply
164
+ files, opt = reply
165
+
166
+ #files = ARGV.map {|file| file }
167
+ #ARGV.clear
168
+
169
+ $stderr.puts "reading files (can take a minute or two for large files)..." if $VERBOSE
170
+ spec_ids = files.map do |file|
171
+ spec_id = file_to_prefiltered_spec_id(file, opt)
172
+ spec_id
173
+ end
174
+
175
+ ## the options hash
176
+ hash = {}
177
+ if opt.cys
178
+ if opt.cys[1]
179
+ opt.cys[1] = opt.cys[1].to_f
180
+ else
181
+ opt.cys[1] = 0.0
182
+ end
183
+ hash[:cys] = opt.cys
184
+ end
185
+
186
+
187
+ hash[:tps] =
188
+ if opt.tps
189
+ Fasta.new.read_file(opt.tps).prots.map do |prot|
190
+ prot.aaseq.chomp
191
+ end
192
+ end
193
+
194
+ hash[:dcy] =
195
+ if opt.false
196
+ new_spec_ids = []
197
+ prefixes_or_files = SpecID.extend_args(opt.false, files.size)
198
+ false_spec_ids = spec_ids.zip(prefixes_or_files).map do |spec_id, prefix_or_file|
199
+ if File.exist? prefix_or_file
200
+ new_spec_ids << spec_id
201
+ file_to_prefiltered_spec_id(prefix_or_file, opt)
202
+ else
203
+ (tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
204
+ fps_specid = spec_id.class.new
205
+ tps_specid = spec_id.class.new
206
+
207
+ fps_specid.peps = fps
208
+ tps_specid.peps = tps
209
+ new_spec_ids << tps_specid
210
+ fps_specid
211
+ end
212
+ end
213
+ spec_ids = new_spec_ids
214
+ false_spec_ids
215
+ end
216
+
217
+ defaults = {
218
+ :dcy => nil, # { spec_id => false_spec_id }
219
+ :cys => nil, # [cys_background_freq, cys_containing_freq]
220
+ :tps => nil,
221
+ :tmm => nil,
222
+ :occams_razor => opt.occams_razor,
223
+ }
224
+ args = defaults.merge hash
225
+
226
+
227
+ base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.ppm]
228
+
229
+ #################################################### <--
230
+ @fppr_methods = [:tmm, :tps, :cys, :dcy].select do |x|
231
+ args[x]
232
+ end
233
+ @groups_reporting = [:pephits, :aaseq, :prothits]
234
+ @groups_reporting.push( :occams_razor ) if args[:occams_razor]
235
+
236
+ @cat_labels = {
237
+ :pephits => 'pep_hits',
238
+ :prothits => 'prot_hits',
239
+ :aaseq => 'uniq_aa_hits',
240
+ :occams_razor => 'occams_prot_hits',
241
+ }
242
+ #################################################### <--
243
+
244
+ if opt.log
245
+ @logfh = File.open(opt.log, 'w')
246
+ else
247
+ @logfh = nil
248
+ end
249
+ #########################################
250
+ # PRINT FILTER LEGEND
251
+ out filter_legend(@fppr_methods)
252
+ #########################################
253
+
254
+ if opt.filters_file
255
+ lines = IO.readlines(opt.filters_file)
256
+ lines.each do |line|
257
+ line.chomp!
258
+ answer = prep_reply(line, base_args)
259
+ next if answer == false
260
+ base_args = answer
261
+ filter_round(spec_ids, base_args, args)
262
+ end
263
+ elsif opt.i
264
+ ## CLEAR ARGV (since otherwise, gets reads it!)
265
+ ARGV.clear
266
+ out interactive_help
267
+ reply = "nil"
268
+ loop do
269
+ b = base_args
270
+ out "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} ppm:#{b[4]}"
271
+ loop do
272
+ reply = gets.chomp
273
+ answer = prep_reply(reply, base_args)
274
+ if answer == false
275
+ out interactive_help
276
+ else
277
+ base_args = answer
278
+ filter_round(spec_ids, base_args, args)
279
+ break
280
+ end
281
+ end
282
+ end
283
+ else
284
+ filter_round(spec_ids, base_args, args)
285
+ end
286
+
287
+ if opt.log
288
+ @logfh.close
289
+ end
290
+
291
+ end
292
+
293
+ def out(string)
294
+ puts string
295
+ if @logfh
296
+ @logfh.puts string
297
+ end
298
+ end
299
+
300
+ # takes a fasta file or a string ( to be cast as a float )
301
+ def get_cys_freq(arg)
302
+ if File.exist? arg
303
+ SpecID::AAFreqs.new(arg).aafreqs[:C]
304
+ else
305
+ arg.to_f
306
+ end
307
+ end
308
+
309
+ # prints shortened number for display
310
+ def short(num)
311
+ sprintf( "%.3f",num)
312
+ end
313
+
314
+ # if good arguments, returns [files_array, options]
315
+ # else prints an error argument and returns nil
316
+ def get_options(argv)
317
+ dup_argv = argv.dup
318
+
319
+ opt = OpenStruct.new
320
+ opt.x1 = 1.0
321
+ opt.x2 = 1.5
322
+ opt.x3 = 2.0
323
+ opt.c = 0.1
324
+ opt.ppm = 1000.0
325
+ opt.false = false
326
+
327
+ opts = OptionParser.new do |op|
328
+ op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <bioworks.xml | bioworks.srg>"
329
+ op.separator("prints number of peptides/proteins ID'd at given thresholds")
330
+ op.separator "only top hit (by xcorr) per scan+charge is considered"
331
+
332
+ #op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
333
+ #op.separator(" (these are peptides who are the only hit with xcorr > 0)")
334
+ op.separator ""
335
+ op.on("-1", "--xcorr1 N", Float, "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v}
336
+ op.on("-2", "--xcorr2 N", Float, "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v}
337
+ op.on("-3", "--xcorr3 N", Float, "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v}
338
+ op.on("-c", "--deltacn N", Float, ">= deltacn d: #{opt.c}") {|v| opt.c = v}
339
+ op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
340
+ op.separator " if bioworks.xml, = 10^6deltamass/mass"
341
+ op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
342
+ op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
343
+ op.separator(" last given will apply to remaining files")
344
+ op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
345
+ v[0] = get_cys_freq(v[0])
346
+ opt.cys = v
347
+ end
348
+ op.separator(" freq = freq of cysteine as amino acid")
349
+ op.separator(" [bkg] = freq of cys containing peps d: 0.0")
350
+ op.on("--filters_file <file>", "(no -i) file with list of interactive input") {|v| opt.filters_file = v}
351
+ op.on("-t", "--tps <fasta>", "fasta file containing true hits") {|v| opt.tps = v }
352
+ #op.on("--tmm <toppred.out>", "toppred.out file with transmembr. topology") {|v| opt.tps = v }
353
+ op.on("--yaml", "spits out yaml-ized data") {|v| opt.tabulate = v }
354
+ op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
355
+ op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
356
+ op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
357
+ op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
358
+ op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
359
+ end
360
+
361
+ opts.parse!(dup_argv)
362
+
363
+ if dup_argv.size < 1
364
+ puts opts
365
+ return nil
366
+ end
367
+
368
+ [dup_argv, opt]
369
+ end
370
+
371
+ # (actual # with cys, expected # with cys, total#peptides,
372
+ # mean_fraction_of_cysteines_true, std)
373
+ # PepHit(C) = Peptide containing cysteine
374
+ # # Total PepHit(C) # Observed Bad Pep (C)
375
+ # ------------------ proportional_to ----------------------
376
+ # # Total PepHit # Total Bad PepHit (X)
377
+ # returns the fppr and the total number false
378
+ def fppr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
379
+
380
+ # the number of bona fide BAD cysteine hits
381
+ # (some of the cysteine hits (~5%) are true positives)
382
+
383
+ ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
384
+ if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
385
+ total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
386
+ fppr = total_number_false / total_peptides
387
+ [fppr, total_number_false]
388
+ end
389
+
390
+ # num_peps_per_protein is an array of the number of peptides per protein hit
391
+ # (these are the true hits)
392
+ # assumes that the number follows a gaussian distribution (binomial
393
+ # distributions tend toward gaussians, I believe, at large N)
394
+ # returns [mean_num_wrong, mean_fppr, stdev_num_wrong, stdev_fppr] fppr
395
+ def protein_fppr( num_peps_per_protein, number_false_peptides, num_iterations=10)
396
+
397
+ ## Check for more false peptides than peptides in our proteins:
398
+ total_protein_peps = 0
399
+ contained = num_peps_per_protein.each do |num|
400
+ total_protein_peps += num
401
+ end
402
+ ## All peptides will be wrong every time!
403
+ ## which means all proteins will be wrong every time!
404
+ if number_false_peptides >= total_protein_peps
405
+ # [all proteins wrong, fppr=1.0
406
+ return [num_peps_per_protein.size, 1.0, 0.0, 0.0]
407
+ end
408
+
409
+
410
+ num_prots = num_peps_per_protein.size
411
+ sample = VecD.new(num_iterations)
412
+ # indexed by peptide_number, pointing to a protein's peptide_count
413
+ # we shuffle the indices and then walk along until we are finished
414
+ # then we count how many proteins still have peptides
415
+
416
+ # we create an array to hold the peptide number for each protein, then we
417
+ # can reference the same entity when subtracting the peptides in the
418
+ # algorithm
419
+ cont_pep_num_per_prot_ars = (0...num_iterations).map do |i|
420
+ total_protein_peps = 0
421
+ contained = num_peps_per_protein.map do |num|
422
+ [num]
423
+ end
424
+ end
425
+
426
+ cont_num_by_pep_index_ars = cont_pep_num_per_prot_ars.map do |ar|
427
+ index_count = 0
428
+ pc_ar = []
429
+ ar.each do |contained_num|
430
+ contained_num.first.times do
431
+ pc_ar[index_count] = contained_num
432
+ index_count += 1
433
+ end
434
+ end
435
+ pc_ar
436
+ end
437
+
438
+ indices = (0...(cont_num_by_pep_index_ars.first.size)).map {|x| x }
439
+
440
+
441
+ (0...num_iterations).each do |i|
442
+ num_false = 0
443
+ indices.shuffle!
444
+ pc = cont_num_by_pep_index_ars[i]
445
+ number_false_peptides.times do |shuffle_index|
446
+ #big_i = indices[shuffle_index]
447
+ pc[indices[shuffle_index]][0] -= 1
448
+ end
449
+ cont_pep_num_per_prot_ars[i].each do |contained_pep_count|
450
+ if contained_pep_count.first == 0
451
+ num_false += 1
452
+ end
453
+ end
454
+ sample[i] = num_false
455
+ end
456
+ (mean_num_wrong, stdev) = sample.sample_stats
457
+ mean_fppr = mean_num_wrong / num_prots
458
+ stdev_fppr = stdev / num_prots
459
+ [mean_num_wrong, mean_fppr, stdev, stdev_fppr]
460
+ end
461
+
462
+ # returns [total_number_false, fppr, fraction_expected]
463
+ # also takes a hash of pephits keyed on :aaseq
464
+ def fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
465
+ (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(pephits, cys_bg_freq)
466
+ fraction_of_expected = ac.to_f/exp
467
+
468
+ (cys_fprate, total_num_false) = fppr_by_cysteines(ac, exp, pephits.size, cys_containing_freq)
469
+ [total_num_false, cys_fprate, fraction_of_expected]
470
+ end
471
+
472
+ def report_cysteines
473
+ #### UNDERWAY:::
474
+ cys_tps = pep_nums[i] - total_num_false
475
+
476
+ puts "CYSTEINE FPR: "
477
+ puts " (# peps containing >= 1 cysteines)"
478
+ puts " actual: #{ac}"
479
+ puts "fraction of expected: #{short(fraction_of_expected)}"
480
+ puts " expected # FP's: " + short(total_num_false)
481
+ puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
482
+
483
+ puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)"
484
+ puts "Combined Score & FPR"
485
+ puts "#{combined_score}\t#{cys_fprate}"
486
+ puts "Combined Score & fraction of expected"
487
+ #puts "#{combined_score} #{fraction_of_expected}"
488
+ to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
489
+ puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
490
+ puts(['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
491
+
492
+ end
493
+
494
+ def filter_legend(fppr_methods)
495
+ lines = []
496
+ lines << "Note: protein FPPR values are probably optimistic"
497
+ lines << "[this implementation assumes an equal likelihood that a false peptide"
498
+ lines << " comes from a protein with more hits as one with less (which is probably"
499
+ lines << " not the case)]"
500
+ lines << "* = deltacn_star = peptides with deltacn > 1.0 (no sibling hits)"
501
+ if fppr_methods.size > 0
502
+ lines << "Following are methods for determining false identification rate:"
503
+ lines << ['dcy=decoy', 'cys=cysteine', 'tps=known_true_positives'].join(" ")
504
+ ## when tmm is implemented:
505
+ #lines << ['dcy=decoy', 'cys=cysteine', 'tmm=transmembrane', 'tps=known_true_positives'].join(" ")
506
+ end
507
+ lines.join("\n")
508
+ end
509
+
510
+ # does this give aafreq from a fasta file?
511
+ # freq = cysteines.aafreqs[:C]
512
+
513
+ # returns [total_number_false, fppr]
514
+ # pephits can be an array or a hash of peptides keyed on :aaseq
515
+ def fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
516
+ if pephits.is_a? Hash
517
+ seqs = pephits.keys
518
+ else
519
+ seqs = pephits.map do |v|
520
+ v.aaseq
521
+ end
522
+ end
523
+ real_tps = 0
524
+ real_fps = 0
525
+ # could also do with partition
526
+ seqs.each do |pep_aaseq|
527
+ if true_pos_aaseqs_ar.any? {|prot_aaseq| prot_aaseq.include? pep_aaseq}
528
+ real_tps += 1
529
+ else
530
+ real_fps += 1
531
+ end
532
+ end
533
+ real_fppr = real_fps.to_f/pephits.size
534
+ [real_fps, real_fppr]
535
+ end
536
+
537
+ def filter_spec_id(spec_id, filter_args, args)
538
+ results_hash = {}
539
+ # that second argument is to update protein peptides
540
+ pephits = spec_id.filter_sequest(filter_args)
541
+
542
+ results_hash[:prothits] = SpecID.passing_proteins(pephits, :no_update)
543
+ results_hash[:pephits] = pephits
544
+ results_hash[:dcn_cnt] = pephits.select{|v| v.deltacn > 1.0}.size
545
+ # be aware that this is a hash keyed by aaseq and values of arrays of
546
+ # peptides sharing the same aaseq!
547
+ results_hash[:aaseq] = pephits.hash_by(:aaseq)
548
+ results_hash
549
+ end
550
+
551
+ # returns [#FP, FPPR]
552
+ def dcy_fppr(pephits, false_pephits)
553
+ fps = false_pephits.size
554
+ [fps, fps.to_f/pephits.size]
555
+ end
556
+
557
+ def tmm_fppr(pephits)
558
+ abort "NEED TO IMPLEMENT"
559
+ end
560
+
561
+ # returns [#FP, FPPR]
562
+ def cys_fppr(pephits, cys_bg_freq, cys_containing_freq)
563
+ (total_num_false, cys_fprate, fraction_of_expected) = fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
564
+ [total_num_false, cys_fprate]
565
+ end
566
+
567
+ def tps_fppr(pephits, true_pos_aaseqs_ar)
568
+ fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
569
+ end
570
+
571
+ ## methods should be passed in like this 'cysteine' for cysteine_fppr
572
+ ## all methods should return [number_false, fppr]
573
+ ## returns a hash (by method) for each set of pephits
574
+ ## if :dcy is given as a method, then expects the false pephits array
575
+ def calculate_pep_fppr(pephits_ar, methods, args, false_pephits_ar=nil)
576
+ cnt = 0
577
+ pephits_ar.map do |ph|
578
+ hash = {}
579
+ methods.each do |mth|
580
+ case mth
581
+ when :dcy
582
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, false_pephits_ar[cnt])
583
+ when :cys
584
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, *(args[:cys]) )
585
+ when :tps
586
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, (args[:tps]) )
587
+ else
588
+ hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph)
589
+ end
590
+ end
591
+ cnt += 1
592
+ hash
593
+ end
594
+ end
595
+
596
+ # fpr is a SpecID obj that is the false positives
597
+ # cysteines holds an aafreqs object or nil
598
+ def filter_round(spec_ids, filter_args, args)
599
+
600
+ # push fpr on the end for the calculations
601
+ ## FILTER the NORMAL spec_id objects
602
+ little_tables = []
603
+ spec_ids.each_with_index do |spec_id, i|
604
+ normal_results = filter_spec_id(spec_id, filter_args, args)
605
+
606
+ ## FILTER the FALSE objects (if given)
607
+ false_results =
608
+ if args[:dcy]
609
+ little_args_hash = args.dup
610
+ false_results = filter_spec_id(args[:dcy][i], filter_args, little_args_hash)
611
+ end
612
+
613
+ ## HOW TO CALCULATE FPPR FOR EVERYTHING:
614
+ # pephits Fpephits C/Tpephits TPpephits
615
+ # uniqaa Funiqaa C/Tuniqaa TPuniqaa
616
+ # prothits ProtFPR(Fpephits, prothits) ProtFPR(C/Tpephits, prothits) ProtFPR(total-TPpephits, prothits)
617
+ # OccProthits ProtFPR(Funiqaa, OccProthits) ProtFPR(C/Tuniqaa, OccProthits) ProtFPR(total-TPuniqaa, OccProthits)
618
+ # C/T = cystein or Transmembrane method
619
+
620
+ ## set up false results array
621
+ if args[:dcy]
622
+ fr_ar = [false_results[:pephits], false_results[:aaseq]]
623
+ else
624
+ fr_ar = nil
625
+ end
626
+ (pephits_fppr_results, aaseq_fppr_results) = calculate_pep_fppr([normal_results[:pephits], normal_results[:aaseq]], @fppr_methods, args, fr_ar)
627
+
628
+ ## NORMAL prothits
629
+ ## update prothits peptides
630
+ updated_proteins = SpecID.passing_proteins(normal_results[:pephits], :update)
631
+ pep_cnt_arr = updated_proteins.map {|v| v.peps.size }
632
+
633
+ ## update occams prothits
634
+ if args[:occams_razor]
635
+ updated_occams_protein_triplets = SpecID::occams_razor(updated_proteins, true)
636
+ occams_pep_cnt_arr = updated_occams_protein_triplets.map {|v| v[1].size }
637
+ occams_prots = updated_occams_protein_triplets.map {|v| v[0] }
638
+ normal_results[:occams_razor] = occams_prots
639
+ end
640
+
641
+ ## note that the original prot.peps arrays are obliterated by this.
642
+ ## we would need to re-update if someone wanted these
643
+
644
+ prothits_fppr_results = {}
645
+ occams_results = {}
646
+ @fppr_methods.each do |mth|
647
+ prothits_fppr_results[mth] = protein_fppr(pep_cnt_arr, pephits_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS)
648
+ occams_results[mth] = protein_fppr(occams_pep_cnt_arr, aaseq_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) if args[:occams_razor]
649
+ end
650
+
651
+ fppr_results = {
652
+ :pephits => pephits_fppr_results,
653
+ :aaseq => aaseq_fppr_results,
654
+ :prothits => prothits_fppr_results,
655
+ }
656
+ fppr_results[:occams_razor] = occams_results if args[:occams_razor]
657
+
658
+ ## CHANGE ALL RESULTS INTO PERCENTAGES:
659
+ fppr_results.each do |bk,hash|
660
+ hash.each do |k,val|
661
+ hash[k][1] = 100.0 * val[1]
662
+ end
663
+ end
664
+ little_tables[i] = to_table( spec_id, args, normal_results, fppr_results, @groups_reporting, @fppr_methods, @cat_labels)
665
+ end
666
+
667
+ out filter_params_string(filter_args, @fppr_methods)
668
+ little_tables.each do |tbl|
669
+ out tbl.to_formatted_string(nil, ' ')
670
+ out "-----------------------------------------------\n"
671
+ end
672
+ #big_table(spec_ids, filter_args, args, normal_results, groups_reporting, fppr_results, cat_labels)
673
+
674
+ end
675
+
676
+
677
+
678
+ def filter_params_string(filter_args, fppr_methods)
679
+ (x1, x2, x3, deltacn, ppm) = filter_args
680
+ st = []
681
+ st << "=========================================================================="
682
+ st << " xcorr(1,2,3) >= #{x1},#{x2},#{x3} || deltacn >= #{deltacn} || ppm <= #{ppm} "
683
+ st << ''
684
+ st.join("\n")
685
+ #st = []
686
+ #st << ["xcorr(1,2,3) >= #{x1},#{x2},#{x3}", "deltacn >= #{deltacn}", "ppm <= #{ppm}"].join("\t")
687
+ #st
688
+ end
689
+
690
+ def to_table(spec_id, args, normal_results, fppr_results, groups_reporting, fppr_methods, cat_labels)
691
+ #table is in the form: { column heading => [ values ] }
692
+
693
+ title = spec_id.passed_in_filename
694
+ col_labels = ['num', *(fppr_methods.map{|v| "#{v}%" })]
695
+
696
+ row_labels = groups_reporting.map {|grp| cat_labels[grp]}
697
+ dt = groups_reporting.map do |grp|
698
+ line = [normal_results[grp].size]
699
+ fppr_methods.each do |mth|
700
+ line << fppr_results[grp][mth][1]
701
+ end
702
+ line
703
+ end
704
+
705
+ Table.new(dt, row_labels, col_labels, title)
706
+ #puts(['TABULATE:', combined_score, pep_tps, pep_fppr, real_tps, real_fppr, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
707
+ end
708
+
709
+ def combined_score(filter_args)
710
+ (x1, x2, x3, deltacn, ppm) = filter_args
711
+ combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)
712
+ end
713
+
714
+ # assumes its already chomped
715
+ # updates the 5 globals
716
+ def prep_reply(reply, base)
717
+ if reply == 'q' ; exit ; end
718
+ if reply =~ /^\s*$/
719
+ base
720
+ elsif reply
721
+ arr = reply.split(/\s+/)
722
+ to_change = []
723
+ to_change_hash = {}
724
+ arr.each do |it|
725
+ if it.include? ':'
726
+ (k,v) = it.split(':')
727
+ to_change_hash[k] = v
728
+ else
729
+ to_change << it
730
+ end
731
+ end
732
+ to_change.each_with_index do |tc,i|
733
+ begin
734
+ base[i] = tc.to_f
735
+ rescue NoMethodError
736
+ out "BAD ARG: #{tc}"
737
+ return false
738
+ end
739
+ end
740
+ to_change_hash.each do |k,v|
741
+ case k
742
+ when 'x1' ; base[0] = v
743
+ when 'x2' ; base[1] = v
744
+ when 'x3' ; base[2] = v
745
+ when 'dcn' ; base[3] = v
746
+ when 'ppm' ; base[4] = v
747
+ else
748
+ out "BAD ARG: #{k}:#{v}"
749
+ end
750
+ end
751
+ base.map {|v| v.to_f }
752
+ else
753
+ false
754
+ end
755
+ end
756
+
757
+ def file_to_prefiltered_spec_id(file, opt)
758
+ spec_id = nil
759
+ marshal_file = file + ".prefiltered.msh"
760
+ if File.exist?(marshal_file)
761
+ File.open(marshal_file) do |fh|
762
+ spec_id = Marshal.load(fh)
763
+ end
764
+ else
765
+ spec_id = SpecID.new(file)
766
+ spec_id.passed_in_filename = file
767
+ spec_id.top_peps_prefilter!
768
+ ## marshal it!
769
+ if opt.marshal
770
+ File.open(marshal_file, "w") do |fh|
771
+ Marshal.dump(spec_id,fh)
772
+ end
773
+ end
774
+ end
775
+ spec_id
776
+ end
777
+
778
+ def interactive_help
779
+ string = []
780
+ string << "********************************************************"
781
+ string << "INTERACTIVE FILTERING HELP:"
782
+ string << "enter: <x1> <x2> <x3> <dcn> <ppm>"
783
+ string << "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> ppm:<ppm>"
784
+ string << "or : dcn:<dcn>"
785
+ string << "or : <x1> <x2> ppm:<ppm>"
786
+ string << "etc..."
787
+ string << "<enter> to (re)run current values"
788
+ string << "'q' to quit"
789
+ string << "********************************************************"
790
+ string.join("\n")
791
+ end
792
+
793
+
794
+ end