mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/lib/spec_id.rb CHANGED
@@ -7,7 +7,7 @@ require 'sample_enzyme' # for others
7
7
  require 'spec_id/bioworks'
8
8
  require 'spec_id/sequest'
9
9
  require 'spec_id/proph'
10
- require 'spec_id/false_positive_rate'
10
+ require 'spec_id/precision'
11
11
 
12
12
 
13
13
  class Mass
@@ -112,11 +112,12 @@ class SpecID
112
112
  "<#{self.class} #peps=\"#{peps.size}\">"
113
113
  end
114
114
 
115
- # returns the top peptide hits per dta (first_scan + charge)
115
+ # returns the top peptide hits per file dta (first_scan + charge)
116
116
  # all hits with same score as top score are returned
117
117
  # assumes that all fields are strings...
118
118
  # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
119
119
  # deletes the protein array (but not relevant proteins)
120
+ # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
120
121
  def top_peps_prefilter!
121
122
  peps.each do |pep|
122
123
  pep.xcorr = pep.xcorr.to_f
@@ -127,7 +128,8 @@ class SpecID
127
128
  end
128
129
  # get the top peptide by firstscan/charge (equivalent to .out files)
129
130
  top_peps = []
130
- self.peps.hash_by {|pep| [pep.first_scan.to_i, pep.charge.to_i]}.map do |k,v|
131
+ #self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
132
+ self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
131
133
  best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
132
134
  top_score = best_to_worst.first.xcorr
133
135
  best_to_worst.each do |pep|
@@ -158,6 +160,7 @@ class SpecID
158
160
  pep_deltacn = pep.deltacn
159
161
  pep_charge = pep.charge
160
162
  (pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
163
+ #truth = (pep_deltacn >= deltacn) and
161
164
  (
162
165
  (pep_charge == 1 && pep.xcorr >= x1) or
163
166
  (pep_charge == 2 && pep.xcorr >= x2) or
@@ -166,6 +169,8 @@ class SpecID
166
169
  ((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
167
170
  end
168
171
 
172
+ #deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
173
+
169
174
  hash = peps_passed.hash_by(:prot)
170
175
 
171
176
  prots_passed = hash.map do |prot,pep_arr|
@@ -173,14 +178,15 @@ class SpecID
173
178
  prot
174
179
  end
175
180
  [prots_passed, peps_passed]
181
+ #[prots_passed, peps_passed, deltacnstar_cnt]
176
182
  else
177
183
  abort "#{kind} not implemented"
178
184
  end
179
185
  end
180
186
 
181
187
  ## basically, this is the command line wrapper
182
- def self.false_positive_rate(argv)
183
- SpecID::FalsePositiveRate.new.run_cmd_line(argv)
188
+ def self.precision(argv)
189
+ SpecID::Precision.new.run_cmd_line(argv)
184
190
  end
185
191
 
186
192
 
@@ -266,16 +272,6 @@ class SpecID
266
272
  return tp, fp
267
273
  end
268
274
 
269
- # type_of_analysis can be (:precision|...)
270
- def area_under_curve(items, fp_prefix)
271
- if items == :prots
272
- (tp,fp) = classify_by_prefix(items, fp_prefix)
273
- (tp, prec, fpr2) = tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
274
-
275
-
276
- ############################################## HERERERERER!!!!
277
- end
278
- end
279
275
 
280
276
  # returns a proc for getting all probabilities so that an ascending sort
281
277
  # will put the best scores first
@@ -299,22 +295,43 @@ class SpecID
299
295
  end
300
296
  end
301
297
 
298
+ # sorts the probabilities and then
299
+ # calcs predicted number hits and precision for protein probabilities
300
+ # (summing probabilities)
301
+ # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
302
+ # called false positive rate
303
+ # SUM(1-probX)/#prots
304
+ def num_hits_and_ppv_for_protein_prophet_probabilities
305
+ current_sum_one_minus_prob = 0.0
306
+ num_prots = []
307
+ ppv = []
308
+ prot_cnt = 0
309
+ probs = prots.map {|v| v.probability}
310
+ sorted = probs.sort.reverse
311
+ sorted.each do |prob|
312
+ prot_cnt += 1
313
+ num_prots << prot_cnt
314
+ current_sum_one_minus_prob += 1.0 - prob
315
+ ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
316
+ # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
317
+ end
318
+ [num_prots, ppv]
319
+ end
320
+
302
321
  # convenience method for the common task of determining precision for
303
322
  # proteins (with decoy proteins found by prefix)
304
- # returns (tps1, precs, fprs)
305
- def tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
323
+ # returns (num_hits, precision)
324
+ def num_hits_and_ppv_for_prob(fp_prefix)
306
325
  regex = /^#{Regexp.escape(fp_prefix)}/
307
326
  prob_proc = probability_proc
308
327
  myproc = proc { |prt|
309
328
  if prt.reference =~ regex ; false
310
329
  else ; true end
311
330
  }
312
- tp, fp = rank_and_classify(:prots, prob_proc, myproc)
313
- tps1, precs = by_tps(:precision, tp, fp)
314
- tps2, fprs = by_tps(:fpr2_times2, tp, fp)
315
- if tps1 != tps2 ; puts "true positives not the same for precision and fpr2_times2. Exiting"
316
- end
317
- [tps1, precs, fprs]
331
+ real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
332
+
333
+ (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
334
+ [num_hits, precision]
318
335
  end
319
336
 
320
337
  def method_missing(symbol, *args)
@@ -389,11 +406,17 @@ class SpecID
389
406
  sorted_probabilities(peps)
390
407
  end
391
408
 
409
+ ##########################################################################
410
+ # WARNING! These might be dangerous to your health if there are multiple
411
+ # files collected in your bioworks file
412
+ ##########################################################################
413
+
392
414
  # (prob_list_by_min, prob_list_by_best10)
393
415
  # returns 2 sorted lists of probabilities based on:
394
416
  # 1. best peptide hit
395
417
  # 2. top 10 peptide hits
396
418
  # on a per scan basis
419
+ # NOTE: you may want to hash on base_name first!
397
420
  def pep_probs_by_scan
398
421
  hash = peps.hash_by(:first_scan, :last_scan)
399
422
  return min_and_best10(hash)
@@ -402,6 +425,7 @@ class SpecID
402
425
 
403
426
  #(prob_list_by_min, prob_list_by_best10)
404
427
  # same as pep_probs_by_scan but per charge state
428
+ # NOTE: you may want to hash on base_name first!
405
429
  def pep_probs_by_scan_charge
406
430
  hash = peps.hash_by(:first_scan, :last_scan, :charge)
407
431
  return min_and_best10(hash)
@@ -410,6 +434,7 @@ class SpecID
410
434
  # (prob_list_by_min)
411
435
  # hashes on seq-charge and returns the sorted list of probabilities of top
412
436
  # hit per seq-charge
437
+ # NOTE: you may want to hash on base_name first!
413
438
  def pep_probs_by_seq_charge
414
439
  hash = peps.hash_by(:sequence, :charge)
415
440
  min_peptides = hash.collect do |k,v|
@@ -418,6 +443,42 @@ class SpecID
418
443
  sorted_probabilities(min_peptides)
419
444
  end
420
445
 
446
+ ##########################################################################
447
+ # USE these if you have multiple files in your bioworks.xml file
448
+ ##########################################################################
449
+ # (prob_list_by_min, prob_list_by_best10)
450
+ # returns 2 sorted lists of probabilities based on:
451
+ # 1. best peptide hit
452
+ # 2. top 10 peptide hits
453
+ # on a per scan basis
454
+ # NOTE: you may want to hash on base_name first!
455
+ def pep_probs_by_bn_scan
456
+ hash = peps.hash_by(:base_name, :first_scan, :last_scan)
457
+ return min_and_best10(hash)
458
+ end
459
+
460
+
461
+ #(prob_list_by_min, prob_list_by_best10)
462
+ # same as pep_probs_by_scan but per charge state
463
+ # NOTE: you may want to hash on base_name first!
464
+ def pep_probs_by_bn_scan_charge
465
+ hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
466
+ return min_and_best10(hash)
467
+ end
468
+
469
+ # (prob_list_by_min)
470
+ # hashes on seq-charge and returns the sorted list of probabilities of top
471
+ # hit per seq-charge
472
+ # NOTE: you may want to hash on base_name first!
473
+ def pep_probs_by_bn_seq_charge
474
+ hash = peps.hash_by(:base_name, :sequence, :charge)
475
+ min_peptides = hash.collect do |k,v|
476
+ v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
477
+ end
478
+ sorted_probabilities(min_peptides)
479
+ end
480
+
481
+
421
482
  # A Generic spectraID protein
422
483
  class Prot
423
484
  # probability is always a float!
@@ -458,6 +519,23 @@ end
458
519
  # concatenation into a file
459
520
  module SpecIDXML
460
521
 
522
+ Special_chrs_hash = {
523
+ '"' => '&quot;',
524
+ '&' => '&amp;',
525
+ "'" => '&apos;',
526
+ '<' => '&lt;',
527
+ '>' => '&gt;',
528
+ }
529
+
530
+ # substitutes special xml chars
531
+ def escape_special_chars(string)
532
+ string.split('').map do |char|
533
+ if Special_chrs_hash.key? char ; Special_chrs_hash[char]
534
+ # if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
535
+ else ; char end
536
+ end.join
537
+ end
538
+
461
539
  $DEPTH = 0
462
540
 
463
541
  def tabs
@@ -486,6 +564,12 @@ module SpecIDXML
486
564
  "#{tabs}<#{element} #{att_string}/>\n"
487
565
  end
488
566
 
567
+ # requires that obj have attribute '@xml_element_name'
568
+ # displays all *instance_variables* (does not call methods!)
569
+ def short_element_xml_from_instance_vars(element_name)
570
+ string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
571
+ "#{tabs}<#{element_name} #{string}/>\n"
572
+ end
489
573
 
490
574
  # takes an element as a symbol and returns the
491
575
  def element_xml_no_atts(element)
data/release_notes.txt ADDED
@@ -0,0 +1,11 @@
1
+
2
+ Note two potentially significant bugs in the software corrected (see the
3
+ changelog). I haven't finished modifying the tests to reflect these changes,
4
+ but I wanted to get the faulty software off the top of the stack. A new
5
+ release will shortly follow that passes all tests. Use this release only as a
6
+ correction to the previous.
7
+
8
+ tests currently failing:
9
+ gi
10
+ spec_id
11
+ id_precision
@@ -0,0 +1,226 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ ## The yeast Scal db mean background is: 0.00984
4
+ ## The yeast Cysteine background freq is: 0.0131986582396467
5
+ pep_seq_re = /<search_hit .* peptide="(\w+)"/o
6
+ pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
7
+
8
+ if ARGV.size != 3
9
+ puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
10
+ puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
11
+ abort
12
+ end
13
+
14
+ def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
15
+ File.open(base_toplot, "w") do |fh|
16
+ fh.puts 'XYData'
17
+ fh.puts base
18
+ fh.puts title
19
+ fh.puts xaxis
20
+ fh.puts yaxis
21
+ cats.each do |ar|
22
+ fh.puts ar.join(" & ")
23
+ ar.each do |a|
24
+ fh.puts hash[a].join(" ")
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ ############################################################################
31
+ #### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
32
+ #### CHANGE HIM THERE (eventually we need to put him in a lib file)
33
+ # (actual # with cys, expected # with cys, total#peptides,
34
+ # mean_fraction_of_cysteines_true, std)
35
+ # PepHit(C) = Peptide containing cysteine
36
+ # # Total PepHit(C) # Observed Bad Pep (C)
37
+ # ------------------ proportional_to ----------------------
38
+ # # Total PepHit # Total Bad PepHit (X)
39
+ def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
40
+
41
+ # the number of bona fide BAD cysteine hits
42
+ # (some of the cysteine hits (~5%) are true positives)
43
+
44
+ ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
45
+ if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
46
+ total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
47
+ fpr = total_number_false / total_peptides
48
+ [fpr, total_number_false]
49
+ end
50
+ ############################################################################
51
+
52
+
53
+
54
+
55
+ (cysteine_background_freq, background_freq, file) = ARGV
56
+ cysteine_background_freq = cysteine_background_freq.to_f
57
+ background_freq = background_freq.to_f
58
+
59
+ seq_probs = []
60
+ last_seq_prob = nil
61
+ File.open(file) do |fh|
62
+ fh.each do |line|
63
+ if line =~ pep_seq_re
64
+ ar = Array.new(2)
65
+ ar[0] = $1
66
+ seq_probs << ar
67
+ last_seq_prob = ar
68
+ elsif line =~ pep_prob_re
69
+ last_seq_prob[1] = $1.to_f
70
+ end
71
+ end
72
+ end
73
+
74
+ #seq_probs.each do |seq|
75
+ # if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
76
+ # abort "BAD PARSING!!"
77
+ # end
78
+ #end
79
+ amino_acid_as_st = 'C'
80
+
81
+ sorted = seq_probs.sort_by {|v| v[1] }.reverse
82
+
83
+ ## traverse the peptides
84
+ actual_cys_containing_peps = 0
85
+ expected_cys_containing_peps = 0.0
86
+ current_sum_one_minus_prob = 0.0
87
+ prob_estimated_fpr = 0.0
88
+ pep_cnt = 0
89
+ one_minus_freq = 1.0 - cysteine_background_freq
90
+
91
+ ## tabulate:
92
+ pep_cnts = []
93
+ probs = []
94
+ prob_fprs = []
95
+ prob_tps = []
96
+ cys_fprs = []
97
+ cys_tps = []
98
+ fpr_diff = []
99
+
100
+
101
+ sorted.each do |ar|
102
+ pep_cnt += 1
103
+
104
+ pep = ar[0]
105
+ prob = ar[1]
106
+
107
+ ## Cysteine FPR: ##
108
+ # Expected:
109
+ expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
110
+ # Actual:
111
+ if pep.include?(amino_acid_as_st)
112
+ actual_cys_containing_peps += 1
113
+ end
114
+ (cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
115
+ cys_tp = pep_cnt.to_f - total_num_false_by_cys
116
+
117
+
118
+ ## FPR by prob: ##
119
+ # SUM(1-probX)/#peps
120
+ current_sum_one_minus_prob += 1.0 - prob
121
+ prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
122
+ prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
123
+
124
+ ## GRAB or report the data:
125
+ pep_cnts << pep_cnt
126
+ probs << prob
127
+ prob_fprs << prob_estimated_fpr
128
+ prob_tps << prob_tp
129
+ cys_fprs << cys_fpr
130
+ cys_tps << cys_tp
131
+ fpr_diff << prob_estimated_fpr - cys_fpr
132
+
133
+ #puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
134
+ end
135
+
136
+ hash = {
137
+ 'pep_cnts' => pep_cnts,
138
+ 'probs' => probs,
139
+ 'prob_fprs' => prob_fprs,
140
+ 'prob_tps' => prob_tps,
141
+ 'cys_fprs' => cys_fprs,
142
+ 'cys_tps' => cys_tps,
143
+ 'fpr_diff' => fpr_diff,
144
+ }
145
+
146
+
147
+ real_base = file.sub(/\.xml/,'')
148
+
149
+
150
+
151
+ ## TPS vs FPR
152
+ base = real_base.dup
153
+ base << "." << "tps_vs_fpr"
154
+ base_toplot = base + '.to_plot'
155
+ title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
156
+ xaxis = "TPs"
157
+ yaxis = "FPR"
158
+ cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
159
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
160
+
161
+ ## PEPHITS vs FPR
162
+ base = real_base.dup
163
+ base << "." << "num_pep_hits_vs_fpr"
164
+ base_toplot = base + '.to_plot'
165
+ title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
166
+ xaxis = "num peptide hits"
167
+ yaxis = "FPR"
168
+ cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
169
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
170
+
171
+ ## PEPHITS VS FPR DIFF
172
+ base = real_base.dup
173
+ base << "." << "num_pep_hits_vs_fpr_diff"
174
+ base_toplot = base + '.to_plot'
175
+ title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
176
+ xaxis = "num peptide hits"
177
+ yaxis = "FPR diff (prob - cysteine)"
178
+ cats = [['pep_cnts', 'fpr_diff']]
179
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
180
+
181
+ ## PROB VS FPR DIFF
182
+ base = real_base.dup
183
+ base << "." << "prob_vs_fpr_diff"
184
+ base_toplot = base + '.to_plot'
185
+ title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
186
+ xaxis = "peptide probability"
187
+ yaxis = "FPR diff (prob - cysteine)"
188
+ cats = [['probs', 'fpr_diff']]
189
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
190
+
191
+
192
+
193
+ =begin
194
+
195
+ returns [number_of_prots, actual_fpr]
196
+ def num_prots_above_fpr(prots, desired_fpr)
197
+ current_fpr_rate_percent = 0.0
198
+ previous_fpr_rate_percent = 0.0
199
+ current_sum_one_minus_prob = 0.0
200
+ proteins_within_fpr = 0
201
+ actual_fpr = nil
202
+ already_found = false
203
+ prot_cnt = 0
204
+ prots.each do |prot|
205
+ prot_cnt += 1
206
+ # SUM(1-probX)/#prots
207
+ current_sum_one_minus_prob += 1.0 - prot._probability.to_f
208
+ current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
209
+
210
+ if current_fpr_rate_percent > desired_fpr && !already_found
211
+ actual_fpr = previous_fpr_rate_percent
212
+ proteins_within_fpr = prot_cnt
213
+ already_found = true
214
+ end
215
+ previous_fpr_rate_percent = current_fpr_rate_percent
216
+ end
217
+ [proteins_within_fpr, actual_fpr]
218
+ end
219
+
220
+ =end
221
+
222
+
223
+
224
+
225
+
226
+
@@ -80,13 +80,13 @@ def number_passing(peps)
80
80
  np = {}
81
81
  np["PepProts"] = filter(peps).size
82
82
 
83
- by_scan_charge = peps.hash_by(:first_scan, :last_scan, :charge).values
83
+ by_scan_charge = peps.hash_by(:base_name, :first_scan, :last_scan, :charge).values
84
84
  analyze(by_scan_charge, "ScanCharge", np)
85
85
 
86
- by_scan = peps.hash_by(:first_scan, :last_scan).values
86
+ by_scan = peps.hash_by(:base_name, :first_scan, :last_scan).values
87
87
  analyze(by_scan, "Scan", np)
88
88
 
89
- by_seq_charge = peps.hash_by(:sequence, :charge).values
89
+ by_seq_charge = peps.hash_by(:base_name, :sequence, :charge).values
90
90
  analyze(by_seq_charge, "SeqCharge", np)
91
91
 
92
92
  np
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'vec'
4
+
5
+ # FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
6
+ # is 0.0101409563168847
7
+
8
+ # <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
9
+
10
+ def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
11
+ File.open(base_toplot, "w") do |fh|
12
+ fh.puts 'XYData'
13
+ fh.puts base
14
+ fh.puts title
15
+ fh.puts xaxis
16
+ fh.puts yaxis
17
+ cats.each do |ar|
18
+ fh.puts ar.join(" & ")
19
+ ar.each do |a|
20
+ fh.puts hash[a].join(" ")
21
+ end
22
+ end
23
+ end
24
+ system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
25
+ end
26
+
27
+ peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
28
+
29
+ unless ARGV.size == 2
30
+ abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
31
+ end
32
+
33
+ (cysteine_background_freq, file) = ARGV
34
+
35
+ # each pep = [nsp_prob, init_prob, SEQUENCE]
36
+ peps = []
37
+ File.open(file) do |fh|
38
+ fh.each do |line|
39
+ if line =~ peptide_re
40
+ peps << [$3.to_f,$2.to_f,$1]
41
+ end
42
+ end
43
+ end
44
+
45
+
46
+ amino_acid_as_st = 'C'
47
+ one_minus_freq = 1.0 - cysteine_background_freq.to_f
48
+ actual_cys_containing_peps = 0
49
+ expected_cys_containing_peps = 0.0
50
+ current_sum_one_minus_prob = 0.0
51
+ prob_estimated_fpr = 0.0
52
+ pep_cnt = 0
53
+
54
+ the_probs = []
55
+ the_fractions = []
56
+ special_probs = []
57
+
58
+
59
+
60
+
61
+ #peps.sort.reverse.each do |ar|
62
+ #peps.sort.each do |ar|
63
+ peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
64
+ (nsp_prob, init_prob, pep) = ar
65
+ ## Cysteine FPR: ##
66
+ # Expected:
67
+ expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
68
+ # Actual:
69
+ if pep.include?(amino_acid_as_st)
70
+ actual_cys_containing_peps += 1
71
+ end
72
+ fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
73
+
74
+ special_prob = (3.0 * nsp_prob) + init_prob
75
+
76
+ ## Get the final fraction
77
+ #if special_prob < 4.0
78
+ # #puts the_fractions.join(" ")
79
+ # puts the_fractions.last
80
+ # abort
81
+ #end
82
+
83
+ # gather data to plot
84
+ the_probs << nsp_prob
85
+ special_probs << special_prob
86
+ the_fractions << fraction_ac_exp
87
+
88
+ end
89
+
90
+
91
+
92
+ hash = {
93
+ 'probs' => the_probs,
94
+ 'fractions' => the_fractions,
95
+ 'special_probs' => special_probs,
96
+ }
97
+
98
+ real_base = file.sub(/\.xml/,'')
99
+
100
+
101
+ =begin
102
+ ## PROB VS FPR DIFF
103
+ base = real_base.dup
104
+ base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
105
+ base_toplot = base + '.to_plot'
106
+ title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
107
+ xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
108
+ yaxis = "fraction with cysteines (actual/expected)"
109
+ cats = [['probs', 'fractions']]
110
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
111
+ =end
112
+
113
+
114
+ =begin
115
+ ## PROB VS FPR DIFF
116
+ base = real_base.dup
117
+ base << "." << "prob_vs_actual_expected_fraction"
118
+ base_toplot = base + '.to_plot'
119
+ title = "peptide prob vs fraction with cysteines (actual/expected)"
120
+ xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
121
+ yaxis = "fraction with cysteines (actual/expected)"
122
+ cats = [['probs', 'fractions']]
123
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
124
+ =end
125
+
126
+ ## SPECIAL PROB VS FPR DIFF
127
+ base = real_base.dup
128
+ base << "." << "special_prob_vs_actual_expected_fraction"
129
+ base_toplot = base + '.to_plot'
130
+ title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
131
+ xaxis = "(3 * nsp_prob) + init_prob"
132
+ yaxis = "fraction with cysteines (actual/expected)"
133
+ cats = [['special_probs', 'fractions']]
134
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
135
+
136
+
137
+
@@ -109,10 +109,12 @@ def run_sequest ; "Run Sequest with a Normal and an Inverse Database
109
109
 
110
110
  If you don't already have one, here's how to make an inverse database:
111
111
 
112
- fasta_mod.rb invert <yourfile.fasta>
112
+ fasta_shaker.rb reverse <yourfile.fasta>
113
113
 
114
- This will create a file with the trailing tag '_INV.fasta'. Just type
115
- `fasta_mod.rb` for more details.
114
+ This will create a file with the trailing tag '_reverse.fasta'. Just type
115
+ `fasta_shaker.rb` for more details.
116
+
117
+ Run sequest with 'report duplicate references' set to >= 40
116
118
  "
117
119
  end
118
120
 
@@ -166,11 +168,13 @@ def run_sequest ; "Run Sequest with a Concatenated Inverse Database
166
168
 
167
169
  If you don't already have one, here's how to make one:
168
170
 
169
- fasta_cat_mod.rb invert <yourfile.fasta>
171
+ fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
172
+
173
+ This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'. Each
174
+ inverted protein name will be prefixed with 'INV_'. Just type
175
+ `fasta_shaker.rb` for more details.
170
176
 
171
- This will create a file with the trailing tag '_CAT_INV.fasta'. Each inverted
172
- protein name will be prefixed with 'INV_'. Just type `fasta_cat_mod.rb` for
173
- more details.
177
+ Run sequest with 'report duplicate references' set to >= 40
174
178
  "
175
179
  end
176
180