mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/lib/spec_id.rb CHANGED
@@ -7,7 +7,7 @@ require 'sample_enzyme' # for others
7
7
  require 'spec_id/bioworks'
8
8
  require 'spec_id/sequest'
9
9
  require 'spec_id/proph'
10
- require 'spec_id/false_positive_rate'
10
+ require 'spec_id/precision'
11
11
 
12
12
 
13
13
  class Mass
@@ -112,11 +112,12 @@ class SpecID
112
112
  "<#{self.class} #peps=\"#{peps.size}\">"
113
113
  end
114
114
 
115
- # returns the top peptide hits per dta (first_scan + charge)
115
+ # returns the top peptide hits per file dta (first_scan + charge)
116
116
  # all hits with same score as top score are returned
117
117
  # assumes that all fields are strings...
118
118
  # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
119
119
  # deletes the protein array (but not relevant proteins)
120
+ # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
120
121
  def top_peps_prefilter!
121
122
  peps.each do |pep|
122
123
  pep.xcorr = pep.xcorr.to_f
@@ -127,7 +128,8 @@ class SpecID
127
128
  end
128
129
  # get the top peptide by firstscan/charge (equivalent to .out files)
129
130
  top_peps = []
130
- self.peps.hash_by {|pep| [pep.first_scan.to_i, pep.charge.to_i]}.map do |k,v|
131
+ #self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
132
+ self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
131
133
  best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
132
134
  top_score = best_to_worst.first.xcorr
133
135
  best_to_worst.each do |pep|
@@ -158,6 +160,7 @@ class SpecID
158
160
  pep_deltacn = pep.deltacn
159
161
  pep_charge = pep.charge
160
162
  (pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
163
+ #truth = (pep_deltacn >= deltacn) and
161
164
  (
162
165
  (pep_charge == 1 && pep.xcorr >= x1) or
163
166
  (pep_charge == 2 && pep.xcorr >= x2) or
@@ -166,6 +169,8 @@ class SpecID
166
169
  ((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
167
170
  end
168
171
 
172
+ #deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
173
+
169
174
  hash = peps_passed.hash_by(:prot)
170
175
 
171
176
  prots_passed = hash.map do |prot,pep_arr|
@@ -173,14 +178,15 @@ class SpecID
173
178
  prot
174
179
  end
175
180
  [prots_passed, peps_passed]
181
+ #[prots_passed, peps_passed, deltacnstar_cnt]
176
182
  else
177
183
  abort "#{kind} not implemented"
178
184
  end
179
185
  end
180
186
 
181
187
  ## basically, this is the command line wrapper
182
- def self.false_positive_rate(argv)
183
- SpecID::FalsePositiveRate.new.run_cmd_line(argv)
188
+ def self.precision(argv)
189
+ SpecID::Precision.new.run_cmd_line(argv)
184
190
  end
185
191
 
186
192
 
@@ -266,16 +272,6 @@ class SpecID
266
272
  return tp, fp
267
273
  end
268
274
 
269
- # type_of_analysis can be (:precision|...)
270
- def area_under_curve(items, fp_prefix)
271
- if items == :prots
272
- (tp,fp) = classify_by_prefix(items, fp_prefix)
273
- (tp, prec, fpr2) = tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
274
-
275
-
276
- ############################################## HERERERERER!!!!
277
- end
278
- end
279
275
 
280
276
  # returns a proc for getting all probabilities so that an ascending sort
281
277
  # will put the best scores first
@@ -299,22 +295,43 @@ class SpecID
299
295
  end
300
296
  end
301
297
 
298
+ # sorts the probabilities and then
299
+ # calcs predicted number hits and precision for protein probabilities
300
+ # (summing probabilities)
301
+ # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
302
+ # called false positive rate
303
+ # SUM(1-probX)/#prots
304
+ def num_hits_and_ppv_for_protein_prophet_probabilities
305
+ current_sum_one_minus_prob = 0.0
306
+ num_prots = []
307
+ ppv = []
308
+ prot_cnt = 0
309
+ probs = prots.map {|v| v.probability}
310
+ sorted = probs.sort.reverse
311
+ sorted.each do |prob|
312
+ prot_cnt += 1
313
+ num_prots << prot_cnt
314
+ current_sum_one_minus_prob += 1.0 - prob
315
+ ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
316
+ # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
317
+ end
318
+ [num_prots, ppv]
319
+ end
320
+
302
321
  # convenience method for the common task of determining precision for
303
322
  # proteins (with decoy proteins found by prefix)
304
- # returns (tps1, precs, fprs)
305
- def tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
323
+ # returns (num_hits, precision)
324
+ def num_hits_and_ppv_for_prob(fp_prefix)
306
325
  regex = /^#{Regexp.escape(fp_prefix)}/
307
326
  prob_proc = probability_proc
308
327
  myproc = proc { |prt|
309
328
  if prt.reference =~ regex ; false
310
329
  else ; true end
311
330
  }
312
- tp, fp = rank_and_classify(:prots, prob_proc, myproc)
313
- tps1, precs = by_tps(:precision, tp, fp)
314
- tps2, fprs = by_tps(:fpr2_times2, tp, fp)
315
- if tps1 != tps2 ; puts "true positives not the same for precision and fpr2_times2. Exiting"
316
- end
317
- [tps1, precs, fprs]
331
+ real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
332
+
333
+ (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
334
+ [num_hits, precision]
318
335
  end
319
336
 
320
337
  def method_missing(symbol, *args)
@@ -389,11 +406,17 @@ class SpecID
389
406
  sorted_probabilities(peps)
390
407
  end
391
408
 
409
+ ##########################################################################
410
+ # WARNING! These might be dangerous to your health if there are multiple
411
+ # files collected in your bioworks file
412
+ ##########################################################################
413
+
392
414
  # (prob_list_by_min, prob_list_by_best10)
393
415
  # returns 2 sorted lists of probabilities based on:
394
416
  # 1. best peptide hit
395
417
  # 2. top 10 peptide hits
396
418
  # on a per scan basis
419
+ # NOTE: you may want to hash on base_name first!
397
420
  def pep_probs_by_scan
398
421
  hash = peps.hash_by(:first_scan, :last_scan)
399
422
  return min_and_best10(hash)
@@ -402,6 +425,7 @@ class SpecID
402
425
 
403
426
  #(prob_list_by_min, prob_list_by_best10)
404
427
  # same as pep_probs_by_scan but per charge state
428
+ # NOTE: you may want to hash on base_name first!
405
429
  def pep_probs_by_scan_charge
406
430
  hash = peps.hash_by(:first_scan, :last_scan, :charge)
407
431
  return min_and_best10(hash)
@@ -410,6 +434,7 @@ class SpecID
410
434
  # (prob_list_by_min)
411
435
  # hashes on seq-charge and returns the sorted list of probabilities of top
412
436
  # hit per seq-charge
437
+ # NOTE: you may want to hash on base_name first!
413
438
  def pep_probs_by_seq_charge
414
439
  hash = peps.hash_by(:sequence, :charge)
415
440
  min_peptides = hash.collect do |k,v|
@@ -418,6 +443,42 @@ class SpecID
418
443
  sorted_probabilities(min_peptides)
419
444
  end
420
445
 
446
+ ##########################################################################
447
+ # USE these if you have multiple files in your bioworks.xml file
448
+ ##########################################################################
449
+ # (prob_list_by_min, prob_list_by_best10)
450
+ # returns 2 sorted lists of probabilities based on:
451
+ # 1. best peptide hit
452
+ # 2. top 10 peptide hits
453
+ # on a per scan basis
454
+ # NOTE: you may want to hash on base_name first!
455
+ def pep_probs_by_bn_scan
456
+ hash = peps.hash_by(:base_name, :first_scan, :last_scan)
457
+ return min_and_best10(hash)
458
+ end
459
+
460
+
461
+ #(prob_list_by_min, prob_list_by_best10)
462
+ # same as pep_probs_by_scan but per charge state
463
+ # NOTE: you may want to hash on base_name first!
464
+ def pep_probs_by_bn_scan_charge
465
+ hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
466
+ return min_and_best10(hash)
467
+ end
468
+
469
+ # (prob_list_by_min)
470
+ # hashes on seq-charge and returns the sorted list of probabilities of top
471
+ # hit per seq-charge
472
+ # NOTE: you may want to hash on base_name first!
473
+ def pep_probs_by_bn_seq_charge
474
+ hash = peps.hash_by(:base_name, :sequence, :charge)
475
+ min_peptides = hash.collect do |k,v|
476
+ v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
477
+ end
478
+ sorted_probabilities(min_peptides)
479
+ end
480
+
481
+
421
482
  # A Generic spectraID protein
422
483
  class Prot
423
484
  # probability is always a float!
@@ -458,6 +519,23 @@ end
458
519
  # concatenation into a file
459
520
  module SpecIDXML
460
521
 
522
+ Special_chrs_hash = {
523
+ '"' => '&quot;',
524
+ '&' => '&amp;',
525
+ "'" => '&apos;',
526
+ '<' => '&lt;',
527
+ '>' => '&gt;',
528
+ }
529
+
530
+ # substitutes special xml chars
531
+ def escape_special_chars(string)
532
+ string.split('').map do |char|
533
+ if Special_chrs_hash.key? char ; Special_chrs_hash[char]
534
+ # if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
535
+ else ; char end
536
+ end.join
537
+ end
538
+
461
539
  $DEPTH = 0
462
540
 
463
541
  def tabs
@@ -486,6 +564,12 @@ module SpecIDXML
486
564
  "#{tabs}<#{element} #{att_string}/>\n"
487
565
  end
488
566
 
567
+ # requires that obj have attribute '@xml_element_name'
568
+ # displays all *instance_variables* (does not call methods!)
569
+ def short_element_xml_from_instance_vars(element_name)
570
+ string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
571
+ "#{tabs}<#{element_name} #{string}/>\n"
572
+ end
489
573
 
490
574
  # takes an element as a symbol and returns the
491
575
  def element_xml_no_atts(element)
data/release_notes.txt ADDED
@@ -0,0 +1,11 @@
1
+
2
+ Note two potentially significant bugs in the software corrected (see the
3
+ changelog). I haven't finished modifying the tests to reflect these changes,
4
+ but I wanted to get the faulty software off the top of the stack. A new
5
+ release will shortly follow that passes all tests. Use this release only as a
6
+ correction to the previous.
7
+
8
+ tests currently failing:
9
+ gi
10
+ spec_id
11
+ id_precision
@@ -0,0 +1,226 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ ## The yeast Scal db mean background is: 0.00984
4
+ ## The yeast Cysteine background freq is: 0.0131986582396467
5
+ pep_seq_re = /<search_hit .* peptide="(\w+)"/o
6
+ pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
7
+
8
+ if ARGV.size != 3
9
+ puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
10
+ puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
11
+ abort
12
+ end
13
+
14
+ def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
15
+ File.open(base_toplot, "w") do |fh|
16
+ fh.puts 'XYData'
17
+ fh.puts base
18
+ fh.puts title
19
+ fh.puts xaxis
20
+ fh.puts yaxis
21
+ cats.each do |ar|
22
+ fh.puts ar.join(" & ")
23
+ ar.each do |a|
24
+ fh.puts hash[a].join(" ")
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ ############################################################################
31
+ #### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
32
+ #### CHANGE HIM THERE (eventually we need to put him in a lib file)
33
+ # (actual # with cys, expected # with cys, total#peptides,
34
+ # mean_fraction_of_cysteines_true, std)
35
+ # PepHit(C) = Peptide containing cysteine
36
+ # # Total PepHit(C) # Observed Bad Pep (C)
37
+ # ------------------ proportional_to ----------------------
38
+ # # Total PepHit # Total Bad PepHit (X)
39
+ def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
40
+
41
+ # the number of bona fide BAD cysteine hits
42
+ # (some of the cysteine hits (~5%) are true positives)
43
+
44
+ ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
45
+ if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
46
+ total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
47
+ fpr = total_number_false / total_peptides
48
+ [fpr, total_number_false]
49
+ end
50
+ ############################################################################
51
+
52
+
53
+
54
+
55
+ (cysteine_background_freq, background_freq, file) = ARGV
56
+ cysteine_background_freq = cysteine_background_freq.to_f
57
+ background_freq = background_freq.to_f
58
+
59
+ seq_probs = []
60
+ last_seq_prob = nil
61
+ File.open(file) do |fh|
62
+ fh.each do |line|
63
+ if line =~ pep_seq_re
64
+ ar = Array.new(2)
65
+ ar[0] = $1
66
+ seq_probs << ar
67
+ last_seq_prob = ar
68
+ elsif line =~ pep_prob_re
69
+ last_seq_prob[1] = $1.to_f
70
+ end
71
+ end
72
+ end
73
+
74
+ #seq_probs.each do |seq|
75
+ # if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
76
+ # abort "BAD PARSING!!"
77
+ # end
78
+ #end
79
+ amino_acid_as_st = 'C'
80
+
81
+ sorted = seq_probs.sort_by {|v| v[1] }.reverse
82
+
83
+ ## traverse the peptides
84
+ actual_cys_containing_peps = 0
85
+ expected_cys_containing_peps = 0.0
86
+ current_sum_one_minus_prob = 0.0
87
+ prob_estimated_fpr = 0.0
88
+ pep_cnt = 0
89
+ one_minus_freq = 1.0 - cysteine_background_freq
90
+
91
+ ## tabulate:
92
+ pep_cnts = []
93
+ probs = []
94
+ prob_fprs = []
95
+ prob_tps = []
96
+ cys_fprs = []
97
+ cys_tps = []
98
+ fpr_diff = []
99
+
100
+
101
+ sorted.each do |ar|
102
+ pep_cnt += 1
103
+
104
+ pep = ar[0]
105
+ prob = ar[1]
106
+
107
+ ## Cysteine FPR: ##
108
+ # Expected:
109
+ expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
110
+ # Actual:
111
+ if pep.include?(amino_acid_as_st)
112
+ actual_cys_containing_peps += 1
113
+ end
114
+ (cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
115
+ cys_tp = pep_cnt.to_f - total_num_false_by_cys
116
+
117
+
118
+ ## FPR by prob: ##
119
+ # SUM(1-probX)/#peps
120
+ current_sum_one_minus_prob += 1.0 - prob
121
+ prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
122
+ prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
123
+
124
+ ## GRAB or report the data:
125
+ pep_cnts << pep_cnt
126
+ probs << prob
127
+ prob_fprs << prob_estimated_fpr
128
+ prob_tps << prob_tp
129
+ cys_fprs << cys_fpr
130
+ cys_tps << cys_tp
131
+ fpr_diff << prob_estimated_fpr - cys_fpr
132
+
133
+ #puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
134
+ end
135
+
136
+ hash = {
137
+ 'pep_cnts' => pep_cnts,
138
+ 'probs' => probs,
139
+ 'prob_fprs' => prob_fprs,
140
+ 'prob_tps' => prob_tps,
141
+ 'cys_fprs' => cys_fprs,
142
+ 'cys_tps' => cys_tps,
143
+ 'fpr_diff' => fpr_diff,
144
+ }
145
+
146
+
147
+ real_base = file.sub(/\.xml/,'')
148
+
149
+
150
+
151
+ ## TPS vs FPR
152
+ base = real_base.dup
153
+ base << "." << "tps_vs_fpr"
154
+ base_toplot = base + '.to_plot'
155
+ title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
156
+ xaxis = "TPs"
157
+ yaxis = "FPR"
158
+ cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
159
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
160
+
161
+ ## PEPHITS vs FPR
162
+ base = real_base.dup
163
+ base << "." << "num_pep_hits_vs_fpr"
164
+ base_toplot = base + '.to_plot'
165
+ title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
166
+ xaxis = "num peptide hits"
167
+ yaxis = "FPR"
168
+ cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
169
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
170
+
171
+ ## PEPHITS VS FPR DIFF
172
+ base = real_base.dup
173
+ base << "." << "num_pep_hits_vs_fpr_diff"
174
+ base_toplot = base + '.to_plot'
175
+ title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
176
+ xaxis = "num peptide hits"
177
+ yaxis = "FPR diff (prob - cysteine)"
178
+ cats = [['pep_cnts', 'fpr_diff']]
179
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
180
+
181
+ ## PROB VS FPR DIFF
182
+ base = real_base.dup
183
+ base << "." << "prob_vs_fpr_diff"
184
+ base_toplot = base + '.to_plot'
185
+ title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
186
+ xaxis = "peptide probability"
187
+ yaxis = "FPR diff (prob - cysteine)"
188
+ cats = [['probs', 'fpr_diff']]
189
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
190
+
191
+
192
+
193
+ =begin
194
+
195
+ returns [number_of_prots, actual_fpr]
196
+ def num_prots_above_fpr(prots, desired_fpr)
197
+ current_fpr_rate_percent = 0.0
198
+ previous_fpr_rate_percent = 0.0
199
+ current_sum_one_minus_prob = 0.0
200
+ proteins_within_fpr = 0
201
+ actual_fpr = nil
202
+ already_found = false
203
+ prot_cnt = 0
204
+ prots.each do |prot|
205
+ prot_cnt += 1
206
+ # SUM(1-probX)/#prots
207
+ current_sum_one_minus_prob += 1.0 - prot._probability.to_f
208
+ current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
209
+
210
+ if current_fpr_rate_percent > desired_fpr && !already_found
211
+ actual_fpr = previous_fpr_rate_percent
212
+ proteins_within_fpr = prot_cnt
213
+ already_found = true
214
+ end
215
+ previous_fpr_rate_percent = current_fpr_rate_percent
216
+ end
217
+ [proteins_within_fpr, actual_fpr]
218
+ end
219
+
220
+ =end
221
+
222
+
223
+
224
+
225
+
226
+
@@ -80,13 +80,13 @@ def number_passing(peps)
80
80
  np = {}
81
81
  np["PepProts"] = filter(peps).size
82
82
 
83
- by_scan_charge = peps.hash_by(:first_scan, :last_scan, :charge).values
83
+ by_scan_charge = peps.hash_by(:base_name, :first_scan, :last_scan, :charge).values
84
84
  analyze(by_scan_charge, "ScanCharge", np)
85
85
 
86
- by_scan = peps.hash_by(:first_scan, :last_scan).values
86
+ by_scan = peps.hash_by(:base_name, :first_scan, :last_scan).values
87
87
  analyze(by_scan, "Scan", np)
88
88
 
89
- by_seq_charge = peps.hash_by(:sequence, :charge).values
89
+ by_seq_charge = peps.hash_by(:base_name, :sequence, :charge).values
90
90
  analyze(by_seq_charge, "SeqCharge", np)
91
91
 
92
92
  np
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'vec'
4
+
5
+ # FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
6
+ # is 0.0101409563168847
7
+
8
+ # <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
9
+
10
+ def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
11
+ File.open(base_toplot, "w") do |fh|
12
+ fh.puts 'XYData'
13
+ fh.puts base
14
+ fh.puts title
15
+ fh.puts xaxis
16
+ fh.puts yaxis
17
+ cats.each do |ar|
18
+ fh.puts ar.join(" & ")
19
+ ar.each do |a|
20
+ fh.puts hash[a].join(" ")
21
+ end
22
+ end
23
+ end
24
+ system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
25
+ end
26
+
27
+ peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
28
+
29
+ unless ARGV.size == 2
30
+ abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
31
+ end
32
+
33
+ (cysteine_background_freq, file) = ARGV
34
+
35
+ # each pep = [nsp_prob, init_prob, SEQUENCE]
36
+ peps = []
37
+ File.open(file) do |fh|
38
+ fh.each do |line|
39
+ if line =~ peptide_re
40
+ peps << [$3.to_f,$2.to_f,$1]
41
+ end
42
+ end
43
+ end
44
+
45
+
46
+ amino_acid_as_st = 'C'
47
+ one_minus_freq = 1.0 - cysteine_background_freq.to_f
48
+ actual_cys_containing_peps = 0
49
+ expected_cys_containing_peps = 0.0
50
+ current_sum_one_minus_prob = 0.0
51
+ prob_estimated_fpr = 0.0
52
+ pep_cnt = 0
53
+
54
+ the_probs = []
55
+ the_fractions = []
56
+ special_probs = []
57
+
58
+
59
+
60
+
61
+ #peps.sort.reverse.each do |ar|
62
+ #peps.sort.each do |ar|
63
+ peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
64
+ (nsp_prob, init_prob, pep) = ar
65
+ ## Cysteine FPR: ##
66
+ # Expected:
67
+ expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
68
+ # Actual:
69
+ if pep.include?(amino_acid_as_st)
70
+ actual_cys_containing_peps += 1
71
+ end
72
+ fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
73
+
74
+ special_prob = (3.0 * nsp_prob) + init_prob
75
+
76
+ ## Get the final fraction
77
+ #if special_prob < 4.0
78
+ # #puts the_fractions.join(" ")
79
+ # puts the_fractions.last
80
+ # abort
81
+ #end
82
+
83
+ # gather data to plot
84
+ the_probs << nsp_prob
85
+ special_probs << special_prob
86
+ the_fractions << fraction_ac_exp
87
+
88
+ end
89
+
90
+
91
+
92
+ hash = {
93
+ 'probs' => the_probs,
94
+ 'fractions' => the_fractions,
95
+ 'special_probs' => special_probs,
96
+ }
97
+
98
+ real_base = file.sub(/\.xml/,'')
99
+
100
+
101
+ =begin
102
+ ## PROB VS FPR DIFF
103
+ base = real_base.dup
104
+ base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
105
+ base_toplot = base + '.to_plot'
106
+ title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
107
+ xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
108
+ yaxis = "fraction with cysteines (actual/expected)"
109
+ cats = [['probs', 'fractions']]
110
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
111
+ =end
112
+
113
+
114
+ =begin
115
+ ## PROB VS FPR DIFF
116
+ base = real_base.dup
117
+ base << "." << "prob_vs_actual_expected_fraction"
118
+ base_toplot = base + '.to_plot'
119
+ title = "peptide prob vs fraction with cysteines (actual/expected)"
120
+ xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
121
+ yaxis = "fraction with cysteines (actual/expected)"
122
+ cats = [['probs', 'fractions']]
123
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
124
+ =end
125
+
126
+ ## SPECIAL PROB VS FPR DIFF
127
+ base = real_base.dup
128
+ base << "." << "special_prob_vs_actual_expected_fraction"
129
+ base_toplot = base + '.to_plot'
130
+ title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
131
+ xaxis = "(3 * nsp_prob) + init_prob"
132
+ yaxis = "fraction with cysteines (actual/expected)"
133
+ cats = [['special_probs', 'fractions']]
134
+ plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
135
+
136
+
137
+
@@ -109,10 +109,12 @@ def run_sequest ; "Run Sequest with a Normal and an Inverse Database
109
109
 
110
110
  If you don't already have one, here's how to make an inverse database:
111
111
 
112
- fasta_mod.rb invert <yourfile.fasta>
112
+ fasta_shaker.rb reverse <yourfile.fasta>
113
113
 
114
- This will create a file with the trailing tag '_INV.fasta'. Just type
115
- `fasta_mod.rb` for more details.
114
+ This will create a file with the trailing tag '_reverse.fasta'. Just type
115
+ `fasta_shaker.rb` for more details.
116
+
117
+ Run sequest with 'report duplicate references' set to >= 40
116
118
  "
117
119
  end
118
120
 
@@ -166,11 +168,13 @@ def run_sequest ; "Run Sequest with a Concatenated Inverse Database
166
168
 
167
169
  If you don't already have one, here's how to make one:
168
170
 
169
- fasta_cat_mod.rb invert <yourfile.fasta>
171
+ fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
172
+
173
+ This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'. Each
174
+ inverted protein name will be prefixed with 'INV_'. Just type
175
+ `fasta_shaker.rb` for more details.
170
176
 
171
- This will create a file with the trailing tag '_CAT_INV.fasta'. Each inverted
172
- protein name will be prefixed with 'INV_'. Just type `fasta_cat_mod.rb` for
173
- more details.
177
+ Run sequest with 'report duplicate references' set to >= 40
174
178
  "
175
179
  end
176
180