mspire 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/lib/spec_id.rb
CHANGED
@@ -7,7 +7,7 @@ require 'sample_enzyme' # for others
|
|
7
7
|
require 'spec_id/bioworks'
|
8
8
|
require 'spec_id/sequest'
|
9
9
|
require 'spec_id/proph'
|
10
|
-
require 'spec_id/
|
10
|
+
require 'spec_id/precision'
|
11
11
|
|
12
12
|
|
13
13
|
class Mass
|
@@ -112,11 +112,12 @@ class SpecID
|
|
112
112
|
"<#{self.class} #peps=\"#{peps.size}\">"
|
113
113
|
end
|
114
114
|
|
115
|
-
# returns the top peptide hits per dta (first_scan + charge)
|
115
|
+
# returns the top peptide hits per file dta (first_scan + charge)
|
116
116
|
# all hits with same score as top score are returned
|
117
117
|
# assumes that all fields are strings...
|
118
118
|
# converts xcorr, deltacn, deltamass, mass, and charge into numerical types
|
119
119
|
# deletes the protein array (but not relevant proteins)
|
120
|
+
# hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
|
120
121
|
def top_peps_prefilter!
|
121
122
|
peps.each do |pep|
|
122
123
|
pep.xcorr = pep.xcorr.to_f
|
@@ -127,7 +128,8 @@ class SpecID
|
|
127
128
|
end
|
128
129
|
# get the top peptide by firstscan/charge (equivalent to .out files)
|
129
130
|
top_peps = []
|
130
|
-
self.peps.hash_by {|pep| [pep.first_scan.to_i, pep.charge.to_i]}.map do |
|
131
|
+
#self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
|
132
|
+
self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
|
131
133
|
best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
|
132
134
|
top_score = best_to_worst.first.xcorr
|
133
135
|
best_to_worst.each do |pep|
|
@@ -158,6 +160,7 @@ class SpecID
|
|
158
160
|
pep_deltacn = pep.deltacn
|
159
161
|
pep_charge = pep.charge
|
160
162
|
(pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
|
163
|
+
#truth = (pep_deltacn >= deltacn) and
|
161
164
|
(
|
162
165
|
(pep_charge == 1 && pep.xcorr >= x1) or
|
163
166
|
(pep_charge == 2 && pep.xcorr >= x2) or
|
@@ -166,6 +169,8 @@ class SpecID
|
|
166
169
|
((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
|
167
170
|
end
|
168
171
|
|
172
|
+
#deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
|
173
|
+
|
169
174
|
hash = peps_passed.hash_by(:prot)
|
170
175
|
|
171
176
|
prots_passed = hash.map do |prot,pep_arr|
|
@@ -173,14 +178,15 @@ class SpecID
|
|
173
178
|
prot
|
174
179
|
end
|
175
180
|
[prots_passed, peps_passed]
|
181
|
+
#[prots_passed, peps_passed, deltacnstar_cnt]
|
176
182
|
else
|
177
183
|
abort "#{kind} not implemented"
|
178
184
|
end
|
179
185
|
end
|
180
186
|
|
181
187
|
## basically, this is the command line wrapper
|
182
|
-
def self.
|
183
|
-
SpecID::
|
188
|
+
def self.precision(argv)
|
189
|
+
SpecID::Precision.new.run_cmd_line(argv)
|
184
190
|
end
|
185
191
|
|
186
192
|
|
@@ -266,16 +272,6 @@ class SpecID
|
|
266
272
|
return tp, fp
|
267
273
|
end
|
268
274
|
|
269
|
-
# type_of_analysis can be (:precision|...)
|
270
|
-
def area_under_curve(items, fp_prefix)
|
271
|
-
if items == :prots
|
272
|
-
(tp,fp) = classify_by_prefix(items, fp_prefix)
|
273
|
-
(tp, prec, fpr2) = tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
|
274
|
-
|
275
|
-
|
276
|
-
############################################## HERERERERER!!!!
|
277
|
-
end
|
278
|
-
end
|
279
275
|
|
280
276
|
# returns a proc for getting all probabilities so that an ascending sort
|
281
277
|
# will put the best scores first
|
@@ -299,22 +295,43 @@ class SpecID
|
|
299
295
|
end
|
300
296
|
end
|
301
297
|
|
298
|
+
# sorts the probabilities and then
|
299
|
+
# calcs predicted number hits and precision for protein probabilities
|
300
|
+
# (summing probabilities)
|
301
|
+
# one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
|
302
|
+
# called false positive rate
|
303
|
+
# SUM(1-probX)/#prots
|
304
|
+
def num_hits_and_ppv_for_protein_prophet_probabilities
|
305
|
+
current_sum_one_minus_prob = 0.0
|
306
|
+
num_prots = []
|
307
|
+
ppv = []
|
308
|
+
prot_cnt = 0
|
309
|
+
probs = prots.map {|v| v.probability}
|
310
|
+
sorted = probs.sort.reverse
|
311
|
+
sorted.each do |prob|
|
312
|
+
prot_cnt += 1
|
313
|
+
num_prots << prot_cnt
|
314
|
+
current_sum_one_minus_prob += 1.0 - prob
|
315
|
+
ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
|
316
|
+
# current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
|
317
|
+
end
|
318
|
+
[num_prots, ppv]
|
319
|
+
end
|
320
|
+
|
302
321
|
# convenience method for the common task of determining precision for
|
303
322
|
# proteins (with decoy proteins found by prefix)
|
304
|
-
# returns (
|
305
|
-
def
|
323
|
+
# returns (num_hits, precision)
|
324
|
+
def num_hits_and_ppv_for_prob(fp_prefix)
|
306
325
|
regex = /^#{Regexp.escape(fp_prefix)}/
|
307
326
|
prob_proc = probability_proc
|
308
327
|
myproc = proc { |prt|
|
309
328
|
if prt.reference =~ regex ; false
|
310
329
|
else ; true end
|
311
330
|
}
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
end
|
317
|
-
[tps1, precs, fprs]
|
331
|
+
real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
|
332
|
+
|
333
|
+
(num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
|
334
|
+
[num_hits, precision]
|
318
335
|
end
|
319
336
|
|
320
337
|
def method_missing(symbol, *args)
|
@@ -389,11 +406,17 @@ class SpecID
|
|
389
406
|
sorted_probabilities(peps)
|
390
407
|
end
|
391
408
|
|
409
|
+
##########################################################################
|
410
|
+
# WARNING! These might be dangerous to your health if there are multiple
|
411
|
+
# files collected in your bioworks file
|
412
|
+
##########################################################################
|
413
|
+
|
392
414
|
# (prob_list_by_min, prob_list_by_best10)
|
393
415
|
# returns 2 sorted lists of probabilities based on:
|
394
416
|
# 1. best peptide hit
|
395
417
|
# 2. top 10 peptide hits
|
396
418
|
# on a per scan basis
|
419
|
+
# NOTE: you may want to hash on base_name first!
|
397
420
|
def pep_probs_by_scan
|
398
421
|
hash = peps.hash_by(:first_scan, :last_scan)
|
399
422
|
return min_and_best10(hash)
|
@@ -402,6 +425,7 @@ class SpecID
|
|
402
425
|
|
403
426
|
#(prob_list_by_min, prob_list_by_best10)
|
404
427
|
# same as pep_probs_by_scan but per charge state
|
428
|
+
# NOTE: you may want to hash on base_name first!
|
405
429
|
def pep_probs_by_scan_charge
|
406
430
|
hash = peps.hash_by(:first_scan, :last_scan, :charge)
|
407
431
|
return min_and_best10(hash)
|
@@ -410,6 +434,7 @@ class SpecID
|
|
410
434
|
# (prob_list_by_min)
|
411
435
|
# hashes on seq-charge and returns the sorted list of probabilities of top
|
412
436
|
# hit per seq-charge
|
437
|
+
# NOTE: you may want to hash on base_name first!
|
413
438
|
def pep_probs_by_seq_charge
|
414
439
|
hash = peps.hash_by(:sequence, :charge)
|
415
440
|
min_peptides = hash.collect do |k,v|
|
@@ -418,6 +443,42 @@ class SpecID
|
|
418
443
|
sorted_probabilities(min_peptides)
|
419
444
|
end
|
420
445
|
|
446
|
+
##########################################################################
|
447
|
+
# USE these if you have multiple files in your bioworks.xml file
|
448
|
+
##########################################################################
|
449
|
+
# (prob_list_by_min, prob_list_by_best10)
|
450
|
+
# returns 2 sorted lists of probabilities based on:
|
451
|
+
# 1. best peptide hit
|
452
|
+
# 2. top 10 peptide hits
|
453
|
+
# on a per scan basis
|
454
|
+
# NOTE: you may want to hash on base_name first!
|
455
|
+
def pep_probs_by_bn_scan
|
456
|
+
hash = peps.hash_by(:base_name, :first_scan, :last_scan)
|
457
|
+
return min_and_best10(hash)
|
458
|
+
end
|
459
|
+
|
460
|
+
|
461
|
+
#(prob_list_by_min, prob_list_by_best10)
|
462
|
+
# same as pep_probs_by_scan but per charge state
|
463
|
+
# NOTE: you may want to hash on base_name first!
|
464
|
+
def pep_probs_by_bn_scan_charge
|
465
|
+
hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
|
466
|
+
return min_and_best10(hash)
|
467
|
+
end
|
468
|
+
|
469
|
+
# (prob_list_by_min)
|
470
|
+
# hashes on seq-charge and returns the sorted list of probabilities of top
|
471
|
+
# hit per seq-charge
|
472
|
+
# NOTE: you may want to hash on base_name first!
|
473
|
+
def pep_probs_by_bn_seq_charge
|
474
|
+
hash = peps.hash_by(:base_name, :sequence, :charge)
|
475
|
+
min_peptides = hash.collect do |k,v|
|
476
|
+
v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
|
477
|
+
end
|
478
|
+
sorted_probabilities(min_peptides)
|
479
|
+
end
|
480
|
+
|
481
|
+
|
421
482
|
# A Generic spectraID protein
|
422
483
|
class Prot
|
423
484
|
# probability is always a float!
|
@@ -458,6 +519,23 @@ end
|
|
458
519
|
# concatenation into a file
|
459
520
|
module SpecIDXML
|
460
521
|
|
522
|
+
Special_chrs_hash = {
|
523
|
+
'"' => '"',
|
524
|
+
'&' => '&',
|
525
|
+
"'" => ''',
|
526
|
+
'<' => '<',
|
527
|
+
'>' => '>',
|
528
|
+
}
|
529
|
+
|
530
|
+
# substitutes special xml chars
|
531
|
+
def escape_special_chars(string)
|
532
|
+
string.split('').map do |char|
|
533
|
+
if Special_chrs_hash.key? char ; Special_chrs_hash[char]
|
534
|
+
# if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
|
535
|
+
else ; char end
|
536
|
+
end.join
|
537
|
+
end
|
538
|
+
|
461
539
|
$DEPTH = 0
|
462
540
|
|
463
541
|
def tabs
|
@@ -486,6 +564,12 @@ module SpecIDXML
|
|
486
564
|
"#{tabs}<#{element} #{att_string}/>\n"
|
487
565
|
end
|
488
566
|
|
567
|
+
# requires that obj have attribute '@xml_element_name'
|
568
|
+
# displays all *instance_variables* (does not call methods!)
|
569
|
+
def short_element_xml_from_instance_vars(element_name)
|
570
|
+
string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
|
571
|
+
"#{tabs}<#{element_name} #{string}/>\n"
|
572
|
+
end
|
489
573
|
|
490
574
|
# takes an element as a symbol and returns the
|
491
575
|
def element_xml_no_atts(element)
|
data/release_notes.txt
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
|
2
|
+
Note two potentially significant bugs in the software corrected (see the
|
3
|
+
changelog). I haven't finished modifying the tests to reflect these changes,
|
4
|
+
but I wanted to get the faulty software off the top of the stack. A new
|
5
|
+
release will shortly follow that passes all tests. Use this release only as a
|
6
|
+
correction to the previous.
|
7
|
+
|
8
|
+
tests currently failing:
|
9
|
+
gi
|
10
|
+
spec_id
|
11
|
+
id_precision
|
@@ -0,0 +1,226 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
## The yeast Scal db mean background is: 0.00984
|
4
|
+
## The yeast Cysteine background freq is: 0.0131986582396467
|
5
|
+
pep_seq_re = /<search_hit .* peptide="(\w+)"/o
|
6
|
+
pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
|
7
|
+
|
8
|
+
if ARGV.size != 3
|
9
|
+
puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
|
10
|
+
puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
|
11
|
+
abort
|
12
|
+
end
|
13
|
+
|
14
|
+
def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
15
|
+
File.open(base_toplot, "w") do |fh|
|
16
|
+
fh.puts 'XYData'
|
17
|
+
fh.puts base
|
18
|
+
fh.puts title
|
19
|
+
fh.puts xaxis
|
20
|
+
fh.puts yaxis
|
21
|
+
cats.each do |ar|
|
22
|
+
fh.puts ar.join(" & ")
|
23
|
+
ar.each do |a|
|
24
|
+
fh.puts hash[a].join(" ")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
############################################################################
|
31
|
+
#### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
|
32
|
+
#### CHANGE HIM THERE (eventually we need to put him in a lib file)
|
33
|
+
# (actual # with cys, expected # with cys, total#peptides,
|
34
|
+
# mean_fraction_of_cysteines_true, std)
|
35
|
+
# PepHit(C) = Peptide containing cysteine
|
36
|
+
# # Total PepHit(C) # Observed Bad Pep (C)
|
37
|
+
# ------------------ proportional_to ----------------------
|
38
|
+
# # Total PepHit # Total Bad PepHit (X)
|
39
|
+
def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
|
40
|
+
|
41
|
+
# the number of bona fide BAD cysteine hits
|
42
|
+
# (some of the cysteine hits (~5%) are true positives)
|
43
|
+
|
44
|
+
ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
|
45
|
+
if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
|
46
|
+
total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
|
47
|
+
fpr = total_number_false / total_peptides
|
48
|
+
[fpr, total_number_false]
|
49
|
+
end
|
50
|
+
############################################################################
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
(cysteine_background_freq, background_freq, file) = ARGV
|
56
|
+
cysteine_background_freq = cysteine_background_freq.to_f
|
57
|
+
background_freq = background_freq.to_f
|
58
|
+
|
59
|
+
seq_probs = []
|
60
|
+
last_seq_prob = nil
|
61
|
+
File.open(file) do |fh|
|
62
|
+
fh.each do |line|
|
63
|
+
if line =~ pep_seq_re
|
64
|
+
ar = Array.new(2)
|
65
|
+
ar[0] = $1
|
66
|
+
seq_probs << ar
|
67
|
+
last_seq_prob = ar
|
68
|
+
elsif line =~ pep_prob_re
|
69
|
+
last_seq_prob[1] = $1.to_f
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#seq_probs.each do |seq|
|
75
|
+
# if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
|
76
|
+
# abort "BAD PARSING!!"
|
77
|
+
# end
|
78
|
+
#end
|
79
|
+
amino_acid_as_st = 'C'
|
80
|
+
|
81
|
+
sorted = seq_probs.sort_by {|v| v[1] }.reverse
|
82
|
+
|
83
|
+
## traverse the peptides
|
84
|
+
actual_cys_containing_peps = 0
|
85
|
+
expected_cys_containing_peps = 0.0
|
86
|
+
current_sum_one_minus_prob = 0.0
|
87
|
+
prob_estimated_fpr = 0.0
|
88
|
+
pep_cnt = 0
|
89
|
+
one_minus_freq = 1.0 - cysteine_background_freq
|
90
|
+
|
91
|
+
## tabulate:
|
92
|
+
pep_cnts = []
|
93
|
+
probs = []
|
94
|
+
prob_fprs = []
|
95
|
+
prob_tps = []
|
96
|
+
cys_fprs = []
|
97
|
+
cys_tps = []
|
98
|
+
fpr_diff = []
|
99
|
+
|
100
|
+
|
101
|
+
sorted.each do |ar|
|
102
|
+
pep_cnt += 1
|
103
|
+
|
104
|
+
pep = ar[0]
|
105
|
+
prob = ar[1]
|
106
|
+
|
107
|
+
## Cysteine FPR: ##
|
108
|
+
# Expected:
|
109
|
+
expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
|
110
|
+
# Actual:
|
111
|
+
if pep.include?(amino_acid_as_st)
|
112
|
+
actual_cys_containing_peps += 1
|
113
|
+
end
|
114
|
+
(cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
|
115
|
+
cys_tp = pep_cnt.to_f - total_num_false_by_cys
|
116
|
+
|
117
|
+
|
118
|
+
## FPR by prob: ##
|
119
|
+
# SUM(1-probX)/#peps
|
120
|
+
current_sum_one_minus_prob += 1.0 - prob
|
121
|
+
prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
|
122
|
+
prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
|
123
|
+
|
124
|
+
## GRAB or report the data:
|
125
|
+
pep_cnts << pep_cnt
|
126
|
+
probs << prob
|
127
|
+
prob_fprs << prob_estimated_fpr
|
128
|
+
prob_tps << prob_tp
|
129
|
+
cys_fprs << cys_fpr
|
130
|
+
cys_tps << cys_tp
|
131
|
+
fpr_diff << prob_estimated_fpr - cys_fpr
|
132
|
+
|
133
|
+
#puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
|
134
|
+
end
|
135
|
+
|
136
|
+
hash = {
|
137
|
+
'pep_cnts' => pep_cnts,
|
138
|
+
'probs' => probs,
|
139
|
+
'prob_fprs' => prob_fprs,
|
140
|
+
'prob_tps' => prob_tps,
|
141
|
+
'cys_fprs' => cys_fprs,
|
142
|
+
'cys_tps' => cys_tps,
|
143
|
+
'fpr_diff' => fpr_diff,
|
144
|
+
}
|
145
|
+
|
146
|
+
|
147
|
+
real_base = file.sub(/\.xml/,'')
|
148
|
+
|
149
|
+
|
150
|
+
|
151
|
+
## TPS vs FPR
|
152
|
+
base = real_base.dup
|
153
|
+
base << "." << "tps_vs_fpr"
|
154
|
+
base_toplot = base + '.to_plot'
|
155
|
+
title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
|
156
|
+
xaxis = "TPs"
|
157
|
+
yaxis = "FPR"
|
158
|
+
cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
|
159
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
160
|
+
|
161
|
+
## PEPHITS vs FPR
|
162
|
+
base = real_base.dup
|
163
|
+
base << "." << "num_pep_hits_vs_fpr"
|
164
|
+
base_toplot = base + '.to_plot'
|
165
|
+
title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
|
166
|
+
xaxis = "num peptide hits"
|
167
|
+
yaxis = "FPR"
|
168
|
+
cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
|
169
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
170
|
+
|
171
|
+
## PEPHITS VS FPR DIFF
|
172
|
+
base = real_base.dup
|
173
|
+
base << "." << "num_pep_hits_vs_fpr_diff"
|
174
|
+
base_toplot = base + '.to_plot'
|
175
|
+
title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
|
176
|
+
xaxis = "num peptide hits"
|
177
|
+
yaxis = "FPR diff (prob - cysteine)"
|
178
|
+
cats = [['pep_cnts', 'fpr_diff']]
|
179
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
180
|
+
|
181
|
+
## PROB VS FPR DIFF
|
182
|
+
base = real_base.dup
|
183
|
+
base << "." << "prob_vs_fpr_diff"
|
184
|
+
base_toplot = base + '.to_plot'
|
185
|
+
title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
|
186
|
+
xaxis = "peptide probability"
|
187
|
+
yaxis = "FPR diff (prob - cysteine)"
|
188
|
+
cats = [['probs', 'fpr_diff']]
|
189
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
190
|
+
|
191
|
+
|
192
|
+
|
193
|
+
=begin
|
194
|
+
|
195
|
+
returns [number_of_prots, actual_fpr]
|
196
|
+
def num_prots_above_fpr(prots, desired_fpr)
|
197
|
+
current_fpr_rate_percent = 0.0
|
198
|
+
previous_fpr_rate_percent = 0.0
|
199
|
+
current_sum_one_minus_prob = 0.0
|
200
|
+
proteins_within_fpr = 0
|
201
|
+
actual_fpr = nil
|
202
|
+
already_found = false
|
203
|
+
prot_cnt = 0
|
204
|
+
prots.each do |prot|
|
205
|
+
prot_cnt += 1
|
206
|
+
# SUM(1-probX)/#prots
|
207
|
+
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
208
|
+
current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
209
|
+
|
210
|
+
if current_fpr_rate_percent > desired_fpr && !already_found
|
211
|
+
actual_fpr = previous_fpr_rate_percent
|
212
|
+
proteins_within_fpr = prot_cnt
|
213
|
+
already_found = true
|
214
|
+
end
|
215
|
+
previous_fpr_rate_percent = current_fpr_rate_percent
|
216
|
+
end
|
217
|
+
[proteins_within_fpr, actual_fpr]
|
218
|
+
end
|
219
|
+
|
220
|
+
=end
|
221
|
+
|
222
|
+
|
223
|
+
|
224
|
+
|
225
|
+
|
226
|
+
|
data/script/filter-peps.rb
CHANGED
@@ -80,13 +80,13 @@ def number_passing(peps)
|
|
80
80
|
np = {}
|
81
81
|
np["PepProts"] = filter(peps).size
|
82
82
|
|
83
|
-
by_scan_charge = peps.hash_by(:first_scan, :last_scan, :charge).values
|
83
|
+
by_scan_charge = peps.hash_by(:base_name, :first_scan, :last_scan, :charge).values
|
84
84
|
analyze(by_scan_charge, "ScanCharge", np)
|
85
85
|
|
86
|
-
by_scan = peps.hash_by(:first_scan, :last_scan).values
|
86
|
+
by_scan = peps.hash_by(:base_name, :first_scan, :last_scan).values
|
87
87
|
analyze(by_scan, "Scan", np)
|
88
88
|
|
89
|
-
by_seq_charge = peps.hash_by(:sequence, :charge).values
|
89
|
+
by_seq_charge = peps.hash_by(:base_name, :sequence, :charge).values
|
90
90
|
analyze(by_seq_charge, "SeqCharge", np)
|
91
91
|
|
92
92
|
np
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'vec'
|
4
|
+
|
5
|
+
# FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
|
6
|
+
# is 0.0101409563168847
|
7
|
+
|
8
|
+
# <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
|
9
|
+
|
10
|
+
def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
11
|
+
File.open(base_toplot, "w") do |fh|
|
12
|
+
fh.puts 'XYData'
|
13
|
+
fh.puts base
|
14
|
+
fh.puts title
|
15
|
+
fh.puts xaxis
|
16
|
+
fh.puts yaxis
|
17
|
+
cats.each do |ar|
|
18
|
+
fh.puts ar.join(" & ")
|
19
|
+
ar.each do |a|
|
20
|
+
fh.puts hash[a].join(" ")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
|
25
|
+
end
|
26
|
+
|
27
|
+
peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
|
28
|
+
|
29
|
+
unless ARGV.size == 2
|
30
|
+
abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
|
31
|
+
end
|
32
|
+
|
33
|
+
(cysteine_background_freq, file) = ARGV
|
34
|
+
|
35
|
+
# each pep = [nsp_prob, init_prob, SEQUENCE]
|
36
|
+
peps = []
|
37
|
+
File.open(file) do |fh|
|
38
|
+
fh.each do |line|
|
39
|
+
if line =~ peptide_re
|
40
|
+
peps << [$3.to_f,$2.to_f,$1]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
amino_acid_as_st = 'C'
|
47
|
+
one_minus_freq = 1.0 - cysteine_background_freq.to_f
|
48
|
+
actual_cys_containing_peps = 0
|
49
|
+
expected_cys_containing_peps = 0.0
|
50
|
+
current_sum_one_minus_prob = 0.0
|
51
|
+
prob_estimated_fpr = 0.0
|
52
|
+
pep_cnt = 0
|
53
|
+
|
54
|
+
the_probs = []
|
55
|
+
the_fractions = []
|
56
|
+
special_probs = []
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
#peps.sort.reverse.each do |ar|
|
62
|
+
#peps.sort.each do |ar|
|
63
|
+
peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
|
64
|
+
(nsp_prob, init_prob, pep) = ar
|
65
|
+
## Cysteine FPR: ##
|
66
|
+
# Expected:
|
67
|
+
expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
|
68
|
+
# Actual:
|
69
|
+
if pep.include?(amino_acid_as_st)
|
70
|
+
actual_cys_containing_peps += 1
|
71
|
+
end
|
72
|
+
fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
|
73
|
+
|
74
|
+
special_prob = (3.0 * nsp_prob) + init_prob
|
75
|
+
|
76
|
+
## Get the final fraction
|
77
|
+
#if special_prob < 4.0
|
78
|
+
# #puts the_fractions.join(" ")
|
79
|
+
# puts the_fractions.last
|
80
|
+
# abort
|
81
|
+
#end
|
82
|
+
|
83
|
+
# gather data to plot
|
84
|
+
the_probs << nsp_prob
|
85
|
+
special_probs << special_prob
|
86
|
+
the_fractions << fraction_ac_exp
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
|
92
|
+
hash = {
|
93
|
+
'probs' => the_probs,
|
94
|
+
'fractions' => the_fractions,
|
95
|
+
'special_probs' => special_probs,
|
96
|
+
}
|
97
|
+
|
98
|
+
real_base = file.sub(/\.xml/,'')
|
99
|
+
|
100
|
+
|
101
|
+
=begin
|
102
|
+
## PROB VS FPR DIFF
|
103
|
+
base = real_base.dup
|
104
|
+
base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
|
105
|
+
base_toplot = base + '.to_plot'
|
106
|
+
title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
|
107
|
+
xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
|
108
|
+
yaxis = "fraction with cysteines (actual/expected)"
|
109
|
+
cats = [['probs', 'fractions']]
|
110
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
111
|
+
=end
|
112
|
+
|
113
|
+
|
114
|
+
=begin
|
115
|
+
## PROB VS FPR DIFF
|
116
|
+
base = real_base.dup
|
117
|
+
base << "." << "prob_vs_actual_expected_fraction"
|
118
|
+
base_toplot = base + '.to_plot'
|
119
|
+
title = "peptide prob vs fraction with cysteines (actual/expected)"
|
120
|
+
xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
|
121
|
+
yaxis = "fraction with cysteines (actual/expected)"
|
122
|
+
cats = [['probs', 'fractions']]
|
123
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
124
|
+
=end
|
125
|
+
|
126
|
+
## SPECIAL PROB VS FPR DIFF
|
127
|
+
base = real_base.dup
|
128
|
+
base << "." << "special_prob_vs_actual_expected_fraction"
|
129
|
+
base_toplot = base + '.to_plot'
|
130
|
+
title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
|
131
|
+
xaxis = "(3 * nsp_prob) + init_prob"
|
132
|
+
yaxis = "fraction with cysteines (actual/expected)"
|
133
|
+
cats = [['special_probs', 'fractions']]
|
134
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
135
|
+
|
136
|
+
|
137
|
+
|
@@ -109,10 +109,12 @@ def run_sequest ; "Run Sequest with a Normal and an Inverse Database
|
|
109
109
|
|
110
110
|
If you don't already have one, here's how to make an inverse database:
|
111
111
|
|
112
|
-
|
112
|
+
fasta_shaker.rb reverse <yourfile.fasta>
|
113
113
|
|
114
|
-
This will create a file with the trailing tag '
|
115
|
-
`
|
114
|
+
This will create a file with the trailing tag '_reverse.fasta'. Just type
|
115
|
+
`fasta_shaker.rb` for more details.
|
116
|
+
|
117
|
+
Run sequest with 'report duplicate references' set to >= 40
|
116
118
|
"
|
117
119
|
end
|
118
120
|
|
@@ -166,11 +168,13 @@ def run_sequest ; "Run Sequest with a Concatenated Inverse Database
|
|
166
168
|
|
167
169
|
If you don't already have one, here's how to make one:
|
168
170
|
|
169
|
-
|
171
|
+
fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
|
172
|
+
|
173
|
+
This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'. Each
|
174
|
+
inverted protein name will be prefixed with 'INV_'. Just type
|
175
|
+
`fasta_shaker.rb` for more details.
|
170
176
|
|
171
|
-
|
172
|
-
protein name will be prefixed with 'INV_'. Just type `fasta_cat_mod.rb` for
|
173
|
-
more details.
|
177
|
+
Run sequest with 'report duplicate references' set to >= 40
|
174
178
|
"
|
175
179
|
end
|
176
180
|
|