mspire 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/lib/spec_id.rb
CHANGED
@@ -7,7 +7,7 @@ require 'sample_enzyme' # for others
|
|
7
7
|
require 'spec_id/bioworks'
|
8
8
|
require 'spec_id/sequest'
|
9
9
|
require 'spec_id/proph'
|
10
|
-
require 'spec_id/
|
10
|
+
require 'spec_id/precision'
|
11
11
|
|
12
12
|
|
13
13
|
class Mass
|
@@ -112,11 +112,12 @@ class SpecID
|
|
112
112
|
"<#{self.class} #peps=\"#{peps.size}\">"
|
113
113
|
end
|
114
114
|
|
115
|
-
# returns the top peptide hits per dta (first_scan + charge)
|
115
|
+
# returns the top peptide hits per file dta (first_scan + charge)
|
116
116
|
# all hits with same score as top score are returned
|
117
117
|
# assumes that all fields are strings...
|
118
118
|
# converts xcorr, deltacn, deltamass, mass, and charge into numerical types
|
119
119
|
# deletes the protein array (but not relevant proteins)
|
120
|
+
# hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
|
120
121
|
def top_peps_prefilter!
|
121
122
|
peps.each do |pep|
|
122
123
|
pep.xcorr = pep.xcorr.to_f
|
@@ -127,7 +128,8 @@ class SpecID
|
|
127
128
|
end
|
128
129
|
# get the top peptide by firstscan/charge (equivalent to .out files)
|
129
130
|
top_peps = []
|
130
|
-
self.peps.hash_by {|pep| [pep.first_scan.to_i, pep.charge.to_i]}.map do |
|
131
|
+
#self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
|
132
|
+
self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
|
131
133
|
best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
|
132
134
|
top_score = best_to_worst.first.xcorr
|
133
135
|
best_to_worst.each do |pep|
|
@@ -158,6 +160,7 @@ class SpecID
|
|
158
160
|
pep_deltacn = pep.deltacn
|
159
161
|
pep_charge = pep.charge
|
160
162
|
(pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
|
163
|
+
#truth = (pep_deltacn >= deltacn) and
|
161
164
|
(
|
162
165
|
(pep_charge == 1 && pep.xcorr >= x1) or
|
163
166
|
(pep_charge == 2 && pep.xcorr >= x2) or
|
@@ -166,6 +169,8 @@ class SpecID
|
|
166
169
|
((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
|
167
170
|
end
|
168
171
|
|
172
|
+
#deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
|
173
|
+
|
169
174
|
hash = peps_passed.hash_by(:prot)
|
170
175
|
|
171
176
|
prots_passed = hash.map do |prot,pep_arr|
|
@@ -173,14 +178,15 @@ class SpecID
|
|
173
178
|
prot
|
174
179
|
end
|
175
180
|
[prots_passed, peps_passed]
|
181
|
+
#[prots_passed, peps_passed, deltacnstar_cnt]
|
176
182
|
else
|
177
183
|
abort "#{kind} not implemented"
|
178
184
|
end
|
179
185
|
end
|
180
186
|
|
181
187
|
## basically, this is the command line wrapper
|
182
|
-
def self.
|
183
|
-
SpecID::
|
188
|
+
def self.precision(argv)
|
189
|
+
SpecID::Precision.new.run_cmd_line(argv)
|
184
190
|
end
|
185
191
|
|
186
192
|
|
@@ -266,16 +272,6 @@ class SpecID
|
|
266
272
|
return tp, fp
|
267
273
|
end
|
268
274
|
|
269
|
-
# type_of_analysis can be (:precision|...)
|
270
|
-
def area_under_curve(items, fp_prefix)
|
271
|
-
if items == :prots
|
272
|
-
(tp,fp) = classify_by_prefix(items, fp_prefix)
|
273
|
-
(tp, prec, fpr2) = tps_and_precision_and_fpr2_times2_for_prob(fp_prefix)
|
274
|
-
|
275
|
-
|
276
|
-
############################################## HERERERERER!!!!
|
277
|
-
end
|
278
|
-
end
|
279
275
|
|
280
276
|
# returns a proc for getting all probabilities so that an ascending sort
|
281
277
|
# will put the best scores first
|
@@ -299,22 +295,43 @@ class SpecID
|
|
299
295
|
end
|
300
296
|
end
|
301
297
|
|
298
|
+
# sorts the probabilities and then
|
299
|
+
# calcs predicted number hits and precision for protein probabilities
|
300
|
+
# (summing probabilities)
|
301
|
+
# one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
|
302
|
+
# called false positive rate
|
303
|
+
# SUM(1-probX)/#prots
|
304
|
+
def num_hits_and_ppv_for_protein_prophet_probabilities
|
305
|
+
current_sum_one_minus_prob = 0.0
|
306
|
+
num_prots = []
|
307
|
+
ppv = []
|
308
|
+
prot_cnt = 0
|
309
|
+
probs = prots.map {|v| v.probability}
|
310
|
+
sorted = probs.sort.reverse
|
311
|
+
sorted.each do |prob|
|
312
|
+
prot_cnt += 1
|
313
|
+
num_prots << prot_cnt
|
314
|
+
current_sum_one_minus_prob += 1.0 - prob
|
315
|
+
ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
|
316
|
+
# current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
|
317
|
+
end
|
318
|
+
[num_prots, ppv]
|
319
|
+
end
|
320
|
+
|
302
321
|
# convenience method for the common task of determining precision for
|
303
322
|
# proteins (with decoy proteins found by prefix)
|
304
|
-
# returns (
|
305
|
-
def
|
323
|
+
# returns (num_hits, precision)
|
324
|
+
def num_hits_and_ppv_for_prob(fp_prefix)
|
306
325
|
regex = /^#{Regexp.escape(fp_prefix)}/
|
307
326
|
prob_proc = probability_proc
|
308
327
|
myproc = proc { |prt|
|
309
328
|
if prt.reference =~ regex ; false
|
310
329
|
else ; true end
|
311
330
|
}
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
end
|
317
|
-
[tps1, precs, fprs]
|
331
|
+
real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
|
332
|
+
|
333
|
+
(num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
|
334
|
+
[num_hits, precision]
|
318
335
|
end
|
319
336
|
|
320
337
|
def method_missing(symbol, *args)
|
@@ -389,11 +406,17 @@ class SpecID
|
|
389
406
|
sorted_probabilities(peps)
|
390
407
|
end
|
391
408
|
|
409
|
+
##########################################################################
|
410
|
+
# WARNING! These might be dangerous to your health if there are multiple
|
411
|
+
# files collected in your bioworks file
|
412
|
+
##########################################################################
|
413
|
+
|
392
414
|
# (prob_list_by_min, prob_list_by_best10)
|
393
415
|
# returns 2 sorted lists of probabilities based on:
|
394
416
|
# 1. best peptide hit
|
395
417
|
# 2. top 10 peptide hits
|
396
418
|
# on a per scan basis
|
419
|
+
# NOTE: you may want to hash on base_name first!
|
397
420
|
def pep_probs_by_scan
|
398
421
|
hash = peps.hash_by(:first_scan, :last_scan)
|
399
422
|
return min_and_best10(hash)
|
@@ -402,6 +425,7 @@ class SpecID
|
|
402
425
|
|
403
426
|
#(prob_list_by_min, prob_list_by_best10)
|
404
427
|
# same as pep_probs_by_scan but per charge state
|
428
|
+
# NOTE: you may want to hash on base_name first!
|
405
429
|
def pep_probs_by_scan_charge
|
406
430
|
hash = peps.hash_by(:first_scan, :last_scan, :charge)
|
407
431
|
return min_and_best10(hash)
|
@@ -410,6 +434,7 @@ class SpecID
|
|
410
434
|
# (prob_list_by_min)
|
411
435
|
# hashes on seq-charge and returns the sorted list of probabilities of top
|
412
436
|
# hit per seq-charge
|
437
|
+
# NOTE: you may want to hash on base_name first!
|
413
438
|
def pep_probs_by_seq_charge
|
414
439
|
hash = peps.hash_by(:sequence, :charge)
|
415
440
|
min_peptides = hash.collect do |k,v|
|
@@ -418,6 +443,42 @@ class SpecID
|
|
418
443
|
sorted_probabilities(min_peptides)
|
419
444
|
end
|
420
445
|
|
446
|
+
##########################################################################
|
447
|
+
# USE these if you have multiple files in your bioworks.xml file
|
448
|
+
##########################################################################
|
449
|
+
# (prob_list_by_min, prob_list_by_best10)
|
450
|
+
# returns 2 sorted lists of probabilities based on:
|
451
|
+
# 1. best peptide hit
|
452
|
+
# 2. top 10 peptide hits
|
453
|
+
# on a per scan basis
|
454
|
+
# NOTE: you may want to hash on base_name first!
|
455
|
+
def pep_probs_by_bn_scan
|
456
|
+
hash = peps.hash_by(:base_name, :first_scan, :last_scan)
|
457
|
+
return min_and_best10(hash)
|
458
|
+
end
|
459
|
+
|
460
|
+
|
461
|
+
#(prob_list_by_min, prob_list_by_best10)
|
462
|
+
# same as pep_probs_by_scan but per charge state
|
463
|
+
# NOTE: you may want to hash on base_name first!
|
464
|
+
def pep_probs_by_bn_scan_charge
|
465
|
+
hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
|
466
|
+
return min_and_best10(hash)
|
467
|
+
end
|
468
|
+
|
469
|
+
# (prob_list_by_min)
|
470
|
+
# hashes on seq-charge and returns the sorted list of probabilities of top
|
471
|
+
# hit per seq-charge
|
472
|
+
# NOTE: you may want to hash on base_name first!
|
473
|
+
def pep_probs_by_bn_seq_charge
|
474
|
+
hash = peps.hash_by(:base_name, :sequence, :charge)
|
475
|
+
min_peptides = hash.collect do |k,v|
|
476
|
+
v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
|
477
|
+
end
|
478
|
+
sorted_probabilities(min_peptides)
|
479
|
+
end
|
480
|
+
|
481
|
+
|
421
482
|
# A Generic spectraID protein
|
422
483
|
class Prot
|
423
484
|
# probability is always a float!
|
@@ -458,6 +519,23 @@ end
|
|
458
519
|
# concatenation into a file
|
459
520
|
module SpecIDXML
|
460
521
|
|
522
|
+
Special_chrs_hash = {
|
523
|
+
'"' => '"',
|
524
|
+
'&' => '&',
|
525
|
+
"'" => ''',
|
526
|
+
'<' => '<',
|
527
|
+
'>' => '>',
|
528
|
+
}
|
529
|
+
|
530
|
+
# substitutes special xml chars
|
531
|
+
def escape_special_chars(string)
|
532
|
+
string.split('').map do |char|
|
533
|
+
if Special_chrs_hash.key? char ; Special_chrs_hash[char]
|
534
|
+
# if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
|
535
|
+
else ; char end
|
536
|
+
end.join
|
537
|
+
end
|
538
|
+
|
461
539
|
$DEPTH = 0
|
462
540
|
|
463
541
|
def tabs
|
@@ -486,6 +564,12 @@ module SpecIDXML
|
|
486
564
|
"#{tabs}<#{element} #{att_string}/>\n"
|
487
565
|
end
|
488
566
|
|
567
|
+
# requires that obj have attribute '@xml_element_name'
|
568
|
+
# displays all *instance_variables* (does not call methods!)
|
569
|
+
def short_element_xml_from_instance_vars(element_name)
|
570
|
+
string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
|
571
|
+
"#{tabs}<#{element_name} #{string}/>\n"
|
572
|
+
end
|
489
573
|
|
490
574
|
# takes an element as a symbol and returns the
|
491
575
|
def element_xml_no_atts(element)
|
data/release_notes.txt
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
|
2
|
+
Note two potentially significant bugs in the software corrected (see the
|
3
|
+
changelog). I haven't finished modifying the tests to reflect these changes,
|
4
|
+
but I wanted to get the faulty software off the top of the stack. A new
|
5
|
+
release will shortly follow that passes all tests. Use this release only as a
|
6
|
+
correction to the previous.
|
7
|
+
|
8
|
+
tests currently failing:
|
9
|
+
gi
|
10
|
+
spec_id
|
11
|
+
id_precision
|
@@ -0,0 +1,226 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
## The yeast Scal db mean background is: 0.00984
|
4
|
+
## The yeast Cysteine background freq is: 0.0131986582396467
|
5
|
+
pep_seq_re = /<search_hit .* peptide="(\w+)"/o
|
6
|
+
pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
|
7
|
+
|
8
|
+
if ARGV.size != 3
|
9
|
+
puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
|
10
|
+
puts " outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
|
11
|
+
abort
|
12
|
+
end
|
13
|
+
|
14
|
+
def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
15
|
+
File.open(base_toplot, "w") do |fh|
|
16
|
+
fh.puts 'XYData'
|
17
|
+
fh.puts base
|
18
|
+
fh.puts title
|
19
|
+
fh.puts xaxis
|
20
|
+
fh.puts yaxis
|
21
|
+
cats.each do |ar|
|
22
|
+
fh.puts ar.join(" & ")
|
23
|
+
ar.each do |a|
|
24
|
+
fh.puts hash[a].join(" ")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
############################################################################
|
31
|
+
#### DO NOT MODIFY THIS GUY! HE IS TAKEN FROM bin/filter_spec_id.rb
|
32
|
+
#### CHANGE HIM THERE (eventually we need to put him in a lib file)
|
33
|
+
# (actual # with cys, expected # with cys, total#peptides,
|
34
|
+
# mean_fraction_of_cysteines_true, std)
|
35
|
+
# PepHit(C) = Peptide containing cysteine
|
36
|
+
# # Total PepHit(C) # Observed Bad Pep (C)
|
37
|
+
# ------------------ proportional_to ----------------------
|
38
|
+
# # Total PepHit # Total Bad PepHit (X)
|
39
|
+
def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
|
40
|
+
|
41
|
+
# the number of bona fide BAD cysteine hits
|
42
|
+
# (some of the cysteine hits (~5%) are true positives)
|
43
|
+
|
44
|
+
ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
|
45
|
+
if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
|
46
|
+
total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
|
47
|
+
fpr = total_number_false / total_peptides
|
48
|
+
[fpr, total_number_false]
|
49
|
+
end
|
50
|
+
############################################################################
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
(cysteine_background_freq, background_freq, file) = ARGV
|
56
|
+
cysteine_background_freq = cysteine_background_freq.to_f
|
57
|
+
background_freq = background_freq.to_f
|
58
|
+
|
59
|
+
seq_probs = []
|
60
|
+
last_seq_prob = nil
|
61
|
+
File.open(file) do |fh|
|
62
|
+
fh.each do |line|
|
63
|
+
if line =~ pep_seq_re
|
64
|
+
ar = Array.new(2)
|
65
|
+
ar[0] = $1
|
66
|
+
seq_probs << ar
|
67
|
+
last_seq_prob = ar
|
68
|
+
elsif line =~ pep_prob_re
|
69
|
+
last_seq_prob[1] = $1.to_f
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#seq_probs.each do |seq|
|
75
|
+
# if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
|
76
|
+
# abort "BAD PARSING!!"
|
77
|
+
# end
|
78
|
+
#end
|
79
|
+
amino_acid_as_st = 'C'
|
80
|
+
|
81
|
+
sorted = seq_probs.sort_by {|v| v[1] }.reverse
|
82
|
+
|
83
|
+
## traverse the peptides
|
84
|
+
actual_cys_containing_peps = 0
|
85
|
+
expected_cys_containing_peps = 0.0
|
86
|
+
current_sum_one_minus_prob = 0.0
|
87
|
+
prob_estimated_fpr = 0.0
|
88
|
+
pep_cnt = 0
|
89
|
+
one_minus_freq = 1.0 - cysteine_background_freq
|
90
|
+
|
91
|
+
## tabulate:
|
92
|
+
pep_cnts = []
|
93
|
+
probs = []
|
94
|
+
prob_fprs = []
|
95
|
+
prob_tps = []
|
96
|
+
cys_fprs = []
|
97
|
+
cys_tps = []
|
98
|
+
fpr_diff = []
|
99
|
+
|
100
|
+
|
101
|
+
sorted.each do |ar|
|
102
|
+
pep_cnt += 1
|
103
|
+
|
104
|
+
pep = ar[0]
|
105
|
+
prob = ar[1]
|
106
|
+
|
107
|
+
## Cysteine FPR: ##
|
108
|
+
# Expected:
|
109
|
+
expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
|
110
|
+
# Actual:
|
111
|
+
if pep.include?(amino_acid_as_st)
|
112
|
+
actual_cys_containing_peps += 1
|
113
|
+
end
|
114
|
+
(cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
|
115
|
+
cys_tp = pep_cnt.to_f - total_num_false_by_cys
|
116
|
+
|
117
|
+
|
118
|
+
## FPR by prob: ##
|
119
|
+
# SUM(1-probX)/#peps
|
120
|
+
current_sum_one_minus_prob += 1.0 - prob
|
121
|
+
prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
|
122
|
+
prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
|
123
|
+
|
124
|
+
## GRAB or report the data:
|
125
|
+
pep_cnts << pep_cnt
|
126
|
+
probs << prob
|
127
|
+
prob_fprs << prob_estimated_fpr
|
128
|
+
prob_tps << prob_tp
|
129
|
+
cys_fprs << cys_fpr
|
130
|
+
cys_tps << cys_tp
|
131
|
+
fpr_diff << prob_estimated_fpr - cys_fpr
|
132
|
+
|
133
|
+
#puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
|
134
|
+
end
|
135
|
+
|
136
|
+
hash = {
|
137
|
+
'pep_cnts' => pep_cnts,
|
138
|
+
'probs' => probs,
|
139
|
+
'prob_fprs' => prob_fprs,
|
140
|
+
'prob_tps' => prob_tps,
|
141
|
+
'cys_fprs' => cys_fprs,
|
142
|
+
'cys_tps' => cys_tps,
|
143
|
+
'fpr_diff' => fpr_diff,
|
144
|
+
}
|
145
|
+
|
146
|
+
|
147
|
+
real_base = file.sub(/\.xml/,'')
|
148
|
+
|
149
|
+
|
150
|
+
|
151
|
+
## TPS vs FPR
|
152
|
+
base = real_base.dup
|
153
|
+
base << "." << "tps_vs_fpr"
|
154
|
+
base_toplot = base + '.to_plot'
|
155
|
+
title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
|
156
|
+
xaxis = "TPs"
|
157
|
+
yaxis = "FPR"
|
158
|
+
cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
|
159
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
160
|
+
|
161
|
+
## PEPHITS vs FPR
|
162
|
+
base = real_base.dup
|
163
|
+
base << "." << "num_pep_hits_vs_fpr"
|
164
|
+
base_toplot = base + '.to_plot'
|
165
|
+
title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
|
166
|
+
xaxis = "num peptide hits"
|
167
|
+
yaxis = "FPR"
|
168
|
+
cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
|
169
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
170
|
+
|
171
|
+
## PEPHITS VS FPR DIFF
|
172
|
+
base = real_base.dup
|
173
|
+
base << "." << "num_pep_hits_vs_fpr_diff"
|
174
|
+
base_toplot = base + '.to_plot'
|
175
|
+
title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
|
176
|
+
xaxis = "num peptide hits"
|
177
|
+
yaxis = "FPR diff (prob - cysteine)"
|
178
|
+
cats = [['pep_cnts', 'fpr_diff']]
|
179
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
180
|
+
|
181
|
+
## PROB VS FPR DIFF
|
182
|
+
base = real_base.dup
|
183
|
+
base << "." << "prob_vs_fpr_diff"
|
184
|
+
base_toplot = base + '.to_plot'
|
185
|
+
title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
|
186
|
+
xaxis = "peptide probability"
|
187
|
+
yaxis = "FPR diff (prob - cysteine)"
|
188
|
+
cats = [['probs', 'fpr_diff']]
|
189
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
190
|
+
|
191
|
+
|
192
|
+
|
193
|
+
=begin
|
194
|
+
|
195
|
+
returns [number_of_prots, actual_fpr]
|
196
|
+
def num_prots_above_fpr(prots, desired_fpr)
|
197
|
+
current_fpr_rate_percent = 0.0
|
198
|
+
previous_fpr_rate_percent = 0.0
|
199
|
+
current_sum_one_minus_prob = 0.0
|
200
|
+
proteins_within_fpr = 0
|
201
|
+
actual_fpr = nil
|
202
|
+
already_found = false
|
203
|
+
prot_cnt = 0
|
204
|
+
prots.each do |prot|
|
205
|
+
prot_cnt += 1
|
206
|
+
# SUM(1-probX)/#prots
|
207
|
+
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
208
|
+
current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
209
|
+
|
210
|
+
if current_fpr_rate_percent > desired_fpr && !already_found
|
211
|
+
actual_fpr = previous_fpr_rate_percent
|
212
|
+
proteins_within_fpr = prot_cnt
|
213
|
+
already_found = true
|
214
|
+
end
|
215
|
+
previous_fpr_rate_percent = current_fpr_rate_percent
|
216
|
+
end
|
217
|
+
[proteins_within_fpr, actual_fpr]
|
218
|
+
end
|
219
|
+
|
220
|
+
=end
|
221
|
+
|
222
|
+
|
223
|
+
|
224
|
+
|
225
|
+
|
226
|
+
|
data/script/filter-peps.rb
CHANGED
@@ -80,13 +80,13 @@ def number_passing(peps)
|
|
80
80
|
np = {}
|
81
81
|
np["PepProts"] = filter(peps).size
|
82
82
|
|
83
|
-
by_scan_charge = peps.hash_by(:first_scan, :last_scan, :charge).values
|
83
|
+
by_scan_charge = peps.hash_by(:base_name, :first_scan, :last_scan, :charge).values
|
84
84
|
analyze(by_scan_charge, "ScanCharge", np)
|
85
85
|
|
86
|
-
by_scan = peps.hash_by(:first_scan, :last_scan).values
|
86
|
+
by_scan = peps.hash_by(:base_name, :first_scan, :last_scan).values
|
87
87
|
analyze(by_scan, "Scan", np)
|
88
88
|
|
89
|
-
by_seq_charge = peps.hash_by(:sequence, :charge).values
|
89
|
+
by_seq_charge = peps.hash_by(:base_name, :sequence, :charge).values
|
90
90
|
analyze(by_seq_charge, "SeqCharge", np)
|
91
91
|
|
92
92
|
np
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'vec'
|
4
|
+
|
5
|
+
# FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
|
6
|
+
# is 0.0101409563168847
|
7
|
+
|
8
|
+
# <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
|
9
|
+
|
10
|
+
def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
11
|
+
File.open(base_toplot, "w") do |fh|
|
12
|
+
fh.puts 'XYData'
|
13
|
+
fh.puts base
|
14
|
+
fh.puts title
|
15
|
+
fh.puts xaxis
|
16
|
+
fh.puts yaxis
|
17
|
+
cats.each do |ar|
|
18
|
+
fh.puts ar.join(" & ")
|
19
|
+
ar.each do |a|
|
20
|
+
fh.puts hash[a].join(" ")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
|
25
|
+
end
|
26
|
+
|
27
|
+
peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
|
28
|
+
|
29
|
+
unless ARGV.size == 2
|
30
|
+
abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
|
31
|
+
end
|
32
|
+
|
33
|
+
(cysteine_background_freq, file) = ARGV
|
34
|
+
|
35
|
+
# each pep = [nsp_prob, init_prob, SEQUENCE]
|
36
|
+
peps = []
|
37
|
+
File.open(file) do |fh|
|
38
|
+
fh.each do |line|
|
39
|
+
if line =~ peptide_re
|
40
|
+
peps << [$3.to_f,$2.to_f,$1]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
amino_acid_as_st = 'C'
|
47
|
+
one_minus_freq = 1.0 - cysteine_background_freq.to_f
|
48
|
+
actual_cys_containing_peps = 0
|
49
|
+
expected_cys_containing_peps = 0.0
|
50
|
+
current_sum_one_minus_prob = 0.0
|
51
|
+
prob_estimated_fpr = 0.0
|
52
|
+
pep_cnt = 0
|
53
|
+
|
54
|
+
the_probs = []
|
55
|
+
the_fractions = []
|
56
|
+
special_probs = []
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
#peps.sort.reverse.each do |ar|
|
62
|
+
#peps.sort.each do |ar|
|
63
|
+
peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
|
64
|
+
(nsp_prob, init_prob, pep) = ar
|
65
|
+
## Cysteine FPR: ##
|
66
|
+
# Expected:
|
67
|
+
expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
|
68
|
+
# Actual:
|
69
|
+
if pep.include?(amino_acid_as_st)
|
70
|
+
actual_cys_containing_peps += 1
|
71
|
+
end
|
72
|
+
fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
|
73
|
+
|
74
|
+
special_prob = (3.0 * nsp_prob) + init_prob
|
75
|
+
|
76
|
+
## Get the final fraction
|
77
|
+
#if special_prob < 4.0
|
78
|
+
# #puts the_fractions.join(" ")
|
79
|
+
# puts the_fractions.last
|
80
|
+
# abort
|
81
|
+
#end
|
82
|
+
|
83
|
+
# gather data to plot
|
84
|
+
the_probs << nsp_prob
|
85
|
+
special_probs << special_prob
|
86
|
+
the_fractions << fraction_ac_exp
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
|
92
|
+
hash = {
|
93
|
+
'probs' => the_probs,
|
94
|
+
'fractions' => the_fractions,
|
95
|
+
'special_probs' => special_probs,
|
96
|
+
}
|
97
|
+
|
98
|
+
real_base = file.sub(/\.xml/,'')
|
99
|
+
|
100
|
+
|
101
|
+
=begin
|
102
|
+
## PROB VS FPR DIFF
|
103
|
+
base = real_base.dup
|
104
|
+
base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
|
105
|
+
base_toplot = base + '.to_plot'
|
106
|
+
title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
|
107
|
+
xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
|
108
|
+
yaxis = "fraction with cysteines (actual/expected)"
|
109
|
+
cats = [['probs', 'fractions']]
|
110
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
111
|
+
=end
|
112
|
+
|
113
|
+
|
114
|
+
=begin
|
115
|
+
## PROB VS FPR DIFF
|
116
|
+
base = real_base.dup
|
117
|
+
base << "." << "prob_vs_actual_expected_fraction"
|
118
|
+
base_toplot = base + '.to_plot'
|
119
|
+
title = "peptide prob vs fraction with cysteines (actual/expected)"
|
120
|
+
xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
|
121
|
+
yaxis = "fraction with cysteines (actual/expected)"
|
122
|
+
cats = [['probs', 'fractions']]
|
123
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
124
|
+
=end
|
125
|
+
|
126
|
+
## SPECIAL PROB VS FPR DIFF
|
127
|
+
base = real_base.dup
|
128
|
+
base << "." << "special_prob_vs_actual_expected_fraction"
|
129
|
+
base_toplot = base + '.to_plot'
|
130
|
+
title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
|
131
|
+
xaxis = "(3 * nsp_prob) + init_prob"
|
132
|
+
yaxis = "fraction with cysteines (actual/expected)"
|
133
|
+
cats = [['special_probs', 'fractions']]
|
134
|
+
plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
|
135
|
+
|
136
|
+
|
137
|
+
|
@@ -109,10 +109,12 @@ def run_sequest ; "Run Sequest with a Normal and an Inverse Database
|
|
109
109
|
|
110
110
|
If you don't already have one, here's how to make an inverse database:
|
111
111
|
|
112
|
-
|
112
|
+
fasta_shaker.rb reverse <yourfile.fasta>
|
113
113
|
|
114
|
-
This will create a file with the trailing tag '
|
115
|
-
`
|
114
|
+
This will create a file with the trailing tag '_reverse.fasta'. Just type
|
115
|
+
`fasta_shaker.rb` for more details.
|
116
|
+
|
117
|
+
Run sequest with 'report duplicate references' set to >= 40
|
116
118
|
"
|
117
119
|
end
|
118
120
|
|
@@ -166,11 +168,13 @@ def run_sequest ; "Run Sequest with a Concatenated Inverse Database
|
|
166
168
|
|
167
169
|
If you don't already have one, here's how to make one:
|
168
170
|
|
169
|
-
|
171
|
+
fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
|
172
|
+
|
173
|
+
This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'. Each
|
174
|
+
inverted protein name will be prefixed with 'INV_'. Just type
|
175
|
+
`fasta_shaker.rb` for more details.
|
170
176
|
|
171
|
-
|
172
|
-
protein name will be prefixed with 'INV_'. Just type `fasta_cat_mod.rb` for
|
173
|
-
more details.
|
177
|
+
Run sequest with 'report duplicate references' set to >= 40
|
174
178
|
"
|
175
179
|
end
|
176
180
|
|