mspire 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/bin/gi2annot.rb
CHANGED
@@ -1,13 +1,6 @@
|
|
1
1
|
#!/usr/bin/ruby -w
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'rexml/document'
|
5
|
-
require 'rexml/streamlistener'
|
6
|
-
|
7
|
-
BATCH_SIZE = 500
|
8
|
-
|
9
|
-
$LOG = nil
|
10
|
-
$ANNOTS = []
|
3
|
+
require 'gi'
|
11
4
|
|
12
5
|
if ARGV.size < 1
|
13
6
|
puts "usage: #{File.basename(__FILE__)} <gi> ..."
|
@@ -15,108 +8,7 @@ if ARGV.size < 1
|
|
15
8
|
end
|
16
9
|
|
17
10
|
|
18
|
-
# db=
|
19
|
-
# retstart=
|
20
|
-
# retmax=
|
21
|
-
|
22
|
-
class Listener
|
23
|
-
include REXML
|
24
|
-
include StreamListener
|
25
|
-
def initialize
|
26
|
-
@get_title = false
|
27
|
-
end
|
28
|
-
|
29
|
-
def tag_start(name, attributes)
|
30
|
-
#puts "NAME" + name
|
31
|
-
#p attributes
|
32
|
-
if name == "Item" && attributes["Name"] == "Title"
|
33
|
-
@get_title = true
|
34
|
-
end
|
35
|
-
end
|
36
|
-
def text(text)
|
37
|
-
#puts "TEXT: " + text + @get_title.to_s
|
38
|
-
if @get_title
|
39
|
-
#puts "GETTING TITLE!"
|
40
|
-
$ANNOTS.push text.chomp
|
41
|
-
@get_title = false
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
# Returns a list of Annotation strings
|
48
|
-
def parse_etool_output(handle)
|
49
|
-
listener = Listener.new
|
50
|
-
parser = REXML::Parsers::StreamParser.new(handle, listener)
|
51
|
-
parser.parse
|
52
|
-
|
53
|
-
$ANNOTS
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
#$LOG = File.open("log.log", "w")
|
58
|
-
|
59
11
|
gis = ARGV.to_a.dup
|
60
12
|
|
61
|
-
|
62
|
-
batch = gis.slice!(0..BATCH_SIZE)
|
63
|
-
if batch.size == 0 then break end
|
64
|
-
string = batch.join(",")
|
65
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
|
66
|
-
#puts url
|
67
|
-
annots = []
|
68
|
-
open(url) do |handle|
|
69
|
-
annots = parse_etool_output(handle)
|
70
|
-
end
|
71
|
-
puts annots.join("\n")
|
72
|
-
end
|
73
|
-
|
74
|
-
|
75
|
-
#$LOG.close
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
=begin
|
81
|
-
|
82
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
83
|
-
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
|
84
|
-
<eSummaryResult>
|
85
|
-
|
86
|
-
<DocSum>
|
87
|
-
<Id>24115498</Id>
|
88
|
-
<Item Name="Caption" Type="String">NP_710008</Item>
|
89
|
-
<Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
|
90
|
-
<Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
|
91
|
-
<Item Name="Gi" Type="Integer">24115498</Item>
|
92
|
-
<Item Name="CreateDate" Type="String">2002/10/16</Item>
|
93
|
-
|
94
|
-
<Item Name="UpdateDate" Type="String">2006/04/03</Item>
|
95
|
-
<Item Name="Flags" Type="Integer">512</Item>
|
96
|
-
<Item Name="TaxId" Type="Integer">198214</Item>
|
97
|
-
<Item Name="Status" Type="String">live</Item>
|
98
|
-
<Item Name="ReplacedBy" Type="String"></Item>
|
99
|
-
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
100
|
-
</DocSum>
|
101
|
-
|
102
|
-
|
103
|
-
<DocSum>
|
104
|
-
<Id>434011</Id>
|
105
|
-
<Item Name="Caption" Type="String">CAA24741</Item>
|
106
|
-
|
107
|
-
<Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
|
108
|
-
<Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
|
109
|
-
<Item Name="Gi" Type="Integer">434011</Item>
|
110
|
-
<Item Name="CreateDate" Type="String">1983/12/06</Item>
|
111
|
-
<Item Name="UpdateDate" Type="String">2005/04/18</Item>
|
112
|
-
<Item Name="Flags" Type="Integer">0</Item>
|
113
|
-
<Item Name="TaxId" Type="Integer">562</Item>
|
114
|
-
<Item Name="Status" Type="String">live</Item>
|
115
|
-
<Item Name="ReplacedBy" Type="String"></Item>
|
116
|
-
|
117
|
-
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
118
|
-
</DocSum>
|
119
|
-
|
120
|
-
</eSummaryResult>
|
13
|
+
puts( GI.gi2annot(gis).join("\n") )
|
121
14
|
|
122
|
-
=end
|
data/bin/id_class_anal.rb
CHANGED
@@ -4,6 +4,7 @@ require 'spec_id'
|
|
4
4
|
require 'generator'
|
5
5
|
require 'optparse'
|
6
6
|
require 'ostruct'
|
7
|
+
require 'roc'
|
7
8
|
|
8
9
|
def file_noext(file)
|
9
10
|
file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
|
@@ -21,7 +22,8 @@ jtplot_file = jtplot_base + '.toplot'
|
|
21
22
|
OptionParser.new do |op|
|
22
23
|
op.on("-p", "--prefix PREFIX", "prefix for false positive proteins") {|v| opt.p = v.split(',') }
|
23
24
|
op.on("-j", "--jtplot", "output file '#{jtplot_file}' for jtp plotting program") {|v| opt.j = v }
|
24
|
-
op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
|
25
|
+
# op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
|
26
|
+
op.on("-a", "--area", "outputs area under the curve") {|v| opt.a = v }
|
25
27
|
end.parse!
|
26
28
|
|
27
29
|
if ARGV.size < 1
|
@@ -32,55 +34,59 @@ if ARGV.size < 1
|
|
32
34
|
probabilities) or protein_prophet-prot.xml file which has been run with
|
33
35
|
decoy proteins.
|
34
36
|
|
35
|
-
Outputs tp's
|
36
|
-
|
37
|
-
|
38
|
-
To capture to file:
|
37
|
+
Outputs tp's and precision.
|
38
|
+
[The false positive predictive rate (FPPR) is 1 - precision]
|
39
|
+
The two columns will be labeled at the top.
|
40
|
+
(delimited by '\\t') to STDOUT. To capture to file:
|
41
|
+
#{File.basename(__FILE__)} protein_file.xml > out.csv
|
39
42
|
|
40
|
-
Also takes gzipped (extension: xml.gz) files.
|
41
|
-
|
42
43
|
OPTIONS:
|
43
44
|
<s> = string
|
44
45
|
-p --prefix <s[,s...]> Prefix(s) by which to determine decoy proteins (default #{def_pre})
|
45
46
|
-j --jtplot outputs #{jtplot_file} for plotting by plot.rb
|
46
47
|
[% plot.rb -w lp --yrange n0.1:1.1 --noenhanced <file> ]
|
48
|
+
-a --area outputs area under the curve instead of tps/precision
|
47
49
|
|
48
50
|
NOTE: protein prophet files not yet functional!!!
|
49
51
|
ABBR:
|
50
52
|
TP = True Positives
|
51
53
|
FP = False Positives
|
52
54
|
Prec = Precision = TP/(TP+FP)
|
53
|
-
FPR = False Positive Rate (as defined by Gygi) 2*[FP/(TP+FP)]
|
54
55
|
"
|
55
56
|
exit
|
56
57
|
end
|
57
58
|
|
59
|
+
###########################################################
|
60
|
+
# I DON"T think option -e is functional yet...
|
61
|
+
###########################################################
|
62
|
+
|
58
63
|
files = ARGV.to_a
|
59
64
|
|
60
65
|
out = nil
|
61
66
|
if opt.j
|
62
67
|
out = File.open(jtplot_file, "w")
|
63
|
-
lines = ['XYData', jtplot_base, "Classification Analysis", "Num
|
68
|
+
lines = ['XYData', jtplot_base, "Classification Analysis", "Num Hits", "Precision"]
|
64
69
|
lines.each {|l| out.puts l}
|
65
70
|
end
|
66
71
|
|
67
72
|
headings = files.collect do |file|
|
68
|
-
%w(TP
|
73
|
+
%w(TP Precision).collect {|v| v + " (#{file_noext(file)})" }
|
69
74
|
end
|
70
|
-
#headings = ["# True Positives", "Precision (TP/(TP+FP))", "FP Rate 2*(FP/(TP+FP))"]
|
71
|
-
puts headings.flatten.join(delimiter)
|
72
75
|
|
73
76
|
all_arrs = []
|
74
77
|
files.each_with_index do |file,i|
|
75
78
|
sp = SpecID.new(file)
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
79
|
+
headers = [file_noext(file)]
|
80
|
+
arrs = sp.num_hits_and_ppv_for_prob(opt.p[i])
|
81
|
+
|
82
|
+
if opt.a
|
83
|
+
(num_hits, prec) = arrs
|
84
|
+
roc = ROC.new
|
85
|
+
prec_area = roc.area_under_curve(num_hits, prec)
|
86
|
+
puts "#{file} (area under curve [num_hits, precision])"
|
87
|
+
puts "Prec [#TPPrec = TP/(TP+FP)]:\t#{prec_area}"
|
83
88
|
end
|
89
|
+
|
84
90
|
all_arrs.push(*arrs)
|
85
91
|
|
86
92
|
lns = []
|
@@ -95,8 +101,12 @@ files.each_with_index do |file,i|
|
|
95
101
|
end
|
96
102
|
end
|
97
103
|
|
98
|
-
|
99
|
-
|
104
|
+
|
105
|
+
unless opt.a
|
106
|
+
puts headings.flatten.join(delimiter)
|
107
|
+
SyncEnumerator.new(*all_arrs).each do |row|
|
108
|
+
puts row.join(delimiter)
|
109
|
+
end
|
100
110
|
end
|
101
111
|
|
102
112
|
out.close if opt.j
|
data/bin/id_precision.rb
CHANGED
@@ -13,7 +13,7 @@ opts = OptionParser.new do |op|
|
|
13
13
|
op.banner = "usage: #{File.basename(__FILE__)} prefix bioworks.xml"
|
14
14
|
op.separator ""
|
15
15
|
op.separator "takes Bioworks 3.2 xml output files (with probabilities)"
|
16
|
-
op.separator "rank orders the probabilities and outputs
|
16
|
+
op.separator "rank orders the probabilities and outputs num hits and precision"
|
17
17
|
op.separator "Also takes gzipped (xml.gz) files labeled as such"
|
18
18
|
op.separator ""
|
19
19
|
op.separator "Outputs a comma separated value to STDOUT (.csv)"
|
@@ -50,10 +50,11 @@ tp_obj.peps = tp
|
|
50
50
|
two_lists = [tp_obj, fp_obj].map do |obj|
|
51
51
|
list = []
|
52
52
|
list.push( obj.pep_probs_by_pep_prots )
|
53
|
-
|
53
|
+
|
54
|
+
list.push( obj.pep_probs_by_bn_seq_charge )
|
54
55
|
# These each have a by_min and a by_top10
|
55
|
-
list.push(*( obj.
|
56
|
-
list.push(*( obj.
|
56
|
+
list.push(*( obj.pep_probs_by_bn_scan ) )
|
57
|
+
list.push(*( obj.pep_probs_by_bn_scan_charge ) )
|
57
58
|
list
|
58
59
|
end
|
59
60
|
|
@@ -61,19 +62,22 @@ end
|
|
61
62
|
headings = ["PepProts", "SeqCharge", "Scan(TopHit)", "Scan(Top10)", "ScanCharge(TopHit)", "ScanCharge(Top10)"]
|
62
63
|
csv_headings = []
|
63
64
|
headings.each do |head|
|
64
|
-
csv_headings << head + ":
|
65
|
+
csv_headings << head + ": NH"
|
65
66
|
csv_headings << head + ": PR"
|
66
67
|
end
|
67
68
|
|
68
69
|
pairs = two_lists[0].zip two_lists[1]
|
69
70
|
|
70
|
-
roc =
|
71
|
+
roc = DecoyROC.new
|
71
72
|
x_y= []
|
72
73
|
area_under_curve = []
|
73
74
|
#start_x = []
|
74
75
|
#end_x = []
|
75
76
|
pairs.each do |pair|
|
76
|
-
x,y = roc.
|
77
|
+
#x,y = roc.pred_and_tps_and_ppv(pair[0], pair[1])
|
78
|
+
(num_hits, tps, ppv) = roc.pred_and_tps_and_ppv(pair[0], pair[1])
|
79
|
+
x = num_hits
|
80
|
+
y = ppv
|
77
81
|
if $AREAS_ONLY
|
78
82
|
x.unshift 0
|
79
83
|
y.unshift 1.0
|
@@ -99,7 +103,7 @@ end
|
|
99
103
|
# X axis is the number of peptides id# (i.e., # of peps in TP db)
|
100
104
|
# Y axis is the precision = TP/(TP+FP)
|
101
105
|
|
102
|
-
|
106
|
+
puts "# NH = number of hits"
|
103
107
|
puts "# TP = true positives"
|
104
108
|
puts "# FP = false positives"
|
105
109
|
puts "# PR = precision = TP/(TP+FP)"
|
data/bin/protein_summary.rb
CHANGED
@@ -8,6 +8,7 @@ require 'spec_id'
|
|
8
8
|
|
9
9
|
#############################################################
|
10
10
|
# GLOBALS:
|
11
|
+
PRECISION_PROGRAM_BASE = 'precision'
|
11
12
|
DEF_PREFIX = "INV_"
|
12
13
|
DEF_PERCENT_FP = "5.0"
|
13
14
|
#############################################################
|
@@ -62,7 +63,7 @@ class Runner
|
|
62
63
|
background-color: #FF0000;
|
63
64
|
color: #FFFFFF
|
64
65
|
}
|
65
|
-
div.file_info, div.software, div.
|
66
|
+
div.file_info, div.software, div.fppr, div.num_proteins{
|
66
67
|
margin-left: 20px;
|
67
68
|
margin-top: 20px;
|
68
69
|
}
|
@@ -173,38 +174,38 @@ class Runner
|
|
173
174
|
end
|
174
175
|
|
175
176
|
# assumes that these are sorted on probability
|
176
|
-
#
|
177
|
-
# returns [number_of_prots,
|
178
|
-
def
|
179
|
-
|
180
|
-
|
177
|
+
# desired_fppr is a float
|
178
|
+
# returns [number_of_prots, actual_fppr]
|
179
|
+
def num_prots_above_fppr(prots, desired_fppr)
|
180
|
+
current_fppr_rate_percent = 0.0
|
181
|
+
previous_fppr_rate_percent = 0.0
|
181
182
|
current_sum_one_minus_prob = 0.0
|
182
|
-
|
183
|
-
|
183
|
+
proteins_within_fppr = 0
|
184
|
+
actual_fppr = nil
|
184
185
|
already_found = false
|
185
186
|
prot_cnt = 0
|
186
187
|
prots.each do |prot|
|
187
188
|
prot_cnt += 1
|
188
189
|
# SUM(1-probX)/#prots
|
189
190
|
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
190
|
-
|
191
|
+
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
191
192
|
|
192
|
-
if
|
193
|
-
|
194
|
-
|
193
|
+
if current_fppr_rate_percent > desired_fppr && !already_found
|
194
|
+
actual_fppr = previous_fppr_rate_percent
|
195
|
+
proteins_within_fppr = prot_cnt
|
195
196
|
already_found = true
|
196
197
|
end
|
197
|
-
|
198
|
+
previous_fppr_rate_percent = current_fppr_rate_percent
|
198
199
|
end
|
199
|
-
[
|
200
|
+
[proteins_within_fppr, actual_fppr]
|
200
201
|
end
|
201
202
|
|
202
|
-
#### #
|
203
|
+
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
|
203
204
|
|
204
205
|
# returns a string of the table rows
|
205
206
|
# false_positive_rate (give as a %) is the cutoff mark
|
206
|
-
# returns the number of proteins at the
|
207
|
-
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols,
|
207
|
+
# returns the number of proteins at the desired_fppr (if given)
|
208
|
+
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
|
208
209
|
prot_cnt = 0
|
209
210
|
uniq_prots.map do |prot|
|
210
211
|
tr do
|
@@ -267,18 +268,20 @@ class Runner
|
|
267
268
|
"<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
|
268
269
|
end
|
269
270
|
|
270
|
-
def proph_output(file, outfn, opt,
|
271
|
+
def proph_output(file, outfn, opt, fppr_output_as_html)
|
271
272
|
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
|
272
273
|
num_cols = header_anchors.size
|
273
274
|
theaders = ths(header_anchors)
|
274
275
|
|
275
276
|
root = AXML.parse_file(file)
|
276
277
|
prots = []
|
277
|
-
## find the min_prob at a
|
278
|
-
min_prob_redline = 1.01 # if no
|
278
|
+
## find the min_prob at a fppr of XX
|
279
|
+
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
|
279
280
|
|
280
|
-
if opt.c
|
281
|
+
if opt.c
|
281
282
|
actual_percent_fp = opt.c.to_f
|
283
|
+
elsif opt.cut_at
|
284
|
+
actual_percent_fp = opt.cut_at.to_f
|
282
285
|
else
|
283
286
|
actual_percent_fp = nil
|
284
287
|
end
|
@@ -289,20 +292,24 @@ class Runner
|
|
289
292
|
end
|
290
293
|
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
291
294
|
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
292
|
-
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
293
295
|
|
294
296
|
## num proteins above cutoff (if opt.c)
|
295
297
|
num_prots_html = ''
|
296
|
-
if opt.c
|
297
|
-
(num_prots,
|
298
|
-
num_prots_html = num_prots_to_html(
|
298
|
+
if opt.c || opt.cut_at
|
299
|
+
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
|
300
|
+
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
|
301
|
+
end
|
302
|
+
if opt.cut_at
|
303
|
+
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
|
299
304
|
end
|
300
305
|
|
306
|
+
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
307
|
+
|
301
308
|
table_string = table do
|
302
309
|
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
|
303
310
|
end
|
304
|
-
er_info = opt.
|
305
|
-
html_pieces = [outfn, header,
|
311
|
+
er_info = opt.precision ? error_info(file) : ""
|
312
|
+
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
|
306
313
|
print_html_pieces(*html_pieces)
|
307
314
|
end # proph_output
|
308
315
|
|
@@ -311,7 +318,7 @@ class Runner
|
|
311
318
|
"<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
|
312
319
|
end
|
313
320
|
|
314
|
-
def bioworks_output(file, outfn, opt,
|
321
|
+
def bioworks_output(file, outfn, opt, fppr_output_as_html)
|
315
322
|
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
316
323
|
num_cols = header_anchors.size
|
317
324
|
theaders = ths(header_anchors)
|
@@ -339,7 +346,7 @@ class Runner
|
|
339
346
|
table_string = table do
|
340
347
|
tr{theaders} + rows
|
341
348
|
end
|
342
|
-
print_html_pieces(outfn, header,
|
349
|
+
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
|
343
350
|
end # bioworks_output
|
344
351
|
|
345
352
|
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
|
@@ -358,7 +365,7 @@ class Runner
|
|
358
365
|
#puts lines ?? is this supposed to be commented out?
|
359
366
|
lines = lines.reject do |obj| obj =~ /\*{10}/ end
|
360
367
|
lines.map! do |line| "#{line}<br/>" end
|
361
|
-
"<div class=\"
|
368
|
+
"<div class=\"fppr\">
|
362
369
|
<h3>Classification Analysis</h3>
|
363
370
|
#{lines.join("\n")}
|
364
371
|
</div>"
|
@@ -366,7 +373,7 @@ class Runner
|
|
366
373
|
|
367
374
|
# transforms the output string of file_as_decoy into html
|
368
375
|
def prefix_as_decoy_to_html(string)
|
369
|
-
"<div class=\"
|
376
|
+
"<div class=\"fppr\">
|
370
377
|
<h3>Classification Analysis</h3>
|
371
378
|
</div>" +
|
372
379
|
string
|
@@ -384,21 +391,18 @@ class Runner
|
|
384
391
|
op.separator " outputs: <file>.summary.html"
|
385
392
|
op.separator ""
|
386
393
|
op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
387
|
-
op.
|
388
|
-
op.separator(" to
|
394
|
+
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
395
|
+
op.separator(" if --precision then -f is used to specify a file or prefix")
|
396
|
+
op.separator(" that indicates the false positives.")
|
389
397
|
op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
|
390
398
|
op.separator ""
|
391
|
-
op.separator "Options for
|
392
|
-
op.on("
|
393
|
-
op.separator(" type '
|
394
|
-
op.separator(" These options are passed on:")
|
395
|
-
op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
|
396
|
-
op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
|
397
|
-
op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
|
398
|
-
|
399
|
+
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
|
400
|
+
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
401
|
+
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
399
402
|
op.separator ""
|
400
403
|
op.separator "Specific to ProteinProphet (with no concatenated DB):"
|
401
|
-
op.on("-c", "--cutoff percent", "
|
404
|
+
op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
|
405
|
+
op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
|
402
406
|
end
|
403
407
|
|
404
408
|
opts.parse!
|
@@ -408,31 +412,23 @@ class Runner
|
|
408
412
|
exit
|
409
413
|
end
|
410
414
|
|
411
|
-
|
415
|
+
fppr_output_as_html = ''
|
412
416
|
files = argv.to_a
|
413
417
|
files.each do |file|
|
414
418
|
outfn = file.gsub(/\.xml$/, '.summary.html')
|
415
419
|
## False Positive Rate Calculation:
|
416
|
-
if opt.
|
420
|
+
if opt.precision
|
417
421
|
opt.o = outfn # won't actually be written over, but used
|
418
|
-
to_use_argv =
|
419
|
-
(out_string, opt
|
420
|
-
|
421
|
-
## DISABLE the opt.f (it's a filename) so it doesn't interfere with
|
422
|
-
## filtering:
|
423
|
-
opt.f = nil
|
424
|
-
|
425
|
-
fpr_output_as_html = file_as_decoy_to_html(out_string)
|
426
|
-
else
|
427
|
-
fpr_output_as_html = prefix_as_decoy_to_html(out_string)
|
428
|
-
end
|
422
|
+
to_use_argv = create_precision_argv(file, opt)
|
423
|
+
(out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
|
424
|
+
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
|
429
425
|
end
|
430
426
|
|
431
427
|
case SpecID.file_type(file)
|
432
428
|
when "protproph"
|
433
|
-
proph_output(file, outfn, opt,
|
429
|
+
proph_output(file, outfn, opt, fppr_output_as_html)
|
434
430
|
when "bioworks"
|
435
|
-
bioworks_output(file, outfn, opt,
|
431
|
+
bioworks_output(file, outfn, opt, fppr_output_as_html)
|
436
432
|
else
|
437
433
|
abort "filetype for #{file} not recognized!"
|
438
434
|
end
|
@@ -440,15 +436,12 @@ class Runner
|
|
440
436
|
|
441
437
|
end # method go
|
442
438
|
|
443
|
-
def
|
439
|
+
def create_precision_argv(file, opt)
|
444
440
|
# include only those options specific
|
445
441
|
new_argv = [file]
|
446
442
|
if opt.f ; new_argv << '-f' << opt.f end
|
447
|
-
if opt.g ; new_argv << '-g' end
|
448
|
-
if opt.p ; new_argv << '-p' end
|
449
|
-
if opt.n ; new_argv << '-n' end
|
450
443
|
if opt.o ; new_argv << '-o' << opt.o end
|
451
|
-
new_argv
|
444
|
+
new_argv
|
452
445
|
end
|
453
446
|
|
454
447
|
end # Runner
|
data/changelog.txt
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
|
2
|
+
1. A couple of scripts and subroutines were hashing peptides but not on the file
|
3
|
+
basename. This would result in slightly incorrect results (any time there
|
4
|
+
were overlapping scan numbers in multiple datasets, only the top one would be
|
5
|
+
chosen). The results would be correct for single runs.
|
6
|
+
|
7
|
+
Output files that could be affected:
|
8
|
+
*.top_per_scan.txt
|
9
|
+
*.all_peps_per_scan.txt
|
10
|
+
|
11
|
+
Scripts that could be affected:
|
12
|
+
script/top_hit_per_scan.rb
|
13
|
+
bin/filter_spec_id.rb
|
14
|
+
script/filter-peps.rb
|
15
|
+
bin/id_precision.rb
|
16
|
+
|
17
|
+
Subroutines that were affected:
|
18
|
+
spec_id.rb (pep_probs_by_* )
|
19
|
+
spec_id.rb (top_peps_prefilter!)
|
20
|
+
proph.rb uniq_by_seqcharge
|
21
|
+
align.rb called uniq_by_seqcharge
|
22
|
+
|
23
|
+
|
24
|
+
2. false_positive_rate.rb and protein_summary.rb (by extension) were using
|
25
|
+
number of true positives on the x axis while in reality I was plotting the
|
26
|
+
number of hits. I've updated x axis labels to reflect this change. In
|
27
|
+
addition, since the term 'false positive rate' has such a distinct definition
|
28
|
+
in classical ROC plots and binary statistics, I've decided to work primarily
|
29
|
+
in terms of precision (TP/(TP+FP)). I've purged the terms 'False Positive
|
30
|
+
Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
|
31
|
+
called the False Positive Predictive Rate (FPPR). I will probably implement
|
32
|
+
this in a future release.
|
33
|
+
|
34
|
+
|