mspire 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/bin/gi2annot.rb
CHANGED
@@ -1,13 +1,6 @@
|
|
1
1
|
#!/usr/bin/ruby -w
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'rexml/document'
|
5
|
-
require 'rexml/streamlistener'
|
6
|
-
|
7
|
-
BATCH_SIZE = 500
|
8
|
-
|
9
|
-
$LOG = nil
|
10
|
-
$ANNOTS = []
|
3
|
+
require 'gi'
|
11
4
|
|
12
5
|
if ARGV.size < 1
|
13
6
|
puts "usage: #{File.basename(__FILE__)} <gi> ..."
|
@@ -15,108 +8,7 @@ if ARGV.size < 1
|
|
15
8
|
end
|
16
9
|
|
17
10
|
|
18
|
-
# db=
|
19
|
-
# retstart=
|
20
|
-
# retmax=
|
21
|
-
|
22
|
-
class Listener
|
23
|
-
include REXML
|
24
|
-
include StreamListener
|
25
|
-
def initialize
|
26
|
-
@get_title = false
|
27
|
-
end
|
28
|
-
|
29
|
-
def tag_start(name, attributes)
|
30
|
-
#puts "NAME" + name
|
31
|
-
#p attributes
|
32
|
-
if name == "Item" && attributes["Name"] == "Title"
|
33
|
-
@get_title = true
|
34
|
-
end
|
35
|
-
end
|
36
|
-
def text(text)
|
37
|
-
#puts "TEXT: " + text + @get_title.to_s
|
38
|
-
if @get_title
|
39
|
-
#puts "GETTING TITLE!"
|
40
|
-
$ANNOTS.push text.chomp
|
41
|
-
@get_title = false
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
|
47
|
-
# Returns a list of Annotation strings
|
48
|
-
def parse_etool_output(handle)
|
49
|
-
listener = Listener.new
|
50
|
-
parser = REXML::Parsers::StreamParser.new(handle, listener)
|
51
|
-
parser.parse
|
52
|
-
|
53
|
-
$ANNOTS
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
#$LOG = File.open("log.log", "w")
|
58
|
-
|
59
11
|
gis = ARGV.to_a.dup
|
60
12
|
|
61
|
-
|
62
|
-
batch = gis.slice!(0..BATCH_SIZE)
|
63
|
-
if batch.size == 0 then break end
|
64
|
-
string = batch.join(",")
|
65
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
|
66
|
-
#puts url
|
67
|
-
annots = []
|
68
|
-
open(url) do |handle|
|
69
|
-
annots = parse_etool_output(handle)
|
70
|
-
end
|
71
|
-
puts annots.join("\n")
|
72
|
-
end
|
73
|
-
|
74
|
-
|
75
|
-
#$LOG.close
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
=begin
|
81
|
-
|
82
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
83
|
-
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
|
84
|
-
<eSummaryResult>
|
85
|
-
|
86
|
-
<DocSum>
|
87
|
-
<Id>24115498</Id>
|
88
|
-
<Item Name="Caption" Type="String">NP_710008</Item>
|
89
|
-
<Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
|
90
|
-
<Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
|
91
|
-
<Item Name="Gi" Type="Integer">24115498</Item>
|
92
|
-
<Item Name="CreateDate" Type="String">2002/10/16</Item>
|
93
|
-
|
94
|
-
<Item Name="UpdateDate" Type="String">2006/04/03</Item>
|
95
|
-
<Item Name="Flags" Type="Integer">512</Item>
|
96
|
-
<Item Name="TaxId" Type="Integer">198214</Item>
|
97
|
-
<Item Name="Status" Type="String">live</Item>
|
98
|
-
<Item Name="ReplacedBy" Type="String"></Item>
|
99
|
-
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
100
|
-
</DocSum>
|
101
|
-
|
102
|
-
|
103
|
-
<DocSum>
|
104
|
-
<Id>434011</Id>
|
105
|
-
<Item Name="Caption" Type="String">CAA24741</Item>
|
106
|
-
|
107
|
-
<Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
|
108
|
-
<Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
|
109
|
-
<Item Name="Gi" Type="Integer">434011</Item>
|
110
|
-
<Item Name="CreateDate" Type="String">1983/12/06</Item>
|
111
|
-
<Item Name="UpdateDate" Type="String">2005/04/18</Item>
|
112
|
-
<Item Name="Flags" Type="Integer">0</Item>
|
113
|
-
<Item Name="TaxId" Type="Integer">562</Item>
|
114
|
-
<Item Name="Status" Type="String">live</Item>
|
115
|
-
<Item Name="ReplacedBy" Type="String"></Item>
|
116
|
-
|
117
|
-
<Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
|
118
|
-
</DocSum>
|
119
|
-
|
120
|
-
</eSummaryResult>
|
13
|
+
puts( GI.gi2annot(gis).join("\n") )
|
121
14
|
|
122
|
-
=end
|
data/bin/id_class_anal.rb
CHANGED
@@ -4,6 +4,7 @@ require 'spec_id'
|
|
4
4
|
require 'generator'
|
5
5
|
require 'optparse'
|
6
6
|
require 'ostruct'
|
7
|
+
require 'roc'
|
7
8
|
|
8
9
|
def file_noext(file)
|
9
10
|
file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
|
@@ -21,7 +22,8 @@ jtplot_file = jtplot_base + '.toplot'
|
|
21
22
|
OptionParser.new do |op|
|
22
23
|
op.on("-p", "--prefix PREFIX", "prefix for false positive proteins") {|v| opt.p = v.split(',') }
|
23
24
|
op.on("-j", "--jtplot", "output file '#{jtplot_file}' for jtp plotting program") {|v| opt.j = v }
|
24
|
-
op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
|
25
|
+
# op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
|
26
|
+
op.on("-a", "--area", "outputs area under the curve") {|v| opt.a = v }
|
25
27
|
end.parse!
|
26
28
|
|
27
29
|
if ARGV.size < 1
|
@@ -32,55 +34,59 @@ if ARGV.size < 1
|
|
32
34
|
probabilities) or protein_prophet-prot.xml file which has been run with
|
33
35
|
decoy proteins.
|
34
36
|
|
35
|
-
Outputs tp's
|
36
|
-
|
37
|
-
|
38
|
-
To capture to file:
|
37
|
+
Outputs tp's and precision.
|
38
|
+
[The false positive predictive rate (FPPR) is 1 - precision]
|
39
|
+
The two columns will be labeled at the top.
|
40
|
+
(delimited by '\\t') to STDOUT. To capture to file:
|
41
|
+
#{File.basename(__FILE__)} protein_file.xml > out.csv
|
39
42
|
|
40
|
-
Also takes gzipped (extension: xml.gz) files.
|
41
|
-
|
42
43
|
OPTIONS:
|
43
44
|
<s> = string
|
44
45
|
-p --prefix <s[,s...]> Prefix(s) by which to determine decoy proteins (default #{def_pre})
|
45
46
|
-j --jtplot outputs #{jtplot_file} for plotting by plot.rb
|
46
47
|
[% plot.rb -w lp --yrange n0.1:1.1 --noenhanced <file> ]
|
48
|
+
-a --area outputs area under the curve instead of tps/precision
|
47
49
|
|
48
50
|
NOTE: protein prophet files not yet functional!!!
|
49
51
|
ABBR:
|
50
52
|
TP = True Positives
|
51
53
|
FP = False Positives
|
52
54
|
Prec = Precision = TP/(TP+FP)
|
53
|
-
FPR = False Positive Rate (as defined by Gygi) 2*[FP/(TP+FP)]
|
54
55
|
"
|
55
56
|
exit
|
56
57
|
end
|
57
58
|
|
59
|
+
###########################################################
|
60
|
+
# I DON"T think option -e is functional yet...
|
61
|
+
###########################################################
|
62
|
+
|
58
63
|
files = ARGV.to_a
|
59
64
|
|
60
65
|
out = nil
|
61
66
|
if opt.j
|
62
67
|
out = File.open(jtplot_file, "w")
|
63
|
-
lines = ['XYData', jtplot_base, "Classification Analysis", "Num
|
68
|
+
lines = ['XYData', jtplot_base, "Classification Analysis", "Num Hits", "Precision"]
|
64
69
|
lines.each {|l| out.puts l}
|
65
70
|
end
|
66
71
|
|
67
72
|
headings = files.collect do |file|
|
68
|
-
%w(TP
|
73
|
+
%w(TP Precision).collect {|v| v + " (#{file_noext(file)})" }
|
69
74
|
end
|
70
|
-
#headings = ["# True Positives", "Precision (TP/(TP+FP))", "FP Rate 2*(FP/(TP+FP))"]
|
71
|
-
puts headings.flatten.join(delimiter)
|
72
75
|
|
73
76
|
all_arrs = []
|
74
77
|
files.each_with_index do |file,i|
|
75
78
|
sp = SpecID.new(file)
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
79
|
+
headers = [file_noext(file)]
|
80
|
+
arrs = sp.num_hits_and_ppv_for_prob(opt.p[i])
|
81
|
+
|
82
|
+
if opt.a
|
83
|
+
(num_hits, prec) = arrs
|
84
|
+
roc = ROC.new
|
85
|
+
prec_area = roc.area_under_curve(num_hits, prec)
|
86
|
+
puts "#{file} (area under curve [num_hits, precision])"
|
87
|
+
puts "Prec [#TPPrec = TP/(TP+FP)]:\t#{prec_area}"
|
83
88
|
end
|
89
|
+
|
84
90
|
all_arrs.push(*arrs)
|
85
91
|
|
86
92
|
lns = []
|
@@ -95,8 +101,12 @@ files.each_with_index do |file,i|
|
|
95
101
|
end
|
96
102
|
end
|
97
103
|
|
98
|
-
|
99
|
-
|
104
|
+
|
105
|
+
unless opt.a
|
106
|
+
puts headings.flatten.join(delimiter)
|
107
|
+
SyncEnumerator.new(*all_arrs).each do |row|
|
108
|
+
puts row.join(delimiter)
|
109
|
+
end
|
100
110
|
end
|
101
111
|
|
102
112
|
out.close if opt.j
|
data/bin/id_precision.rb
CHANGED
@@ -13,7 +13,7 @@ opts = OptionParser.new do |op|
|
|
13
13
|
op.banner = "usage: #{File.basename(__FILE__)} prefix bioworks.xml"
|
14
14
|
op.separator ""
|
15
15
|
op.separator "takes Bioworks 3.2 xml output files (with probabilities)"
|
16
|
-
op.separator "rank orders the probabilities and outputs
|
16
|
+
op.separator "rank orders the probabilities and outputs num hits and precision"
|
17
17
|
op.separator "Also takes gzipped (xml.gz) files labeled as such"
|
18
18
|
op.separator ""
|
19
19
|
op.separator "Outputs a comma separated value to STDOUT (.csv)"
|
@@ -50,10 +50,11 @@ tp_obj.peps = tp
|
|
50
50
|
two_lists = [tp_obj, fp_obj].map do |obj|
|
51
51
|
list = []
|
52
52
|
list.push( obj.pep_probs_by_pep_prots )
|
53
|
-
|
53
|
+
|
54
|
+
list.push( obj.pep_probs_by_bn_seq_charge )
|
54
55
|
# These each have a by_min and a by_top10
|
55
|
-
list.push(*( obj.
|
56
|
-
list.push(*( obj.
|
56
|
+
list.push(*( obj.pep_probs_by_bn_scan ) )
|
57
|
+
list.push(*( obj.pep_probs_by_bn_scan_charge ) )
|
57
58
|
list
|
58
59
|
end
|
59
60
|
|
@@ -61,19 +62,22 @@ end
|
|
61
62
|
headings = ["PepProts", "SeqCharge", "Scan(TopHit)", "Scan(Top10)", "ScanCharge(TopHit)", "ScanCharge(Top10)"]
|
62
63
|
csv_headings = []
|
63
64
|
headings.each do |head|
|
64
|
-
csv_headings << head + ":
|
65
|
+
csv_headings << head + ": NH"
|
65
66
|
csv_headings << head + ": PR"
|
66
67
|
end
|
67
68
|
|
68
69
|
pairs = two_lists[0].zip two_lists[1]
|
69
70
|
|
70
|
-
roc =
|
71
|
+
roc = DecoyROC.new
|
71
72
|
x_y= []
|
72
73
|
area_under_curve = []
|
73
74
|
#start_x = []
|
74
75
|
#end_x = []
|
75
76
|
pairs.each do |pair|
|
76
|
-
x,y = roc.
|
77
|
+
#x,y = roc.pred_and_tps_and_ppv(pair[0], pair[1])
|
78
|
+
(num_hits, tps, ppv) = roc.pred_and_tps_and_ppv(pair[0], pair[1])
|
79
|
+
x = num_hits
|
80
|
+
y = ppv
|
77
81
|
if $AREAS_ONLY
|
78
82
|
x.unshift 0
|
79
83
|
y.unshift 1.0
|
@@ -99,7 +103,7 @@ end
|
|
99
103
|
# X axis is the number of peptides id# (i.e., # of peps in TP db)
|
100
104
|
# Y axis is the precision = TP/(TP+FP)
|
101
105
|
|
102
|
-
|
106
|
+
puts "# NH = number of hits"
|
103
107
|
puts "# TP = true positives"
|
104
108
|
puts "# FP = false positives"
|
105
109
|
puts "# PR = precision = TP/(TP+FP)"
|
data/bin/protein_summary.rb
CHANGED
@@ -8,6 +8,7 @@ require 'spec_id'
|
|
8
8
|
|
9
9
|
#############################################################
|
10
10
|
# GLOBALS:
|
11
|
+
PRECISION_PROGRAM_BASE = 'precision'
|
11
12
|
DEF_PREFIX = "INV_"
|
12
13
|
DEF_PERCENT_FP = "5.0"
|
13
14
|
#############################################################
|
@@ -62,7 +63,7 @@ class Runner
|
|
62
63
|
background-color: #FF0000;
|
63
64
|
color: #FFFFFF
|
64
65
|
}
|
65
|
-
div.file_info, div.software, div.
|
66
|
+
div.file_info, div.software, div.fppr, div.num_proteins{
|
66
67
|
margin-left: 20px;
|
67
68
|
margin-top: 20px;
|
68
69
|
}
|
@@ -173,38 +174,38 @@ class Runner
|
|
173
174
|
end
|
174
175
|
|
175
176
|
# assumes that these are sorted on probability
|
176
|
-
#
|
177
|
-
# returns [number_of_prots,
|
178
|
-
def
|
179
|
-
|
180
|
-
|
177
|
+
# desired_fppr is a float
|
178
|
+
# returns [number_of_prots, actual_fppr]
|
179
|
+
def num_prots_above_fppr(prots, desired_fppr)
|
180
|
+
current_fppr_rate_percent = 0.0
|
181
|
+
previous_fppr_rate_percent = 0.0
|
181
182
|
current_sum_one_minus_prob = 0.0
|
182
|
-
|
183
|
-
|
183
|
+
proteins_within_fppr = 0
|
184
|
+
actual_fppr = nil
|
184
185
|
already_found = false
|
185
186
|
prot_cnt = 0
|
186
187
|
prots.each do |prot|
|
187
188
|
prot_cnt += 1
|
188
189
|
# SUM(1-probX)/#prots
|
189
190
|
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
190
|
-
|
191
|
+
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
191
192
|
|
192
|
-
if
|
193
|
-
|
194
|
-
|
193
|
+
if current_fppr_rate_percent > desired_fppr && !already_found
|
194
|
+
actual_fppr = previous_fppr_rate_percent
|
195
|
+
proteins_within_fppr = prot_cnt
|
195
196
|
already_found = true
|
196
197
|
end
|
197
|
-
|
198
|
+
previous_fppr_rate_percent = current_fppr_rate_percent
|
198
199
|
end
|
199
|
-
[
|
200
|
+
[proteins_within_fppr, actual_fppr]
|
200
201
|
end
|
201
202
|
|
202
|
-
#### #
|
203
|
+
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
|
203
204
|
|
204
205
|
# returns a string of the table rows
|
205
206
|
# false_positive_rate (give as a %) is the cutoff mark
|
206
|
-
# returns the number of proteins at the
|
207
|
-
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols,
|
207
|
+
# returns the number of proteins at the desired_fppr (if given)
|
208
|
+
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
|
208
209
|
prot_cnt = 0
|
209
210
|
uniq_prots.map do |prot|
|
210
211
|
tr do
|
@@ -267,18 +268,20 @@ class Runner
|
|
267
268
|
"<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
|
268
269
|
end
|
269
270
|
|
270
|
-
def proph_output(file, outfn, opt,
|
271
|
+
def proph_output(file, outfn, opt, fppr_output_as_html)
|
271
272
|
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
|
272
273
|
num_cols = header_anchors.size
|
273
274
|
theaders = ths(header_anchors)
|
274
275
|
|
275
276
|
root = AXML.parse_file(file)
|
276
277
|
prots = []
|
277
|
-
## find the min_prob at a
|
278
|
-
min_prob_redline = 1.01 # if no
|
278
|
+
## find the min_prob at a fppr of XX
|
279
|
+
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
|
279
280
|
|
280
|
-
if opt.c
|
281
|
+
if opt.c
|
281
282
|
actual_percent_fp = opt.c.to_f
|
283
|
+
elsif opt.cut_at
|
284
|
+
actual_percent_fp = opt.cut_at.to_f
|
282
285
|
else
|
283
286
|
actual_percent_fp = nil
|
284
287
|
end
|
@@ -289,20 +292,24 @@ class Runner
|
|
289
292
|
end
|
290
293
|
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
291
294
|
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
292
|
-
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
293
295
|
|
294
296
|
## num proteins above cutoff (if opt.c)
|
295
297
|
num_prots_html = ''
|
296
|
-
if opt.c
|
297
|
-
(num_prots,
|
298
|
-
num_prots_html = num_prots_to_html(
|
298
|
+
if opt.c || opt.cut_at
|
299
|
+
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
|
300
|
+
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
|
301
|
+
end
|
302
|
+
if opt.cut_at
|
303
|
+
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
|
299
304
|
end
|
300
305
|
|
306
|
+
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
307
|
+
|
301
308
|
table_string = table do
|
302
309
|
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
|
303
310
|
end
|
304
|
-
er_info = opt.
|
305
|
-
html_pieces = [outfn, header,
|
311
|
+
er_info = opt.precision ? error_info(file) : ""
|
312
|
+
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
|
306
313
|
print_html_pieces(*html_pieces)
|
307
314
|
end # proph_output
|
308
315
|
|
@@ -311,7 +318,7 @@ class Runner
|
|
311
318
|
"<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
|
312
319
|
end
|
313
320
|
|
314
|
-
def bioworks_output(file, outfn, opt,
|
321
|
+
def bioworks_output(file, outfn, opt, fppr_output_as_html)
|
315
322
|
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
316
323
|
num_cols = header_anchors.size
|
317
324
|
theaders = ths(header_anchors)
|
@@ -339,7 +346,7 @@ class Runner
|
|
339
346
|
table_string = table do
|
340
347
|
tr{theaders} + rows
|
341
348
|
end
|
342
|
-
print_html_pieces(outfn, header,
|
349
|
+
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
|
343
350
|
end # bioworks_output
|
344
351
|
|
345
352
|
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
|
@@ -358,7 +365,7 @@ class Runner
|
|
358
365
|
#puts lines ?? is this supposed to be commented out?
|
359
366
|
lines = lines.reject do |obj| obj =~ /\*{10}/ end
|
360
367
|
lines.map! do |line| "#{line}<br/>" end
|
361
|
-
"<div class=\"
|
368
|
+
"<div class=\"fppr\">
|
362
369
|
<h3>Classification Analysis</h3>
|
363
370
|
#{lines.join("\n")}
|
364
371
|
</div>"
|
@@ -366,7 +373,7 @@ class Runner
|
|
366
373
|
|
367
374
|
# transforms the output string of file_as_decoy into html
|
368
375
|
def prefix_as_decoy_to_html(string)
|
369
|
-
"<div class=\"
|
376
|
+
"<div class=\"fppr\">
|
370
377
|
<h3>Classification Analysis</h3>
|
371
378
|
</div>" +
|
372
379
|
string
|
@@ -384,21 +391,18 @@ class Runner
|
|
384
391
|
op.separator " outputs: <file>.summary.html"
|
385
392
|
op.separator ""
|
386
393
|
op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
387
|
-
op.
|
388
|
-
op.separator(" to
|
394
|
+
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
395
|
+
op.separator(" if --precision then -f is used to specify a file or prefix")
|
396
|
+
op.separator(" that indicates the false positives.")
|
389
397
|
op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
|
390
398
|
op.separator ""
|
391
|
-
op.separator "Options for
|
392
|
-
op.on("
|
393
|
-
op.separator(" type '
|
394
|
-
op.separator(" These options are passed on:")
|
395
|
-
op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
|
396
|
-
op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
|
397
|
-
op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
|
398
|
-
|
399
|
+
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
|
400
|
+
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
401
|
+
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
399
402
|
op.separator ""
|
400
403
|
op.separator "Specific to ProteinProphet (with no concatenated DB):"
|
401
|
-
op.on("-c", "--cutoff percent", "
|
404
|
+
op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
|
405
|
+
op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
|
402
406
|
end
|
403
407
|
|
404
408
|
opts.parse!
|
@@ -408,31 +412,23 @@ class Runner
|
|
408
412
|
exit
|
409
413
|
end
|
410
414
|
|
411
|
-
|
415
|
+
fppr_output_as_html = ''
|
412
416
|
files = argv.to_a
|
413
417
|
files.each do |file|
|
414
418
|
outfn = file.gsub(/\.xml$/, '.summary.html')
|
415
419
|
## False Positive Rate Calculation:
|
416
|
-
if opt.
|
420
|
+
if opt.precision
|
417
421
|
opt.o = outfn # won't actually be written over, but used
|
418
|
-
to_use_argv =
|
419
|
-
(out_string, opt
|
420
|
-
|
421
|
-
## DISABLE the opt.f (it's a filename) so it doesn't interfere with
|
422
|
-
## filtering:
|
423
|
-
opt.f = nil
|
424
|
-
|
425
|
-
fpr_output_as_html = file_as_decoy_to_html(out_string)
|
426
|
-
else
|
427
|
-
fpr_output_as_html = prefix_as_decoy_to_html(out_string)
|
428
|
-
end
|
422
|
+
to_use_argv = create_precision_argv(file, opt)
|
423
|
+
(out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
|
424
|
+
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
|
429
425
|
end
|
430
426
|
|
431
427
|
case SpecID.file_type(file)
|
432
428
|
when "protproph"
|
433
|
-
proph_output(file, outfn, opt,
|
429
|
+
proph_output(file, outfn, opt, fppr_output_as_html)
|
434
430
|
when "bioworks"
|
435
|
-
bioworks_output(file, outfn, opt,
|
431
|
+
bioworks_output(file, outfn, opt, fppr_output_as_html)
|
436
432
|
else
|
437
433
|
abort "filetype for #{file} not recognized!"
|
438
434
|
end
|
@@ -440,15 +436,12 @@ class Runner
|
|
440
436
|
|
441
437
|
end # method go
|
442
438
|
|
443
|
-
def
|
439
|
+
def create_precision_argv(file, opt)
|
444
440
|
# include only those options specific
|
445
441
|
new_argv = [file]
|
446
442
|
if opt.f ; new_argv << '-f' << opt.f end
|
447
|
-
if opt.g ; new_argv << '-g' end
|
448
|
-
if opt.p ; new_argv << '-p' end
|
449
|
-
if opt.n ; new_argv << '-n' end
|
450
443
|
if opt.o ; new_argv << '-o' << opt.o end
|
451
|
-
new_argv
|
444
|
+
new_argv
|
452
445
|
end
|
453
446
|
|
454
447
|
end # Runner
|
data/changelog.txt
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
|
2
|
+
1. A couple of scripts and subroutines were hashing peptides but not on the file
|
3
|
+
basename. This would result in slightly incorrect results (any time there
|
4
|
+
were overlapping scan numbers in multiple datasets, only the top one would be
|
5
|
+
chosen). The results would be correct for single runs.
|
6
|
+
|
7
|
+
Output files that could be affected:
|
8
|
+
*.top_per_scan.txt
|
9
|
+
*.all_peps_per_scan.txt
|
10
|
+
|
11
|
+
Scripts that could be affected:
|
12
|
+
script/top_hit_per_scan.rb
|
13
|
+
bin/filter_spec_id.rb
|
14
|
+
script/filter-peps.rb
|
15
|
+
bin/id_precision.rb
|
16
|
+
|
17
|
+
Subroutines that were affected:
|
18
|
+
spec_id.rb (pep_probs_by_* )
|
19
|
+
spec_id.rb (top_peps_prefilter!)
|
20
|
+
proph.rb uniq_by_seqcharge
|
21
|
+
align.rb called uniq_by_seqcharge
|
22
|
+
|
23
|
+
|
24
|
+
2. false_positive_rate.rb and protein_summary.rb (by extension) were using
|
25
|
+
number of true positives on the x axis while in reality I was plotting the
|
26
|
+
number of hits. I've updated x axis labels to reflect this change. In
|
27
|
+
addition, since the term 'false positive rate' has such a distinct definition
|
28
|
+
in classical ROC plots and binary statistics, I've decided to work primarily
|
29
|
+
in terms of precision (TP/(TP+FP)). I've purged the terms 'False Positive
|
30
|
+
Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
|
31
|
+
called the False Positive Predictive Rate (FPPR). I will probably implement
|
32
|
+
this in a future release.
|
33
|
+
|
34
|
+
|