mspire 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/lib/spec_id/precision.rb
CHANGED
@@ -2,22 +2,25 @@
|
|
2
2
|
require 'optparse'
|
3
3
|
require 'ostruct'
|
4
4
|
require 'generator'
|
5
|
-
require 'gnuplot'
|
6
5
|
require 'roc'
|
7
6
|
|
7
|
+
## silence this bad boy
|
8
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
9
|
+
require 'gnuplot'
|
10
|
+
$VERBOSE = tmp
|
11
|
+
|
8
12
|
class String
|
9
13
|
def margin
|
10
14
|
self.gsub(/^\s*\|/,'')
|
11
15
|
end
|
12
16
|
end
|
13
17
|
|
14
|
-
class
|
15
|
-
class SpecID::Precision ; end
|
18
|
+
class Prec ; end
|
16
19
|
|
17
|
-
module
|
20
|
+
module Prec::PlotHelper
|
18
21
|
|
19
22
|
PLOT_TYPE = 'XYData'
|
20
|
-
TITLE = 'Precision
|
23
|
+
TITLE = 'Precision vs. Num Hits [ Precision = Positive Predictive Value = TP/(TP+FP) ]'
|
21
24
|
XAXIS = 'Num Hits (excludes known false positives)'
|
22
25
|
EXT = '.toplot'
|
23
26
|
IMAGE_EXT = '.png'
|
@@ -26,6 +29,8 @@ module SpecID::Precision::PlotHelper
|
|
26
29
|
## CREATE the PLOT IMAGE:
|
27
30
|
to_plot = filename_noext + EXT
|
28
31
|
png = filename_noext + IMAGE_EXT
|
32
|
+
|
33
|
+
|
29
34
|
File.open(to_plot,'w') do |out|
|
30
35
|
out.puts PLOT_TYPE
|
31
36
|
out.puts filename_noext
|
@@ -55,6 +60,8 @@ module SpecID::Precision::PlotHelper
|
|
55
60
|
## CREATE the PLOT IMAGE:
|
56
61
|
to_plot = filename_noext+'.toplot'
|
57
62
|
png = filename_noext+'.png'
|
63
|
+
|
64
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
58
65
|
Gnuplot.open do |gp|
|
59
66
|
Gnuplot::Plot.new( gp ) do |plot|
|
60
67
|
plot.terminal "png noenhanced"
|
@@ -76,6 +83,7 @@ module SpecID::Precision::PlotHelper
|
|
76
83
|
end
|
77
84
|
end
|
78
85
|
end
|
86
|
+
$VERBOSE = tmp
|
79
87
|
|
80
88
|
## CREATE the HTML to load the plot:
|
81
89
|
basename_filename_noext = File.basename(filename_noext)
|
@@ -88,7 +96,7 @@ module SpecID::Precision::PlotHelper
|
|
88
96
|
|
89
97
|
end
|
90
98
|
|
91
|
-
module
|
99
|
+
module Prec::HTML
|
92
100
|
|
93
101
|
# html and body tags
|
94
102
|
def html
|
@@ -173,23 +181,22 @@ module SpecID::Precision::HTML
|
|
173
181
|
end
|
174
182
|
end # module HTML
|
175
183
|
|
176
|
-
class
|
177
|
-
include
|
184
|
+
class Prec
|
185
|
+
include Prec::PlotHelper
|
178
186
|
|
179
187
|
###########################################################
|
180
188
|
# GLOBAL SETTINGS:
|
181
|
-
DEF_PREFIX = "INV_"
|
182
189
|
DATA_PREC = 4 # decimal places of precision for ppv data
|
183
190
|
STDOUT_JTPLOT_BASE = "ppv" # if there is no outfile
|
184
191
|
###########################################################
|
185
192
|
|
186
|
-
include
|
193
|
+
include Prec::HTML
|
187
194
|
|
188
195
|
## returns an html string
|
189
196
|
def precision(argv)
|
190
197
|
opt = parse_args(argv)
|
191
198
|
files = argv.to_a
|
192
|
-
out_string =
|
199
|
+
out_string = create_precision_data(files, opt)
|
193
200
|
[out_string, opt]
|
194
201
|
end
|
195
202
|
|
@@ -270,24 +277,6 @@ Example:
|
|
270
277
|
end
|
271
278
|
|
272
279
|
|
273
|
-
# takes a comma separated list and extends the last to create an array of
|
274
|
-
# desired size
|
275
|
-
def prefixes(arg, desired_size)
|
276
|
-
arg_arr = arg.split(',')
|
277
|
-
new_arr = []
|
278
|
-
last_arg = arg_arr[0]
|
279
|
-
desired_size.times do |i|
|
280
|
-
if arg_arr[i]
|
281
|
-
new_arr[i] = arg_arr[i]
|
282
|
-
last_arg = new_arr[i]
|
283
|
-
else
|
284
|
-
new_arr[i] = last_arg
|
285
|
-
end
|
286
|
-
end
|
287
|
-
new_arr
|
288
|
-
end
|
289
|
-
|
290
|
-
|
291
280
|
## collapses arrays to one level deep so we can sync them up
|
292
281
|
def arrays_to_one_level_deep(all_arrs)
|
293
282
|
mostly_flat = []
|
@@ -352,7 +341,7 @@ Example:
|
|
352
341
|
def y_axis_label(key)
|
353
342
|
## We only take the keys for the first file, as it's assumed that the major
|
354
343
|
## labels will be identical for all of them
|
355
|
-
labels = key.first.map {|tp| tp.first }
|
344
|
+
labels = key.first.map {|tp| tp.first }.uniq
|
356
345
|
labels.join " | "
|
357
346
|
end
|
358
347
|
|
@@ -367,11 +356,14 @@ Example:
|
|
367
356
|
new_string
|
368
357
|
end
|
369
358
|
|
370
|
-
|
371
|
-
|
359
|
+
# if opt.f, then a prefix is assumed.
|
360
|
+
# if a file =~ /-prot.xml$/ then a precision plot based on probability is
|
361
|
+
# also created
|
362
|
+
def create_precision_data(files, opt)
|
363
|
+
#$stderr.puts "using prefix #{opt.f} ..."
|
372
364
|
|
373
365
|
if opt.f
|
374
|
-
prefix_arr =
|
366
|
+
prefix_arr = SpecID.extend_args(opt.f, files.size)
|
375
367
|
end
|
376
368
|
all_arrs = []
|
377
369
|
key = []
|
@@ -384,12 +376,13 @@ Example:
|
|
384
376
|
if opt.f
|
385
377
|
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
|
386
378
|
all_arrs[i] << [num_hits,ppv]
|
387
|
-
key[i] << ["Precision", ["#
|
388
|
-
|
379
|
+
key[i] << ["Precision", ["# hits", "Prec (decoy)"]]
|
380
|
+
end
|
381
|
+
if file =~ /-prot\.xml$/
|
389
382
|
## These are just from protein prophet probabilities:
|
390
383
|
(num_hits, ppv) = sp.num_hits_and_ppv_for_protein_prophet_probabilities
|
391
384
|
all_arrs[i] << [num_hits,ppv]
|
392
|
-
key[i] << ["Precision", ["#
|
385
|
+
key[i] << ["Precision", ["# hits", "Prec (prob)"]]
|
393
386
|
end
|
394
387
|
end
|
395
388
|
|
data/lib/spec_id/proph.rb
CHANGED
@@ -5,7 +5,6 @@ require 'instance_var_set_from_hash'
|
|
5
5
|
require 'axml'
|
6
6
|
require 'spec_id'
|
7
7
|
|
8
|
-
class SpecID
|
9
8
|
class Proph
|
10
9
|
|
11
10
|
|
@@ -20,6 +19,8 @@ end
|
|
20
19
|
|
21
20
|
|
22
21
|
class ProtSummary
|
22
|
+
include SpecID
|
23
|
+
|
23
24
|
attr_writer :prots
|
24
25
|
attr_accessor :prot_groups
|
25
26
|
|
@@ -102,7 +103,8 @@ class ProtGroup
|
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
class Prot
|
106
|
+
class Prot
|
107
|
+
include SpecID::Prot
|
106
108
|
|
107
109
|
## probability and reference accessors are inherited
|
108
110
|
attr_accessor :peps, :protein_name, :cutoff, :group_sibling_id, :n_indistinguishable_proteins, :percent_coverage, :unique_stripped_peptides, :total_number_peptides, :pct_spectrum_ids, :description
|
@@ -137,6 +139,7 @@ class Prot < SpecID::Prot
|
|
137
139
|
end # class Prot
|
138
140
|
|
139
141
|
class Pep
|
142
|
+
include SpecID::Pep
|
140
143
|
|
141
144
|
attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
|
142
145
|
attr_writer :arithmetic_avg_scan_by_parent_time
|
@@ -458,4 +461,3 @@ end # Prot::Parser
|
|
458
461
|
################ --END
|
459
462
|
|
460
463
|
end # Proph
|
461
|
-
end # SpecID
|
@@ -0,0 +1,459 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'axml'
|
4
|
+
require 'hash_by'
|
5
|
+
require 'optparse'
|
6
|
+
require 'ostruct'
|
7
|
+
require 'spec_id'
|
8
|
+
require 'spec_id/precision'
|
9
|
+
|
10
|
+
#############################################################
|
11
|
+
# GLOBALS:
|
12
|
+
PRECISION_PROGRAM_BASE = 'precision'
|
13
|
+
DEF_PREFIX = "INV_"
|
14
|
+
DEF_PERCENT_FP = "5.0"
|
15
|
+
#############################################################
|
16
|
+
|
17
|
+
|
18
|
+
# @TODO: add group probability title (showin all group probabilities) for protein prob
|
19
|
+
|
20
|
+
#class String
|
21
|
+
# def margin
|
22
|
+
# self.gsub(/^\s*\|/,'')
|
23
|
+
# end
|
24
|
+
#end
|
25
|
+
|
26
|
+
|
27
|
+
class ProteinSummary
|
28
|
+
module HTML
|
29
|
+
def header
|
30
|
+
%Q{<html>
|
31
|
+
<head
|
32
|
+
#{style}
|
33
|
+
</head>
|
34
|
+
<body>
|
35
|
+
<script type="text/javascript">
|
36
|
+
<!--
|
37
|
+
function toggle_vis(id) {
|
38
|
+
var e = document.getElementById(id);
|
39
|
+
if(e.style.display == 'none')
|
40
|
+
e.style.display = 'block';
|
41
|
+
else
|
42
|
+
e.style.display = 'none';
|
43
|
+
}
|
44
|
+
//-->
|
45
|
+
</script>
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
def style
|
50
|
+
'
|
51
|
+
<style type="text/css">
|
52
|
+
table {
|
53
|
+
border-width:1px;
|
54
|
+
border-color:#DDDDDD;
|
55
|
+
border-collapse: collapse;
|
56
|
+
}
|
57
|
+
td,th {
|
58
|
+
padding-top: 2px;
|
59
|
+
padding-bottom: 2px;
|
60
|
+
padding-left: 5;
|
61
|
+
padding-right: 5;
|
62
|
+
}
|
63
|
+
td.redline {
|
64
|
+
background-color: #FF0000;
|
65
|
+
color: #FFFFFF
|
66
|
+
}
|
67
|
+
div.file_info, div.software, div.fppr, div.num_proteins{
|
68
|
+
margin-left: 20px;
|
69
|
+
margin-top: 20px;
|
70
|
+
}
|
71
|
+
div.main {
|
72
|
+
margin-left: 10px;
|
73
|
+
margin-right: 10px;
|
74
|
+
margin-top: 50px;
|
75
|
+
margin-bottom: 50px;
|
76
|
+
}
|
77
|
+
div#error {
|
78
|
+
margin: 30px;
|
79
|
+
text-align:center
|
80
|
+
}
|
81
|
+
hr {color: sienna}
|
82
|
+
body { font-size: 8pt; font-family: Arial,Helvetica,Times}
|
83
|
+
</style>
|
84
|
+
'
|
85
|
+
end
|
86
|
+
|
87
|
+
# an anchor and a title
|
88
|
+
def at(display, title)
|
89
|
+
"<a title=\"#{title}\">#{display}</a>"
|
90
|
+
end
|
91
|
+
|
92
|
+
def trailer
|
93
|
+
%q{
|
94
|
+
</body>
|
95
|
+
</html>
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
def tr
|
100
|
+
"|<tr>
|
101
|
+
| #{yield}
|
102
|
+
|</tr>\n".margin
|
103
|
+
end
|
104
|
+
|
105
|
+
def table
|
106
|
+
"|<div class=\"main\"><table align=\"center\" border=\"1\" style=\"font-size:100%\" width=\"800px\">
|
107
|
+
| #{yield}
|
108
|
+
|</table></div>\n".margin
|
109
|
+
end
|
110
|
+
|
111
|
+
def tds(arr)
|
112
|
+
arr.map {|v| "<td>#{v}</td>"}.join
|
113
|
+
end
|
114
|
+
|
115
|
+
def ths(arr)
|
116
|
+
str = arr.map {|v| "<th>#{v}</th>"}.join
|
117
|
+
str << "\n"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
class ProteinSummary
|
125
|
+
|
126
|
+
include ProteinSummary::HTML
|
127
|
+
|
128
|
+
def ref_html(gi, name)
|
129
|
+
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
|
130
|
+
end
|
131
|
+
|
132
|
+
# Takes the -prot.xml filename and grabs the png file (if available)
|
133
|
+
def error_info(prot_file_name)
|
134
|
+
img = prot_file_name.gsub('.xml', '.png')
|
135
|
+
img_bn = File.basename(img)
|
136
|
+
"<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
|
137
|
+
end
|
138
|
+
|
139
|
+
# attempts to get the NCBI gi code
|
140
|
+
def accession(name)
|
141
|
+
if (name.include? '|') && (name[0,3] == 'gi|')
|
142
|
+
name.split('|')[1]
|
143
|
+
else
|
144
|
+
name
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def prefix_to_regex(prefix)
|
149
|
+
if prefix
|
150
|
+
/^#{Regexp.escape(prefix)}/
|
151
|
+
else
|
152
|
+
nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# given a list of proteins, output a tab delimited textfile with protein
|
157
|
+
# name and the total number of peptides found
|
158
|
+
def output_peptide_counts_file(prots, filename)
|
159
|
+
File.open(filename, "w") do |fh_out|
|
160
|
+
prots.each do |prot|
|
161
|
+
fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# filters on the false positive regex and sorts by prot probability
|
167
|
+
def filter_and_sort(uniq_prots, prefix=nil)
|
168
|
+
prefix_re = prefix_to_regex(prefix)
|
169
|
+
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
|
170
|
+
## filter on prefix
|
171
|
+
if prefix
|
172
|
+
sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
|
173
|
+
end
|
174
|
+
sorted
|
175
|
+
end
|
176
|
+
|
177
|
+
# assumes that these are sorted on probability
|
178
|
+
# desired_fppr is a float
|
179
|
+
# returns [number_of_prots, actual_fppr]
|
180
|
+
def num_prots_above_fppr(prots, desired_fppr)
|
181
|
+
current_fppr_rate_percent = 0.0
|
182
|
+
previous_fppr_rate_percent = 0.0
|
183
|
+
current_sum_one_minus_prob = 0.0
|
184
|
+
proteins_within_fppr = 0
|
185
|
+
actual_fppr = nil
|
186
|
+
already_found = false
|
187
|
+
prot_cnt = 0
|
188
|
+
prots.each do |prot|
|
189
|
+
prot_cnt += 1
|
190
|
+
# SUM(1-probX)/#prots
|
191
|
+
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
192
|
+
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
193
|
+
|
194
|
+
if current_fppr_rate_percent > desired_fppr && !already_found
|
195
|
+
actual_fppr = previous_fppr_rate_percent
|
196
|
+
proteins_within_fppr = prot_cnt
|
197
|
+
already_found = true
|
198
|
+
end
|
199
|
+
previous_fppr_rate_percent = current_fppr_rate_percent
|
200
|
+
end
|
201
|
+
[proteins_within_fppr, actual_fppr]
|
202
|
+
end
|
203
|
+
|
204
|
+
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
|
205
|
+
|
206
|
+
# returns a string of the table rows
|
207
|
+
# false_positive_rate (give as a %) is the cutoff mark
|
208
|
+
# returns the number of proteins at the desired_fppr (if given)
|
209
|
+
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
|
210
|
+
prot_cnt = 0
|
211
|
+
uniq_prots.map do |prot|
|
212
|
+
tr do
|
213
|
+
prot_cnt += 1
|
214
|
+
gi = accession(prot._protein_name)
|
215
|
+
tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
|
216
|
+
end
|
217
|
+
end.join
|
218
|
+
end
|
219
|
+
|
220
|
+
def print_html_pieces(file, *pieces)
|
221
|
+
File.open(file, "w") do |out|
|
222
|
+
pieces.each do |piece|
|
223
|
+
out.print piece
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def file_info(file)
|
229
|
+
"<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
|
230
|
+
<br/>Last Modified: #{File.mtime(file)}
|
231
|
+
<br/>Size: #{File.size(file)/1000} KB
|
232
|
+
</div>"
|
233
|
+
end
|
234
|
+
|
235
|
+
def bioworks_script_info(obj)
|
236
|
+
version = "3.2??"
|
237
|
+
if obj.version
|
238
|
+
version = obj.version
|
239
|
+
end
|
240
|
+
script_info{"Bioworks version #{version}"}
|
241
|
+
end
|
242
|
+
|
243
|
+
def protproph_script_info
|
244
|
+
begin
|
245
|
+
where = `which xinteract`
|
246
|
+
reply = `#{where}`
|
247
|
+
rescue Exception
|
248
|
+
reply = ""
|
249
|
+
end
|
250
|
+
prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically
|
251
|
+
if reply =~ /xinteract.*?\((TPP .*)\)/
|
252
|
+
prophet = $1.dup
|
253
|
+
end
|
254
|
+
script_info { "ProteinProphet from: #{prophet}" }
|
255
|
+
end
|
256
|
+
|
257
|
+
def mspire_version
|
258
|
+
string = "mspire"
|
259
|
+
begin
|
260
|
+
if `gem list --local mspire` =~ /mspire \((.*?)\)/
|
261
|
+
string << (" v" + $1)
|
262
|
+
end
|
263
|
+
rescue Exception
|
264
|
+
end
|
265
|
+
string
|
266
|
+
end
|
267
|
+
|
268
|
+
def script_info
|
269
|
+
"<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
|
270
|
+
end
|
271
|
+
|
272
|
+
def proph_output(file, outfn, opt, fppr_output_as_html)
|
273
|
+
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
|
274
|
+
num_cols = header_anchors.size
|
275
|
+
theaders = ths(header_anchors)
|
276
|
+
|
277
|
+
root = AXML.parse_file(file)
|
278
|
+
prots = []
|
279
|
+
## find the min_prob at a fppr of XX
|
280
|
+
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
|
281
|
+
|
282
|
+
if opt.c
|
283
|
+
actual_percent_fp = opt.c.to_f
|
284
|
+
elsif opt.cut_at
|
285
|
+
actual_percent_fp = opt.cut_at.to_f
|
286
|
+
else
|
287
|
+
actual_percent_fp = nil
|
288
|
+
end
|
289
|
+
root.protein_group.each do |group|
|
290
|
+
group.protein.each do |prt|
|
291
|
+
prots << prt
|
292
|
+
end
|
293
|
+
end
|
294
|
+
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
295
|
+
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
296
|
+
|
297
|
+
## num proteins above cutoff (if opt.c)
|
298
|
+
num_prots_html = ''
|
299
|
+
if opt.c || opt.cut_at
|
300
|
+
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
|
301
|
+
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
|
302
|
+
end
|
303
|
+
if opt.cut_at
|
304
|
+
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
|
305
|
+
end
|
306
|
+
|
307
|
+
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
308
|
+
|
309
|
+
table_string = table do
|
310
|
+
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
|
311
|
+
end
|
312
|
+
er_info = opt.precision ? error_info(file) : ""
|
313
|
+
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
|
314
|
+
print_html_pieces(*html_pieces)
|
315
|
+
end # proph_output
|
316
|
+
|
317
|
+
# given a list of peptide sequences creates javascript to hide/show them
|
318
|
+
def peptide_cell(prot_num, peptide_sequences)
|
319
|
+
"<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
|
320
|
+
end
|
321
|
+
|
322
|
+
# takes spec_id object
|
323
|
+
# the outfn is the output filename
|
324
|
+
# opt is an OpenStruct that holds opt.f = the false prefix
|
325
|
+
def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
|
326
|
+
fppr_output_as_html ||= ''
|
327
|
+
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
328
|
+
num_cols = header_anchors.size
|
329
|
+
theaders = ths(header_anchors)
|
330
|
+
proteins = spec_id.prots
|
331
|
+
protein_num = 0
|
332
|
+
rows = ""
|
333
|
+
prefix_re = prefix_to_regex(false_prefix)
|
334
|
+
proteins.each do |prot|
|
335
|
+
if false_prefix && prot.reference =~ prefix_re
|
336
|
+
next
|
337
|
+
end
|
338
|
+
uniq_peps = Hash.new {|h,k| h[k] = true; }
|
339
|
+
protein_num += 1
|
340
|
+
prot.peps.each do |pep|
|
341
|
+
uniq_peps[pep.sequence.split('.')[1]] = true
|
342
|
+
end
|
343
|
+
pieces = prot.reference.split(' ')
|
344
|
+
long_prot_name = pieces.shift
|
345
|
+
annotation = pieces.join(' ')
|
346
|
+
accession = prot.accession
|
347
|
+
if accession == '0' ; accession = long_prot_name end
|
348
|
+
rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
|
349
|
+
end
|
350
|
+
table_string = table do
|
351
|
+
tr{theaders} + rows
|
352
|
+
end
|
353
|
+
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer)
|
354
|
+
end # bioworks_output
|
355
|
+
|
356
|
+
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
|
357
|
+
actual_cutoff = sprintf("%.3f", actual_cutoff)
|
358
|
+
desired_cutoff = sprintf("%.3f", desired_cutoff)
|
359
|
+
"<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3>
|
360
|
+
Desired FPPR: #{desired_cutoff} %<br/>
|
361
|
+
Actual FPPR: #{actual_cutoff} %<br/>
|
362
|
+
Number of Proteins at Actual FPPR: #{num_proteins}
|
363
|
+
</div>"
|
364
|
+
end
|
365
|
+
|
366
|
+
# transforms the output string of file_as_decoy into html
|
367
|
+
def file_as_decoy_to_html(string)
|
368
|
+
lines = string.split("\n")
|
369
|
+
#puts lines ?? is this supposed to be commented out?
|
370
|
+
lines = lines.reject do |obj| obj =~ /\*{10}/ end
|
371
|
+
lines.map! do |line| "#{line}<br/>" end
|
372
|
+
"<div class=\"fppr\">
|
373
|
+
<h3>Classification Analysis</h3>
|
374
|
+
#{lines.join("\n")}
|
375
|
+
</div>"
|
376
|
+
end
|
377
|
+
|
378
|
+
# transforms the output string of file_as_decoy into html
|
379
|
+
def prefix_as_decoy_to_html(string)
|
380
|
+
"<div class=\"fppr\">
|
381
|
+
<h3>Classification Analysis</h3>
|
382
|
+
</div>" +
|
383
|
+
string
|
384
|
+
end
|
385
|
+
|
386
|
+
def create_from_command_line_args(argv)
|
387
|
+
@orig_argv = argv.dup
|
388
|
+
|
389
|
+
opt = OpenStruct.new
|
390
|
+
opt.f = DEF_PREFIX
|
391
|
+
opts = OptionParser.new do |op|
|
392
|
+
op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
|
393
|
+
op.separator " where file = bioworks -or- <run>-prot (prophet output)"
|
394
|
+
op.separator " outputs: <file>.summary.html"
|
395
|
+
op.separator ""
|
396
|
+
op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
397
|
+
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
398
|
+
op.separator(" if --precision then -f is used to specify a file or prefix")
|
399
|
+
op.separator(" that indicates the false positives.")
|
400
|
+
op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
|
401
|
+
op.separator ""
|
402
|
+
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
|
403
|
+
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
404
|
+
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
405
|
+
op.separator ""
|
406
|
+
op.separator "Specific to ProteinProphet (with no concatenated DB):"
|
407
|
+
op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
|
408
|
+
op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
|
409
|
+
end
|
410
|
+
|
411
|
+
opts.parse!(argv)
|
412
|
+
|
413
|
+
if argv.size < 1
|
414
|
+
puts opts
|
415
|
+
return
|
416
|
+
end
|
417
|
+
|
418
|
+
fppr_output_as_html = ''
|
419
|
+
files = argv.to_a
|
420
|
+
files.each do |file|
|
421
|
+
outfn = file.sub(/\.xml$/, '.summary.html')
|
422
|
+
outfn = outfn.sub(/\.srg$/, '.summary.html')
|
423
|
+
## False Positive Rate Calculation:
|
424
|
+
if opt.precision
|
425
|
+
opt.o = outfn # won't actually be written over, but used
|
426
|
+
to_use_argv = create_precision_argv(file, opt)
|
427
|
+
(out_string, opt) = Prec.new.precision(to_use_argv)
|
428
|
+
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
|
429
|
+
end
|
430
|
+
|
431
|
+
case SpecID.file_type(file)
|
432
|
+
when "protproph"
|
433
|
+
#spec_id = SpecID.new(file)
|
434
|
+
proph_output(file, outfn, opt, fppr_output_as_html)
|
435
|
+
when "bioworks"
|
436
|
+
spec_id = SpecID.new(file)
|
437
|
+
bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
|
438
|
+
else
|
439
|
+
abort "filetype for #{file} not recognized!"
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
end # method create_from_command_line
|
444
|
+
|
445
|
+
def create_precision_argv(file, opt)
|
446
|
+
# include only those options specific
|
447
|
+
new_argv = [file]
|
448
|
+
if opt.f ; new_argv << '-f' << opt.f end
|
449
|
+
if opt.o ; new_argv << '-o' << opt.o end
|
450
|
+
new_argv
|
451
|
+
end
|
452
|
+
|
453
|
+
end # ProteinSummary
|
454
|
+
|
455
|
+
##################################################################
|
456
|
+
# MAIN
|
457
|
+
##################################################################
|
458
|
+
|
459
|
+
|