mspire 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/lib/spec_id/precision.rb
CHANGED
@@ -2,22 +2,25 @@
|
|
2
2
|
require 'optparse'
|
3
3
|
require 'ostruct'
|
4
4
|
require 'generator'
|
5
|
-
require 'gnuplot'
|
6
5
|
require 'roc'
|
7
6
|
|
7
|
+
## silence this bad boy
|
8
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
9
|
+
require 'gnuplot'
|
10
|
+
$VERBOSE = tmp
|
11
|
+
|
8
12
|
class String
|
9
13
|
def margin
|
10
14
|
self.gsub(/^\s*\|/,'')
|
11
15
|
end
|
12
16
|
end
|
13
17
|
|
14
|
-
class
|
15
|
-
class SpecID::Precision ; end
|
18
|
+
class Prec ; end
|
16
19
|
|
17
|
-
module
|
20
|
+
module Prec::PlotHelper
|
18
21
|
|
19
22
|
PLOT_TYPE = 'XYData'
|
20
|
-
TITLE = 'Precision
|
23
|
+
TITLE = 'Precision vs. Num Hits [ Precision = Positive Predictive Value = TP/(TP+FP) ]'
|
21
24
|
XAXIS = 'Num Hits (excludes known false positives)'
|
22
25
|
EXT = '.toplot'
|
23
26
|
IMAGE_EXT = '.png'
|
@@ -26,6 +29,8 @@ module SpecID::Precision::PlotHelper
|
|
26
29
|
## CREATE the PLOT IMAGE:
|
27
30
|
to_plot = filename_noext + EXT
|
28
31
|
png = filename_noext + IMAGE_EXT
|
32
|
+
|
33
|
+
|
29
34
|
File.open(to_plot,'w') do |out|
|
30
35
|
out.puts PLOT_TYPE
|
31
36
|
out.puts filename_noext
|
@@ -55,6 +60,8 @@ module SpecID::Precision::PlotHelper
|
|
55
60
|
## CREATE the PLOT IMAGE:
|
56
61
|
to_plot = filename_noext+'.toplot'
|
57
62
|
png = filename_noext+'.png'
|
63
|
+
|
64
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
58
65
|
Gnuplot.open do |gp|
|
59
66
|
Gnuplot::Plot.new( gp ) do |plot|
|
60
67
|
plot.terminal "png noenhanced"
|
@@ -76,6 +83,7 @@ module SpecID::Precision::PlotHelper
|
|
76
83
|
end
|
77
84
|
end
|
78
85
|
end
|
86
|
+
$VERBOSE = tmp
|
79
87
|
|
80
88
|
## CREATE the HTML to load the plot:
|
81
89
|
basename_filename_noext = File.basename(filename_noext)
|
@@ -88,7 +96,7 @@ module SpecID::Precision::PlotHelper
|
|
88
96
|
|
89
97
|
end
|
90
98
|
|
91
|
-
module
|
99
|
+
module Prec::HTML
|
92
100
|
|
93
101
|
# html and body tags
|
94
102
|
def html
|
@@ -173,23 +181,22 @@ module SpecID::Precision::HTML
|
|
173
181
|
end
|
174
182
|
end # module HTML
|
175
183
|
|
176
|
-
class
|
177
|
-
include
|
184
|
+
class Prec
|
185
|
+
include Prec::PlotHelper
|
178
186
|
|
179
187
|
###########################################################
|
180
188
|
# GLOBAL SETTINGS:
|
181
|
-
DEF_PREFIX = "INV_"
|
182
189
|
DATA_PREC = 4 # decimal places of precision for ppv data
|
183
190
|
STDOUT_JTPLOT_BASE = "ppv" # if there is no outfile
|
184
191
|
###########################################################
|
185
192
|
|
186
|
-
include
|
193
|
+
include Prec::HTML
|
187
194
|
|
188
195
|
## returns an html string
|
189
196
|
def precision(argv)
|
190
197
|
opt = parse_args(argv)
|
191
198
|
files = argv.to_a
|
192
|
-
out_string =
|
199
|
+
out_string = create_precision_data(files, opt)
|
193
200
|
[out_string, opt]
|
194
201
|
end
|
195
202
|
|
@@ -270,24 +277,6 @@ Example:
|
|
270
277
|
end
|
271
278
|
|
272
279
|
|
273
|
-
# takes a comma separated list and extends the last to create an array of
|
274
|
-
# desired size
|
275
|
-
def prefixes(arg, desired_size)
|
276
|
-
arg_arr = arg.split(',')
|
277
|
-
new_arr = []
|
278
|
-
last_arg = arg_arr[0]
|
279
|
-
desired_size.times do |i|
|
280
|
-
if arg_arr[i]
|
281
|
-
new_arr[i] = arg_arr[i]
|
282
|
-
last_arg = new_arr[i]
|
283
|
-
else
|
284
|
-
new_arr[i] = last_arg
|
285
|
-
end
|
286
|
-
end
|
287
|
-
new_arr
|
288
|
-
end
|
289
|
-
|
290
|
-
|
291
280
|
## collapses arrays to one level deep so we can sync them up
|
292
281
|
def arrays_to_one_level_deep(all_arrs)
|
293
282
|
mostly_flat = []
|
@@ -352,7 +341,7 @@ Example:
|
|
352
341
|
def y_axis_label(key)
|
353
342
|
## We only take the keys for the first file, as it's assumed that the major
|
354
343
|
## labels will be identical for all of them
|
355
|
-
labels = key.first.map {|tp| tp.first }
|
344
|
+
labels = key.first.map {|tp| tp.first }.uniq
|
356
345
|
labels.join " | "
|
357
346
|
end
|
358
347
|
|
@@ -367,11 +356,14 @@ Example:
|
|
367
356
|
new_string
|
368
357
|
end
|
369
358
|
|
370
|
-
|
371
|
-
|
359
|
+
# if opt.f, then a prefix is assumed.
|
360
|
+
# if a file =~ /-prot.xml$/ then a precision plot based on probability is
|
361
|
+
# also created
|
362
|
+
def create_precision_data(files, opt)
|
363
|
+
#$stderr.puts "using prefix #{opt.f} ..."
|
372
364
|
|
373
365
|
if opt.f
|
374
|
-
prefix_arr =
|
366
|
+
prefix_arr = SpecID.extend_args(opt.f, files.size)
|
375
367
|
end
|
376
368
|
all_arrs = []
|
377
369
|
key = []
|
@@ -384,12 +376,13 @@ Example:
|
|
384
376
|
if opt.f
|
385
377
|
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
|
386
378
|
all_arrs[i] << [num_hits,ppv]
|
387
|
-
key[i] << ["Precision", ["#
|
388
|
-
|
379
|
+
key[i] << ["Precision", ["# hits", "Prec (decoy)"]]
|
380
|
+
end
|
381
|
+
if file =~ /-prot\.xml$/
|
389
382
|
## These are just from protein prophet probabilities:
|
390
383
|
(num_hits, ppv) = sp.num_hits_and_ppv_for_protein_prophet_probabilities
|
391
384
|
all_arrs[i] << [num_hits,ppv]
|
392
|
-
key[i] << ["Precision", ["#
|
385
|
+
key[i] << ["Precision", ["# hits", "Prec (prob)"]]
|
393
386
|
end
|
394
387
|
end
|
395
388
|
|
data/lib/spec_id/proph.rb
CHANGED
@@ -5,7 +5,6 @@ require 'instance_var_set_from_hash'
|
|
5
5
|
require 'axml'
|
6
6
|
require 'spec_id'
|
7
7
|
|
8
|
-
class SpecID
|
9
8
|
class Proph
|
10
9
|
|
11
10
|
|
@@ -20,6 +19,8 @@ end
|
|
20
19
|
|
21
20
|
|
22
21
|
class ProtSummary
|
22
|
+
include SpecID
|
23
|
+
|
23
24
|
attr_writer :prots
|
24
25
|
attr_accessor :prot_groups
|
25
26
|
|
@@ -102,7 +103,8 @@ class ProtGroup
|
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
class Prot
|
106
|
+
class Prot
|
107
|
+
include SpecID::Prot
|
106
108
|
|
107
109
|
## probability and reference accessors are inherited
|
108
110
|
attr_accessor :peps, :protein_name, :cutoff, :group_sibling_id, :n_indistinguishable_proteins, :percent_coverage, :unique_stripped_peptides, :total_number_peptides, :pct_spectrum_ids, :description
|
@@ -137,6 +139,7 @@ class Prot < SpecID::Prot
|
|
137
139
|
end # class Prot
|
138
140
|
|
139
141
|
class Pep
|
142
|
+
include SpecID::Pep
|
140
143
|
|
141
144
|
attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
|
142
145
|
attr_writer :arithmetic_avg_scan_by_parent_time
|
@@ -458,4 +461,3 @@ end # Prot::Parser
|
|
458
461
|
################ --END
|
459
462
|
|
460
463
|
end # Proph
|
461
|
-
end # SpecID
|
@@ -0,0 +1,459 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'axml'
|
4
|
+
require 'hash_by'
|
5
|
+
require 'optparse'
|
6
|
+
require 'ostruct'
|
7
|
+
require 'spec_id'
|
8
|
+
require 'spec_id/precision'
|
9
|
+
|
10
|
+
#############################################################
|
11
|
+
# GLOBALS:
|
12
|
+
PRECISION_PROGRAM_BASE = 'precision'
|
13
|
+
DEF_PREFIX = "INV_"
|
14
|
+
DEF_PERCENT_FP = "5.0"
|
15
|
+
#############################################################
|
16
|
+
|
17
|
+
|
18
|
+
# @TODO: add group probability title (showin all group probabilities) for protein prob
|
19
|
+
|
20
|
+
#class String
|
21
|
+
# def margin
|
22
|
+
# self.gsub(/^\s*\|/,'')
|
23
|
+
# end
|
24
|
+
#end
|
25
|
+
|
26
|
+
|
27
|
+
class ProteinSummary
|
28
|
+
module HTML
|
29
|
+
def header
|
30
|
+
%Q{<html>
|
31
|
+
<head
|
32
|
+
#{style}
|
33
|
+
</head>
|
34
|
+
<body>
|
35
|
+
<script type="text/javascript">
|
36
|
+
<!--
|
37
|
+
function toggle_vis(id) {
|
38
|
+
var e = document.getElementById(id);
|
39
|
+
if(e.style.display == 'none')
|
40
|
+
e.style.display = 'block';
|
41
|
+
else
|
42
|
+
e.style.display = 'none';
|
43
|
+
}
|
44
|
+
//-->
|
45
|
+
</script>
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
def style
|
50
|
+
'
|
51
|
+
<style type="text/css">
|
52
|
+
table {
|
53
|
+
border-width:1px;
|
54
|
+
border-color:#DDDDDD;
|
55
|
+
border-collapse: collapse;
|
56
|
+
}
|
57
|
+
td,th {
|
58
|
+
padding-top: 2px;
|
59
|
+
padding-bottom: 2px;
|
60
|
+
padding-left: 5;
|
61
|
+
padding-right: 5;
|
62
|
+
}
|
63
|
+
td.redline {
|
64
|
+
background-color: #FF0000;
|
65
|
+
color: #FFFFFF
|
66
|
+
}
|
67
|
+
div.file_info, div.software, div.fppr, div.num_proteins{
|
68
|
+
margin-left: 20px;
|
69
|
+
margin-top: 20px;
|
70
|
+
}
|
71
|
+
div.main {
|
72
|
+
margin-left: 10px;
|
73
|
+
margin-right: 10px;
|
74
|
+
margin-top: 50px;
|
75
|
+
margin-bottom: 50px;
|
76
|
+
}
|
77
|
+
div#error {
|
78
|
+
margin: 30px;
|
79
|
+
text-align:center
|
80
|
+
}
|
81
|
+
hr {color: sienna}
|
82
|
+
body { font-size: 8pt; font-family: Arial,Helvetica,Times}
|
83
|
+
</style>
|
84
|
+
'
|
85
|
+
end
|
86
|
+
|
87
|
+
# an anchor and a title
|
88
|
+
def at(display, title)
|
89
|
+
"<a title=\"#{title}\">#{display}</a>"
|
90
|
+
end
|
91
|
+
|
92
|
+
def trailer
|
93
|
+
%q{
|
94
|
+
</body>
|
95
|
+
</html>
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
def tr
|
100
|
+
"|<tr>
|
101
|
+
| #{yield}
|
102
|
+
|</tr>\n".margin
|
103
|
+
end
|
104
|
+
|
105
|
+
def table
|
106
|
+
"|<div class=\"main\"><table align=\"center\" border=\"1\" style=\"font-size:100%\" width=\"800px\">
|
107
|
+
| #{yield}
|
108
|
+
|</table></div>\n".margin
|
109
|
+
end
|
110
|
+
|
111
|
+
def tds(arr)
|
112
|
+
arr.map {|v| "<td>#{v}</td>"}.join
|
113
|
+
end
|
114
|
+
|
115
|
+
def ths(arr)
|
116
|
+
str = arr.map {|v| "<th>#{v}</th>"}.join
|
117
|
+
str << "\n"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
class ProteinSummary
|
125
|
+
|
126
|
+
include ProteinSummary::HTML
|
127
|
+
|
128
|
+
def ref_html(gi, name)
|
129
|
+
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
|
130
|
+
end
|
131
|
+
|
132
|
+
# Takes the -prot.xml filename and grabs the png file (if available)
|
133
|
+
def error_info(prot_file_name)
|
134
|
+
img = prot_file_name.gsub('.xml', '.png')
|
135
|
+
img_bn = File.basename(img)
|
136
|
+
"<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
|
137
|
+
end
|
138
|
+
|
139
|
+
# attempts to get the NCBI gi code
|
140
|
+
def accession(name)
|
141
|
+
if (name.include? '|') && (name[0,3] == 'gi|')
|
142
|
+
name.split('|')[1]
|
143
|
+
else
|
144
|
+
name
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def prefix_to_regex(prefix)
|
149
|
+
if prefix
|
150
|
+
/^#{Regexp.escape(prefix)}/
|
151
|
+
else
|
152
|
+
nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# given a list of proteins, output a tab delimited textfile with protein
|
157
|
+
# name and the total number of peptides found
|
158
|
+
def output_peptide_counts_file(prots, filename)
|
159
|
+
File.open(filename, "w") do |fh_out|
|
160
|
+
prots.each do |prot|
|
161
|
+
fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# filters on the false positive regex and sorts by prot probability
|
167
|
+
def filter_and_sort(uniq_prots, prefix=nil)
|
168
|
+
prefix_re = prefix_to_regex(prefix)
|
169
|
+
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
|
170
|
+
## filter on prefix
|
171
|
+
if prefix
|
172
|
+
sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
|
173
|
+
end
|
174
|
+
sorted
|
175
|
+
end
|
176
|
+
|
177
|
+
# assumes that these are sorted on probability
|
178
|
+
# desired_fppr is a float
|
179
|
+
# returns [number_of_prots, actual_fppr]
|
180
|
+
def num_prots_above_fppr(prots, desired_fppr)
|
181
|
+
current_fppr_rate_percent = 0.0
|
182
|
+
previous_fppr_rate_percent = 0.0
|
183
|
+
current_sum_one_minus_prob = 0.0
|
184
|
+
proteins_within_fppr = 0
|
185
|
+
actual_fppr = nil
|
186
|
+
already_found = false
|
187
|
+
prot_cnt = 0
|
188
|
+
prots.each do |prot|
|
189
|
+
prot_cnt += 1
|
190
|
+
# SUM(1-probX)/#prots
|
191
|
+
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
192
|
+
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
193
|
+
|
194
|
+
if current_fppr_rate_percent > desired_fppr && !already_found
|
195
|
+
actual_fppr = previous_fppr_rate_percent
|
196
|
+
proteins_within_fppr = prot_cnt
|
197
|
+
already_found = true
|
198
|
+
end
|
199
|
+
previous_fppr_rate_percent = current_fppr_rate_percent
|
200
|
+
end
|
201
|
+
[proteins_within_fppr, actual_fppr]
|
202
|
+
end
|
203
|
+
|
204
|
+
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
|
205
|
+
|
206
|
+
# returns a string of the table rows
|
207
|
+
# false_positive_rate (give as a %) is the cutoff mark
|
208
|
+
# returns the number of proteins at the desired_fppr (if given)
|
209
|
+
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
|
210
|
+
prot_cnt = 0
|
211
|
+
uniq_prots.map do |prot|
|
212
|
+
tr do
|
213
|
+
prot_cnt += 1
|
214
|
+
gi = accession(prot._protein_name)
|
215
|
+
tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
|
216
|
+
end
|
217
|
+
end.join
|
218
|
+
end
|
219
|
+
|
220
|
+
def print_html_pieces(file, *pieces)
|
221
|
+
File.open(file, "w") do |out|
|
222
|
+
pieces.each do |piece|
|
223
|
+
out.print piece
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def file_info(file)
|
229
|
+
"<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
|
230
|
+
<br/>Last Modified: #{File.mtime(file)}
|
231
|
+
<br/>Size: #{File.size(file)/1000} KB
|
232
|
+
</div>"
|
233
|
+
end
|
234
|
+
|
235
|
+
def bioworks_script_info(obj)
|
236
|
+
version = "3.2??"
|
237
|
+
if obj.version
|
238
|
+
version = obj.version
|
239
|
+
end
|
240
|
+
script_info{"Bioworks version #{version}"}
|
241
|
+
end
|
242
|
+
|
243
|
+
def protproph_script_info
|
244
|
+
begin
|
245
|
+
where = `which xinteract`
|
246
|
+
reply = `#{where}`
|
247
|
+
rescue Exception
|
248
|
+
reply = ""
|
249
|
+
end
|
250
|
+
prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically
|
251
|
+
if reply =~ /xinteract.*?\((TPP .*)\)/
|
252
|
+
prophet = $1.dup
|
253
|
+
end
|
254
|
+
script_info { "ProteinProphet from: #{prophet}" }
|
255
|
+
end
|
256
|
+
|
257
|
+
def mspire_version
|
258
|
+
string = "mspire"
|
259
|
+
begin
|
260
|
+
if `gem list --local mspire` =~ /mspire \((.*?)\)/
|
261
|
+
string << (" v" + $1)
|
262
|
+
end
|
263
|
+
rescue Exception
|
264
|
+
end
|
265
|
+
string
|
266
|
+
end
|
267
|
+
|
268
|
+
def script_info
|
269
|
+
"<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
|
270
|
+
end
|
271
|
+
|
272
|
+
def proph_output(file, outfn, opt, fppr_output_as_html)
|
273
|
+
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
|
274
|
+
num_cols = header_anchors.size
|
275
|
+
theaders = ths(header_anchors)
|
276
|
+
|
277
|
+
root = AXML.parse_file(file)
|
278
|
+
prots = []
|
279
|
+
## find the min_prob at a fppr of XX
|
280
|
+
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
|
281
|
+
|
282
|
+
if opt.c
|
283
|
+
actual_percent_fp = opt.c.to_f
|
284
|
+
elsif opt.cut_at
|
285
|
+
actual_percent_fp = opt.cut_at.to_f
|
286
|
+
else
|
287
|
+
actual_percent_fp = nil
|
288
|
+
end
|
289
|
+
root.protein_group.each do |group|
|
290
|
+
group.protein.each do |prt|
|
291
|
+
prots << prt
|
292
|
+
end
|
293
|
+
end
|
294
|
+
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
295
|
+
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
296
|
+
|
297
|
+
## num proteins above cutoff (if opt.c)
|
298
|
+
num_prots_html = ''
|
299
|
+
if opt.c || opt.cut_at
|
300
|
+
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
|
301
|
+
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
|
302
|
+
end
|
303
|
+
if opt.cut_at
|
304
|
+
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
|
305
|
+
end
|
306
|
+
|
307
|
+
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
308
|
+
|
309
|
+
table_string = table do
|
310
|
+
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
|
311
|
+
end
|
312
|
+
er_info = opt.precision ? error_info(file) : ""
|
313
|
+
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
|
314
|
+
print_html_pieces(*html_pieces)
|
315
|
+
end # proph_output
|
316
|
+
|
317
|
+
# given a list of peptide sequences creates javascript to hide/show them
|
318
|
+
def peptide_cell(prot_num, peptide_sequences)
|
319
|
+
"<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
|
320
|
+
end
|
321
|
+
|
322
|
+
# takes spec_id object
|
323
|
+
# the outfn is the output filename
|
324
|
+
# opt is an OpenStruct that holds opt.f = the false prefix
|
325
|
+
def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
|
326
|
+
fppr_output_as_html ||= ''
|
327
|
+
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
328
|
+
num_cols = header_anchors.size
|
329
|
+
theaders = ths(header_anchors)
|
330
|
+
proteins = spec_id.prots
|
331
|
+
protein_num = 0
|
332
|
+
rows = ""
|
333
|
+
prefix_re = prefix_to_regex(false_prefix)
|
334
|
+
proteins.each do |prot|
|
335
|
+
if false_prefix && prot.reference =~ prefix_re
|
336
|
+
next
|
337
|
+
end
|
338
|
+
uniq_peps = Hash.new {|h,k| h[k] = true; }
|
339
|
+
protein_num += 1
|
340
|
+
prot.peps.each do |pep|
|
341
|
+
uniq_peps[pep.sequence.split('.')[1]] = true
|
342
|
+
end
|
343
|
+
pieces = prot.reference.split(' ')
|
344
|
+
long_prot_name = pieces.shift
|
345
|
+
annotation = pieces.join(' ')
|
346
|
+
accession = prot.accession
|
347
|
+
if accession == '0' ; accession = long_prot_name end
|
348
|
+
rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
|
349
|
+
end
|
350
|
+
table_string = table do
|
351
|
+
tr{theaders} + rows
|
352
|
+
end
|
353
|
+
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer)
|
354
|
+
end # bioworks_output
|
355
|
+
|
356
|
+
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
|
357
|
+
actual_cutoff = sprintf("%.3f", actual_cutoff)
|
358
|
+
desired_cutoff = sprintf("%.3f", desired_cutoff)
|
359
|
+
"<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3>
|
360
|
+
Desired FPPR: #{desired_cutoff} %<br/>
|
361
|
+
Actual FPPR: #{actual_cutoff} %<br/>
|
362
|
+
Number of Proteins at Actual FPPR: #{num_proteins}
|
363
|
+
</div>"
|
364
|
+
end
|
365
|
+
|
366
|
+
# transforms the output string of file_as_decoy into html
|
367
|
+
def file_as_decoy_to_html(string)
|
368
|
+
lines = string.split("\n")
|
369
|
+
#puts lines ?? is this supposed to be commented out?
|
370
|
+
lines = lines.reject do |obj| obj =~ /\*{10}/ end
|
371
|
+
lines.map! do |line| "#{line}<br/>" end
|
372
|
+
"<div class=\"fppr\">
|
373
|
+
<h3>Classification Analysis</h3>
|
374
|
+
#{lines.join("\n")}
|
375
|
+
</div>"
|
376
|
+
end
|
377
|
+
|
378
|
+
# transforms the output string of file_as_decoy into html
|
379
|
+
def prefix_as_decoy_to_html(string)
|
380
|
+
"<div class=\"fppr\">
|
381
|
+
<h3>Classification Analysis</h3>
|
382
|
+
</div>" +
|
383
|
+
string
|
384
|
+
end
|
385
|
+
|
386
|
+
def create_from_command_line_args(argv)
|
387
|
+
@orig_argv = argv.dup
|
388
|
+
|
389
|
+
opt = OpenStruct.new
|
390
|
+
opt.f = DEF_PREFIX
|
391
|
+
opts = OptionParser.new do |op|
|
392
|
+
op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
|
393
|
+
op.separator " where file = bioworks -or- <run>-prot (prophet output)"
|
394
|
+
op.separator " outputs: <file>.summary.html"
|
395
|
+
op.separator ""
|
396
|
+
op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
397
|
+
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
398
|
+
op.separator(" if --precision then -f is used to specify a file or prefix")
|
399
|
+
op.separator(" that indicates the false positives.")
|
400
|
+
op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
|
401
|
+
op.separator ""
|
402
|
+
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
|
403
|
+
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
404
|
+
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
405
|
+
op.separator ""
|
406
|
+
op.separator "Specific to ProteinProphet (with no concatenated DB):"
|
407
|
+
op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
|
408
|
+
op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
|
409
|
+
end
|
410
|
+
|
411
|
+
opts.parse!(argv)
|
412
|
+
|
413
|
+
if argv.size < 1
|
414
|
+
puts opts
|
415
|
+
return
|
416
|
+
end
|
417
|
+
|
418
|
+
fppr_output_as_html = ''
|
419
|
+
files = argv.to_a
|
420
|
+
files.each do |file|
|
421
|
+
outfn = file.sub(/\.xml$/, '.summary.html')
|
422
|
+
outfn = outfn.sub(/\.srg$/, '.summary.html')
|
423
|
+
## False Positive Rate Calculation:
|
424
|
+
if opt.precision
|
425
|
+
opt.o = outfn # won't actually be written over, but used
|
426
|
+
to_use_argv = create_precision_argv(file, opt)
|
427
|
+
(out_string, opt) = Prec.new.precision(to_use_argv)
|
428
|
+
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
|
429
|
+
end
|
430
|
+
|
431
|
+
case SpecID.file_type(file)
|
432
|
+
when "protproph"
|
433
|
+
#spec_id = SpecID.new(file)
|
434
|
+
proph_output(file, outfn, opt, fppr_output_as_html)
|
435
|
+
when "bioworks"
|
436
|
+
spec_id = SpecID.new(file)
|
437
|
+
bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
|
438
|
+
else
|
439
|
+
abort "filetype for #{file} not recognized!"
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
end # method create_from_command_line
|
444
|
+
|
445
|
+
def create_precision_argv(file, opt)
|
446
|
+
# include only those options specific
|
447
|
+
new_argv = [file]
|
448
|
+
if opt.f ; new_argv << '-f' << opt.f end
|
449
|
+
if opt.o ; new_argv << '-o' << opt.o end
|
450
|
+
new_argv
|
451
|
+
end
|
452
|
+
|
453
|
+
end # ProteinSummary
|
454
|
+
|
455
|
+
##################################################################
|
456
|
+
# MAIN
|
457
|
+
##################################################################
|
458
|
+
|
459
|
+
|