mspire 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,427 @@
|
|
1
|
+
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
require 'generator'
|
5
|
+
require 'gnuplot'
|
6
|
+
require 'roc'
|
7
|
+
|
8
|
+
class String
|
9
|
+
def margin
|
10
|
+
self.gsub(/^\s*\|/,'')
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class SpecID ; end
|
15
|
+
class SpecID::Precision ; end
|
16
|
+
|
17
|
+
module SpecID::Precision::PlotHelper
|
18
|
+
|
19
|
+
PLOT_TYPE = 'XYData'
|
20
|
+
TITLE = 'Precision (Positive Predictive Value)'
|
21
|
+
XAXIS = 'Num Hits (excludes known false positives)'
|
22
|
+
EXT = '.toplot'
|
23
|
+
IMAGE_EXT = '.png'
|
24
|
+
|
25
|
+
def create_to_plot_file(all_arrs, key, files, filename_noext)
|
26
|
+
## CREATE the PLOT IMAGE:
|
27
|
+
to_plot = filename_noext + EXT
|
28
|
+
png = filename_noext + IMAGE_EXT
|
29
|
+
File.open(to_plot,'w') do |out|
|
30
|
+
out.puts PLOT_TYPE
|
31
|
+
out.puts filename_noext
|
32
|
+
out.puts TITLE
|
33
|
+
out.puts XAXIS
|
34
|
+
out.puts escape_to_gnuplot(y_axis_label(key))
|
35
|
+
files.each_with_index do |file,i|
|
36
|
+
#p key[i]
|
37
|
+
#p all_arrs[i]
|
38
|
+
|
39
|
+
key[i].each_with_index do |k,j|
|
40
|
+
out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
|
41
|
+
out.puts all_arrs[i][j][0].join(' ')
|
42
|
+
out.puts all_arrs[i][j][1].join(' ')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
## outputs a .toplot file based on filename_noext, creates a png file, and
|
50
|
+
## writes html to fh that will load the png file up
|
51
|
+
## This is a self contained module that can be swapped out for a
|
52
|
+
## completely different plotting program if desired.
|
53
|
+
def plot_figure(all_arrs, key, files, filename_noext)
|
54
|
+
|
55
|
+
## CREATE the PLOT IMAGE:
|
56
|
+
to_plot = filename_noext+'.toplot'
|
57
|
+
png = filename_noext+'.png'
|
58
|
+
Gnuplot.open do |gp|
|
59
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
60
|
+
plot.terminal "png noenhanced"
|
61
|
+
plot.output png
|
62
|
+
plot.title TITLE
|
63
|
+
plot.xlabel XAXIS
|
64
|
+
plot.ylabel escape_to_gnuplot(y_axis_label(key))
|
65
|
+
plot.style "line 1 lt 1"
|
66
|
+
plot.style "line 2 lt 12"
|
67
|
+
#plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
|
68
|
+
plot.yrange "[-0.05:#{1.05 + 0.020*files.size}]"
|
69
|
+
files.each_with_index do |file,i|
|
70
|
+
key[i].each_with_index do |k,j|
|
71
|
+
plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
|
72
|
+
ds.with = "lines"
|
73
|
+
ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
## CREATE the HTML to load the plot:
|
81
|
+
basename_filename_noext = File.basename(filename_noext)
|
82
|
+
output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
|
83
|
+
#output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: <span class=\"code\">#{plot_cmd}</span></caption>\n"
|
84
|
+
output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
|
85
|
+
output << "</table></div>\n"
|
86
|
+
output
|
87
|
+
end # plot_figure
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
module SpecID::Precision::HTML
|
92
|
+
|
93
|
+
# html and body tags
|
94
|
+
def html
|
95
|
+
"|<html>
|
96
|
+
|#{yield}
|
97
|
+
|</html>\n".margin
|
98
|
+
end
|
99
|
+
|
100
|
+
def body
|
101
|
+
"|<body>
|
102
|
+
| #{yield}
|
103
|
+
|</body>\n".margin
|
104
|
+
end
|
105
|
+
|
106
|
+
def header
|
107
|
+
"|<head>
|
108
|
+
| #{style}
|
109
|
+
|</head>\n".margin
|
110
|
+
end
|
111
|
+
|
112
|
+
def td
|
113
|
+
"<td>#{yield}</td>"
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
def style
|
118
|
+
'
|
119
|
+
<style type="text/css">
|
120
|
+
div#tp_table {
|
121
|
+
text-align: center;
|
122
|
+
margin-top: 50px;
|
123
|
+
margin-bottom: 50px;
|
124
|
+
}
|
125
|
+
span.code {
|
126
|
+
font-family: Courier,Monospace;
|
127
|
+
font-size: 80%;
|
128
|
+
}
|
129
|
+
table {
|
130
|
+
border-width:1px;
|
131
|
+
border-color:#CCCCCC;
|
132
|
+
border-collapse: collapse;
|
133
|
+
}
|
134
|
+
caption {
|
135
|
+
font-size: 90%;
|
136
|
+
}
|
137
|
+
td,th {
|
138
|
+
padding-top: 2px;
|
139
|
+
padding-bottom: 2px;
|
140
|
+
padding-left: 1;
|
141
|
+
padding-right: 1;
|
142
|
+
}
|
143
|
+
th.small {
|
144
|
+
font-size: 80%;
|
145
|
+
font-weight: normal;
|
146
|
+
padding: 1px;
|
147
|
+
}
|
148
|
+
td.redline {
|
149
|
+
background-color: #FF0000;
|
150
|
+
color: #FFFFFF
|
151
|
+
}
|
152
|
+
div#plot {
|
153
|
+
margin: 30px;
|
154
|
+
text-align:center
|
155
|
+
}
|
156
|
+
hr {color: sienna}
|
157
|
+
body { font-size: 8pt; font-family: Arial,Helvetica,Times}
|
158
|
+
</style>
|
159
|
+
'
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
def table
|
164
|
+
"|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
|
165
|
+
| #{yield}
|
166
|
+
|</table>\n".margin
|
167
|
+
end
|
168
|
+
|
169
|
+
def tr
|
170
|
+
"|<tr>
|
171
|
+
| #{yield}
|
172
|
+
|</tr>\n".margin
|
173
|
+
end
|
174
|
+
end # module HTML
|
175
|
+
|
176
|
+
class SpecID::Precision
|
177
|
+
include SpecID::Precision::PlotHelper
|
178
|
+
|
179
|
+
###########################################################
|
180
|
+
# GLOBAL SETTINGS:
|
181
|
+
DEF_PREFIX = "INV_"
|
182
|
+
DATA_PREC = 4 # decimal places of precision for ppv data
|
183
|
+
STDOUT_JTPLOT_BASE = "ppv" # if there is no outfile
|
184
|
+
###########################################################
|
185
|
+
|
186
|
+
include SpecID::Precision::HTML
|
187
|
+
|
188
|
+
## returns an html string
|
189
|
+
def precision(argv)
|
190
|
+
opt = parse_args(argv)
|
191
|
+
files = argv.to_a
|
192
|
+
out_string = prefix_as_decoy(files, opt)
|
193
|
+
[out_string, opt]
|
194
|
+
end
|
195
|
+
|
196
|
+
def run_cmd_line(argv)
|
197
|
+
output_string, opt, file_as_decoy = precision(argv)
|
198
|
+
if file_as_decoy
|
199
|
+
puts output_string
|
200
|
+
else
|
201
|
+
## open file and write to it..
|
202
|
+
if opt.o == 'STDOUT'
|
203
|
+
print output_string
|
204
|
+
else
|
205
|
+
File.open(opt.o,'w') do |fh| fh.print output_string end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# returns the outfile with no extension
|
211
|
+
def outfile_noext(opt)
|
212
|
+
if opt == 'STDOUT'
|
213
|
+
"#{STDOUT_JTPLOT_BASE}"
|
214
|
+
else
|
215
|
+
opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def file_noext(file)
|
220
|
+
file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
|
221
|
+
end
|
222
|
+
|
223
|
+
def parse_args(argv)
|
224
|
+
|
225
|
+
opt = OpenStruct.new
|
226
|
+
opt.o = 'STDOUT'
|
227
|
+
opts = OptionParser.new do |op|
|
228
|
+
op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
|
229
|
+
op.separator ""
|
230
|
+
op.separator "Abbreviations and Definitions:"
|
231
|
+
op.separator " TP = True Positives"
|
232
|
+
op.separator " FP = False Positives"
|
233
|
+
op.separator " Precision = Positive Predictive Value = [TP/(TP+FP)]"
|
234
|
+
op.separator ""
|
235
|
+
op.separator "Output: "
|
236
|
+
op.separator " 1. Decoy as separate search: PPV to STDOUT"
|
237
|
+
op.separator " 2. Decoy proteins from concatenated database: '.html'"
|
238
|
+
op.separator ""
|
239
|
+
op.separator "Options:"
|
240
|
+
|
241
|
+
op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
|
242
|
+
op.separator ""
|
243
|
+
op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
|
244
|
+
op.separator " If files have different prefixes, separate with commas."
|
245
|
+
op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
|
246
|
+
op.separator ""
|
247
|
+
## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
|
248
|
+
op.separator ""
|
249
|
+
op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
|
250
|
+
op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
|
251
|
+
op.on("-j", "--plot_file", "output to_plot file") {|v| opt.j = v}
|
252
|
+
op.on_tail("
|
253
|
+
Example:
|
254
|
+
For a search on a concatenated database where the decoy proteins have
|
255
|
+
been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
|
256
|
+
output:
|
257
|
+
|
258
|
+
#{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
|
259
|
+
|
260
|
+
")
|
261
|
+
end
|
262
|
+
opts.parse!(argv)
|
263
|
+
|
264
|
+
if argv.size < 1
|
265
|
+
puts opts
|
266
|
+
exit
|
267
|
+
end
|
268
|
+
|
269
|
+
opt
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
# takes a comma separated list and extends the last to create an array of
|
274
|
+
# desired size
|
275
|
+
def prefixes(arg, desired_size)
|
276
|
+
arg_arr = arg.split(',')
|
277
|
+
new_arr = []
|
278
|
+
last_arg = arg_arr[0]
|
279
|
+
desired_size.times do |i|
|
280
|
+
if arg_arr[i]
|
281
|
+
new_arr[i] = arg_arr[i]
|
282
|
+
last_arg = new_arr[i]
|
283
|
+
else
|
284
|
+
new_arr[i] = last_arg
|
285
|
+
end
|
286
|
+
end
|
287
|
+
new_arr
|
288
|
+
end
|
289
|
+
|
290
|
+
|
291
|
+
## collapses arrays to one level deep so we can sync them up
|
292
|
+
def arrays_to_one_level_deep(all_arrs)
|
293
|
+
mostly_flat = []
|
294
|
+
all_arrs.each do |per_file|
|
295
|
+
per_file.each do |per_style|
|
296
|
+
mostly_flat << per_style[0]
|
297
|
+
mostly_flat << per_style[1]
|
298
|
+
end
|
299
|
+
end
|
300
|
+
mostly_flat
|
301
|
+
end
|
302
|
+
|
303
|
+
# prints rows and th for the data
|
304
|
+
def table_cells(all_arrs, key)
|
305
|
+
## columns specific headings:
|
306
|
+
all_string = ""
|
307
|
+
all_string << tr do
|
308
|
+
line = ""
|
309
|
+
key.each do |per_file|
|
310
|
+
per_file.each do |per_ds|
|
311
|
+
line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
|
312
|
+
end
|
313
|
+
end
|
314
|
+
line
|
315
|
+
end
|
316
|
+
mostly_flat = arrays_to_one_level_deep(all_arrs)
|
317
|
+
SyncEnumerator.new(*mostly_flat).each do |row|
|
318
|
+
all_string << tr do
|
319
|
+
string = row.map {|it|
|
320
|
+
sty="%d"
|
321
|
+
if it.class == Float ; sty="%.#{DATA_PREC}f" end
|
322
|
+
td{ sprintf(sty,it)}
|
323
|
+
}.join
|
324
|
+
end
|
325
|
+
end
|
326
|
+
all_string
|
327
|
+
end
|
328
|
+
|
329
|
+
def html_table_output(all_arrs, key, files, filename_noext)
|
330
|
+
num_datasets_per_file = all_arrs.first.size
|
331
|
+
num_cols_per_dataset = 2
|
332
|
+
big_colspan = num_datasets_per_file * num_cols_per_dataset
|
333
|
+
output = table do
|
334
|
+
tr do
|
335
|
+
files.map do |file|
|
336
|
+
"<th colspan=\"#{big_colspan}\">#{file}</th>"
|
337
|
+
end.join
|
338
|
+
end +
|
339
|
+
tr do
|
340
|
+
key.map do |arr|
|
341
|
+
arr.map do |ds|
|
342
|
+
"<th colspan=\"2\">#{ds.first}</th>"
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end +
|
346
|
+
table_cells(all_arrs, key)
|
347
|
+
end
|
348
|
+
"<div id=\"tp_table\">" + output + "</div>"
|
349
|
+
end
|
350
|
+
|
351
|
+
|
352
|
+
def y_axis_label(key)
|
353
|
+
## We only take the keys for the first file, as it's assumed that the major
|
354
|
+
## labels will be identical for all of them
|
355
|
+
labels = key.first.map {|tp| tp.first }
|
356
|
+
labels.join " | "
|
357
|
+
end
|
358
|
+
|
359
|
+
# escapes any ' chars
|
360
|
+
def escape_to_gnuplot(string)
|
361
|
+
# long way, but it works.
|
362
|
+
new_string = ""
|
363
|
+
string.split(//).each do |chr|
|
364
|
+
if chr == "'" ; new_string << "\\" end
|
365
|
+
new_string << chr
|
366
|
+
end
|
367
|
+
new_string
|
368
|
+
end
|
369
|
+
|
370
|
+
def prefix_as_decoy(files, opt)
|
371
|
+
$stderr.puts "using prefix #{opt.f} ..."
|
372
|
+
|
373
|
+
if opt.f
|
374
|
+
prefix_arr = prefixes(opt.f, files.size)
|
375
|
+
end
|
376
|
+
all_arrs = []
|
377
|
+
key = []
|
378
|
+
out_noext = outfile_noext(opt.o)
|
379
|
+
files.each_with_index do |file,i|
|
380
|
+
all_arrs[i] = []
|
381
|
+
key[i] = []
|
382
|
+
sp = SpecID.new(file)
|
383
|
+
#headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
|
384
|
+
if opt.f
|
385
|
+
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
|
386
|
+
all_arrs[i] << [num_hits,ppv]
|
387
|
+
key[i] << ["Precision", ["#TP", "Prec = TP/(TP+FP)"]]
|
388
|
+
else
|
389
|
+
## These are just from protein prophet probabilities:
|
390
|
+
(num_hits, ppv) = sp.num_hits_and_ppv_for_protein_prophet_probabilities
|
391
|
+
all_arrs[i] << [num_hits,ppv]
|
392
|
+
key[i] << ["Precision", ["#TP", "Prec = TP/(TP+FP)"]]
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
string = ''
|
397
|
+
if opt.a
|
398
|
+
roc = ROC.new
|
399
|
+
#string << "***********************************************************\n"
|
400
|
+
#string << "AREA UNDER CURVE:\n"
|
401
|
+
key.each_with_index do |file,i|
|
402
|
+
string << "#{files[i]} (area under curve)\n"
|
403
|
+
key[i].each_index do |j|
|
404
|
+
string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
|
405
|
+
num_hits = all_arrs[i][j][0]
|
406
|
+
oth = all_arrs[i][j][1]
|
407
|
+
string << roc.area_under_curve(num_hits, oth).to_s << "\n"
|
408
|
+
end
|
409
|
+
end
|
410
|
+
#string << "***********************************************************\n"
|
411
|
+
else
|
412
|
+
if opt.j
|
413
|
+
create_to_plot_file(all_arrs, key, files, out_noext)
|
414
|
+
end
|
415
|
+
string = html do
|
416
|
+
header +
|
417
|
+
body do
|
418
|
+
plot_figure(all_arrs, key, files, out_noext) +
|
419
|
+
html_table_output(all_arrs, key, files, out_noext)
|
420
|
+
end
|
421
|
+
end
|
422
|
+
end
|
423
|
+
string
|
424
|
+
end
|
425
|
+
|
426
|
+
end # class SpecID
|
427
|
+
|
data/lib/spec_id/proph.rb
CHANGED
@@ -14,7 +14,7 @@ class Proph
|
|
14
14
|
|
15
15
|
class Parser
|
16
16
|
def root_el(file)
|
17
|
-
|
17
|
+
AXML.parse_file(file)
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
@@ -275,7 +275,7 @@ class Pep::Parser < Parser
|
|
275
275
|
|
276
276
|
## file from peptideAtlas:
|
277
277
|
search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
|
278
|
-
search_result_regex2 = /<search_result
|
278
|
+
search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
|
279
279
|
search_hit_regex = /<search_hit .*peptide="(\w+)" /o
|
280
280
|
|
281
281
|
peptide_h = {}
|