mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -1,476 +0,0 @@
1
-
2
- # for false_positive_rate:
3
- require 'optparse'
4
- require 'ostruct'
5
- require 'generator'
6
- require 'gnuplot'
7
- require 'roc'
8
-
9
- class String
10
- def margin
11
- self.gsub(/^\s*\|/,'')
12
- end
13
- end
14
-
15
- class SpecID
16
- class FalsePositiveRate
17
-
18
- module HTML
19
-
20
- # html and body tags
21
- def html
22
- "|<html>
23
- |#{yield}
24
- |</html>\n".margin
25
- end
26
-
27
- def body
28
- "|<body>
29
- | #{yield}
30
- |</body>\n".margin
31
- end
32
-
33
- def header
34
- "|<head>
35
- | #{style}
36
- |</head>\n".margin
37
- end
38
-
39
- def td
40
- "<td>#{yield}</td>"
41
- end
42
-
43
-
44
- def style
45
- '
46
- <style type="text/css">
47
- div#tp_table {
48
- text-align: center;
49
- margin-top: 50px;
50
- margin-bottom: 50px;
51
- }
52
- span.code {
53
- font-family: Courier,Monospace;
54
- font-size: 80%;
55
- }
56
- table {
57
- border-width:1px;
58
- border-color:#CCCCCC;
59
- border-collapse: collapse;
60
- }
61
- caption {
62
- font-size: 90%;
63
- }
64
- td,th {
65
- padding-top: 2px;
66
- padding-bottom: 2px;
67
- padding-left: 1;
68
- padding-right: 1;
69
- }
70
- th.small {
71
- font-size: 80%;
72
- font-weight: normal;
73
- padding: 1px;
74
- }
75
- td.redline {
76
- background-color: #FF0000;
77
- color: #FFFFFF
78
- }
79
- div#plot {
80
- margin: 30px;
81
- text-align:center
82
- }
83
- hr {color: sienna}
84
- body { font-size: 8pt; font-family: Arial,Helvetica,Times}
85
- </style>
86
- '
87
-
88
- end
89
-
90
- def table
91
- "|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
92
- | #{yield}
93
- |</table>\n".margin
94
- end
95
-
96
- def tr
97
- "|<tr>
98
- | #{yield}
99
- |</tr>\n".margin
100
- end
101
- end # module HTML
102
- end # class FalsePositiveRate
103
- end #class SpecID
104
-
105
-
106
-
107
- class SpecID
108
- class FalsePositiveRate
109
- ###########################################################
110
- # GLOBAL SETTINGS:
111
- DEF_PREFIX = "INV_"
112
- DATA_PREC = 4 # decimal places of precision or fpr data
113
- STDOUT_JTPLOT_BASE = "fpr" # if there is no outfile
114
- ###########################################################
115
-
116
- include SpecID::FalsePositiveRate::HTML
117
-
118
- ## returns either an ascii string (if file_as_decoy) or
119
- ## returns an html string
120
- def false_positive_rate(argv)
121
- opt = parse_args(argv)
122
- files = argv.to_a
123
- file_as_decoy = false
124
- if File.exist? opt.f
125
- file_as_decoy = true
126
- out_string = file_as_decoy(files, opt)
127
- else
128
- out_string = prefix_as_decoy(files, opt)
129
- end
130
- return [out_string, opt, file_as_decoy]
131
- end
132
-
133
- def run_cmd_line(argv)
134
- output_string, opt, file_as_decoy = false_positive_rate(argv)
135
- if file_as_decoy
136
- puts output_string
137
- else
138
- ## open file and write to it..
139
- if opt.o == 'STDOUT'
140
- print output_string
141
- else
142
- File.open(opt.o,'w') do |fh| fh.print output_string end
143
- end
144
- end
145
- end
146
-
147
- # returns the outfile with no extension
148
- def outfile_noext(opt)
149
- if opt == 'STDOUT'
150
- "#{STDOUT_JTPLOT_BASE}"
151
- else
152
- opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
153
- end
154
- end
155
-
156
- def file_noext(file)
157
- file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
158
- end
159
-
160
- def parse_args(argv)
161
-
162
- opt = OpenStruct.new
163
- opt.f = DEF_PREFIX
164
- opt.o = 'STDOUT'
165
- opts = OptionParser.new do |op|
166
- op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
167
- op.separator ""
168
- op.separator "Abbreviations and Definitions:"
169
- op.separator " TP = True Positives"
170
- op.separator " FP = False Positive"
171
- op.separator " FPR = False Positive Rate = [FP/(TP+FP)] (between 0 and 1)"
172
- op.separator " FPR2 = Gygi's estimation of FPR = [2*FPR]"
173
- op.separator " Precision = [TP/(TP+FP)]"
174
- op.separator ""
175
- op.separator "Output: "
176
- op.separator " 1. Decoy as separate search: FPR to STDOUT"
177
- op.separator " 2. Decoy proteins from concatenated database: 'fpr.html'"
178
- op.separator ""
179
- op.separator "Options:"
180
-
181
- op.on("-f", "--fp_data <prefix_or_file>", "PREFIX (def: #{DEF_PREFIX}) -or- decoy FILE") {|v| opt.f = v }
182
- op.separator ""
183
- op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
184
- op.separator " If files have different prefixes, separate with commas."
185
- op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
186
- op.separator ""
187
- op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
188
- ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
189
- op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
190
- op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
191
- op.separator ""
192
- op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
193
- op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
194
- op.on_tail("
195
- Examples:
196
- 1. For a search on a concatenated database where the decoy proteins have
197
- been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
198
- output:
199
-
200
- #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
201
-
202
- 2. To determine the false positive rate of a search (and fpr2 and precision)
203
- using a normal and decoy database search, filter both the normal and decoy
204
- datasets identically, export to xml and run like this (only works for
205
- Bioworks xml export):
206
-
207
- #{File.basename(__FILE__)} -tp -f decoy_bioworks.xml bioworks.xml
208
- ")
209
- end
210
- opts.parse!(argv)
211
-
212
- if argv.size < 1
213
- puts opts
214
- exit
215
- end
216
-
217
- opt
218
- end
219
-
220
-
221
- # takes a comma separated list and extends the last to create an array of
222
- # desired size
223
- def prefixes(arg, desired_size)
224
- arg_arr = arg.split(',')
225
- new_arr = []
226
- last_arg = arg_arr[0]
227
- desired_size.times do |i|
228
- if arg_arr[i]
229
- new_arr[i] = arg_arr[i]
230
- last_arg = new_arr[i]
231
- else
232
- new_arr[i] = last_arg
233
- end
234
- end
235
- new_arr
236
- end
237
-
238
-
239
- ## collapses arrays to one level deep so we can sync them up
240
- def arrays_to_one_level_deep(all_arrs)
241
- mostly_flat = []
242
- all_arrs.each do |per_file|
243
- per_file.each do |per_style|
244
- mostly_flat << per_style[0]
245
- mostly_flat << per_style[1]
246
- end
247
- end
248
- mostly_flat
249
- end
250
-
251
- # prints rows and th for the data
252
- def table_cells(all_arrs, key)
253
- ## columns specific headings:
254
- all_string = ""
255
- all_string << tr do
256
- line = ""
257
- key.each do |per_file|
258
- per_file.each do |per_ds|
259
- line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
260
- end
261
- end
262
- line
263
- end
264
- mostly_flat = arrays_to_one_level_deep(all_arrs)
265
- SyncEnumerator.new(*mostly_flat).each do |row|
266
- all_string << tr do
267
- string = row.map {|it|
268
- sty="%d"
269
- if it.class == Float ; sty="%.#{DATA_PREC}f" end
270
- td{ sprintf(sty,it)}
271
- }.join
272
- end
273
- end
274
- all_string
275
- end
276
-
277
- def html_table_output(all_arrs, key, files, filename_noext)
278
- num_datasets_per_file = all_arrs.first.size
279
- num_cols_per_dataset = 2
280
- big_colspan = num_datasets_per_file * num_cols_per_dataset
281
- output = table do
282
- tr do
283
- files.map do |file|
284
- "<th colspan=\"#{big_colspan}\">#{file}</th>"
285
- end.join
286
- end +
287
- tr do
288
- key.map do |arr|
289
- arr.map do |ds|
290
- "<th colspan=\"2\">#{ds.first}</th>"
291
- end
292
- end
293
- end +
294
- table_cells(all_arrs, key)
295
- end
296
- "<div id=\"tp_table\">" + output + "</div>"
297
- end
298
-
299
-
300
- def y_axis_label(key)
301
- ## We only take the keys for the first file, as it's assumed that the major
302
- ## labels will be identical for all of them
303
- labels = key.first.map {|tp| tp.first }
304
- labels.join " | "
305
- end
306
-
307
- # escapes any ' chars
308
- def escape_to_gnuplot(string)
309
- # long way, but it works.
310
- new_string = ""
311
- string.split(//).each do |chr|
312
- if chr == "'" ; new_string << "\\" end
313
- new_string << chr
314
- end
315
- new_string
316
- end
317
-
318
-
319
-
320
- ## outputs a .toplot file based on filename_noext, creates a png file, and
321
- ## writes html to fh that will load the png file up
322
- ## This is a self contained module that can be swapped out for a
323
- ## completely different plotting program if desired.
324
- def plot_figure(all_arrs, key, files, filename_noext)
325
-
326
-
327
- ## CREATE the PLOT IMAGE:
328
- to_plot = filename_noext+'.toplot'
329
- png = filename_noext+'.png'
330
- Gnuplot.open do |gp|
331
- Gnuplot::Plot.new( gp ) do |plot|
332
- plot.terminal "png noenhanced"
333
- plot.output png
334
- plot.title "Classification Analysis"
335
- plot.xlabel 'Num True Positives'
336
- plot.ylabel escape_to_gnuplot(y_axis_label(key))
337
- plot.style "line 1 lt 1"
338
- plot.style "line 2 lt 12"
339
- #plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
340
- plot.yrange "[-0.05:#{1.0 + 0.2*files.size}]"
341
- files.each_with_index do |file,i|
342
- key[i].each_with_index do |k,j|
343
- plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
344
- ds.with = "lines"
345
- ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
346
- end
347
- end
348
- end
349
- end
350
- end
351
-
352
- =begin
353
-
354
- ## CREATE the PLOT IMAGE:
355
- to_plot = filename_noext+'.toplot'
356
- png = filename_noext+'.png'
357
- File.open(to_plot,'w') do |out|
358
- out.puts 'XYData'
359
- out.puts filename_noext
360
- out.puts "Classification Analysis"
361
- out.puts 'Num True Positives'
362
- out.puts escape_to_gnuplot(y_axis_label(key))
363
- files.each_with_index do |file,i|
364
- #p key[i]
365
- #p all_arrs[i]
366
-
367
- key[i].each_with_index do |k,j|
368
- out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
369
- out.puts all_arrs[i][j][0].join(' ')
370
- out.puts all_arrs[i][j][1].join(' ')
371
- end
372
- end
373
- end
374
- num_files = files.size
375
- if $".include? 'plotter.rb'
376
- cmd = "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
377
- plot_cmd = "plot.rb #{cmd}"
378
- Plotter.new.plot_string "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
379
- unless File.file? png
380
- abort "Fatal Error in plotting cmd=\"#{plot_cmd}\":\n#{reply}"
381
- end
382
- else
383
- warn "plotter.rb not found, not png plot image available"
384
- end
385
-
386
- =end
387
-
388
-
389
- ## CREATE the HTML to load the plot:
390
- basename_filename_noext = File.basename(filename_noext)
391
- output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
392
- #output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: &nbsp;&nbsp; <span class=\"code\">#{plot_cmd}</span></caption>\n"
393
- output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
394
- output << "</table></div>\n"
395
- output
396
- end # plot_figure
397
-
398
- def file_as_decoy(files, opt)
399
- bio = SpecID::Bioworks.new
400
- puts "Calculating false positive rates using '#{opt.f}' as decoy ..."
401
- fps = bio.num_prots(opt.f)
402
- out = ""
403
- files.each do |file|
404
- tps = bio.num_prots(file)
405
- out << "*****************************************************\n"
406
- out << sprintf("%-36s # TP : #{tps}\n", file)
407
- out << sprintf("%-36s # FP : #{fps}\n", opt.f)
408
- out << sprintf(" False Positive Rate [FP/(TP+FP)] : %.3f\n", fps.to_f/(tps+fps)) unless opt.n
409
- out << sprintf(" Gygi's False Positive Rate 2*[FP/(TP+FP)] : %.3f\n", 2.0*fps/(tps+fps)) if opt.g
410
- out << sprintf(" Precision [TP/(TP+FP)] : %.3f\n", tps.to_f/(tps+fps)) if opt.p
411
- out << "*****************************************************\n"
412
- end
413
- out
414
- end
415
-
416
-
417
-
418
- def prefix_as_decoy(files, opt)
419
- #puts "Calculating false positive rates using prefix #{opt.f} ..."
420
- prefix_arr = prefixes(opt.f, files.size)
421
- all_arrs = []
422
- key = []
423
- out_noext = outfile_noext(opt.o)
424
- files.each_with_index do |file,i|
425
- all_arrs[i] = []
426
- key[i] = []
427
- sp = SpecID.new(file)
428
- #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
429
- (tp, prec, fpr2) = sp.tps_and_precision_and_fpr2_times2_for_prob(prefix_arr[i])
430
- if opt.g
431
- all_arrs[i] << [tp,fpr2]
432
- key[i] << ["Gygi FPR", ["#TP", "Gygi FPR = 2*FP/(TP+FP)"]]
433
- end
434
- if opt.p
435
- all_arrs[i] << [tp,prec]
436
- key[i] << ["Prec", ["#TP", "Prec = TP/(TP+FP)"]]
437
- end
438
- unless opt.n
439
- ## Add the fpr datasets
440
- fpr = fpr2.map {|v| v/2.0}
441
- all_arrs[i] << [tp,fpr]
442
- key[i] << ["FPR", ["#TP", "FPR = FP/(TP+FP)"]]
443
- end
444
- end
445
-
446
- string = ''
447
- if opt.a
448
- roc = ROC.new
449
- #string << "***********************************************************\n"
450
- #string << "AREA UNDER CURVE:\n"
451
- key.each_with_index do |file,i|
452
- string << "#{files[i]} (area under curve)\n"
453
- key[i].each_index do |j|
454
- string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
455
- tps = all_arrs[i][j][0]
456
- oth = all_arrs[i][j][1]
457
- string << roc.area_under_curve(tps, oth).to_s << "\n"
458
- end
459
- end
460
- #string << "***********************************************************\n"
461
- else
462
- string = html do
463
- header +
464
- body do
465
- plot_figure(all_arrs, key, files, out_noext) +
466
- html_table_output(all_arrs, key, files, out_noext)
467
- end
468
- end
469
- end
470
- string
471
- end
472
-
473
- end # class FalsePositiveRate
474
- end # class SpecID
475
-
476
-
data/test/tc_gi2annot.rb DELETED
@@ -1,12 +0,0 @@
1
-
2
- require 'test/unit'
3
-
4
-
5
- class Gi2AnnotTest < Test::Unit::TestCase
6
- ROOT_DIR = File.join(File.dirname(__FILE__), '..')
7
-
8
- def test_query
9
- # @TODO: Add in some kind of skip if there is no internet connection
10
- assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", `ruby -I#{File.join(ROOT_DIR, 'lib')} #{File.join(ROOT_DIR, 'bin', 'gi2annot.rb')} 16130548`)
11
- end
12
- end