mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -1,476 +0,0 @@
1
-
2
- # for false_positive_rate:
3
- require 'optparse'
4
- require 'ostruct'
5
- require 'generator'
6
- require 'gnuplot'
7
- require 'roc'
8
-
9
- class String
10
- def margin
11
- self.gsub(/^\s*\|/,'')
12
- end
13
- end
14
-
15
- class SpecID
16
- class FalsePositiveRate
17
-
18
- module HTML
19
-
20
- # html and body tags
21
- def html
22
- "|<html>
23
- |#{yield}
24
- |</html>\n".margin
25
- end
26
-
27
- def body
28
- "|<body>
29
- | #{yield}
30
- |</body>\n".margin
31
- end
32
-
33
- def header
34
- "|<head>
35
- | #{style}
36
- |</head>\n".margin
37
- end
38
-
39
- def td
40
- "<td>#{yield}</td>"
41
- end
42
-
43
-
44
- def style
45
- '
46
- <style type="text/css">
47
- div#tp_table {
48
- text-align: center;
49
- margin-top: 50px;
50
- margin-bottom: 50px;
51
- }
52
- span.code {
53
- font-family: Courier,Monospace;
54
- font-size: 80%;
55
- }
56
- table {
57
- border-width:1px;
58
- border-color:#CCCCCC;
59
- border-collapse: collapse;
60
- }
61
- caption {
62
- font-size: 90%;
63
- }
64
- td,th {
65
- padding-top: 2px;
66
- padding-bottom: 2px;
67
- padding-left: 1;
68
- padding-right: 1;
69
- }
70
- th.small {
71
- font-size: 80%;
72
- font-weight: normal;
73
- padding: 1px;
74
- }
75
- td.redline {
76
- background-color: #FF0000;
77
- color: #FFFFFF
78
- }
79
- div#plot {
80
- margin: 30px;
81
- text-align:center
82
- }
83
- hr {color: sienna}
84
- body { font-size: 8pt; font-family: Arial,Helvetica,Times}
85
- </style>
86
- '
87
-
88
- end
89
-
90
- def table
91
- "|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
92
- | #{yield}
93
- |</table>\n".margin
94
- end
95
-
96
- def tr
97
- "|<tr>
98
- | #{yield}
99
- |</tr>\n".margin
100
- end
101
- end # module HTML
102
- end # class FalsePositiveRate
103
- end #class SpecID
104
-
105
-
106
-
107
- class SpecID
108
- class FalsePositiveRate
109
- ###########################################################
110
- # GLOBAL SETTINGS:
111
- DEF_PREFIX = "INV_"
112
- DATA_PREC = 4 # decimal places of precision or fpr data
113
- STDOUT_JTPLOT_BASE = "fpr" # if there is no outfile
114
- ###########################################################
115
-
116
- include SpecID::FalsePositiveRate::HTML
117
-
118
- ## returns either an ascii string (if file_as_decoy) or
119
- ## returns an html string
120
- def false_positive_rate(argv)
121
- opt = parse_args(argv)
122
- files = argv.to_a
123
- file_as_decoy = false
124
- if File.exist? opt.f
125
- file_as_decoy = true
126
- out_string = file_as_decoy(files, opt)
127
- else
128
- out_string = prefix_as_decoy(files, opt)
129
- end
130
- return [out_string, opt, file_as_decoy]
131
- end
132
-
133
- def run_cmd_line(argv)
134
- output_string, opt, file_as_decoy = false_positive_rate(argv)
135
- if file_as_decoy
136
- puts output_string
137
- else
138
- ## open file and write to it..
139
- if opt.o == 'STDOUT'
140
- print output_string
141
- else
142
- File.open(opt.o,'w') do |fh| fh.print output_string end
143
- end
144
- end
145
- end
146
-
147
- # returns the outfile with no extension
148
- def outfile_noext(opt)
149
- if opt == 'STDOUT'
150
- "#{STDOUT_JTPLOT_BASE}"
151
- else
152
- opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
153
- end
154
- end
155
-
156
- def file_noext(file)
157
- file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
158
- end
159
-
160
- def parse_args(argv)
161
-
162
- opt = OpenStruct.new
163
- opt.f = DEF_PREFIX
164
- opt.o = 'STDOUT'
165
- opts = OptionParser.new do |op|
166
- op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
167
- op.separator ""
168
- op.separator "Abbreviations and Definitions:"
169
- op.separator " TP = True Positives"
170
- op.separator " FP = False Positive"
171
- op.separator " FPR = False Positive Rate = [FP/(TP+FP)] (between 0 and 1)"
172
- op.separator " FPR2 = Gygi's estimation of FPR = [2*FPR]"
173
- op.separator " Precision = [TP/(TP+FP)]"
174
- op.separator ""
175
- op.separator "Output: "
176
- op.separator " 1. Decoy as separate search: FPR to STDOUT"
177
- op.separator " 2. Decoy proteins from concatenated database: 'fpr.html'"
178
- op.separator ""
179
- op.separator "Options:"
180
-
181
- op.on("-f", "--fp_data <prefix_or_file>", "PREFIX (def: #{DEF_PREFIX}) -or- decoy FILE") {|v| opt.f = v }
182
- op.separator ""
183
- op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
184
- op.separator " If files have different prefixes, separate with commas."
185
- op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
186
- op.separator ""
187
- op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
188
- ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
189
- op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
190
- op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
191
- op.separator ""
192
- op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
193
- op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
194
- op.on_tail("
195
- Examples:
196
- 1. For a search on a concatenated database where the decoy proteins have
197
- been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
198
- output:
199
-
200
- #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
201
-
202
- 2. To determine the false positive rate of a search (and fpr2 and precision)
203
- using a normal and decoy database search, filter both the normal and decoy
204
- datasets identically, export to xml and run like this (only works for
205
- Bioworks xml export):
206
-
207
- #{File.basename(__FILE__)} -tp -f decoy_bioworks.xml bioworks.xml
208
- ")
209
- end
210
- opts.parse!(argv)
211
-
212
- if argv.size < 1
213
- puts opts
214
- exit
215
- end
216
-
217
- opt
218
- end
219
-
220
-
221
- # takes a comma separated list and extends the last to create an array of
222
- # desired size
223
- def prefixes(arg, desired_size)
224
- arg_arr = arg.split(',')
225
- new_arr = []
226
- last_arg = arg_arr[0]
227
- desired_size.times do |i|
228
- if arg_arr[i]
229
- new_arr[i] = arg_arr[i]
230
- last_arg = new_arr[i]
231
- else
232
- new_arr[i] = last_arg
233
- end
234
- end
235
- new_arr
236
- end
237
-
238
-
239
- ## collapses arrays to one level deep so we can sync them up
240
- def arrays_to_one_level_deep(all_arrs)
241
- mostly_flat = []
242
- all_arrs.each do |per_file|
243
- per_file.each do |per_style|
244
- mostly_flat << per_style[0]
245
- mostly_flat << per_style[1]
246
- end
247
- end
248
- mostly_flat
249
- end
250
-
251
- # prints rows and th for the data
252
- def table_cells(all_arrs, key)
253
- ## columns specific headings:
254
- all_string = ""
255
- all_string << tr do
256
- line = ""
257
- key.each do |per_file|
258
- per_file.each do |per_ds|
259
- line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
260
- end
261
- end
262
- line
263
- end
264
- mostly_flat = arrays_to_one_level_deep(all_arrs)
265
- SyncEnumerator.new(*mostly_flat).each do |row|
266
- all_string << tr do
267
- string = row.map {|it|
268
- sty="%d"
269
- if it.class == Float ; sty="%.#{DATA_PREC}f" end
270
- td{ sprintf(sty,it)}
271
- }.join
272
- end
273
- end
274
- all_string
275
- end
276
-
277
- def html_table_output(all_arrs, key, files, filename_noext)
278
- num_datasets_per_file = all_arrs.first.size
279
- num_cols_per_dataset = 2
280
- big_colspan = num_datasets_per_file * num_cols_per_dataset
281
- output = table do
282
- tr do
283
- files.map do |file|
284
- "<th colspan=\"#{big_colspan}\">#{file}</th>"
285
- end.join
286
- end +
287
- tr do
288
- key.map do |arr|
289
- arr.map do |ds|
290
- "<th colspan=\"2\">#{ds.first}</th>"
291
- end
292
- end
293
- end +
294
- table_cells(all_arrs, key)
295
- end
296
- "<div id=\"tp_table\">" + output + "</div>"
297
- end
298
-
299
-
300
- def y_axis_label(key)
301
- ## We only take the keys for the first file, as it's assumed that the major
302
- ## labels will be identical for all of them
303
- labels = key.first.map {|tp| tp.first }
304
- labels.join " | "
305
- end
306
-
307
- # escapes any ' chars
308
- def escape_to_gnuplot(string)
309
- # long way, but it works.
310
- new_string = ""
311
- string.split(//).each do |chr|
312
- if chr == "'" ; new_string << "\\" end
313
- new_string << chr
314
- end
315
- new_string
316
- end
317
-
318
-
319
-
320
- ## outputs a .toplot file based on filename_noext, creates a png file, and
321
- ## writes html to fh that will load the png file up
322
- ## This is a self contained module that can be swapped out for a
323
- ## completely different plotting program if desired.
324
- def plot_figure(all_arrs, key, files, filename_noext)
325
-
326
-
327
- ## CREATE the PLOT IMAGE:
328
- to_plot = filename_noext+'.toplot'
329
- png = filename_noext+'.png'
330
- Gnuplot.open do |gp|
331
- Gnuplot::Plot.new( gp ) do |plot|
332
- plot.terminal "png noenhanced"
333
- plot.output png
334
- plot.title "Classification Analysis"
335
- plot.xlabel 'Num True Positives'
336
- plot.ylabel escape_to_gnuplot(y_axis_label(key))
337
- plot.style "line 1 lt 1"
338
- plot.style "line 2 lt 12"
339
- #plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
340
- plot.yrange "[-0.05:#{1.0 + 0.2*files.size}]"
341
- files.each_with_index do |file,i|
342
- key[i].each_with_index do |k,j|
343
- plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
344
- ds.with = "lines"
345
- ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
346
- end
347
- end
348
- end
349
- end
350
- end
351
-
352
- =begin
353
-
354
- ## CREATE the PLOT IMAGE:
355
- to_plot = filename_noext+'.toplot'
356
- png = filename_noext+'.png'
357
- File.open(to_plot,'w') do |out|
358
- out.puts 'XYData'
359
- out.puts filename_noext
360
- out.puts "Classification Analysis"
361
- out.puts 'Num True Positives'
362
- out.puts escape_to_gnuplot(y_axis_label(key))
363
- files.each_with_index do |file,i|
364
- #p key[i]
365
- #p all_arrs[i]
366
-
367
- key[i].each_with_index do |k,j|
368
- out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
369
- out.puts all_arrs[i][j][0].join(' ')
370
- out.puts all_arrs[i][j][1].join(' ')
371
- end
372
- end
373
- end
374
- num_files = files.size
375
- if $".include? 'plotter.rb'
376
- cmd = "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
377
- plot_cmd = "plot.rb #{cmd}"
378
- Plotter.new.plot_string "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
379
- unless File.file? png
380
- abort "Fatal Error in plotting cmd=\"#{plot_cmd}\":\n#{reply}"
381
- end
382
- else
383
- warn "plotter.rb not found, not png plot image available"
384
- end
385
-
386
- =end
387
-
388
-
389
- ## CREATE the HTML to load the plot:
390
- basename_filename_noext = File.basename(filename_noext)
391
- output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
392
- #output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: &nbsp;&nbsp; <span class=\"code\">#{plot_cmd}</span></caption>\n"
393
- output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
394
- output << "</table></div>\n"
395
- output
396
- end # plot_figure
397
-
398
- def file_as_decoy(files, opt)
399
- bio = SpecID::Bioworks.new
400
- puts "Calculating false positive rates using '#{opt.f}' as decoy ..."
401
- fps = bio.num_prots(opt.f)
402
- out = ""
403
- files.each do |file|
404
- tps = bio.num_prots(file)
405
- out << "*****************************************************\n"
406
- out << sprintf("%-36s # TP : #{tps}\n", file)
407
- out << sprintf("%-36s # FP : #{fps}\n", opt.f)
408
- out << sprintf(" False Positive Rate [FP/(TP+FP)] : %.3f\n", fps.to_f/(tps+fps)) unless opt.n
409
- out << sprintf(" Gygi's False Positive Rate 2*[FP/(TP+FP)] : %.3f\n", 2.0*fps/(tps+fps)) if opt.g
410
- out << sprintf(" Precision [TP/(TP+FP)] : %.3f\n", tps.to_f/(tps+fps)) if opt.p
411
- out << "*****************************************************\n"
412
- end
413
- out
414
- end
415
-
416
-
417
-
418
- def prefix_as_decoy(files, opt)
419
- #puts "Calculating false positive rates using prefix #{opt.f} ..."
420
- prefix_arr = prefixes(opt.f, files.size)
421
- all_arrs = []
422
- key = []
423
- out_noext = outfile_noext(opt.o)
424
- files.each_with_index do |file,i|
425
- all_arrs[i] = []
426
- key[i] = []
427
- sp = SpecID.new(file)
428
- #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
429
- (tp, prec, fpr2) = sp.tps_and_precision_and_fpr2_times2_for_prob(prefix_arr[i])
430
- if opt.g
431
- all_arrs[i] << [tp,fpr2]
432
- key[i] << ["Gygi FPR", ["#TP", "Gygi FPR = 2*FP/(TP+FP)"]]
433
- end
434
- if opt.p
435
- all_arrs[i] << [tp,prec]
436
- key[i] << ["Prec", ["#TP", "Prec = TP/(TP+FP)"]]
437
- end
438
- unless opt.n
439
- ## Add the fpr datasets
440
- fpr = fpr2.map {|v| v/2.0}
441
- all_arrs[i] << [tp,fpr]
442
- key[i] << ["FPR", ["#TP", "FPR = FP/(TP+FP)"]]
443
- end
444
- end
445
-
446
- string = ''
447
- if opt.a
448
- roc = ROC.new
449
- #string << "***********************************************************\n"
450
- #string << "AREA UNDER CURVE:\n"
451
- key.each_with_index do |file,i|
452
- string << "#{files[i]} (area under curve)\n"
453
- key[i].each_index do |j|
454
- string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
455
- tps = all_arrs[i][j][0]
456
- oth = all_arrs[i][j][1]
457
- string << roc.area_under_curve(tps, oth).to_s << "\n"
458
- end
459
- end
460
- #string << "***********************************************************\n"
461
- else
462
- string = html do
463
- header +
464
- body do
465
- plot_figure(all_arrs, key, files, out_noext) +
466
- html_table_output(all_arrs, key, files, out_noext)
467
- end
468
- end
469
- end
470
- string
471
- end
472
-
473
- end # class FalsePositiveRate
474
- end # class SpecID
475
-
476
-
data/test/tc_gi2annot.rb DELETED
@@ -1,12 +0,0 @@
1
-
2
- require 'test/unit'
3
-
4
-
5
- class Gi2AnnotTest < Test::Unit::TestCase
6
- ROOT_DIR = File.join(File.dirname(__FILE__), '..')
7
-
8
- def test_query
9
- # @TODO: Add in some kind of skip if there is no internet connection
10
- assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", `ruby -I#{File.join(ROOT_DIR, 'lib')} #{File.join(ROOT_DIR, 'bin', 'gi2annot.rb')} 16130548`)
11
- end
12
- end