mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,427 @@
1
+
2
+ require 'optparse'
3
+ require 'ostruct'
4
+ require 'generator'
5
+ require 'gnuplot'
6
+ require 'roc'
7
+
8
+ class String
9
+ def margin
10
+ self.gsub(/^\s*\|/,'')
11
+ end
12
+ end
13
+
14
+ class SpecID ; end
15
+ class SpecID::Precision ; end
16
+
17
+ module SpecID::Precision::PlotHelper
18
+
19
+ PLOT_TYPE = 'XYData'
20
+ TITLE = 'Precision (Positive Predictive Value)'
21
+ XAXIS = 'Num Hits (excludes known false positives)'
22
+ EXT = '.toplot'
23
+ IMAGE_EXT = '.png'
24
+
25
+ def create_to_plot_file(all_arrs, key, files, filename_noext)
26
+ ## CREATE the PLOT IMAGE:
27
+ to_plot = filename_noext + EXT
28
+ png = filename_noext + IMAGE_EXT
29
+ File.open(to_plot,'w') do |out|
30
+ out.puts PLOT_TYPE
31
+ out.puts filename_noext
32
+ out.puts TITLE
33
+ out.puts XAXIS
34
+ out.puts escape_to_gnuplot(y_axis_label(key))
35
+ files.each_with_index do |file,i|
36
+ #p key[i]
37
+ #p all_arrs[i]
38
+
39
+ key[i].each_with_index do |k,j|
40
+ out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
41
+ out.puts all_arrs[i][j][0].join(' ')
42
+ out.puts all_arrs[i][j][1].join(' ')
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+
49
+ ## outputs a .toplot file based on filename_noext, creates a png file, and
50
+ ## writes html to fh that will load the png file up
51
+ ## This is a self contained module that can be swapped out for a
52
+ ## completely different plotting program if desired.
53
+ def plot_figure(all_arrs, key, files, filename_noext)
54
+
55
+ ## CREATE the PLOT IMAGE:
56
+ to_plot = filename_noext+'.toplot'
57
+ png = filename_noext+'.png'
58
+ Gnuplot.open do |gp|
59
+ Gnuplot::Plot.new( gp ) do |plot|
60
+ plot.terminal "png noenhanced"
61
+ plot.output png
62
+ plot.title TITLE
63
+ plot.xlabel XAXIS
64
+ plot.ylabel escape_to_gnuplot(y_axis_label(key))
65
+ plot.style "line 1 lt 1"
66
+ plot.style "line 2 lt 12"
67
+ #plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
68
+ plot.yrange "[-0.05:#{1.05 + 0.020*files.size}]"
69
+ files.each_with_index do |file,i|
70
+ key[i].each_with_index do |k,j|
71
+ plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
72
+ ds.with = "lines"
73
+ ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+ ## CREATE the HTML to load the plot:
81
+ basename_filename_noext = File.basename(filename_noext)
82
+ output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
83
+ #output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: &nbsp;&nbsp; <span class=\"code\">#{plot_cmd}</span></caption>\n"
84
+ output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
85
+ output << "</table></div>\n"
86
+ output
87
+ end # plot_figure
88
+
89
+ end
90
+
91
+ module SpecID::Precision::HTML
92
+
93
+ # html and body tags
94
+ def html
95
+ "|<html>
96
+ |#{yield}
97
+ |</html>\n".margin
98
+ end
99
+
100
+ def body
101
+ "|<body>
102
+ | #{yield}
103
+ |</body>\n".margin
104
+ end
105
+
106
+ def header
107
+ "|<head>
108
+ | #{style}
109
+ |</head>\n".margin
110
+ end
111
+
112
+ def td
113
+ "<td>#{yield}</td>"
114
+ end
115
+
116
+
117
+ def style
118
+ '
119
+ <style type="text/css">
120
+ div#tp_table {
121
+ text-align: center;
122
+ margin-top: 50px;
123
+ margin-bottom: 50px;
124
+ }
125
+ span.code {
126
+ font-family: Courier,Monospace;
127
+ font-size: 80%;
128
+ }
129
+ table {
130
+ border-width:1px;
131
+ border-color:#CCCCCC;
132
+ border-collapse: collapse;
133
+ }
134
+ caption {
135
+ font-size: 90%;
136
+ }
137
+ td,th {
138
+ padding-top: 2px;
139
+ padding-bottom: 2px;
140
+ padding-left: 1;
141
+ padding-right: 1;
142
+ }
143
+ th.small {
144
+ font-size: 80%;
145
+ font-weight: normal;
146
+ padding: 1px;
147
+ }
148
+ td.redline {
149
+ background-color: #FF0000;
150
+ color: #FFFFFF
151
+ }
152
+ div#plot {
153
+ margin: 30px;
154
+ text-align:center
155
+ }
156
+ hr {color: sienna}
157
+ body { font-size: 8pt; font-family: Arial,Helvetica,Times}
158
+ </style>
159
+ '
160
+
161
+ end
162
+
163
+ def table
164
+ "|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
165
+ | #{yield}
166
+ |</table>\n".margin
167
+ end
168
+
169
+ def tr
170
+ "|<tr>
171
+ | #{yield}
172
+ |</tr>\n".margin
173
+ end
174
+ end # module HTML
175
+
176
+ class SpecID::Precision
177
+ include SpecID::Precision::PlotHelper
178
+
179
+ ###########################################################
180
+ # GLOBAL SETTINGS:
181
+ DEF_PREFIX = "INV_"
182
+ DATA_PREC = 4 # decimal places of precision for ppv data
183
+ STDOUT_JTPLOT_BASE = "ppv" # if there is no outfile
184
+ ###########################################################
185
+
186
+ include SpecID::Precision::HTML
187
+
188
+ ## returns an html string
189
+ def precision(argv)
190
+ opt = parse_args(argv)
191
+ files = argv.to_a
192
+ out_string = prefix_as_decoy(files, opt)
193
+ [out_string, opt]
194
+ end
195
+
196
+ def run_cmd_line(argv)
197
+ output_string, opt, file_as_decoy = precision(argv)
198
+ if file_as_decoy
199
+ puts output_string
200
+ else
201
+ ## open file and write to it..
202
+ if opt.o == 'STDOUT'
203
+ print output_string
204
+ else
205
+ File.open(opt.o,'w') do |fh| fh.print output_string end
206
+ end
207
+ end
208
+ end
209
+
210
+ # returns the outfile with no extension
211
+ def outfile_noext(opt)
212
+ if opt == 'STDOUT'
213
+ "#{STDOUT_JTPLOT_BASE}"
214
+ else
215
+ opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
216
+ end
217
+ end
218
+
219
+ def file_noext(file)
220
+ file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
221
+ end
222
+
223
+ def parse_args(argv)
224
+
225
+ opt = OpenStruct.new
226
+ opt.o = 'STDOUT'
227
+ opts = OptionParser.new do |op|
228
+ op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
229
+ op.separator ""
230
+ op.separator "Abbreviations and Definitions:"
231
+ op.separator " TP = True Positives"
232
+ op.separator " FP = False Positives"
233
+ op.separator " Precision = Positive Predictive Value = [TP/(TP+FP)]"
234
+ op.separator ""
235
+ op.separator "Output: "
236
+ op.separator " 1. Decoy as separate search: PPV to STDOUT"
237
+ op.separator " 2. Decoy proteins from concatenated database: '.html'"
238
+ op.separator ""
239
+ op.separator "Options:"
240
+
241
+ op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
242
+ op.separator ""
243
+ op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
244
+ op.separator " If files have different prefixes, separate with commas."
245
+ op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
246
+ op.separator ""
247
+ ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
248
+ op.separator ""
249
+ op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
250
+ op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
251
+ op.on("-j", "--plot_file", "output to_plot file") {|v| opt.j = v}
252
+ op.on_tail("
253
+ Example:
254
+ For a search on a concatenated database where the decoy proteins have
255
+ been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
256
+ output:
257
+
258
+ #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
259
+
260
+ ")
261
+ end
262
+ opts.parse!(argv)
263
+
264
+ if argv.size < 1
265
+ puts opts
266
+ exit
267
+ end
268
+
269
+ opt
270
+ end
271
+
272
+
273
+ # takes a comma separated list and extends the last to create an array of
274
+ # desired size
275
+ def prefixes(arg, desired_size)
276
+ arg_arr = arg.split(',')
277
+ new_arr = []
278
+ last_arg = arg_arr[0]
279
+ desired_size.times do |i|
280
+ if arg_arr[i]
281
+ new_arr[i] = arg_arr[i]
282
+ last_arg = new_arr[i]
283
+ else
284
+ new_arr[i] = last_arg
285
+ end
286
+ end
287
+ new_arr
288
+ end
289
+
290
+
291
+ ## collapses arrays to one level deep so we can sync them up
292
+ def arrays_to_one_level_deep(all_arrs)
293
+ mostly_flat = []
294
+ all_arrs.each do |per_file|
295
+ per_file.each do |per_style|
296
+ mostly_flat << per_style[0]
297
+ mostly_flat << per_style[1]
298
+ end
299
+ end
300
+ mostly_flat
301
+ end
302
+
303
+ # prints rows and th for the data
304
+ def table_cells(all_arrs, key)
305
+ ## columns specific headings:
306
+ all_string = ""
307
+ all_string << tr do
308
+ line = ""
309
+ key.each do |per_file|
310
+ per_file.each do |per_ds|
311
+ line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
312
+ end
313
+ end
314
+ line
315
+ end
316
+ mostly_flat = arrays_to_one_level_deep(all_arrs)
317
+ SyncEnumerator.new(*mostly_flat).each do |row|
318
+ all_string << tr do
319
+ string = row.map {|it|
320
+ sty="%d"
321
+ if it.class == Float ; sty="%.#{DATA_PREC}f" end
322
+ td{ sprintf(sty,it)}
323
+ }.join
324
+ end
325
+ end
326
+ all_string
327
+ end
328
+
329
+ def html_table_output(all_arrs, key, files, filename_noext)
330
+ num_datasets_per_file = all_arrs.first.size
331
+ num_cols_per_dataset = 2
332
+ big_colspan = num_datasets_per_file * num_cols_per_dataset
333
+ output = table do
334
+ tr do
335
+ files.map do |file|
336
+ "<th colspan=\"#{big_colspan}\">#{file}</th>"
337
+ end.join
338
+ end +
339
+ tr do
340
+ key.map do |arr|
341
+ arr.map do |ds|
342
+ "<th colspan=\"2\">#{ds.first}</th>"
343
+ end
344
+ end
345
+ end +
346
+ table_cells(all_arrs, key)
347
+ end
348
+ "<div id=\"tp_table\">" + output + "</div>"
349
+ end
350
+
351
+
352
+ def y_axis_label(key)
353
+ ## We only take the keys for the first file, as it's assumed that the major
354
+ ## labels will be identical for all of them
355
+ labels = key.first.map {|tp| tp.first }
356
+ labels.join " | "
357
+ end
358
+
359
+ # escapes any ' chars
360
+ def escape_to_gnuplot(string)
361
+ # long way, but it works.
362
+ new_string = ""
363
+ string.split(//).each do |chr|
364
+ if chr == "'" ; new_string << "\\" end
365
+ new_string << chr
366
+ end
367
+ new_string
368
+ end
369
+
370
+ def prefix_as_decoy(files, opt)
371
+ $stderr.puts "using prefix #{opt.f} ..."
372
+
373
+ if opt.f
374
+ prefix_arr = prefixes(opt.f, files.size)
375
+ end
376
+ all_arrs = []
377
+ key = []
378
+ out_noext = outfile_noext(opt.o)
379
+ files.each_with_index do |file,i|
380
+ all_arrs[i] = []
381
+ key[i] = []
382
+ sp = SpecID.new(file)
383
+ #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
384
+ if opt.f
385
+ (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
386
+ all_arrs[i] << [num_hits,ppv]
387
+ key[i] << ["Precision", ["#TP", "Prec = TP/(TP+FP)"]]
388
+ else
389
+ ## These are just from protein prophet probabilities:
390
+ (num_hits, ppv) = sp.num_hits_and_ppv_for_protein_prophet_probabilities
391
+ all_arrs[i] << [num_hits,ppv]
392
+ key[i] << ["Precision", ["#TP", "Prec = TP/(TP+FP)"]]
393
+ end
394
+ end
395
+
396
+ string = ''
397
+ if opt.a
398
+ roc = ROC.new
399
+ #string << "***********************************************************\n"
400
+ #string << "AREA UNDER CURVE:\n"
401
+ key.each_with_index do |file,i|
402
+ string << "#{files[i]} (area under curve)\n"
403
+ key[i].each_index do |j|
404
+ string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
405
+ num_hits = all_arrs[i][j][0]
406
+ oth = all_arrs[i][j][1]
407
+ string << roc.area_under_curve(num_hits, oth).to_s << "\n"
408
+ end
409
+ end
410
+ #string << "***********************************************************\n"
411
+ else
412
+ if opt.j
413
+ create_to_plot_file(all_arrs, key, files, out_noext)
414
+ end
415
+ string = html do
416
+ header +
417
+ body do
418
+ plot_figure(all_arrs, key, files, out_noext) +
419
+ html_table_output(all_arrs, key, files, out_noext)
420
+ end
421
+ end
422
+ end
423
+ string
424
+ end
425
+
426
+ end # class SpecID
427
+
data/lib/spec_id/proph.rb CHANGED
@@ -14,7 +14,7 @@ class Proph
14
14
 
15
15
  class Parser
16
16
  def root_el(file)
17
- XMLTree.parse_file(file)
17
+ AXML.parse_file(file)
18
18
  end
19
19
  end
20
20
 
@@ -275,7 +275,7 @@ class Pep::Parser < Parser
275
275
 
276
276
  ## file from peptideAtlas:
277
277
  search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
278
- search_result_regex2 = /<search_result spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
278
+ search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
279
279
  search_hit_regex = /<search_hit .*peptide="(\w+)" /o
280
280
 
281
281
  peptide_h = {}