mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,427 @@
1
+
2
+ require 'optparse'
3
+ require 'ostruct'
4
+ require 'generator'
5
+ require 'gnuplot'
6
+ require 'roc'
7
+
8
+ class String
9
+ def margin
10
+ self.gsub(/^\s*\|/,'')
11
+ end
12
+ end
13
+
14
+ class SpecID ; end
15
+ class SpecID::Precision ; end
16
+
17
+ module SpecID::Precision::PlotHelper
18
+
19
+ PLOT_TYPE = 'XYData'
20
+ TITLE = 'Precision (Positive Predictive Value)'
21
+ XAXIS = 'Num Hits (excludes known false positives)'
22
+ EXT = '.toplot'
23
+ IMAGE_EXT = '.png'
24
+
25
+ def create_to_plot_file(all_arrs, key, files, filename_noext)
26
+ ## CREATE the PLOT IMAGE:
27
+ to_plot = filename_noext + EXT
28
+ png = filename_noext + IMAGE_EXT
29
+ File.open(to_plot,'w') do |out|
30
+ out.puts PLOT_TYPE
31
+ out.puts filename_noext
32
+ out.puts TITLE
33
+ out.puts XAXIS
34
+ out.puts escape_to_gnuplot(y_axis_label(key))
35
+ files.each_with_index do |file,i|
36
+ #p key[i]
37
+ #p all_arrs[i]
38
+
39
+ key[i].each_with_index do |k,j|
40
+ out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
41
+ out.puts all_arrs[i][j][0].join(' ')
42
+ out.puts all_arrs[i][j][1].join(' ')
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+
49
+ ## outputs a .toplot file based on filename_noext, creates a png file, and
50
+ ## writes html to fh that will load the png file up
51
+ ## This is a self contained module that can be swapped out for a
52
+ ## completely different plotting program if desired.
53
+ def plot_figure(all_arrs, key, files, filename_noext)
54
+
55
+ ## CREATE the PLOT IMAGE:
56
+ to_plot = filename_noext+'.toplot'
57
+ png = filename_noext+'.png'
58
+ Gnuplot.open do |gp|
59
+ Gnuplot::Plot.new( gp ) do |plot|
60
+ plot.terminal "png noenhanced"
61
+ plot.output png
62
+ plot.title TITLE
63
+ plot.xlabel XAXIS
64
+ plot.ylabel escape_to_gnuplot(y_axis_label(key))
65
+ plot.style "line 1 lt 1"
66
+ plot.style "line 2 lt 12"
67
+ #plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
68
+ plot.yrange "[-0.05:#{1.05 + 0.020*files.size}]"
69
+ files.each_with_index do |file,i|
70
+ key[i].each_with_index do |k,j|
71
+ plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
72
+ ds.with = "lines"
73
+ ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+ ## CREATE the HTML to load the plot:
81
+ basename_filename_noext = File.basename(filename_noext)
82
+ output = "<div id=\"plot\"><table class=\"image\" align=\"center\">\n"
83
+ #output << "<caption align=\"bottom\">Additional views of this data may be obtained by using the <span class=\"code\">plot.rb</span> command on '#{to_plot}' (type <span class=\"code\">plot.rb</span> for more details). Plot generated with command: &nbsp;&nbsp; <span class=\"code\">#{plot_cmd}</span></caption>\n"
84
+ output << "<tr><td><img src=\"#{basename_filename_noext}.png\" title=\"File #{basename_filename_noext} must be in the same directory as this html.\"/></td></tr>\n"
85
+ output << "</table></div>\n"
86
+ output
87
+ end # plot_figure
88
+
89
+ end
90
+
91
+ module SpecID::Precision::HTML
92
+
93
+ # html and body tags
94
+ def html
95
+ "|<html>
96
+ |#{yield}
97
+ |</html>\n".margin
98
+ end
99
+
100
+ def body
101
+ "|<body>
102
+ | #{yield}
103
+ |</body>\n".margin
104
+ end
105
+
106
+ def header
107
+ "|<head>
108
+ | #{style}
109
+ |</head>\n".margin
110
+ end
111
+
112
+ def td
113
+ "<td>#{yield}</td>"
114
+ end
115
+
116
+
117
+ def style
118
+ '
119
+ <style type="text/css">
120
+ div#tp_table {
121
+ text-align: center;
122
+ margin-top: 50px;
123
+ margin-bottom: 50px;
124
+ }
125
+ span.code {
126
+ font-family: Courier,Monospace;
127
+ font-size: 80%;
128
+ }
129
+ table {
130
+ border-width:1px;
131
+ border-color:#CCCCCC;
132
+ border-collapse: collapse;
133
+ }
134
+ caption {
135
+ font-size: 90%;
136
+ }
137
+ td,th {
138
+ padding-top: 2px;
139
+ padding-bottom: 2px;
140
+ padding-left: 1;
141
+ padding-right: 1;
142
+ }
143
+ th.small {
144
+ font-size: 80%;
145
+ font-weight: normal;
146
+ padding: 1px;
147
+ }
148
+ td.redline {
149
+ background-color: #FF0000;
150
+ color: #FFFFFF
151
+ }
152
+ div#plot {
153
+ margin: 30px;
154
+ text-align:center
155
+ }
156
+ hr {color: sienna}
157
+ body { font-size: 8pt; font-family: Arial,Helvetica,Times}
158
+ </style>
159
+ '
160
+
161
+ end
162
+
163
+ def table
164
+ "|<table border=\"1\" align=\"center\" style=\"font-size:100%\">
165
+ | #{yield}
166
+ |</table>\n".margin
167
+ end
168
+
169
+ def tr
170
+ "|<tr>
171
+ | #{yield}
172
+ |</tr>\n".margin
173
+ end
174
+ end # module HTML
175
+
176
+ class SpecID::Precision
177
+ include SpecID::Precision::PlotHelper
178
+
179
+ ###########################################################
180
+ # GLOBAL SETTINGS:
181
+ DEF_PREFIX = "INV_"
182
+ DATA_PREC = 4 # decimal places of precision for ppv data
183
+ STDOUT_JTPLOT_BASE = "ppv" # if there is no outfile
184
+ ###########################################################
185
+
186
+ include SpecID::Precision::HTML
187
+
188
+ ## returns an html string
189
+ def precision(argv)
190
+ opt = parse_args(argv)
191
+ files = argv.to_a
192
+ out_string = prefix_as_decoy(files, opt)
193
+ [out_string, opt]
194
+ end
195
+
196
+ def run_cmd_line(argv)
197
+ output_string, opt, file_as_decoy = precision(argv)
198
+ if file_as_decoy
199
+ puts output_string
200
+ else
201
+ ## open file and write to it..
202
+ if opt.o == 'STDOUT'
203
+ print output_string
204
+ else
205
+ File.open(opt.o,'w') do |fh| fh.print output_string end
206
+ end
207
+ end
208
+ end
209
+
210
+ # returns the outfile with no extension
211
+ def outfile_noext(opt)
212
+ if opt == 'STDOUT'
213
+ "#{STDOUT_JTPLOT_BASE}"
214
+ else
215
+ opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
216
+ end
217
+ end
218
+
219
+ def file_noext(file)
220
+ file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
221
+ end
222
+
223
+ def parse_args(argv)
224
+
225
+ opt = OpenStruct.new
226
+ opt.o = 'STDOUT'
227
+ opts = OptionParser.new do |op|
228
+ op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
229
+ op.separator ""
230
+ op.separator "Abbreviations and Definitions:"
231
+ op.separator " TP = True Positives"
232
+ op.separator " FP = False Positives"
233
+ op.separator " Precision = Positive Predictive Value = [TP/(TP+FP)]"
234
+ op.separator ""
235
+ op.separator "Output: "
236
+ op.separator " 1. Decoy as separate search: PPV to STDOUT"
237
+ op.separator " 2. Decoy proteins from concatenated database: '.html'"
238
+ op.separator ""
239
+ op.separator "Options:"
240
+
241
+ op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
242
+ op.separator ""
243
+ op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
244
+ op.separator " If files have different prefixes, separate with commas."
245
+ op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
246
+ op.separator ""
247
+ ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
248
+ op.separator ""
249
+ op.on("-o", "--outfile <file>", "write output to file (def: #{opt.o})") {|v| opt.o = v}
250
+ op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
251
+ op.on("-j", "--plot_file", "output to_plot file") {|v| opt.j = v}
252
+ op.on_tail("
253
+ Example:
254
+ For a search on a concatenated database where the decoy proteins have
255
+ been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
256
+ output:
257
+
258
+ #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
259
+
260
+ ")
261
+ end
262
+ opts.parse!(argv)
263
+
264
+ if argv.size < 1
265
+ puts opts
266
+ exit
267
+ end
268
+
269
+ opt
270
+ end
271
+
272
+
273
+ # takes a comma separated list and extends the last to create an array of
274
+ # desired size
275
+ def prefixes(arg, desired_size)
276
+ arg_arr = arg.split(',')
277
+ new_arr = []
278
+ last_arg = arg_arr[0]
279
+ desired_size.times do |i|
280
+ if arg_arr[i]
281
+ new_arr[i] = arg_arr[i]
282
+ last_arg = new_arr[i]
283
+ else
284
+ new_arr[i] = last_arg
285
+ end
286
+ end
287
+ new_arr
288
+ end
289
+
290
+
291
+ ## collapses arrays to one level deep so we can sync them up
292
+ def arrays_to_one_level_deep(all_arrs)
293
+ mostly_flat = []
294
+ all_arrs.each do |per_file|
295
+ per_file.each do |per_style|
296
+ mostly_flat << per_style[0]
297
+ mostly_flat << per_style[1]
298
+ end
299
+ end
300
+ mostly_flat
301
+ end
302
+
303
+ # prints rows and th for the data
304
+ def table_cells(all_arrs, key)
305
+ ## columns specific headings:
306
+ all_string = ""
307
+ all_string << tr do
308
+ line = ""
309
+ key.each do |per_file|
310
+ per_file.each do |per_ds|
311
+ line << "<th class=\"small\">#{per_ds[1][0]}</th><th class=\"small\">#{per_ds[1][1]}</th>"
312
+ end
313
+ end
314
+ line
315
+ end
316
+ mostly_flat = arrays_to_one_level_deep(all_arrs)
317
+ SyncEnumerator.new(*mostly_flat).each do |row|
318
+ all_string << tr do
319
+ string = row.map {|it|
320
+ sty="%d"
321
+ if it.class == Float ; sty="%.#{DATA_PREC}f" end
322
+ td{ sprintf(sty,it)}
323
+ }.join
324
+ end
325
+ end
326
+ all_string
327
+ end
328
+
329
+ def html_table_output(all_arrs, key, files, filename_noext)
330
+ num_datasets_per_file = all_arrs.first.size
331
+ num_cols_per_dataset = 2
332
+ big_colspan = num_datasets_per_file * num_cols_per_dataset
333
+ output = table do
334
+ tr do
335
+ files.map do |file|
336
+ "<th colspan=\"#{big_colspan}\">#{file}</th>"
337
+ end.join
338
+ end +
339
+ tr do
340
+ key.map do |arr|
341
+ arr.map do |ds|
342
+ "<th colspan=\"2\">#{ds.first}</th>"
343
+ end
344
+ end
345
+ end +
346
+ table_cells(all_arrs, key)
347
+ end
348
+ "<div id=\"tp_table\">" + output + "</div>"
349
+ end
350
+
351
+
352
+ def y_axis_label(key)
353
+ ## We only take the keys for the first file, as it's assumed that the major
354
+ ## labels will be identical for all of them
355
+ labels = key.first.map {|tp| tp.first }
356
+ labels.join " | "
357
+ end
358
+
359
+ # escapes any ' chars
360
+ def escape_to_gnuplot(string)
361
+ # long way, but it works.
362
+ new_string = ""
363
+ string.split(//).each do |chr|
364
+ if chr == "'" ; new_string << "\\" end
365
+ new_string << chr
366
+ end
367
+ new_string
368
+ end
369
+
370
+ def prefix_as_decoy(files, opt)
371
+ $stderr.puts "using prefix #{opt.f} ..."
372
+
373
+ if opt.f
374
+ prefix_arr = prefixes(opt.f, files.size)
375
+ end
376
+ all_arrs = []
377
+ key = []
378
+ out_noext = outfile_noext(opt.o)
379
+ files.each_with_index do |file,i|
380
+ all_arrs[i] = []
381
+ key[i] = []
382
+ sp = SpecID.new(file)
383
+ #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
384
+ if opt.f
385
+ (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
386
+ all_arrs[i] << [num_hits,ppv]
387
+ key[i] << ["Precision", ["#TP", "Prec = TP/(TP+FP)"]]
388
+ else
389
+ ## These are just from protein prophet probabilities:
390
+ (num_hits, ppv) = sp.num_hits_and_ppv_for_protein_prophet_probabilities
391
+ all_arrs[i] << [num_hits,ppv]
392
+ key[i] << ["Precision", ["#TP", "Prec = TP/(TP+FP)"]]
393
+ end
394
+ end
395
+
396
+ string = ''
397
+ if opt.a
398
+ roc = ROC.new
399
+ #string << "***********************************************************\n"
400
+ #string << "AREA UNDER CURVE:\n"
401
+ key.each_with_index do |file,i|
402
+ string << "#{files[i]} (area under curve)\n"
403
+ key[i].each_index do |j|
404
+ string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
405
+ num_hits = all_arrs[i][j][0]
406
+ oth = all_arrs[i][j][1]
407
+ string << roc.area_under_curve(num_hits, oth).to_s << "\n"
408
+ end
409
+ end
410
+ #string << "***********************************************************\n"
411
+ else
412
+ if opt.j
413
+ create_to_plot_file(all_arrs, key, files, out_noext)
414
+ end
415
+ string = html do
416
+ header +
417
+ body do
418
+ plot_figure(all_arrs, key, files, out_noext) +
419
+ html_table_output(all_arrs, key, files, out_noext)
420
+ end
421
+ end
422
+ end
423
+ string
424
+ end
425
+
426
+ end # class SpecID
427
+
data/lib/spec_id/proph.rb CHANGED
@@ -14,7 +14,7 @@ class Proph
14
14
 
15
15
  class Parser
16
16
  def root_el(file)
17
- XMLTree.parse_file(file)
17
+ AXML.parse_file(file)
18
18
  end
19
19
  end
20
20
 
@@ -275,7 +275,7 @@ class Pep::Parser < Parser
275
275
 
276
276
  ## file from peptideAtlas:
277
277
  search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
278
- search_result_regex2 = /<search_result spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
278
+ search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
279
279
  search_hit_regex = /<search_hit .*peptide="(\w+)" /o
280
280
 
281
281
  peptide_h = {}