mspire 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/changelog.txt +9 -0
- data/lib/spec_id.rb +22 -18
- data/lib/spec_id/filter.rb +7 -4
- data/lib/spec_id/precision.rb +5 -4
- data/lib/spec_id/protein_summary.rb +19 -12
- data/test/tc_precision.rb +4 -4
- data/test/tc_protein_summary.rb +1 -1
- data/test/tc_spec_id.rb +3 -3
- metadata +2 -2
data/Rakefile
CHANGED
@@ -140,7 +140,7 @@ tm = Time.now
|
|
140
140
|
spec = Gem::Specification.new do |s|
|
141
141
|
s.platform = Gem::Platform::RUBY
|
142
142
|
s.name = NAME
|
143
|
-
s.version = "0.2.
|
143
|
+
s.version = "0.2.2"
|
144
144
|
s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
|
145
145
|
s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
|
146
146
|
s.email = "jprince@icmb.utexas.edu"
|
data/changelog.txt
CHANGED
@@ -40,3 +40,12 @@ Added support for modifications to bioworks_to_pepxml.rb
|
|
40
40
|
Can read .srf files (nearly interchangeable with bioworks files)
|
41
41
|
Redid filter.rb
|
42
42
|
|
43
|
+
## version 0.2.1
|
44
|
+
|
45
|
+
minor bugfix
|
46
|
+
|
47
|
+
## version 0.2.2
|
48
|
+
|
49
|
+
made compatible with Bioworks fasta file reverser and updated tutorial.
|
50
|
+
Killed classify_by_prefix routine in favor of classify_by_false_flag which has
|
51
|
+
a prefix option
|
data/lib/spec_id.rb
CHANGED
@@ -223,13 +223,7 @@ module SpecID
|
|
223
223
|
pps
|
224
224
|
end
|
225
225
|
|
226
|
-
|
227
|
-
# (:prot|:peps)
|
228
|
-
# this may result in a duplication of some peptides if they match both
|
229
|
-
# normal and decoy proteins. In this case, the protein arrays are split,
|
230
|
-
# too, so that each points only to its breed of protein.
|
231
|
-
def classify_by_prefix(items, prefix, fp_on_match=true)
|
232
|
-
regex = /^#{Regexp.escape(prefix)}/
|
226
|
+
def classify_by_regex(items, regex, fp_on_match=true)
|
233
227
|
case items
|
234
228
|
when :prots
|
235
229
|
myproc = proc { |prt|
|
@@ -264,15 +258,21 @@ module SpecID
|
|
264
258
|
else
|
265
259
|
abort "don't recognize "
|
266
260
|
end
|
267
|
-
end
|
261
|
+
end
|
268
262
|
|
269
|
-
|
270
|
-
#
|
271
|
-
#
|
272
|
-
#
|
273
|
-
#
|
274
|
-
|
275
|
-
|
263
|
+
# returns [tp, fp] based on the protein prefix for items where items =
|
264
|
+
# (:prot|:peps)
|
265
|
+
# this may result in a duplication of some peptides if they match both
|
266
|
+
# normal and decoy proteins. In this case, the protein arrays are split,
|
267
|
+
# too, so that each points only to its breed of protein.
|
268
|
+
def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
|
269
|
+
if prefix
|
270
|
+
regex = /^#{Regexp.escape(flag)}/
|
271
|
+
else
|
272
|
+
regex = /#{Regexp.escape(flag)}/
|
273
|
+
end
|
274
|
+
classify_by_regex(items, regex, fp_on_match)
|
275
|
+
end
|
276
276
|
|
277
277
|
# Returns (match, nomatch)
|
278
278
|
# items = symbol (:prots, :peps)
|
@@ -354,10 +354,14 @@ end
|
|
354
354
|
end
|
355
355
|
|
356
356
|
# convenience method for the common task of determining precision for
|
357
|
-
# proteins (with decoy proteins found by
|
357
|
+
# proteins (with decoy proteins found by false_flag)
|
358
358
|
# returns (num_hits, precision)
|
359
|
-
def num_hits_and_ppv_for_prob(
|
360
|
-
|
359
|
+
def num_hits_and_ppv_for_prob(false_flag, prefix=false)
|
360
|
+
if prefix
|
361
|
+
regex = /^#{Regexp.escape(false_flag)}/
|
362
|
+
else
|
363
|
+
regex = /#{Regexp.escape(false_flag)}/
|
364
|
+
end
|
361
365
|
prob_proc = probability_proc
|
362
366
|
myproc = proc { |prt|
|
363
367
|
if prt.reference =~ regex ; false
|
data/lib/spec_id/filter.rb
CHANGED
@@ -200,7 +200,7 @@ class SpecID::Filter
|
|
200
200
|
new_spec_ids << spec_id
|
201
201
|
file_to_prefiltered_spec_id(prefix_or_file, opt)
|
202
202
|
else
|
203
|
-
(tps, fps) = spec_id.
|
203
|
+
(tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
|
204
204
|
fps_specid = spec_id.class.new
|
205
205
|
tps_specid = spec_id.class.new
|
206
206
|
|
@@ -339,8 +339,10 @@ class SpecID::Filter
|
|
339
339
|
op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
|
340
340
|
op.separator " if bioworks.xml, = 10^6deltamass/mass"
|
341
341
|
op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
|
342
|
-
op.on("-f", "--false a,b,c", Array, "
|
343
|
-
op.separator("
|
342
|
+
op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
|
343
|
+
op.separator(" e.g., for Bioworks: 'REVERSE'")
|
344
|
+
op.separator(" (last given will apply to remaining files)")
|
345
|
+
op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
|
344
346
|
op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
|
345
347
|
v[0] = get_cys_freq(v[0])
|
346
348
|
opt.cys = v
|
@@ -354,7 +356,8 @@ class SpecID::Filter
|
|
354
356
|
op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
|
355
357
|
op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
|
356
358
|
op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
|
357
|
-
|
359
|
+
## NEED TO IMPLEMENT THIS:
|
360
|
+
#op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
|
358
361
|
op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
|
359
362
|
end
|
360
363
|
|
data/lib/spec_id/precision.rb
CHANGED
@@ -245,11 +245,12 @@ class Prec
|
|
245
245
|
op.separator ""
|
246
246
|
op.separator "Options:"
|
247
247
|
|
248
|
-
op.on("-f", "--fp_data <prefix_or_file>", "
|
248
|
+
op.on("-f", "--fp_data <prefix_or_file>", "flag -or- decoy FILE") {|v| opt.f = v }
|
249
249
|
op.separator ""
|
250
|
-
op.separator " If searched with a concatenated DB, give a
|
251
|
-
op.separator " If files have different
|
250
|
+
op.separator " If searched with a concatenated DB, give a false flag to decoy proteins."
|
251
|
+
op.separator " If files have different flags, separate with commas."
|
252
252
|
op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
|
253
|
+
op.on("--prefix", "false flag as prefix only") {|v| opt.prefix = v }
|
253
254
|
op.separator ""
|
254
255
|
## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
|
255
256
|
op.separator ""
|
@@ -374,7 +375,7 @@ Example:
|
|
374
375
|
sp = SpecID.new(file)
|
375
376
|
#headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
|
376
377
|
if opt.f
|
377
|
-
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
|
378
|
+
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i], opt.prefix)
|
378
379
|
all_arrs[i] << [num_hits,ppv]
|
379
380
|
key[i] << ["Precision", ["# hits", "Prec (decoy)"]]
|
380
381
|
end
|
@@ -145,9 +145,13 @@ class ProteinSummary
|
|
145
145
|
end
|
146
146
|
end
|
147
147
|
|
148
|
-
def
|
149
|
-
if
|
150
|
-
|
148
|
+
def flag_to_regex(flag, prefix=false)
|
149
|
+
if flag
|
150
|
+
if prefix
|
151
|
+
/^#{Regexp.escape(flag)}/
|
152
|
+
else
|
153
|
+
/#{Regexp.escape(flag)}/
|
154
|
+
end
|
151
155
|
else
|
152
156
|
nil
|
153
157
|
end
|
@@ -164,12 +168,12 @@ class ProteinSummary
|
|
164
168
|
end
|
165
169
|
|
166
170
|
# filters on the false positive regex and sorts by prot probability
|
167
|
-
def filter_and_sort(uniq_prots, prefix=
|
168
|
-
|
171
|
+
def filter_and_sort(uniq_prots, flag=nil, prefix=false)
|
172
|
+
false_flag_re = flag_to_regex(flag, prefix)
|
169
173
|
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
|
170
174
|
## filter on prefix
|
171
175
|
if prefix
|
172
|
-
sorted = sorted.reject {|prot| prot._protein_name =~
|
176
|
+
sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
|
173
177
|
end
|
174
178
|
sorted
|
175
179
|
end
|
@@ -292,7 +296,7 @@ class ProteinSummary
|
|
292
296
|
end
|
293
297
|
end
|
294
298
|
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
295
|
-
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
299
|
+
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
|
296
300
|
|
297
301
|
## num proteins above cutoff (if opt.c)
|
298
302
|
num_prots_html = ''
|
@@ -322,7 +326,7 @@ class ProteinSummary
|
|
322
326
|
# takes spec_id object
|
323
327
|
# the outfn is the output filename
|
324
328
|
# opt is an OpenStruct that holds opt.f = the false prefix
|
325
|
-
def bioworks_output(spec_id, outfn, file=nil,
|
329
|
+
def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
|
326
330
|
fppr_output_as_html ||= ''
|
327
331
|
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
328
332
|
num_cols = header_anchors.size
|
@@ -330,9 +334,8 @@ class ProteinSummary
|
|
330
334
|
proteins = spec_id.prots
|
331
335
|
protein_num = 0
|
332
336
|
rows = ""
|
333
|
-
prefix_re = prefix_to_regex(false_prefix)
|
334
337
|
proteins.each do |prot|
|
335
|
-
if
|
338
|
+
if false_flag_re && prot.reference =~ false_flag_re
|
336
339
|
next
|
337
340
|
end
|
338
341
|
uniq_peps = Hash.new {|h,k| h[k] = true; }
|
@@ -393,7 +396,8 @@ class ProteinSummary
|
|
393
396
|
op.separator " where file = bioworks -or- <run>-prot (prophet output)"
|
394
397
|
op.separator " outputs: <file>.summary.html"
|
395
398
|
op.separator ""
|
396
|
-
op.on("-f", "--false <prefix>", "ignore proteins with
|
399
|
+
op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
400
|
+
op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
|
397
401
|
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
398
402
|
op.separator(" if --precision then -f is used to specify a file or prefix")
|
399
403
|
op.separator(" that indicates the false positives.")
|
@@ -434,7 +438,9 @@ class ProteinSummary
|
|
434
438
|
proph_output(file, outfn, opt, fppr_output_as_html)
|
435
439
|
when "bioworks"
|
436
440
|
spec_id = SpecID.new(file)
|
437
|
-
|
441
|
+
|
442
|
+
false_regex = flag_to_regex(opt.f, opt.prefix)
|
443
|
+
bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
|
438
444
|
else
|
439
445
|
abort "filetype for #{file} not recognized!"
|
440
446
|
end
|
@@ -445,6 +451,7 @@ class ProteinSummary
|
|
445
451
|
def create_precision_argv(file, opt)
|
446
452
|
# include only those options specific
|
447
453
|
new_argv = [file]
|
454
|
+
if opt.prefix ; new_argv << '--prefix' end
|
448
455
|
if opt.f ; new_argv << '-f' << opt.f end
|
449
456
|
if opt.o ; new_argv << '-o' << opt.o end
|
450
457
|
new_argv
|
data/test/tc_precision.rb
CHANGED
@@ -22,7 +22,7 @@ class PrecTest < Test::Unit::TestCase
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def test_basic_cat
|
25
|
-
output = `#{@cmd} -o #{@tf_html} -f SHUFF_ #{@tf_bioworks_shuff}`
|
25
|
+
output = `#{@cmd} -o #{@tf_html} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
|
26
26
|
puts output
|
27
27
|
|
28
28
|
assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
|
@@ -34,7 +34,7 @@ class PrecTest < Test::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def test_multiple_files
|
37
|
-
output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
|
37
|
+
output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ --prefix #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
|
38
38
|
assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
|
39
39
|
assert_match(/1.*1.0000.*1.*1.0000.*0.*0.*15.*0.8667/m, IO.read(@tf_html), "has values")
|
40
40
|
[@tf_html, @tf_png].each do |file|
|
@@ -45,14 +45,14 @@ class PrecTest < Test::Unit::TestCase
|
|
45
45
|
|
46
46
|
def test_area_under_curve
|
47
47
|
file = @tfiles + 'ppv_area.txt'
|
48
|
-
`#{@cmd} -o #{file} -a -f SHUFF_ #{@tf_bioworks_shuff}`
|
48
|
+
`#{@cmd} -o #{file} -a -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
|
49
49
|
assert(File.exist?(file), "file #{file} exists")
|
50
50
|
output = IO.read(file)
|
51
51
|
assert_match(/Prec.*7.39206/, output, "consistency check")
|
52
52
|
File.unlink file
|
53
53
|
|
54
54
|
outfile = File.join(File.dirname(__FILE__), 'other.html')
|
55
|
-
`#{@cmd} -o #{outfile} -f SHUFF_ #{@tf_bioworks_shuff}`
|
55
|
+
`#{@cmd} -o #{outfile} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
|
56
56
|
File.unlink outfile
|
57
57
|
File.unlink File.join(File.dirname(__FILE__),'other.png')
|
58
58
|
end
|
data/test/tc_protein_summary.rb
CHANGED
@@ -75,7 +75,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
|
|
75
75
|
|
76
76
|
def test_proph_with_precision
|
77
77
|
#puts @cmd
|
78
|
-
runit "#{@tf_proph_cat_inv} -f INV_ --precision"
|
78
|
+
runit "#{@tf_proph_cat_inv} -f INV_ --prefix --precision"
|
79
79
|
html = IO.read(@tf_proph_cat_inv_summary_html)
|
80
80
|
assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
|
81
81
|
assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")
|
data/test/tc_spec_id.rb
CHANGED
@@ -20,11 +20,11 @@ class SpecIDTest < Test::Unit::TestCase
|
|
20
20
|
assert_equal(106, sp.prots.size)
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def test_classify_by_false_flag
|
24
24
|
file = @tfiles + "bioworks_with_INV_small.xml"
|
25
25
|
sp = SpecID.new(file)
|
26
26
|
assert_equal(19, sp.prots.size)
|
27
|
-
(tp, fp) = sp.
|
27
|
+
(tp, fp) = sp.classify_by_false_flag(:prots, "INV_", true, true)
|
28
28
|
assert_equal(4, fp.size, "num false pos")
|
29
29
|
assert_equal(15, tp.size, "num true pos")
|
30
30
|
end
|
@@ -58,7 +58,7 @@ class SpecIDTest < Test::Unit::TestCase
|
|
58
58
|
(tps, ys) = roc.tps_and_ppv(tp, fp)
|
59
59
|
assert_equal(exp_tp, tps)
|
60
60
|
assert_equal(exp_fp, ys)
|
61
|
-
(num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_")
|
61
|
+
(num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_", true)
|
62
62
|
# @TODO: assert these guys for consistencies sake:
|
63
63
|
assert_in_delta_arrays([1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15], tps, 0.0000001)
|
64
64
|
# Consistency check only:
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: mspire
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.2
|
7
|
+
date: 2007-05-08 00:00:00 -05:00
|
8
8
|
summary: Mass Spectrometry Proteomics Objects, Scripts, and Executables
|
9
9
|
require_paths:
|
10
10
|
- lib
|