mspire 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/changelog.txt +9 -0
- data/lib/spec_id.rb +22 -18
- data/lib/spec_id/filter.rb +7 -4
- data/lib/spec_id/precision.rb +5 -4
- data/lib/spec_id/protein_summary.rb +19 -12
- data/test/tc_precision.rb +4 -4
- data/test/tc_protein_summary.rb +1 -1
- data/test/tc_spec_id.rb +3 -3
- metadata +2 -2
data/Rakefile
CHANGED
@@ -140,7 +140,7 @@ tm = Time.now
|
|
140
140
|
spec = Gem::Specification.new do |s|
|
141
141
|
s.platform = Gem::Platform::RUBY
|
142
142
|
s.name = NAME
|
143
|
-
s.version = "0.2.
|
143
|
+
s.version = "0.2.2"
|
144
144
|
s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
|
145
145
|
s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
|
146
146
|
s.email = "jprince@icmb.utexas.edu"
|
data/changelog.txt
CHANGED
@@ -40,3 +40,12 @@ Added support for modifications to bioworks_to_pepxml.rb
|
|
40
40
|
Can read .srf files (nearly interchangeable with bioworks files)
|
41
41
|
Redid filter.rb
|
42
42
|
|
43
|
+
## version 0.2.1
|
44
|
+
|
45
|
+
minor bugfix
|
46
|
+
|
47
|
+
## version 0.2.2
|
48
|
+
|
49
|
+
made compatible with Bioworks fasta file reverser and updated tutorial.
|
50
|
+
Killed classify_by_prefix routine in favor of classify_by_false_flag which has
|
51
|
+
a prefix option
|
data/lib/spec_id.rb
CHANGED
@@ -223,13 +223,7 @@ module SpecID
|
|
223
223
|
pps
|
224
224
|
end
|
225
225
|
|
226
|
-
|
227
|
-
# (:prot|:peps)
|
228
|
-
# this may result in a duplication of some peptides if they match both
|
229
|
-
# normal and decoy proteins. In this case, the protein arrays are split,
|
230
|
-
# too, so that each points only to its breed of protein.
|
231
|
-
def classify_by_prefix(items, prefix, fp_on_match=true)
|
232
|
-
regex = /^#{Regexp.escape(prefix)}/
|
226
|
+
def classify_by_regex(items, regex, fp_on_match=true)
|
233
227
|
case items
|
234
228
|
when :prots
|
235
229
|
myproc = proc { |prt|
|
@@ -264,15 +258,21 @@ module SpecID
|
|
264
258
|
else
|
265
259
|
abort "don't recognize "
|
266
260
|
end
|
267
|
-
end
|
261
|
+
end
|
268
262
|
|
269
|
-
|
270
|
-
#
|
271
|
-
#
|
272
|
-
#
|
273
|
-
#
|
274
|
-
|
275
|
-
|
263
|
+
# returns [tp, fp] based on the protein prefix for items where items =
|
264
|
+
# (:prot|:peps)
|
265
|
+
# this may result in a duplication of some peptides if they match both
|
266
|
+
# normal and decoy proteins. In this case, the protein arrays are split,
|
267
|
+
# too, so that each points only to its breed of protein.
|
268
|
+
def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
|
269
|
+
if prefix
|
270
|
+
regex = /^#{Regexp.escape(flag)}/
|
271
|
+
else
|
272
|
+
regex = /#{Regexp.escape(flag)}/
|
273
|
+
end
|
274
|
+
classify_by_regex(items, regex, fp_on_match)
|
275
|
+
end
|
276
276
|
|
277
277
|
# Returns (match, nomatch)
|
278
278
|
# items = symbol (:prots, :peps)
|
@@ -354,10 +354,14 @@ end
|
|
354
354
|
end
|
355
355
|
|
356
356
|
# convenience method for the common task of determining precision for
|
357
|
-
# proteins (with decoy proteins found by
|
357
|
+
# proteins (with decoy proteins found by false_flag)
|
358
358
|
# returns (num_hits, precision)
|
359
|
-
def num_hits_and_ppv_for_prob(
|
360
|
-
|
359
|
+
def num_hits_and_ppv_for_prob(false_flag, prefix=false)
|
360
|
+
if prefix
|
361
|
+
regex = /^#{Regexp.escape(false_flag)}/
|
362
|
+
else
|
363
|
+
regex = /#{Regexp.escape(false_flag)}/
|
364
|
+
end
|
361
365
|
prob_proc = probability_proc
|
362
366
|
myproc = proc { |prt|
|
363
367
|
if prt.reference =~ regex ; false
|
data/lib/spec_id/filter.rb
CHANGED
@@ -200,7 +200,7 @@ class SpecID::Filter
|
|
200
200
|
new_spec_ids << spec_id
|
201
201
|
file_to_prefiltered_spec_id(prefix_or_file, opt)
|
202
202
|
else
|
203
|
-
(tps, fps) = spec_id.
|
203
|
+
(tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
|
204
204
|
fps_specid = spec_id.class.new
|
205
205
|
tps_specid = spec_id.class.new
|
206
206
|
|
@@ -339,8 +339,10 @@ class SpecID::Filter
|
|
339
339
|
op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
|
340
340
|
op.separator " if bioworks.xml, = 10^6deltamass/mass"
|
341
341
|
op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
|
342
|
-
op.on("-f", "--false a,b,c", Array, "
|
343
|
-
op.separator("
|
342
|
+
op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
|
343
|
+
op.separator(" e.g., for Bioworks: 'REVERSE'")
|
344
|
+
op.separator(" (last given will apply to remaining files)")
|
345
|
+
op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
|
344
346
|
op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
|
345
347
|
v[0] = get_cys_freq(v[0])
|
346
348
|
opt.cys = v
|
@@ -354,7 +356,8 @@ class SpecID::Filter
|
|
354
356
|
op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
|
355
357
|
op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
|
356
358
|
op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
|
357
|
-
|
359
|
+
## NEED TO IMPLEMENT THIS:
|
360
|
+
#op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
|
358
361
|
op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
|
359
362
|
end
|
360
363
|
|
data/lib/spec_id/precision.rb
CHANGED
@@ -245,11 +245,12 @@ class Prec
|
|
245
245
|
op.separator ""
|
246
246
|
op.separator "Options:"
|
247
247
|
|
248
|
-
op.on("-f", "--fp_data <prefix_or_file>", "
|
248
|
+
op.on("-f", "--fp_data <prefix_or_file>", "flag -or- decoy FILE") {|v| opt.f = v }
|
249
249
|
op.separator ""
|
250
|
-
op.separator " If searched with a concatenated DB, give a
|
251
|
-
op.separator " If files have different
|
250
|
+
op.separator " If searched with a concatenated DB, give a false flag to decoy proteins."
|
251
|
+
op.separator " If files have different flags, separate with commas."
|
252
252
|
op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
|
253
|
+
op.on("--prefix", "false flag as prefix only") {|v| opt.prefix = v }
|
253
254
|
op.separator ""
|
254
255
|
## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
|
255
256
|
op.separator ""
|
@@ -374,7 +375,7 @@ Example:
|
|
374
375
|
sp = SpecID.new(file)
|
375
376
|
#headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
|
376
377
|
if opt.f
|
377
|
-
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
|
378
|
+
(num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i], opt.prefix)
|
378
379
|
all_arrs[i] << [num_hits,ppv]
|
379
380
|
key[i] << ["Precision", ["# hits", "Prec (decoy)"]]
|
380
381
|
end
|
@@ -145,9 +145,13 @@ class ProteinSummary
|
|
145
145
|
end
|
146
146
|
end
|
147
147
|
|
148
|
-
def
|
149
|
-
if
|
150
|
-
|
148
|
+
def flag_to_regex(flag, prefix=false)
|
149
|
+
if flag
|
150
|
+
if prefix
|
151
|
+
/^#{Regexp.escape(flag)}/
|
152
|
+
else
|
153
|
+
/#{Regexp.escape(flag)}/
|
154
|
+
end
|
151
155
|
else
|
152
156
|
nil
|
153
157
|
end
|
@@ -164,12 +168,12 @@ class ProteinSummary
|
|
164
168
|
end
|
165
169
|
|
166
170
|
# filters on the false positive regex and sorts by prot probability
|
167
|
-
def filter_and_sort(uniq_prots, prefix=
|
168
|
-
|
171
|
+
def filter_and_sort(uniq_prots, flag=nil, prefix=false)
|
172
|
+
false_flag_re = flag_to_regex(flag, prefix)
|
169
173
|
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
|
170
174
|
## filter on prefix
|
171
175
|
if prefix
|
172
|
-
sorted = sorted.reject {|prot| prot._protein_name =~
|
176
|
+
sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
|
173
177
|
end
|
174
178
|
sorted
|
175
179
|
end
|
@@ -292,7 +296,7 @@ class ProteinSummary
|
|
292
296
|
end
|
293
297
|
end
|
294
298
|
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
295
|
-
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
299
|
+
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
|
296
300
|
|
297
301
|
## num proteins above cutoff (if opt.c)
|
298
302
|
num_prots_html = ''
|
@@ -322,7 +326,7 @@ class ProteinSummary
|
|
322
326
|
# takes spec_id object
|
323
327
|
# the outfn is the output filename
|
324
328
|
# opt is an OpenStruct that holds opt.f = the false prefix
|
325
|
-
def bioworks_output(spec_id, outfn, file=nil,
|
329
|
+
def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
|
326
330
|
fppr_output_as_html ||= ''
|
327
331
|
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
328
332
|
num_cols = header_anchors.size
|
@@ -330,9 +334,8 @@ class ProteinSummary
|
|
330
334
|
proteins = spec_id.prots
|
331
335
|
protein_num = 0
|
332
336
|
rows = ""
|
333
|
-
prefix_re = prefix_to_regex(false_prefix)
|
334
337
|
proteins.each do |prot|
|
335
|
-
if
|
338
|
+
if false_flag_re && prot.reference =~ false_flag_re
|
336
339
|
next
|
337
340
|
end
|
338
341
|
uniq_peps = Hash.new {|h,k| h[k] = true; }
|
@@ -393,7 +396,8 @@ class ProteinSummary
|
|
393
396
|
op.separator " where file = bioworks -or- <run>-prot (prophet output)"
|
394
397
|
op.separator " outputs: <file>.summary.html"
|
395
398
|
op.separator ""
|
396
|
-
op.on("-f", "--false <prefix>", "ignore proteins with
|
399
|
+
op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
400
|
+
op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
|
397
401
|
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
398
402
|
op.separator(" if --precision then -f is used to specify a file or prefix")
|
399
403
|
op.separator(" that indicates the false positives.")
|
@@ -434,7 +438,9 @@ class ProteinSummary
|
|
434
438
|
proph_output(file, outfn, opt, fppr_output_as_html)
|
435
439
|
when "bioworks"
|
436
440
|
spec_id = SpecID.new(file)
|
437
|
-
|
441
|
+
|
442
|
+
false_regex = flag_to_regex(opt.f, opt.prefix)
|
443
|
+
bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
|
438
444
|
else
|
439
445
|
abort "filetype for #{file} not recognized!"
|
440
446
|
end
|
@@ -445,6 +451,7 @@ class ProteinSummary
|
|
445
451
|
def create_precision_argv(file, opt)
|
446
452
|
# include only those options specific
|
447
453
|
new_argv = [file]
|
454
|
+
if opt.prefix ; new_argv << '--prefix' end
|
448
455
|
if opt.f ; new_argv << '-f' << opt.f end
|
449
456
|
if opt.o ; new_argv << '-o' << opt.o end
|
450
457
|
new_argv
|
data/test/tc_precision.rb
CHANGED
@@ -22,7 +22,7 @@ class PrecTest < Test::Unit::TestCase
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def test_basic_cat
|
25
|
-
output = `#{@cmd} -o #{@tf_html} -f SHUFF_ #{@tf_bioworks_shuff}`
|
25
|
+
output = `#{@cmd} -o #{@tf_html} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
|
26
26
|
puts output
|
27
27
|
|
28
28
|
assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
|
@@ -34,7 +34,7 @@ class PrecTest < Test::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def test_multiple_files
|
37
|
-
output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
|
37
|
+
output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ --prefix #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
|
38
38
|
assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
|
39
39
|
assert_match(/1.*1.0000.*1.*1.0000.*0.*0.*15.*0.8667/m, IO.read(@tf_html), "has values")
|
40
40
|
[@tf_html, @tf_png].each do |file|
|
@@ -45,14 +45,14 @@ class PrecTest < Test::Unit::TestCase
|
|
45
45
|
|
46
46
|
def test_area_under_curve
|
47
47
|
file = @tfiles + 'ppv_area.txt'
|
48
|
-
`#{@cmd} -o #{file} -a -f SHUFF_ #{@tf_bioworks_shuff}`
|
48
|
+
`#{@cmd} -o #{file} -a -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
|
49
49
|
assert(File.exist?(file), "file #{file} exists")
|
50
50
|
output = IO.read(file)
|
51
51
|
assert_match(/Prec.*7.39206/, output, "consistency check")
|
52
52
|
File.unlink file
|
53
53
|
|
54
54
|
outfile = File.join(File.dirname(__FILE__), 'other.html')
|
55
|
-
`#{@cmd} -o #{outfile} -f SHUFF_ #{@tf_bioworks_shuff}`
|
55
|
+
`#{@cmd} -o #{outfile} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
|
56
56
|
File.unlink outfile
|
57
57
|
File.unlink File.join(File.dirname(__FILE__),'other.png')
|
58
58
|
end
|
data/test/tc_protein_summary.rb
CHANGED
@@ -75,7 +75,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
|
|
75
75
|
|
76
76
|
def test_proph_with_precision
|
77
77
|
#puts @cmd
|
78
|
-
runit "#{@tf_proph_cat_inv} -f INV_ --precision"
|
78
|
+
runit "#{@tf_proph_cat_inv} -f INV_ --prefix --precision"
|
79
79
|
html = IO.read(@tf_proph_cat_inv_summary_html)
|
80
80
|
assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
|
81
81
|
assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")
|
data/test/tc_spec_id.rb
CHANGED
@@ -20,11 +20,11 @@ class SpecIDTest < Test::Unit::TestCase
|
|
20
20
|
assert_equal(106, sp.prots.size)
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def test_classify_by_false_flag
|
24
24
|
file = @tfiles + "bioworks_with_INV_small.xml"
|
25
25
|
sp = SpecID.new(file)
|
26
26
|
assert_equal(19, sp.prots.size)
|
27
|
-
(tp, fp) = sp.
|
27
|
+
(tp, fp) = sp.classify_by_false_flag(:prots, "INV_", true, true)
|
28
28
|
assert_equal(4, fp.size, "num false pos")
|
29
29
|
assert_equal(15, tp.size, "num true pos")
|
30
30
|
end
|
@@ -58,7 +58,7 @@ class SpecIDTest < Test::Unit::TestCase
|
|
58
58
|
(tps, ys) = roc.tps_and_ppv(tp, fp)
|
59
59
|
assert_equal(exp_tp, tps)
|
60
60
|
assert_equal(exp_fp, ys)
|
61
|
-
(num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_")
|
61
|
+
(num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_", true)
|
62
62
|
# @TODO: assert these guys for consistencies sake:
|
63
63
|
assert_in_delta_arrays([1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15], tps, 0.0000001)
|
64
64
|
# Consistency check only:
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: mspire
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.2
|
7
|
+
date: 2007-05-08 00:00:00 -05:00
|
8
8
|
summary: Mass Spectrometry Proteomics Objects, Scripts, and Executables
|
9
9
|
require_paths:
|
10
10
|
- lib
|