mspire 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -140,7 +140,7 @@ tm = Time.now
140
140
  spec = Gem::Specification.new do |s|
141
141
  s.platform = Gem::Platform::RUBY
142
142
  s.name = NAME
143
- s.version = "0.2.1"
143
+ s.version = "0.2.2"
144
144
  s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
145
145
  s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
146
146
  s.email = "jprince@icmb.utexas.edu"
data/changelog.txt CHANGED
@@ -40,3 +40,12 @@ Added support for modifications to bioworks_to_pepxml.rb
40
40
  Can read .srf files (nearly interchangeable with bioworks files)
41
41
  Redid filter.rb
42
42
 
43
+ ## version 0.2.1
44
+
45
+ minor bugfix
46
+
47
+ ## version 0.2.2
48
+
49
+ made compatible with Bioworks fasta file reverser and updated tutorial.
50
+ Killed classify_by_prefix routine in favor of classify_by_false_flag which has
51
+ a prefix option
data/lib/spec_id.rb CHANGED
@@ -223,13 +223,7 @@ module SpecID
223
223
  pps
224
224
  end
225
225
 
226
- # returns [tp, fp] based on the protein prefix for items where items =
227
- # (:prot|:peps)
228
- # this may result in a duplication of some peptides if they match both
229
- # normal and decoy proteins. In this case, the protein arrays are split,
230
- # too, so that each points only to its breed of protein.
231
- def classify_by_prefix(items, prefix, fp_on_match=true)
232
- regex = /^#{Regexp.escape(prefix)}/
226
+ def classify_by_regex(items, regex, fp_on_match=true)
233
227
  case items
234
228
  when :prots
235
229
  myproc = proc { |prt|
@@ -264,15 +258,21 @@ module SpecID
264
258
  else
265
259
  abort "don't recognize "
266
260
  end
267
- end
261
+ end
268
262
 
269
- ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
270
- # # Returns tp, fp where each is an array of proteins where fp is determined
271
- # # by a protein's reference matching the prefix. fp is a protein matching!
272
- # def classify_prots_by_prefix(prefix)
273
- # regex = /^#{Regexp.escape(prefix)}/
274
- # classify(:prots, proc {|prot| prot.reference })
275
- # end
263
+ # returns [tp, fp] based on the protein prefix for items where items =
264
+ # (:prot|:peps)
265
+ # this may result in a duplication of some peptides if they match both
266
+ # normal and decoy proteins. In this case, the protein arrays are split,
267
+ # too, so that each points only to its breed of protein.
268
+ def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
269
+ if prefix
270
+ regex = /^#{Regexp.escape(flag)}/
271
+ else
272
+ regex = /#{Regexp.escape(flag)}/
273
+ end
274
+ classify_by_regex(items, regex, fp_on_match)
275
+ end
276
276
 
277
277
  # Returns (match, nomatch)
278
278
  # items = symbol (:prots, :peps)
@@ -354,10 +354,14 @@ end
354
354
  end
355
355
 
356
356
  # convenience method for the common task of determining precision for
357
- # proteins (with decoy proteins found by prefix)
357
+ # proteins (with decoy proteins found by false_flag)
358
358
  # returns (num_hits, precision)
359
- def num_hits_and_ppv_for_prob(fp_prefix)
360
- regex = /^#{Regexp.escape(fp_prefix)}/
359
+ def num_hits_and_ppv_for_prob(false_flag, prefix=false)
360
+ if prefix
361
+ regex = /^#{Regexp.escape(false_flag)}/
362
+ else
363
+ regex = /#{Regexp.escape(false_flag)}/
364
+ end
361
365
  prob_proc = probability_proc
362
366
  myproc = proc { |prt|
363
367
  if prt.reference =~ regex ; false
@@ -200,7 +200,7 @@ class SpecID::Filter
200
200
  new_spec_ids << spec_id
201
201
  file_to_prefiltered_spec_id(prefix_or_file, opt)
202
202
  else
203
- (tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
203
+ (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
204
204
  fps_specid = spec_id.class.new
205
205
  tps_specid = spec_id.class.new
206
206
 
@@ -339,8 +339,10 @@ class SpecID::Filter
339
339
  op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
340
340
  op.separator " if bioworks.xml, = 10^6deltamass/mass"
341
341
  op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
342
- op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
343
- op.separator(" last given will apply to remaining files")
342
+ op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
343
+ op.separator(" e.g., for Bioworks: 'REVERSE'")
344
+ op.separator(" (last given will apply to remaining files)")
345
+ op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
344
346
  op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
345
347
  v[0] = get_cys_freq(v[0])
346
348
  opt.cys = v
@@ -354,7 +356,8 @@ class SpecID::Filter
354
356
  op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
355
357
  op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
356
358
  op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
357
- op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
359
+ ## NEED TO IMPLEMENT THIS:
360
+ #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
358
361
  op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
359
362
  end
360
363
 
@@ -245,11 +245,12 @@ class Prec
245
245
  op.separator ""
246
246
  op.separator "Options:"
247
247
 
248
- op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
248
+ op.on("-f", "--fp_data <prefix_or_file>", "flag -or- decoy FILE") {|v| opt.f = v }
249
249
  op.separator ""
250
- op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
251
- op.separator " If files have different prefixes, separate with commas."
250
+ op.separator " If searched with a concatenated DB, give a false flag to decoy proteins."
251
+ op.separator " If files have different flags, separate with commas."
252
252
  op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
253
+ op.on("--prefix", "false flag as prefix only") {|v| opt.prefix = v }
253
254
  op.separator ""
254
255
  ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
255
256
  op.separator ""
@@ -374,7 +375,7 @@ Example:
374
375
  sp = SpecID.new(file)
375
376
  #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
376
377
  if opt.f
377
- (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
378
+ (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i], opt.prefix)
378
379
  all_arrs[i] << [num_hits,ppv]
379
380
  key[i] << ["Precision", ["# hits", "Prec (decoy)"]]
380
381
  end
@@ -145,9 +145,13 @@ class ProteinSummary
145
145
  end
146
146
  end
147
147
 
148
- def prefix_to_regex(prefix)
149
- if prefix
150
- /^#{Regexp.escape(prefix)}/
148
+ def flag_to_regex(flag, prefix=false)
149
+ if flag
150
+ if prefix
151
+ /^#{Regexp.escape(flag)}/
152
+ else
153
+ /#{Regexp.escape(flag)}/
154
+ end
151
155
  else
152
156
  nil
153
157
  end
@@ -164,12 +168,12 @@ class ProteinSummary
164
168
  end
165
169
 
166
170
  # filters on the false positive regex and sorts by prot probability
167
- def filter_and_sort(uniq_prots, prefix=nil)
168
- prefix_re = prefix_to_regex(prefix)
171
+ def filter_and_sort(uniq_prots, flag=nil, prefix=false)
172
+ false_flag_re = flag_to_regex(flag, prefix)
169
173
  sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
170
174
  ## filter on prefix
171
175
  if prefix
172
- sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
176
+ sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
173
177
  end
174
178
  sorted
175
179
  end
@@ -292,7 +296,7 @@ class ProteinSummary
292
296
  end
293
297
  end
294
298
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
295
- filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
299
+ filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
296
300
 
297
301
  ## num proteins above cutoff (if opt.c)
298
302
  num_prots_html = ''
@@ -322,7 +326,7 @@ class ProteinSummary
322
326
  # takes spec_id object
323
327
  # the outfn is the output filename
324
328
  # opt is an OpenStruct that holds opt.f = the false prefix
325
- def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
329
+ def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
326
330
  fppr_output_as_html ||= ''
327
331
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
328
332
  num_cols = header_anchors.size
@@ -330,9 +334,8 @@ class ProteinSummary
330
334
  proteins = spec_id.prots
331
335
  protein_num = 0
332
336
  rows = ""
333
- prefix_re = prefix_to_regex(false_prefix)
334
337
  proteins.each do |prot|
335
- if false_prefix && prot.reference =~ prefix_re
338
+ if false_flag_re && prot.reference =~ false_flag_re
336
339
  next
337
340
  end
338
341
  uniq_peps = Hash.new {|h,k| h[k] = true; }
@@ -393,7 +396,8 @@ class ProteinSummary
393
396
  op.separator " where file = bioworks -or- <run>-prot (prophet output)"
394
397
  op.separator " outputs: <file>.summary.html"
395
398
  op.separator ""
396
- op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
399
+ op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
400
+ op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
397
401
  op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
398
402
  op.separator(" if --precision then -f is used to specify a file or prefix")
399
403
  op.separator(" that indicates the false positives.")
@@ -434,7 +438,9 @@ class ProteinSummary
434
438
  proph_output(file, outfn, opt, fppr_output_as_html)
435
439
  when "bioworks"
436
440
  spec_id = SpecID.new(file)
437
- bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
441
+
442
+ false_regex = flag_to_regex(opt.f, opt.prefix)
443
+ bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
438
444
  else
439
445
  abort "filetype for #{file} not recognized!"
440
446
  end
@@ -445,6 +451,7 @@ class ProteinSummary
445
451
  def create_precision_argv(file, opt)
446
452
  # include only those options specific
447
453
  new_argv = [file]
454
+ if opt.prefix ; new_argv << '--prefix' end
448
455
  if opt.f ; new_argv << '-f' << opt.f end
449
456
  if opt.o ; new_argv << '-o' << opt.o end
450
457
  new_argv
data/test/tc_precision.rb CHANGED
@@ -22,7 +22,7 @@ class PrecTest < Test::Unit::TestCase
22
22
  end
23
23
 
24
24
  def test_basic_cat
25
- output = `#{@cmd} -o #{@tf_html} -f SHUFF_ #{@tf_bioworks_shuff}`
25
+ output = `#{@cmd} -o #{@tf_html} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
26
26
  puts output
27
27
 
28
28
  assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
@@ -34,7 +34,7 @@ class PrecTest < Test::Unit::TestCase
34
34
  end
35
35
 
36
36
  def test_multiple_files
37
- output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
37
+ output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ --prefix #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
38
38
  assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
39
39
  assert_match(/1.*1.0000.*1.*1.0000.*0.*0.*15.*0.8667/m, IO.read(@tf_html), "has values")
40
40
  [@tf_html, @tf_png].each do |file|
@@ -45,14 +45,14 @@ class PrecTest < Test::Unit::TestCase
45
45
 
46
46
  def test_area_under_curve
47
47
  file = @tfiles + 'ppv_area.txt'
48
- `#{@cmd} -o #{file} -a -f SHUFF_ #{@tf_bioworks_shuff}`
48
+ `#{@cmd} -o #{file} -a -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
49
49
  assert(File.exist?(file), "file #{file} exists")
50
50
  output = IO.read(file)
51
51
  assert_match(/Prec.*7.39206/, output, "consistency check")
52
52
  File.unlink file
53
53
 
54
54
  outfile = File.join(File.dirname(__FILE__), 'other.html')
55
- `#{@cmd} -o #{outfile} -f SHUFF_ #{@tf_bioworks_shuff}`
55
+ `#{@cmd} -o #{outfile} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
56
56
  File.unlink outfile
57
57
  File.unlink File.join(File.dirname(__FILE__),'other.png')
58
58
  end
@@ -75,7 +75,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
75
75
 
76
76
  def test_proph_with_precision
77
77
  #puts @cmd
78
- runit "#{@tf_proph_cat_inv} -f INV_ --precision"
78
+ runit "#{@tf_proph_cat_inv} -f INV_ --prefix --precision"
79
79
  html = IO.read(@tf_proph_cat_inv_summary_html)
80
80
  assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
81
81
  assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")
data/test/tc_spec_id.rb CHANGED
@@ -20,11 +20,11 @@ class SpecIDTest < Test::Unit::TestCase
20
20
  assert_equal(106, sp.prots.size)
21
21
  end
22
22
 
23
- def test_classify_by_prefix
23
+ def test_classify_by_false_flag
24
24
  file = @tfiles + "bioworks_with_INV_small.xml"
25
25
  sp = SpecID.new(file)
26
26
  assert_equal(19, sp.prots.size)
27
- (tp, fp) = sp.classify_by_prefix(:prots, "INV_")
27
+ (tp, fp) = sp.classify_by_false_flag(:prots, "INV_", true, true)
28
28
  assert_equal(4, fp.size, "num false pos")
29
29
  assert_equal(15, tp.size, "num true pos")
30
30
  end
@@ -58,7 +58,7 @@ class SpecIDTest < Test::Unit::TestCase
58
58
  (tps, ys) = roc.tps_and_ppv(tp, fp)
59
59
  assert_equal(exp_tp, tps)
60
60
  assert_equal(exp_fp, ys)
61
- (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_")
61
+ (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_", true)
62
62
  # @TODO: assert these guys for consistencies sake:
63
63
  assert_in_delta_arrays([1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15], tps, 0.0000001)
64
64
  # Consistency check only:
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: mspire
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.1
7
- date: 2007-04-30 00:00:00 -05:00
6
+ version: 0.2.2
7
+ date: 2007-05-08 00:00:00 -05:00
8
8
  summary: Mass Spectrometry Proteomics Objects, Scripts, and Executables
9
9
  require_paths:
10
10
  - lib