mspire 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -140,7 +140,7 @@ tm = Time.now
140
140
  spec = Gem::Specification.new do |s|
141
141
  s.platform = Gem::Platform::RUBY
142
142
  s.name = NAME
143
- s.version = "0.2.1"
143
+ s.version = "0.2.2"
144
144
  s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
145
145
  s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
146
146
  s.email = "jprince@icmb.utexas.edu"
data/changelog.txt CHANGED
@@ -40,3 +40,12 @@ Added support for modifications to bioworks_to_pepxml.rb
40
40
  Can read .srf files (nearly interchangeable with bioworks files)
41
41
  Redid filter.rb
42
42
 
43
+ ## version 0.2.1
44
+
45
+ minor bugfix
46
+
47
+ ## version 0.2.2
48
+
49
+ made compatible with Bioworks fasta file reverser and updated tutorial.
50
+ Killed classify_by_prefix routine in favor of classify_by_false_flag which has
51
+ a prefix option
data/lib/spec_id.rb CHANGED
@@ -223,13 +223,7 @@ module SpecID
223
223
  pps
224
224
  end
225
225
 
226
- # returns [tp, fp] based on the protein prefix for items where items =
227
- # (:prot|:peps)
228
- # this may result in a duplication of some peptides if they match both
229
- # normal and decoy proteins. In this case, the protein arrays are split,
230
- # too, so that each points only to its breed of protein.
231
- def classify_by_prefix(items, prefix, fp_on_match=true)
232
- regex = /^#{Regexp.escape(prefix)}/
226
+ def classify_by_regex(items, regex, fp_on_match=true)
233
227
  case items
234
228
  when :prots
235
229
  myproc = proc { |prt|
@@ -264,15 +258,21 @@ module SpecID
264
258
  else
265
259
  abort "don't recognize "
266
260
  end
267
- end
261
+ end
268
262
 
269
- ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
270
- # # Returns tp, fp where each is an array of proteins where fp is determined
271
- # # by a protein's reference matching the prefix. fp is a protein matching!
272
- # def classify_prots_by_prefix(prefix)
273
- # regex = /^#{Regexp.escape(prefix)}/
274
- # classify(:prots, proc {|prot| prot.reference })
275
- # end
263
+ # returns [tp, fp] based on the protein prefix for items where items =
264
+ # (:prot|:peps)
265
+ # this may result in a duplication of some peptides if they match both
266
+ # normal and decoy proteins. In this case, the protein arrays are split,
267
+ # too, so that each points only to its breed of protein.
268
+ def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
269
+ if prefix
270
+ regex = /^#{Regexp.escape(flag)}/
271
+ else
272
+ regex = /#{Regexp.escape(flag)}/
273
+ end
274
+ classify_by_regex(items, regex, fp_on_match)
275
+ end
276
276
 
277
277
  # Returns (match, nomatch)
278
278
  # items = symbol (:prots, :peps)
@@ -354,10 +354,14 @@ end
354
354
  end
355
355
 
356
356
  # convenience method for the common task of determining precision for
357
- # proteins (with decoy proteins found by prefix)
357
+ # proteins (with decoy proteins found by false_flag)
358
358
  # returns (num_hits, precision)
359
- def num_hits_and_ppv_for_prob(fp_prefix)
360
- regex = /^#{Regexp.escape(fp_prefix)}/
359
+ def num_hits_and_ppv_for_prob(false_flag, prefix=false)
360
+ if prefix
361
+ regex = /^#{Regexp.escape(false_flag)}/
362
+ else
363
+ regex = /#{Regexp.escape(false_flag)}/
364
+ end
361
365
  prob_proc = probability_proc
362
366
  myproc = proc { |prt|
363
367
  if prt.reference =~ regex ; false
@@ -200,7 +200,7 @@ class SpecID::Filter
200
200
  new_spec_ids << spec_id
201
201
  file_to_prefiltered_spec_id(prefix_or_file, opt)
202
202
  else
203
- (tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
203
+ (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix)
204
204
  fps_specid = spec_id.class.new
205
205
  tps_specid = spec_id.class.new
206
206
 
@@ -339,8 +339,10 @@ class SpecID::Filter
339
339
  op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
340
340
  op.separator " if bioworks.xml, = 10^6deltamass/mass"
341
341
  op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
342
- op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
343
- op.separator(" last given will apply to remaining files")
342
+ op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v}
343
+ op.separator(" e.g., for Bioworks: 'REVERSE'")
344
+ op.separator(" (last given will apply to remaining files)")
345
+ op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v}
344
346
  op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
345
347
  v[0] = get_cys_freq(v[0])
346
348
  opt.cys = v
@@ -354,7 +356,8 @@ class SpecID::Filter
354
356
  op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
355
357
  op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
356
358
  op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
357
- op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
359
+ ## NEED TO IMPLEMENT THIS:
360
+ #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
358
361
  op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
359
362
  end
360
363
 
@@ -245,11 +245,12 @@ class Prec
245
245
  op.separator ""
246
246
  op.separator "Options:"
247
247
 
248
- op.on("-f", "--fp_data <prefix_or_file>", "PREFIX -or- decoy FILE") {|v| opt.f = v }
248
+ op.on("-f", "--fp_data <prefix_or_file>", "flag -or- decoy FILE") {|v| opt.f = v }
249
249
  op.separator ""
250
- op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
251
- op.separator " If files have different prefixes, separate with commas."
250
+ op.separator " If searched with a concatenated DB, give a false flag to decoy proteins."
251
+ op.separator " If files have different flags, separate with commas."
252
252
  op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
253
+ op.on("--prefix", "false flag as prefix only") {|v| opt.prefix = v }
253
254
  op.separator ""
254
255
  ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
255
256
  op.separator ""
@@ -374,7 +375,7 @@ Example:
374
375
  sp = SpecID.new(file)
375
376
  #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
376
377
  if opt.f
377
- (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i])
378
+ (num_hits, ppv) = sp.num_hits_and_ppv_for_prob(prefix_arr[i], opt.prefix)
378
379
  all_arrs[i] << [num_hits,ppv]
379
380
  key[i] << ["Precision", ["# hits", "Prec (decoy)"]]
380
381
  end
@@ -145,9 +145,13 @@ class ProteinSummary
145
145
  end
146
146
  end
147
147
 
148
- def prefix_to_regex(prefix)
149
- if prefix
150
- /^#{Regexp.escape(prefix)}/
148
+ def flag_to_regex(flag, prefix=false)
149
+ if flag
150
+ if prefix
151
+ /^#{Regexp.escape(flag)}/
152
+ else
153
+ /#{Regexp.escape(flag)}/
154
+ end
151
155
  else
152
156
  nil
153
157
  end
@@ -164,12 +168,12 @@ class ProteinSummary
164
168
  end
165
169
 
166
170
  # filters on the false positive regex and sorts by prot probability
167
- def filter_and_sort(uniq_prots, prefix=nil)
168
- prefix_re = prefix_to_regex(prefix)
171
+ def filter_and_sort(uniq_prots, flag=nil, prefix=false)
172
+ false_flag_re = flag_to_regex(flag, prefix)
169
173
  sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
170
174
  ## filter on prefix
171
175
  if prefix
172
- sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
176
+ sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
173
177
  end
174
178
  sorted
175
179
  end
@@ -292,7 +296,7 @@ class ProteinSummary
292
296
  end
293
297
  end
294
298
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
295
- filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
299
+ filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
296
300
 
297
301
  ## num proteins above cutoff (if opt.c)
298
302
  num_prots_html = ''
@@ -322,7 +326,7 @@ class ProteinSummary
322
326
  # takes spec_id object
323
327
  # the outfn is the output filename
324
328
  # opt is an OpenStruct that holds opt.f = the false prefix
325
- def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
329
+ def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
326
330
  fppr_output_as_html ||= ''
327
331
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
328
332
  num_cols = header_anchors.size
@@ -330,9 +334,8 @@ class ProteinSummary
330
334
  proteins = spec_id.prots
331
335
  protein_num = 0
332
336
  rows = ""
333
- prefix_re = prefix_to_regex(false_prefix)
334
337
  proteins.each do |prot|
335
- if false_prefix && prot.reference =~ prefix_re
338
+ if false_flag_re && prot.reference =~ false_flag_re
336
339
  next
337
340
  end
338
341
  uniq_peps = Hash.new {|h,k| h[k] = true; }
@@ -393,7 +396,8 @@ class ProteinSummary
393
396
  op.separator " where file = bioworks -or- <run>-prot (prophet output)"
394
397
  op.separator " outputs: <file>.summary.html"
395
398
  op.separator ""
396
- op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
399
+ op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
400
+ op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
397
401
  op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
398
402
  op.separator(" if --precision then -f is used to specify a file or prefix")
399
403
  op.separator(" that indicates the false positives.")
@@ -434,7 +438,9 @@ class ProteinSummary
434
438
  proph_output(file, outfn, opt, fppr_output_as_html)
435
439
  when "bioworks"
436
440
  spec_id = SpecID.new(file)
437
- bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
441
+
442
+ false_regex = flag_to_regex(opt.f, opt.prefix)
443
+ bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
438
444
  else
439
445
  abort "filetype for #{file} not recognized!"
440
446
  end
@@ -445,6 +451,7 @@ class ProteinSummary
445
451
  def create_precision_argv(file, opt)
446
452
  # include only those options specific
447
453
  new_argv = [file]
454
+ if opt.prefix ; new_argv << '--prefix' end
448
455
  if opt.f ; new_argv << '-f' << opt.f end
449
456
  if opt.o ; new_argv << '-o' << opt.o end
450
457
  new_argv
data/test/tc_precision.rb CHANGED
@@ -22,7 +22,7 @@ class PrecTest < Test::Unit::TestCase
22
22
  end
23
23
 
24
24
  def test_basic_cat
25
- output = `#{@cmd} -o #{@tf_html} -f SHUFF_ #{@tf_bioworks_shuff}`
25
+ output = `#{@cmd} -o #{@tf_html} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
26
26
  puts output
27
27
 
28
28
  assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
@@ -34,7 +34,7 @@ class PrecTest < Test::Unit::TestCase
34
34
  end
35
35
 
36
36
  def test_multiple_files
37
- output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
37
+ output = `#{@cmd} -o #{@tf_html} -f SHUFF_,INV_ --prefix #{@tf_bioworks_shuff} #{@tf_bioworks_esmall_xml}`
38
38
  assert_match(/<table.*<\/table>/m, IO.read(@tf_html), "has html table in it")
39
39
  assert_match(/1.*1.0000.*1.*1.0000.*0.*0.*15.*0.8667/m, IO.read(@tf_html), "has values")
40
40
  [@tf_html, @tf_png].each do |file|
@@ -45,14 +45,14 @@ class PrecTest < Test::Unit::TestCase
45
45
 
46
46
  def test_area_under_curve
47
47
  file = @tfiles + 'ppv_area.txt'
48
- `#{@cmd} -o #{file} -a -f SHUFF_ #{@tf_bioworks_shuff}`
48
+ `#{@cmd} -o #{file} -a -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
49
49
  assert(File.exist?(file), "file #{file} exists")
50
50
  output = IO.read(file)
51
51
  assert_match(/Prec.*7.39206/, output, "consistency check")
52
52
  File.unlink file
53
53
 
54
54
  outfile = File.join(File.dirname(__FILE__), 'other.html')
55
- `#{@cmd} -o #{outfile} -f SHUFF_ #{@tf_bioworks_shuff}`
55
+ `#{@cmd} -o #{outfile} -f SHUFF_ --prefix #{@tf_bioworks_shuff}`
56
56
  File.unlink outfile
57
57
  File.unlink File.join(File.dirname(__FILE__),'other.png')
58
58
  end
@@ -75,7 +75,7 @@ class ProphProtSummaryTest < Test::Unit::TestCase
75
75
 
76
76
  def test_proph_with_precision
77
77
  #puts @cmd
78
- runit "#{@tf_proph_cat_inv} -f INV_ --precision"
78
+ runit "#{@tf_proph_cat_inv} -f INV_ --prefix --precision"
79
79
  html = IO.read(@tf_proph_cat_inv_summary_html)
80
80
  assert_match(/# hits/, html, "in #{@tf_proph_cat_inv_summary_html}")
81
81
  assert_match(/2.*0\.0000/m, html, "in #{@tf_proph_cat_inv_summary_html}")
data/test/tc_spec_id.rb CHANGED
@@ -20,11 +20,11 @@ class SpecIDTest < Test::Unit::TestCase
20
20
  assert_equal(106, sp.prots.size)
21
21
  end
22
22
 
23
- def test_classify_by_prefix
23
+ def test_classify_by_false_flag
24
24
  file = @tfiles + "bioworks_with_INV_small.xml"
25
25
  sp = SpecID.new(file)
26
26
  assert_equal(19, sp.prots.size)
27
- (tp, fp) = sp.classify_by_prefix(:prots, "INV_")
27
+ (tp, fp) = sp.classify_by_false_flag(:prots, "INV_", true, true)
28
28
  assert_equal(4, fp.size, "num false pos")
29
29
  assert_equal(15, tp.size, "num true pos")
30
30
  end
@@ -58,7 +58,7 @@ class SpecIDTest < Test::Unit::TestCase
58
58
  (tps, ys) = roc.tps_and_ppv(tp, fp)
59
59
  assert_equal(exp_tp, tps)
60
60
  assert_equal(exp_fp, ys)
61
- (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_")
61
+ (num_hits, prec) = sp.num_hits_and_ppv_for_prob("INV_", true)
62
62
  # @TODO: assert these guys for consistencies sake:
63
63
  assert_in_delta_arrays([1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15], tps, 0.0000001)
64
64
  # Consistency check only:
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: mspire
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.1
7
- date: 2007-04-30 00:00:00 -05:00
6
+ version: 0.2.2
7
+ date: 2007-05-08 00:00:00 -05:00
8
8
  summary: Mass Spectrometry Proteomics Objects, Scripts, and Executables
9
9
  require_paths:
10
10
  - lib