mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/bin/gi2annot.rb CHANGED
@@ -1,13 +1,6 @@
1
1
  #!/usr/bin/ruby -w
2
2
 
3
- require 'open-uri'
4
- require 'rexml/document'
5
- require 'rexml/streamlistener'
6
-
7
- BATCH_SIZE = 500
8
-
9
- $LOG = nil
10
- $ANNOTS = []
3
+ require 'gi'
11
4
 
12
5
  if ARGV.size < 1
13
6
  puts "usage: #{File.basename(__FILE__)} <gi> ..."
@@ -15,108 +8,7 @@ if ARGV.size < 1
15
8
  end
16
9
 
17
10
 
18
- # db=
19
- # retstart=
20
- # retmax=
21
-
22
- class Listener
23
- include REXML
24
- include StreamListener
25
- def initialize
26
- @get_title = false
27
- end
28
-
29
- def tag_start(name, attributes)
30
- #puts "NAME" + name
31
- #p attributes
32
- if name == "Item" && attributes["Name"] == "Title"
33
- @get_title = true
34
- end
35
- end
36
- def text(text)
37
- #puts "TEXT: " + text + @get_title.to_s
38
- if @get_title
39
- #puts "GETTING TITLE!"
40
- $ANNOTS.push text.chomp
41
- @get_title = false
42
- end
43
- end
44
- end
45
-
46
-
47
- # Returns a list of Annotation strings
48
- def parse_etool_output(handle)
49
- listener = Listener.new
50
- parser = REXML::Parsers::StreamParser.new(handle, listener)
51
- parser.parse
52
-
53
- $ANNOTS
54
- end
55
-
56
-
57
- #$LOG = File.open("log.log", "w")
58
-
59
11
  gis = ARGV.to_a.dup
60
12
 
61
- while(true) do
62
- batch = gis.slice!(0..BATCH_SIZE)
63
- if batch.size == 0 then break end
64
- string = batch.join(",")
65
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
66
- #puts url
67
- annots = []
68
- open(url) do |handle|
69
- annots = parse_etool_output(handle)
70
- end
71
- puts annots.join("\n")
72
- end
73
-
74
-
75
- #$LOG.close
76
-
77
-
78
-
79
-
80
- =begin
81
-
82
- <?xml version="1.0" encoding="ISO-8859-1"?>
83
- <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
84
- <eSummaryResult>
85
-
86
- <DocSum>
87
- <Id>24115498</Id>
88
- <Item Name="Caption" Type="String">NP_710008</Item>
89
- <Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
90
- <Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
91
- <Item Name="Gi" Type="Integer">24115498</Item>
92
- <Item Name="CreateDate" Type="String">2002/10/16</Item>
93
-
94
- <Item Name="UpdateDate" Type="String">2006/04/03</Item>
95
- <Item Name="Flags" Type="Integer">512</Item>
96
- <Item Name="TaxId" Type="Integer">198214</Item>
97
- <Item Name="Status" Type="String">live</Item>
98
- <Item Name="ReplacedBy" Type="String"></Item>
99
- <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
100
- </DocSum>
101
-
102
-
103
- <DocSum>
104
- <Id>434011</Id>
105
- <Item Name="Caption" Type="String">CAA24741</Item>
106
-
107
- <Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
108
- <Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
109
- <Item Name="Gi" Type="Integer">434011</Item>
110
- <Item Name="CreateDate" Type="String">1983/12/06</Item>
111
- <Item Name="UpdateDate" Type="String">2005/04/18</Item>
112
- <Item Name="Flags" Type="Integer">0</Item>
113
- <Item Name="TaxId" Type="Integer">562</Item>
114
- <Item Name="Status" Type="String">live</Item>
115
- <Item Name="ReplacedBy" Type="String"></Item>
116
-
117
- <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
118
- </DocSum>
119
-
120
- </eSummaryResult>
13
+ puts( GI.gi2annot(gis).join("\n") )
121
14
 
122
- =end
data/bin/id_class_anal.rb CHANGED
@@ -4,6 +4,7 @@ require 'spec_id'
4
4
  require 'generator'
5
5
  require 'optparse'
6
6
  require 'ostruct'
7
+ require 'roc'
7
8
 
8
9
  def file_noext(file)
9
10
  file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
@@ -21,7 +22,8 @@ jtplot_file = jtplot_base + '.toplot'
21
22
  OptionParser.new do |op|
22
23
  op.on("-p", "--prefix PREFIX", "prefix for false positive proteins") {|v| opt.p = v.split(',') }
23
24
  op.on("-j", "--jtplot", "output file '#{jtplot_file}' for jtp plotting program") {|v| opt.j = v }
24
- op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
25
+ # op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
26
+ op.on("-a", "--area", "outputs area under the curve") {|v| opt.a = v }
25
27
  end.parse!
26
28
 
27
29
  if ARGV.size < 1
@@ -32,55 +34,59 @@ if ARGV.size < 1
32
34
  probabilities) or protein_prophet-prot.xml file which has been run with
33
35
  decoy proteins.
34
36
 
35
- Outputs tp's, precision, and the false positive rate [as calculated by Gygi
36
- 2*(#mod/(#norm+#mod))]. Each of these will be in a column with a label at
37
- the top. Outputs columns (delimited by '\\t') to STDOUT.
38
- To capture to file: #{File.basename(__FILE__)} protein_file.xml > out.csv
37
+ Outputs tp's and precision.
38
+ [The false positive predictive rate (FPPR) is 1 - precision]
39
+ The two columns will be labeled at the top.
40
+ (delimited by '\\t') to STDOUT. To capture to file:
41
+ #{File.basename(__FILE__)} protein_file.xml > out.csv
39
42
 
40
- Also takes gzipped (extension: xml.gz) files.
41
-
42
43
  OPTIONS:
43
44
  <s> = string
44
45
  -p --prefix <s[,s...]> Prefix(s) by which to determine decoy proteins (default #{def_pre})
45
46
  -j --jtplot outputs #{jtplot_file} for plotting by plot.rb
46
47
  [% plot.rb -w lp --yrange n0.1:1.1 --noenhanced <file> ]
48
+ -a --area outputs area under the curve instead of tps/precision
47
49
 
48
50
  NOTE: protein prophet files not yet functional!!!
49
51
  ABBR:
50
52
  TP = True Positives
51
53
  FP = False Positives
52
54
  Prec = Precision = TP/(TP+FP)
53
- FPR = False Positive Rate (as defined by Gygi) 2*[FP/(TP+FP)]
54
55
  "
55
56
  exit
56
57
  end
57
58
 
59
+ ###########################################################
60
+ # I DON"T think option -e is functional yet...
61
+ ###########################################################
62
+
58
63
  files = ARGV.to_a
59
64
 
60
65
  out = nil
61
66
  if opt.j
62
67
  out = File.open(jtplot_file, "w")
63
- lines = ['XYData', jtplot_base, "Classification Analysis", "Num TPs", "(Prec|FPR)"]
68
+ lines = ['XYData', jtplot_base, "Classification Analysis", "Num Hits", "Precision"]
64
69
  lines.each {|l| out.puts l}
65
70
  end
66
71
 
67
72
  headings = files.collect do |file|
68
- %w(TP Prec FPR).collect {|v| v + " (#{file_noext(file)})" }
73
+ %w(TP Precision).collect {|v| v + " (#{file_noext(file)})" }
69
74
  end
70
- #headings = ["# True Positives", "Precision (TP/(TP+FP))", "FP Rate 2*(FP/(TP+FP))"]
71
- puts headings.flatten.join(delimiter)
72
75
 
73
76
  all_arrs = []
74
77
  files.each_with_index do |file,i|
75
78
  sp = SpecID.new(file)
76
- #puts sp.prots.first.respond_to?
77
- if opt.e
78
- headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", ]
79
- arrs = sp.tps_and_precision_and_fpr2_times2_for_prob(opt.p[i])
80
- else
81
- headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
82
- arrs = sp.tps_and_precision_and_fpr2_times2_for_prob(opt.p[i])
79
+ headers = [file_noext(file)]
80
+ arrs = sp.num_hits_and_ppv_for_prob(opt.p[i])
81
+
82
+ if opt.a
83
+ (num_hits, prec) = arrs
84
+ roc = ROC.new
85
+ prec_area = roc.area_under_curve(num_hits, prec)
86
+ puts "#{file} (area under curve [num_hits, precision])"
87
+ puts "Prec [#TPPrec = TP/(TP+FP)]:\t#{prec_area}"
83
88
  end
89
+
84
90
  all_arrs.push(*arrs)
85
91
 
86
92
  lns = []
@@ -95,8 +101,12 @@ files.each_with_index do |file,i|
95
101
  end
96
102
  end
97
103
 
98
- SyncEnumerator.new(*all_arrs).each do |row|
99
- puts row.join(delimiter)
104
+
105
+ unless opt.a
106
+ puts headings.flatten.join(delimiter)
107
+ SyncEnumerator.new(*all_arrs).each do |row|
108
+ puts row.join(delimiter)
109
+ end
100
110
  end
101
111
 
102
112
  out.close if opt.j
data/bin/id_precision.rb CHANGED
@@ -13,7 +13,7 @@ opts = OptionParser.new do |op|
13
13
  op.banner = "usage: #{File.basename(__FILE__)} prefix bioworks.xml"
14
14
  op.separator ""
15
15
  op.separator "takes Bioworks 3.2 xml output files (with probabilities)"
16
- op.separator "rank orders the probabilities and outputs tp's and sensitivity"
16
+ op.separator "rank orders the probabilities and outputs num hits and precision"
17
17
  op.separator "Also takes gzipped (xml.gz) files labeled as such"
18
18
  op.separator ""
19
19
  op.separator "Outputs a comma separated value to STDOUT (.csv)"
@@ -50,10 +50,11 @@ tp_obj.peps = tp
50
50
  two_lists = [tp_obj, fp_obj].map do |obj|
51
51
  list = []
52
52
  list.push( obj.pep_probs_by_pep_prots )
53
- list.push( obj.pep_probs_by_seq_charge )
53
+
54
+ list.push( obj.pep_probs_by_bn_seq_charge )
54
55
  # These each have a by_min and a by_top10
55
- list.push(*( obj.pep_probs_by_scan ) )
56
- list.push(*( obj.pep_probs_by_scan_charge ) )
56
+ list.push(*( obj.pep_probs_by_bn_scan ) )
57
+ list.push(*( obj.pep_probs_by_bn_scan_charge ) )
57
58
  list
58
59
  end
59
60
 
@@ -61,19 +62,22 @@ end
61
62
  headings = ["PepProts", "SeqCharge", "Scan(TopHit)", "Scan(Top10)", "ScanCharge(TopHit)", "ScanCharge(Top10)"]
62
63
  csv_headings = []
63
64
  headings.each do |head|
64
- csv_headings << head + ": TP"
65
+ csv_headings << head + ": NH"
65
66
  csv_headings << head + ": PR"
66
67
  end
67
68
 
68
69
  pairs = two_lists[0].zip two_lists[1]
69
70
 
70
- roc = ROC.new
71
+ roc = DecoyROC.new
71
72
  x_y= []
72
73
  area_under_curve = []
73
74
  #start_x = []
74
75
  #end_x = []
75
76
  pairs.each do |pair|
76
- x,y = roc.tps_and_precision(pair[0], pair[1])
77
+ #x,y = roc.pred_and_tps_and_ppv(pair[0], pair[1])
78
+ (num_hits, tps, ppv) = roc.pred_and_tps_and_ppv(pair[0], pair[1])
79
+ x = num_hits
80
+ y = ppv
77
81
  if $AREAS_ONLY
78
82
  x.unshift 0
79
83
  y.unshift 1.0
@@ -99,7 +103,7 @@ end
99
103
  # X axis is the number of peptides id# (i.e., # of peps in TP db)
100
104
  # Y axis is the precision = TP/(TP+FP)
101
105
 
102
- ## Make some legend comments at the top of the file:
106
+ puts "# NH = number of hits"
103
107
  puts "# TP = true positives"
104
108
  puts "# FP = false positives"
105
109
  puts "# PR = precision = TP/(TP+FP)"
@@ -2,4 +2,4 @@
2
2
 
3
3
  require 'spec_id'
4
4
 
5
- SpecID.false_positive_rate(ARGV)
5
+ SpecID.precision(ARGV)
@@ -8,6 +8,7 @@ require 'spec_id'
8
8
 
9
9
  #############################################################
10
10
  # GLOBALS:
11
+ PRECISION_PROGRAM_BASE = 'precision'
11
12
  DEF_PREFIX = "INV_"
12
13
  DEF_PERCENT_FP = "5.0"
13
14
  #############################################################
@@ -62,7 +63,7 @@ class Runner
62
63
  background-color: #FF0000;
63
64
  color: #FFFFFF
64
65
  }
65
- div.file_info, div.software, div.fpr, div.num_proteins{
66
+ div.file_info, div.software, div.fppr, div.num_proteins{
66
67
  margin-left: 20px;
67
68
  margin-top: 20px;
68
69
  }
@@ -173,38 +174,38 @@ class Runner
173
174
  end
174
175
 
175
176
  # assumes that these are sorted on probability
176
- # desired_fpr is a float
177
- # returns [number_of_prots, actual_fpr]
178
- def num_prots_above_fpr(prots, desired_fpr)
179
- current_fpr_rate_percent = 0.0
180
- previous_fpr_rate_percent = 0.0
177
+ # desired_fppr is a float
178
+ # returns [number_of_prots, actual_fppr]
179
+ def num_prots_above_fppr(prots, desired_fppr)
180
+ current_fppr_rate_percent = 0.0
181
+ previous_fppr_rate_percent = 0.0
181
182
  current_sum_one_minus_prob = 0.0
182
- proteins_within_fpr = 0
183
- actual_fpr = nil
183
+ proteins_within_fppr = 0
184
+ actual_fppr = nil
184
185
  already_found = false
185
186
  prot_cnt = 0
186
187
  prots.each do |prot|
187
188
  prot_cnt += 1
188
189
  # SUM(1-probX)/#prots
189
190
  current_sum_one_minus_prob += 1.0 - prot._probability.to_f
190
- current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
191
+ current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
191
192
 
192
- if current_fpr_rate_percent > desired_fpr && !already_found
193
- actual_fpr = previous_fpr_rate_percent
194
- proteins_within_fpr = prot_cnt
193
+ if current_fppr_rate_percent > desired_fppr && !already_found
194
+ actual_fppr = previous_fppr_rate_percent
195
+ proteins_within_fppr = prot_cnt
195
196
  already_found = true
196
197
  end
197
- previous_fpr_rate_percent = current_fpr_rate_percent
198
+ previous_fppr_rate_percent = current_fppr_rate_percent
198
199
  end
199
- [proteins_within_fpr, actual_fpr]
200
+ [proteins_within_fppr, actual_fppr]
200
201
  end
201
202
 
202
- #### #readable_previous_fpr_rate_percent = sprintf("%.2f", previous_fpr_rate_percent)
203
+ #### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
203
204
 
204
205
  # returns a string of the table rows
205
206
  # false_positive_rate (give as a %) is the cutoff mark
206
- # returns the number of proteins at the desired_fpr (if given)
207
- def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fpr, actual_percent_fp, peptide_count_filename=nil)
207
+ # returns the number of proteins at the desired_fppr (if given)
208
+ def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
208
209
  prot_cnt = 0
209
210
  uniq_prots.map do |prot|
210
211
  tr do
@@ -267,18 +268,20 @@ class Runner
267
268
  "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
268
269
  end
269
270
 
270
- def proph_output(file, outfn, opt, fpr_output_as_html)
271
+ def proph_output(file, outfn, opt, fppr_output_as_html)
271
272
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
272
273
  num_cols = header_anchors.size
273
274
  theaders = ths(header_anchors)
274
275
 
275
276
  root = AXML.parse_file(file)
276
277
  prots = []
277
- ## find the min_prob at a fpr of XX
278
- min_prob_redline = 1.01 # if no fpr is less than what they give, then all are redlined!
278
+ ## find the min_prob at a fppr of XX
279
+ min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
279
280
 
280
- if opt.c
281
+ if opt.c
281
282
  actual_percent_fp = opt.c.to_f
283
+ elsif opt.cut_at
284
+ actual_percent_fp = opt.cut_at.to_f
282
285
  else
283
286
  actual_percent_fp = nil
284
287
  end
@@ -289,20 +292,24 @@ class Runner
289
292
  end
290
293
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
291
294
  filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
292
- output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
293
295
 
294
296
  ## num proteins above cutoff (if opt.c)
295
297
  num_prots_html = ''
296
- if opt.c
297
- (num_prots, actual_fpr) = num_prots_above_fpr(filtered_sorted_prots, opt.c.to_f)
298
- num_prots_html = num_prots_to_html(opt.c.to_f, actual_fpr, num_prots)
298
+ if opt.c || opt.cut_at
299
+ (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
300
+ num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
301
+ end
302
+ if opt.cut_at
303
+ filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
299
304
  end
300
305
 
306
+ output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
307
+
301
308
  table_string = table do
302
309
  tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
303
310
  end
304
- er_info = opt.fpr ? error_info(file) : ""
305
- html_pieces = [outfn, header, fpr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
311
+ er_info = opt.precision ? error_info(file) : ""
312
+ html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
306
313
  print_html_pieces(*html_pieces)
307
314
  end # proph_output
308
315
 
@@ -311,7 +318,7 @@ class Runner
311
318
  "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
312
319
  end
313
320
 
314
- def bioworks_output(file, outfn, opt, fpr_output_as_html)
321
+ def bioworks_output(file, outfn, opt, fppr_output_as_html)
315
322
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
316
323
  num_cols = header_anchors.size
317
324
  theaders = ths(header_anchors)
@@ -339,7 +346,7 @@ class Runner
339
346
  table_string = table do
340
347
  tr{theaders} + rows
341
348
  end
342
- print_html_pieces(outfn, header, fpr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
349
+ print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
343
350
  end # bioworks_output
344
351
 
345
352
  def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
@@ -358,7 +365,7 @@ class Runner
358
365
  #puts lines ?? is this supposed to be commented out?
359
366
  lines = lines.reject do |obj| obj =~ /\*{10}/ end
360
367
  lines.map! do |line| "#{line}<br/>" end
361
- "<div class=\"fpr\">
368
+ "<div class=\"fppr\">
362
369
  <h3>Classification Analysis</h3>
363
370
  #{lines.join("\n")}
364
371
  </div>"
@@ -366,7 +373,7 @@ class Runner
366
373
 
367
374
  # transforms the output string of file_as_decoy into html
368
375
  def prefix_as_decoy_to_html(string)
369
- "<div class=\"fpr\">
376
+ "<div class=\"fppr\">
370
377
  <h3>Classification Analysis</h3>
371
378
  </div>" +
372
379
  string
@@ -384,21 +391,18 @@ class Runner
384
391
  op.separator " outputs: <file>.summary.html"
385
392
  op.separator ""
386
393
  op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
387
- op.separator(" if --fpr then -f is used to specify a file or prefix")
388
- op.separator(" to indicate false positives.")
394
+ op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
395
+ op.separator(" if --precision then -f is used to specify a file or prefix")
396
+ op.separator(" that indicates the false positives.")
389
397
  op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
390
398
  op.separator ""
391
- op.separator "Options for False Positive Rate:"
392
- op.on("--fpr", "include output of false_positive_rate.rb,") {|v| opt.fpr = v}
393
- op.separator(" type 'false_positive_rate.rb' for details")
394
- op.separator(" These options are passed on:")
395
- op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
396
- op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
397
- op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
398
-
399
+ op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
400
+ op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
401
+ op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
399
402
  op.separator ""
400
403
  op.separator "Specific to ProteinProphet (with no concatenated DB):"
401
- op.on("-c", "--cutoff percent", "displays red line at given % fpr") {|v| opt.c = v }
404
+ op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
405
+ op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
402
406
  end
403
407
 
404
408
  opts.parse!
@@ -408,31 +412,23 @@ class Runner
408
412
  exit
409
413
  end
410
414
 
411
- fpr_output_as_html = ''
415
+ fppr_output_as_html = ''
412
416
  files = argv.to_a
413
417
  files.each do |file|
414
418
  outfn = file.gsub(/\.xml$/, '.summary.html')
415
419
  ## False Positive Rate Calculation:
416
- if opt.fpr
420
+ if opt.precision
417
421
  opt.o = outfn # won't actually be written over, but used
418
- to_use_argv = create_false_positive_rate_argv(file, opt)
419
- (out_string, opt, file_as_decoy) = SpecID::FalsePositiveRate.new.false_positive_rate(to_use_argv)
420
- if file_as_decoy ## need to wrap this guy up in some html
421
- ## DISABLE the opt.f (it's a filename) so it doesn't interfere with
422
- ## filtering:
423
- opt.f = nil
424
-
425
- fpr_output_as_html = file_as_decoy_to_html(out_string)
426
- else
427
- fpr_output_as_html = prefix_as_decoy_to_html(out_string)
428
- end
422
+ to_use_argv = create_precision_argv(file, opt)
423
+ (out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
424
+ fppr_output_as_html = prefix_as_decoy_to_html(out_string)
429
425
  end
430
426
 
431
427
  case SpecID.file_type(file)
432
428
  when "protproph"
433
- proph_output(file, outfn, opt, fpr_output_as_html)
429
+ proph_output(file, outfn, opt, fppr_output_as_html)
434
430
  when "bioworks"
435
- bioworks_output(file, outfn, opt, fpr_output_as_html)
431
+ bioworks_output(file, outfn, opt, fppr_output_as_html)
436
432
  else
437
433
  abort "filetype for #{file} not recognized!"
438
434
  end
@@ -440,15 +436,12 @@ class Runner
440
436
 
441
437
  end # method go
442
438
 
443
- def create_false_positive_rate_argv(file, opt)
439
+ def create_precision_argv(file, opt)
444
440
  # include only those options specific
445
441
  new_argv = [file]
446
442
  if opt.f ; new_argv << '-f' << opt.f end
447
- if opt.g ; new_argv << '-g' end
448
- if opt.p ; new_argv << '-p' end
449
- if opt.n ; new_argv << '-n' end
450
443
  if opt.o ; new_argv << '-o' << opt.o end
451
- new_argv
444
+ new_argv
452
445
  end
453
446
 
454
447
  end # Runner
data/changelog.txt ADDED
@@ -0,0 +1,34 @@
1
+
2
+ 1. A couple of scripts and subroutines were hashing peptides but not on the file
3
+ basename. This would result in slightly incorrect results (any time there
4
+ were overlapping scan numbers in multiple datasets, only the top one would be
5
+ chosen). The results would be correct for single runs.
6
+
7
+ Output files that could be affected:
8
+ *.top_per_scan.txt
9
+ *.all_peps_per_scan.txt
10
+
11
+ Scripts that could be affected:
12
+ script/top_hit_per_scan.rb
13
+ bin/filter_spec_id.rb
14
+ script/filter-peps.rb
15
+ bin/id_precision.rb
16
+
17
+ Subroutines that were affected:
18
+ spec_id.rb (pep_probs_by_* )
19
+ spec_id.rb (top_peps_prefilter!)
20
+ proph.rb uniq_by_seqcharge
21
+ align.rb called uniq_by_seqcharge
22
+
23
+
24
+ 2. false_positive_rate.rb and protein_summary.rb (by extension) were using
25
+ number of true positives on the x axis while in reality I was plotting the
26
+ number of hits. I've updated x axis labels to reflect this change. In
27
+ addition, since the term 'false positive rate' has such a distinct definition
28
+ in classical ROC plots and binary statistics, I've decided to work primarily
29
+ in terms of precision (TP/(TP+FP)). I've purged the terms 'False Positive
30
+ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
31
+ called the False Positive Predictive Rate (FPPR). I will probably implement
32
+ this in a future release.
33
+
34
+
data/lib/align.rb CHANGED
@@ -1,6 +1,5 @@
1
1
 
2
2
  require 'spec/mzxml/parser'
3
- require 'hash_by'
4
3
  require 'spec/msrun'
5
4
  require 'spec_id/proph'
6
5
  require 'vec'