mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/bin/gi2annot.rb CHANGED
@@ -1,13 +1,6 @@
1
1
  #!/usr/bin/ruby -w
2
2
 
3
- require 'open-uri'
4
- require 'rexml/document'
5
- require 'rexml/streamlistener'
6
-
7
- BATCH_SIZE = 500
8
-
9
- $LOG = nil
10
- $ANNOTS = []
3
+ require 'gi'
11
4
 
12
5
  if ARGV.size < 1
13
6
  puts "usage: #{File.basename(__FILE__)} <gi> ..."
@@ -15,108 +8,7 @@ if ARGV.size < 1
15
8
  end
16
9
 
17
10
 
18
- # db=
19
- # retstart=
20
- # retmax=
21
-
22
- class Listener
23
- include REXML
24
- include StreamListener
25
- def initialize
26
- @get_title = false
27
- end
28
-
29
- def tag_start(name, attributes)
30
- #puts "NAME" + name
31
- #p attributes
32
- if name == "Item" && attributes["Name"] == "Title"
33
- @get_title = true
34
- end
35
- end
36
- def text(text)
37
- #puts "TEXT: " + text + @get_title.to_s
38
- if @get_title
39
- #puts "GETTING TITLE!"
40
- $ANNOTS.push text.chomp
41
- @get_title = false
42
- end
43
- end
44
- end
45
-
46
-
47
- # Returns a list of Annotation strings
48
- def parse_etool_output(handle)
49
- listener = Listener.new
50
- parser = REXML::Parsers::StreamParser.new(handle, listener)
51
- parser.parse
52
-
53
- $ANNOTS
54
- end
55
-
56
-
57
- #$LOG = File.open("log.log", "w")
58
-
59
11
  gis = ARGV.to_a.dup
60
12
 
61
- while(true) do
62
- batch = gis.slice!(0..BATCH_SIZE)
63
- if batch.size == 0 then break end
64
- string = batch.join(",")
65
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
66
- #puts url
67
- annots = []
68
- open(url) do |handle|
69
- annots = parse_etool_output(handle)
70
- end
71
- puts annots.join("\n")
72
- end
73
-
74
-
75
- #$LOG.close
76
-
77
-
78
-
79
-
80
- =begin
81
-
82
- <?xml version="1.0" encoding="ISO-8859-1"?>
83
- <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
84
- <eSummaryResult>
85
-
86
- <DocSum>
87
- <Id>24115498</Id>
88
- <Item Name="Caption" Type="String">NP_710008</Item>
89
- <Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
90
- <Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
91
- <Item Name="Gi" Type="Integer">24115498</Item>
92
- <Item Name="CreateDate" Type="String">2002/10/16</Item>
93
-
94
- <Item Name="UpdateDate" Type="String">2006/04/03</Item>
95
- <Item Name="Flags" Type="Integer">512</Item>
96
- <Item Name="TaxId" Type="Integer">198214</Item>
97
- <Item Name="Status" Type="String">live</Item>
98
- <Item Name="ReplacedBy" Type="String"></Item>
99
- <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
100
- </DocSum>
101
-
102
-
103
- <DocSum>
104
- <Id>434011</Id>
105
- <Item Name="Caption" Type="String">CAA24741</Item>
106
-
107
- <Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
108
- <Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
109
- <Item Name="Gi" Type="Integer">434011</Item>
110
- <Item Name="CreateDate" Type="String">1983/12/06</Item>
111
- <Item Name="UpdateDate" Type="String">2005/04/18</Item>
112
- <Item Name="Flags" Type="Integer">0</Item>
113
- <Item Name="TaxId" Type="Integer">562</Item>
114
- <Item Name="Status" Type="String">live</Item>
115
- <Item Name="ReplacedBy" Type="String"></Item>
116
-
117
- <Item Name="Comment" Type="String"><![CDATA[ ]]></Item>
118
- </DocSum>
119
-
120
- </eSummaryResult>
13
+ puts( GI.gi2annot(gis).join("\n") )
121
14
 
122
- =end
data/bin/id_class_anal.rb CHANGED
@@ -4,6 +4,7 @@ require 'spec_id'
4
4
  require 'generator'
5
5
  require 'optparse'
6
6
  require 'ostruct'
7
+ require 'roc'
7
8
 
8
9
  def file_noext(file)
9
10
  file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
@@ -21,7 +22,8 @@ jtplot_file = jtplot_base + '.toplot'
21
22
  OptionParser.new do |op|
22
23
  op.on("-p", "--prefix PREFIX", "prefix for false positive proteins") {|v| opt.p = v.split(',') }
23
24
  op.on("-j", "--jtplot", "output file '#{jtplot_file}' for jtp plotting program") {|v| opt.j = v }
24
- op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
25
+ # op.on("-e", "--peptides", "runs a full analysis on peptides") {|v| opt.e = v }
26
+ op.on("-a", "--area", "outputs area under the curve") {|v| opt.a = v }
25
27
  end.parse!
26
28
 
27
29
  if ARGV.size < 1
@@ -32,55 +34,59 @@ if ARGV.size < 1
32
34
  probabilities) or protein_prophet-prot.xml file which has been run with
33
35
  decoy proteins.
34
36
 
35
- Outputs tp's, precision, and the false positive rate [as calculated by Gygi
36
- 2*(#mod/(#norm+#mod))]. Each of these will be in a column with a label at
37
- the top. Outputs columns (delimited by '\\t') to STDOUT.
38
- To capture to file: #{File.basename(__FILE__)} protein_file.xml > out.csv
37
+ Outputs tp's and precision.
38
+ [The false positive predictive rate (FPPR) is 1 - precision]
39
+ The two columns will be labeled at the top.
40
+ (delimited by '\\t') to STDOUT. To capture to file:
41
+ #{File.basename(__FILE__)} protein_file.xml > out.csv
39
42
 
40
- Also takes gzipped (extension: xml.gz) files.
41
-
42
43
  OPTIONS:
43
44
  <s> = string
44
45
  -p --prefix <s[,s...]> Prefix(s) by which to determine decoy proteins (default #{def_pre})
45
46
  -j --jtplot outputs #{jtplot_file} for plotting by plot.rb
46
47
  [% plot.rb -w lp --yrange n0.1:1.1 --noenhanced <file> ]
48
+ -a --area outputs area under the curve instead of tps/precision
47
49
 
48
50
  NOTE: protein prophet files not yet functional!!!
49
51
  ABBR:
50
52
  TP = True Positives
51
53
  FP = False Positives
52
54
  Prec = Precision = TP/(TP+FP)
53
- FPR = False Positive Rate (as defined by Gygi) 2*[FP/(TP+FP)]
54
55
  "
55
56
  exit
56
57
  end
57
58
 
59
+ ###########################################################
60
+ # I DON"T think option -e is functional yet...
61
+ ###########################################################
62
+
58
63
  files = ARGV.to_a
59
64
 
60
65
  out = nil
61
66
  if opt.j
62
67
  out = File.open(jtplot_file, "w")
63
- lines = ['XYData', jtplot_base, "Classification Analysis", "Num TPs", "(Prec|FPR)"]
68
+ lines = ['XYData', jtplot_base, "Classification Analysis", "Num Hits", "Precision"]
64
69
  lines.each {|l| out.puts l}
65
70
  end
66
71
 
67
72
  headings = files.collect do |file|
68
- %w(TP Prec FPR).collect {|v| v + " (#{file_noext(file)})" }
73
+ %w(TP Precision).collect {|v| v + " (#{file_noext(file)})" }
69
74
  end
70
- #headings = ["# True Positives", "Precision (TP/(TP+FP))", "FP Rate 2*(FP/(TP+FP))"]
71
- puts headings.flatten.join(delimiter)
72
75
 
73
76
  all_arrs = []
74
77
  files.each_with_index do |file,i|
75
78
  sp = SpecID.new(file)
76
- #puts sp.prots.first.respond_to?
77
- if opt.e
78
- headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", ]
79
- arrs = sp.tps_and_precision_and_fpr2_times2_for_prob(opt.p[i])
80
- else
81
- headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
82
- arrs = sp.tps_and_precision_and_fpr2_times2_for_prob(opt.p[i])
79
+ headers = [file_noext(file)]
80
+ arrs = sp.num_hits_and_ppv_for_prob(opt.p[i])
81
+
82
+ if opt.a
83
+ (num_hits, prec) = arrs
84
+ roc = ROC.new
85
+ prec_area = roc.area_under_curve(num_hits, prec)
86
+ puts "#{file} (area under curve [num_hits, precision])"
87
+ puts "Prec [#TPPrec = TP/(TP+FP)]:\t#{prec_area}"
83
88
  end
89
+
84
90
  all_arrs.push(*arrs)
85
91
 
86
92
  lns = []
@@ -95,8 +101,12 @@ files.each_with_index do |file,i|
95
101
  end
96
102
  end
97
103
 
98
- SyncEnumerator.new(*all_arrs).each do |row|
99
- puts row.join(delimiter)
104
+
105
+ unless opt.a
106
+ puts headings.flatten.join(delimiter)
107
+ SyncEnumerator.new(*all_arrs).each do |row|
108
+ puts row.join(delimiter)
109
+ end
100
110
  end
101
111
 
102
112
  out.close if opt.j
data/bin/id_precision.rb CHANGED
@@ -13,7 +13,7 @@ opts = OptionParser.new do |op|
13
13
  op.banner = "usage: #{File.basename(__FILE__)} prefix bioworks.xml"
14
14
  op.separator ""
15
15
  op.separator "takes Bioworks 3.2 xml output files (with probabilities)"
16
- op.separator "rank orders the probabilities and outputs tp's and sensitivity"
16
+ op.separator "rank orders the probabilities and outputs num hits and precision"
17
17
  op.separator "Also takes gzipped (xml.gz) files labeled as such"
18
18
  op.separator ""
19
19
  op.separator "Outputs a comma separated value to STDOUT (.csv)"
@@ -50,10 +50,11 @@ tp_obj.peps = tp
50
50
  two_lists = [tp_obj, fp_obj].map do |obj|
51
51
  list = []
52
52
  list.push( obj.pep_probs_by_pep_prots )
53
- list.push( obj.pep_probs_by_seq_charge )
53
+
54
+ list.push( obj.pep_probs_by_bn_seq_charge )
54
55
  # These each have a by_min and a by_top10
55
- list.push(*( obj.pep_probs_by_scan ) )
56
- list.push(*( obj.pep_probs_by_scan_charge ) )
56
+ list.push(*( obj.pep_probs_by_bn_scan ) )
57
+ list.push(*( obj.pep_probs_by_bn_scan_charge ) )
57
58
  list
58
59
  end
59
60
 
@@ -61,19 +62,22 @@ end
61
62
  headings = ["PepProts", "SeqCharge", "Scan(TopHit)", "Scan(Top10)", "ScanCharge(TopHit)", "ScanCharge(Top10)"]
62
63
  csv_headings = []
63
64
  headings.each do |head|
64
- csv_headings << head + ": TP"
65
+ csv_headings << head + ": NH"
65
66
  csv_headings << head + ": PR"
66
67
  end
67
68
 
68
69
  pairs = two_lists[0].zip two_lists[1]
69
70
 
70
- roc = ROC.new
71
+ roc = DecoyROC.new
71
72
  x_y= []
72
73
  area_under_curve = []
73
74
  #start_x = []
74
75
  #end_x = []
75
76
  pairs.each do |pair|
76
- x,y = roc.tps_and_precision(pair[0], pair[1])
77
+ #x,y = roc.pred_and_tps_and_ppv(pair[0], pair[1])
78
+ (num_hits, tps, ppv) = roc.pred_and_tps_and_ppv(pair[0], pair[1])
79
+ x = num_hits
80
+ y = ppv
77
81
  if $AREAS_ONLY
78
82
  x.unshift 0
79
83
  y.unshift 1.0
@@ -99,7 +103,7 @@ end
99
103
  # X axis is the number of peptides id# (i.e., # of peps in TP db)
100
104
  # Y axis is the precision = TP/(TP+FP)
101
105
 
102
- ## Make some legend comments at the top of the file:
106
+ puts "# NH = number of hits"
103
107
  puts "# TP = true positives"
104
108
  puts "# FP = false positives"
105
109
  puts "# PR = precision = TP/(TP+FP)"
@@ -2,4 +2,4 @@
2
2
 
3
3
  require 'spec_id'
4
4
 
5
- SpecID.false_positive_rate(ARGV)
5
+ SpecID.precision(ARGV)
@@ -8,6 +8,7 @@ require 'spec_id'
8
8
 
9
9
  #############################################################
10
10
  # GLOBALS:
11
+ PRECISION_PROGRAM_BASE = 'precision'
11
12
  DEF_PREFIX = "INV_"
12
13
  DEF_PERCENT_FP = "5.0"
13
14
  #############################################################
@@ -62,7 +63,7 @@ class Runner
62
63
  background-color: #FF0000;
63
64
  color: #FFFFFF
64
65
  }
65
- div.file_info, div.software, div.fpr, div.num_proteins{
66
+ div.file_info, div.software, div.fppr, div.num_proteins{
66
67
  margin-left: 20px;
67
68
  margin-top: 20px;
68
69
  }
@@ -173,38 +174,38 @@ class Runner
173
174
  end
174
175
 
175
176
  # assumes that these are sorted on probability
176
- # desired_fpr is a float
177
- # returns [number_of_prots, actual_fpr]
178
- def num_prots_above_fpr(prots, desired_fpr)
179
- current_fpr_rate_percent = 0.0
180
- previous_fpr_rate_percent = 0.0
177
+ # desired_fppr is a float
178
+ # returns [number_of_prots, actual_fppr]
179
+ def num_prots_above_fppr(prots, desired_fppr)
180
+ current_fppr_rate_percent = 0.0
181
+ previous_fppr_rate_percent = 0.0
181
182
  current_sum_one_minus_prob = 0.0
182
- proteins_within_fpr = 0
183
- actual_fpr = nil
183
+ proteins_within_fppr = 0
184
+ actual_fppr = nil
184
185
  already_found = false
185
186
  prot_cnt = 0
186
187
  prots.each do |prot|
187
188
  prot_cnt += 1
188
189
  # SUM(1-probX)/#prots
189
190
  current_sum_one_minus_prob += 1.0 - prot._probability.to_f
190
- current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
191
+ current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
191
192
 
192
- if current_fpr_rate_percent > desired_fpr && !already_found
193
- actual_fpr = previous_fpr_rate_percent
194
- proteins_within_fpr = prot_cnt
193
+ if current_fppr_rate_percent > desired_fppr && !already_found
194
+ actual_fppr = previous_fppr_rate_percent
195
+ proteins_within_fppr = prot_cnt
195
196
  already_found = true
196
197
  end
197
- previous_fpr_rate_percent = current_fpr_rate_percent
198
+ previous_fppr_rate_percent = current_fppr_rate_percent
198
199
  end
199
- [proteins_within_fpr, actual_fpr]
200
+ [proteins_within_fppr, actual_fppr]
200
201
  end
201
202
 
202
- #### #readable_previous_fpr_rate_percent = sprintf("%.2f", previous_fpr_rate_percent)
203
+ #### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
203
204
 
204
205
  # returns a string of the table rows
205
206
  # false_positive_rate (give as a %) is the cutoff mark
206
- # returns the number of proteins at the desired_fpr (if given)
207
- def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fpr, actual_percent_fp, peptide_count_filename=nil)
207
+ # returns the number of proteins at the desired_fppr (if given)
208
+ def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
208
209
  prot_cnt = 0
209
210
  uniq_prots.map do |prot|
210
211
  tr do
@@ -267,18 +268,20 @@ class Runner
267
268
  "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
268
269
  end
269
270
 
270
- def proph_output(file, outfn, opt, fpr_output_as_html)
271
+ def proph_output(file, outfn, opt, fppr_output_as_html)
271
272
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
272
273
  num_cols = header_anchors.size
273
274
  theaders = ths(header_anchors)
274
275
 
275
276
  root = AXML.parse_file(file)
276
277
  prots = []
277
- ## find the min_prob at a fpr of XX
278
- min_prob_redline = 1.01 # if no fpr is less than what they give, then all are redlined!
278
+ ## find the min_prob at a fppr of XX
279
+ min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
279
280
 
280
- if opt.c
281
+ if opt.c
281
282
  actual_percent_fp = opt.c.to_f
283
+ elsif opt.cut_at
284
+ actual_percent_fp = opt.cut_at.to_f
282
285
  else
283
286
  actual_percent_fp = nil
284
287
  end
@@ -289,20 +292,24 @@ class Runner
289
292
  end
290
293
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
291
294
  filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
292
- output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
293
295
 
294
296
  ## num proteins above cutoff (if opt.c)
295
297
  num_prots_html = ''
296
- if opt.c
297
- (num_prots, actual_fpr) = num_prots_above_fpr(filtered_sorted_prots, opt.c.to_f)
298
- num_prots_html = num_prots_to_html(opt.c.to_f, actual_fpr, num_prots)
298
+ if opt.c || opt.cut_at
299
+ (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
300
+ num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
301
+ end
302
+ if opt.cut_at
303
+ filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
299
304
  end
300
305
 
306
+ output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
307
+
301
308
  table_string = table do
302
309
  tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
303
310
  end
304
- er_info = opt.fpr ? error_info(file) : ""
305
- html_pieces = [outfn, header, fpr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
311
+ er_info = opt.precision ? error_info(file) : ""
312
+ html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
306
313
  print_html_pieces(*html_pieces)
307
314
  end # proph_output
308
315
 
@@ -311,7 +318,7 @@ class Runner
311
318
  "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
312
319
  end
313
320
 
314
- def bioworks_output(file, outfn, opt, fpr_output_as_html)
321
+ def bioworks_output(file, outfn, opt, fppr_output_as_html)
315
322
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
316
323
  num_cols = header_anchors.size
317
324
  theaders = ths(header_anchors)
@@ -339,7 +346,7 @@ class Runner
339
346
  table_string = table do
340
347
  tr{theaders} + rows
341
348
  end
342
- print_html_pieces(outfn, header, fpr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
349
+ print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
343
350
  end # bioworks_output
344
351
 
345
352
  def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
@@ -358,7 +365,7 @@ class Runner
358
365
  #puts lines ?? is this supposed to be commented out?
359
366
  lines = lines.reject do |obj| obj =~ /\*{10}/ end
360
367
  lines.map! do |line| "#{line}<br/>" end
361
- "<div class=\"fpr\">
368
+ "<div class=\"fppr\">
362
369
  <h3>Classification Analysis</h3>
363
370
  #{lines.join("\n")}
364
371
  </div>"
@@ -366,7 +373,7 @@ class Runner
366
373
 
367
374
  # transforms the output string of file_as_decoy into html
368
375
  def prefix_as_decoy_to_html(string)
369
- "<div class=\"fpr\">
376
+ "<div class=\"fppr\">
370
377
  <h3>Classification Analysis</h3>
371
378
  </div>" +
372
379
  string
@@ -384,21 +391,18 @@ class Runner
384
391
  op.separator " outputs: <file>.summary.html"
385
392
  op.separator ""
386
393
  op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
387
- op.separator(" if --fpr then -f is used to specify a file or prefix")
388
- op.separator(" to indicate false positives.")
394
+ op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
395
+ op.separator(" if --precision then -f is used to specify a file or prefix")
396
+ op.separator(" that indicates the false positives.")
389
397
  op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
390
398
  op.separator ""
391
- op.separator "Options for False Positive Rate:"
392
- op.on("--fpr", "include output of false_positive_rate.rb,") {|v| opt.fpr = v}
393
- op.separator(" type 'false_positive_rate.rb' for details")
394
- op.separator(" These options are passed on:")
395
- op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
396
- op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
397
- op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
398
-
399
+ op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
400
+ op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
401
+ op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
399
402
  op.separator ""
400
403
  op.separator "Specific to ProteinProphet (with no concatenated DB):"
401
- op.on("-c", "--cutoff percent", "displays red line at given % fpr") {|v| opt.c = v }
404
+ op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
405
+ op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
402
406
  end
403
407
 
404
408
  opts.parse!
@@ -408,31 +412,23 @@ class Runner
408
412
  exit
409
413
  end
410
414
 
411
- fpr_output_as_html = ''
415
+ fppr_output_as_html = ''
412
416
  files = argv.to_a
413
417
  files.each do |file|
414
418
  outfn = file.gsub(/\.xml$/, '.summary.html')
415
419
  ## False Positive Rate Calculation:
416
- if opt.fpr
420
+ if opt.precision
417
421
  opt.o = outfn # won't actually be written over, but used
418
- to_use_argv = create_false_positive_rate_argv(file, opt)
419
- (out_string, opt, file_as_decoy) = SpecID::FalsePositiveRate.new.false_positive_rate(to_use_argv)
420
- if file_as_decoy ## need to wrap this guy up in some html
421
- ## DISABLE the opt.f (it's a filename) so it doesn't interfere with
422
- ## filtering:
423
- opt.f = nil
424
-
425
- fpr_output_as_html = file_as_decoy_to_html(out_string)
426
- else
427
- fpr_output_as_html = prefix_as_decoy_to_html(out_string)
428
- end
422
+ to_use_argv = create_precision_argv(file, opt)
423
+ (out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
424
+ fppr_output_as_html = prefix_as_decoy_to_html(out_string)
429
425
  end
430
426
 
431
427
  case SpecID.file_type(file)
432
428
  when "protproph"
433
- proph_output(file, outfn, opt, fpr_output_as_html)
429
+ proph_output(file, outfn, opt, fppr_output_as_html)
434
430
  when "bioworks"
435
- bioworks_output(file, outfn, opt, fpr_output_as_html)
431
+ bioworks_output(file, outfn, opt, fppr_output_as_html)
436
432
  else
437
433
  abort "filetype for #{file} not recognized!"
438
434
  end
@@ -440,15 +436,12 @@ class Runner
440
436
 
441
437
  end # method go
442
438
 
443
- def create_false_positive_rate_argv(file, opt)
439
+ def create_precision_argv(file, opt)
444
440
  # include only those options specific
445
441
  new_argv = [file]
446
442
  if opt.f ; new_argv << '-f' << opt.f end
447
- if opt.g ; new_argv << '-g' end
448
- if opt.p ; new_argv << '-p' end
449
- if opt.n ; new_argv << '-n' end
450
443
  if opt.o ; new_argv << '-o' << opt.o end
451
- new_argv
444
+ new_argv
452
445
  end
453
446
 
454
447
  end # Runner
data/changelog.txt ADDED
@@ -0,0 +1,34 @@
1
+
2
+ 1. A couple of scripts and subroutines were hashing peptides but not on the file
3
+ basename. This would result in slightly incorrect results (any time there
4
+ were overlapping scan numbers in multiple datasets, only the top one would be
5
+ chosen). The results would be correct for single runs.
6
+
7
+ Output files that could be affected:
8
+ *.top_per_scan.txt
9
+ *.all_peps_per_scan.txt
10
+
11
+ Scripts that could be affected:
12
+ script/top_hit_per_scan.rb
13
+ bin/filter_spec_id.rb
14
+ script/filter-peps.rb
15
+ bin/id_precision.rb
16
+
17
+ Subroutines that were affected:
18
+ spec_id.rb (pep_probs_by_* )
19
+ spec_id.rb (top_peps_prefilter!)
20
+ proph.rb uniq_by_seqcharge
21
+ align.rb called uniq_by_seqcharge
22
+
23
+
24
+ 2. false_positive_rate.rb and protein_summary.rb (by extension) were using
25
+ number of true positives on the x axis while in reality I was plotting the
26
+ number of hits. I've updated x axis labels to reflect this change. In
27
+ addition, since the term 'false positive rate' has such a distinct definition
28
+ in classical ROC plots and binary statistics, I've decided to work primarily
29
+ in terms of precision (TP/(TP+FP)). I've purged the terms 'False Positive
30
+ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
31
+ called the False Positive Predictive Rate (FPPR). I will probably implement
32
+ this in a future release.
33
+
34
+
data/lib/align.rb CHANGED
@@ -1,6 +1,5 @@
1
1
 
2
2
  require 'spec/mzxml/parser'
3
- require 'hash_by'
4
3
  require 'spec/msrun'
5
4
  require 'spec_id/proph'
6
5
  require 'vec'