mspire 0.3.1 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/Rakefile +2 -2
  2. data/bin/bioworks_to_pepxml.rb +15 -3
  3. data/bin/ms_to_lmat.rb +2 -1
  4. data/bin/sqt_group.rb +26 -0
  5. data/changelog.txt +36 -0
  6. data/lib/ms/msrun.rb +3 -1
  7. data/lib/ms/parser/mzdata/dom.rb +14 -14
  8. data/lib/ms/scan.rb +3 -3
  9. data/lib/mspire.rb +1 -1
  10. data/lib/sample_enzyme.rb +39 -0
  11. data/lib/spec_id.rb +18 -0
  12. data/lib/spec_id/aa_freqs.rb +6 -9
  13. data/lib/spec_id/digestor.rb +16 -17
  14. data/lib/spec_id/mass.rb +63 -1
  15. data/lib/spec_id/parser/proph.rb +101 -2
  16. data/lib/spec_id/precision/filter.rb +3 -2
  17. data/lib/spec_id/precision/filter/cmdline.rb +3 -1
  18. data/lib/spec_id/precision/filter/output.rb +1 -0
  19. data/lib/spec_id/precision/prob.rb +88 -21
  20. data/lib/spec_id/precision/prob/cmdline.rb +28 -16
  21. data/lib/spec_id/precision/prob/output.rb +8 -2
  22. data/lib/spec_id/proph/pep_summary.rb +25 -12
  23. data/lib/spec_id/sequest.rb +28 -0
  24. data/lib/spec_id/sequest/pepxml.rb +142 -197
  25. data/lib/spec_id/sqt.rb +349 -0
  26. data/lib/spec_id/srf.rb +33 -23
  27. data/lib/validator.rb +40 -57
  28. data/lib/validator/aa.rb +3 -90
  29. data/lib/validator/aa_est.rb +112 -0
  30. data/lib/validator/cmdline.rb +163 -31
  31. data/lib/validator/decoy.rb +15 -7
  32. data/lib/validator/digestion_based.rb +5 -4
  33. data/lib/validator/q_value.rb +32 -0
  34. data/script/peps_per_bin.rb +67 -0
  35. data/script/sqt_to_meta.rb +24 -0
  36. data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
  37. data/specs/bin/fasta_shaker_spec.rb +2 -2
  38. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
  39. data/specs/bin/filter_and_validate_spec.rb +25 -6
  40. data/specs/bin/ms_to_lmat_spec.rb +2 -2
  41. data/specs/bin/prob_validate_spec.rb +5 -3
  42. data/specs/sample_enzyme_spec.rb +86 -1
  43. data/specs/spec_helper.rb +11 -9
  44. data/specs/spec_id/bioworks_spec.rb +2 -1
  45. data/specs/spec_id/precision/filter_spec.rb +5 -5
  46. data/specs/spec_id/precision/prob_spec.rb +0 -67
  47. data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
  48. data/specs/spec_id/protein_summary_spec.rb +4 -4
  49. data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
  50. data/specs/spec_id/sequest_spec.rb +38 -0
  51. data/specs/spec_id/sqt_spec.rb +111 -3
  52. data/specs/spec_id_spec.rb +2 -0
  53. data/specs/transmem/phobius_spec.rb +3 -1
  54. data/specs/transmem/toppred_spec.rb +1 -1
  55. data/specs/validator/aa_est_spec.rb +66 -0
  56. data/specs/validator/aa_spec.rb +1 -68
  57. data/specs/validator/background_spec.rb +2 -0
  58. data/specs/validator/bias_spec.rb +3 -27
  59. data/specs/validator/decoy_spec.rb +2 -2
  60. data/specs/validator/transmem_spec.rb +2 -1
  61. data/test_files/small.sqt +87 -0
  62. metadata +312 -293
data/lib/validator.rb CHANGED
@@ -1,19 +1,34 @@
1
1
 
2
2
  class Validator
3
3
 
4
+ # in the absence of digestion, does the spec_id type requires pephits for
5
+ # validation?
6
+ def self.requires_pephits?(spec_id_obj)
7
+ case spec_id_obj
8
+ when Proph::ProtSummary : true
9
+ when Proph::PepSummary : true
10
+ when SQTGroup : true
11
+ else ; false
12
+ end
13
+ end
14
+
4
15
  Validator_to_string = {
5
16
  'Validator::AA' => 'badAA',
17
+ 'Validator::AAEst' => 'badAAEst',
6
18
  'Validator::Decoy' => 'decoy',
7
19
  'Validator::Transmem::Protein' => 'tmm',
8
20
  'Validator::TruePos' => 'tps',
9
21
  'Validator::Bias' => 'bias',
10
22
  'Validator::Probability' => 'prob',
23
+ 'Validator::QValue' => 'qval',
11
24
  :bad_aa => 'badAA',
25
+ :bad_aa_est => 'badAAEst',
12
26
  :decoy => 'decoy',
13
27
  :tmm => 'tmm',
14
28
  :tps => 'tps',
15
29
  :bias => 'bias',
16
30
  :prob => 'prob',
31
+ :qval => 'qval',
17
32
  }
18
33
 
19
34
  def initialize_increment
@@ -45,12 +60,12 @@ class Validator
45
60
  @increment_tps += tps.size
46
61
  @increment_fps += fps.size
47
62
  (num_tps, num_fps) =
48
- if self.respond_to?(:calc_precision_prep) # for digestion based validators
49
- (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
50
- [num_tps, num_fps]
51
- else
52
- [@increment_tps, @increment_fps]
53
- end
63
+ if self.respond_to?(:calc_precision_prep) # for digestion based validators
64
+ (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
65
+ [num_tps, num_fps]
66
+ else
67
+ [@increment_tps, @increment_fps]
68
+ end
54
69
  calc_precision(num_tps, num_fps)
55
70
  end
56
71
 
@@ -97,12 +112,16 @@ class Validator
97
112
  case val
98
113
  when Validator::TruePos
99
114
  hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
115
+ when Validator::AAEst
116
+ %w(frequency background calculated_background).each do |cat|
117
+ hash[cat.to_sym] = val.send(cat.to_sym)
118
+ end
100
119
  when Validator::AA
101
- %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
120
+ %w(false_to_total_ratio background calculated_background).each do |cat|
102
121
  hash[cat.to_sym] = val.send(cat.to_sym)
103
122
  end
104
123
  when Validator::Decoy
105
- %w(correct_wins decoy_on_match).each do |cat|
124
+ %w(decoy_to_target_ratio correct_wins decoy_on_match).each do |cat|
106
125
  hash[cat.to_sym] = val.send(cat.to_sym)
107
126
  end
108
127
  hash[:constraint] = val.constraint.inspect if val.constraint
@@ -119,6 +138,8 @@ class Validator
119
138
  %w(prob_method).each do |cat|
120
139
  hash[cat.to_sym] = val.send(cat.to_sym)
121
140
  end
141
+ when Validator::QValue
142
+ # no params to add
122
143
  else ; raise ArgumentError, "Don't know the validator class #{val}"
123
144
  end
124
145
  klass_as_s = val.class.to_s
@@ -127,46 +148,6 @@ class Validator
127
148
  hash
128
149
  end
129
150
  end
130
-
131
- =begin
132
- ## THIS IS WITH STRINGS AS KEYS!
133
- # takes an array of validators and returns a fresh array where each has been
134
- # turned into a sensible hash (with symbols as the keys!)
135
- def self.sensible_validator_hashes(validators)
136
- validators.map do |val|
137
- hash = {}
138
- case val
139
- when Validator::TruePos
140
- hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
141
- when Validator::AA
142
- %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
143
- hash[cat] = val.send(cat.to_sym)
144
- end
145
- when Validator::Decoy
146
- %w(correct_wins decoy_on_match).each do |cat|
147
- hash[cat] = val.send(cat.to_sym)
148
- end
149
- hash['constraint'] = val.constraint.inspect if val.constraint
150
- when Validator::Bias
151
- %w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
152
- hash[cat] = val.send(cat.to_sym)
153
- end
154
- hash['file'] = val.fasta.filename
155
- when Validator::Transmem::Protein
156
- %w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
157
- hash[cat] = val.send(cat.to_sym)
158
- end
159
- when Validator::Probability
160
- else ; raise ArgumentError, "Don't know the validator class #{val}"
161
- end
162
- klass_as_s = val.class.to_s
163
- hash['type'] = Validator_to_string[klass_as_s]
164
- hash['class'] = klass_as_s
165
- hash
166
- end
167
- end
168
- =end
169
-
170
151
  end
171
152
 
172
153
  module Precision::Calculator
@@ -186,11 +167,11 @@ end
186
167
  # normal hits (which may be true or false) and the second are decoy hits.
187
168
  # edge case: if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
188
169
  module Precision::Calculator::Decoy
189
- def calc_precision(num_normal, num_decoy)
170
+ def calc_precision(num_normal, num_decoy, decoy_to_target_ratio=1.0)
190
171
  # will calculate as floats in case fractional amounts passed in for
191
172
  # whatever reason
192
173
  num_normal_f = num_normal.to_f
193
- num_true_pos = num_normal.to_f - num_decoy
174
+ num_true_pos = num_normal_f - (num_decoy.to_f / decoy_to_target_ratio)
194
175
  precision =
195
176
  if num_normal_f == 0.0
196
177
  if num_decoy.to_f > 0.0
@@ -204,11 +185,13 @@ module Precision::Calculator::Decoy
204
185
  end
205
186
  end
206
187
 
207
- require 'validator/true_pos'
208
- require 'validator/aa'
209
- require 'validator/bias'
210
- require 'validator/decoy'
211
- require 'validator/transmem'
212
- require 'validator/probability'
213
- require 'validator/prot_from_pep'
188
+ #require 'validator/true_pos'
189
+ #require 'validator/aa'
190
+ #require 'validator/aa_est'
191
+ #require 'validator/bias'
192
+ #require 'validator/decoy'
193
+ #require 'validator/transmem'
194
+ #require 'validator/probability'
195
+ #require 'validator/q_value'
196
+ #require 'validator/prot_from_pep'
214
197
 
data/lib/validator/aa.rb CHANGED
@@ -1,4 +1,3 @@
1
- require 'validator' # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
2
1
  require 'validator/digestion_based'
3
2
  require 'fasta'
4
3
  require 'spec_id/aa_freqs'
@@ -12,12 +11,7 @@ class Validator::AA < Validator::DigestionBased
12
11
 
13
12
  # it is a false hit if the amino acid is located in the peptide
14
13
  attr_accessor :false_if_found
15
-
16
- # if given, the frequency of the amino acid is used to estimate the false to
17
- # total ratio based on the pephits given for pephit_precision.
18
- # see Validator::AA.calc_frequency to calculate a frequency
19
- attr_accessor :frequency
20
-
14
+
21
15
  DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
22
16
  :false_if_found => true,
23
17
  } )
@@ -34,21 +28,9 @@ class Validator::AA < Validator::DigestionBased
34
28
  end
35
29
  end
36
30
 
37
- # takes a fasta object and sets the frequency based on constraint.
38
- # constraint is one acceptable to initialize!
39
- # returns self
40
- def set_frequency(fasta_obj)
41
- table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
42
- @frequency = table[@constraint.to_sym]
43
- self
44
- end
45
-
46
31
  # right now only accepts single amino acids as constraints (as a string,
47
32
  # e.g. 'C', or symbol, e.g. :C)
48
33
  # options:
49
- # :frequency OR :false_to_total_ratio should be used (NOT both)
50
- # :frequency => Float, if the frequency of the amino acid is known (see
51
- # Validator::AA.calc_frequency)
52
34
  # :false_to_total_ratio => if a true digestion was already performed (see
53
35
  # Validator::AA.calc_false_to_total_ratio)
54
36
  # :false_if_found => it is a false positive if the amino acid is found.
@@ -56,80 +38,11 @@ class Validator::AA < Validator::DigestionBased
56
38
  def initialize(constraint, options={})
57
39
  @constraint = constraint.to_s
58
40
  opts = DEFAULTS.merge(options)
59
- (@frequency, @false_to_total_ratio, @false_if_found, @background) = opts.values_at(:frequency, :false_to_total_ratio, :false_if_found, :background)
60
- end
61
-
62
- # if expected is 0 then will return precision = 1.0
63
- def pephit_precision(peps)
64
- if @frequency
65
- (actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
66
- if expected == 0.0
67
- 1.0
68
- else
69
- # what's this guy ?? good for??
70
- fraction_of_expected = actual.to_f/expected
71
- pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
72
- end
73
- elsif @false_to_total_ratio
74
- super(peps)
75
- else
76
- raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
77
- end
78
- end
79
-
80
- # returns (Actual(Int), Expected(Float)) based on how many peptides have at
81
- # least one amino_acid, the frequency it is observed in background (then we
82
- # can look at the size of each peptide and determine the likelihood of
83
- # having the peptide with at least one amino acid).
84
- # amino_acid should be a string (e.g., 'C')
85
- def at_least_one(amino_acid, freq, amino_acid_seqs)
86
- one_minus_freq = 1.0 - freq
87
- probs = []
88
- actual = 0
89
- expected = 0.0
90
- amino_acid_seqs.each do |aaseq|
91
- expected += (1.0 - (one_minus_freq**aaseq.size))
92
- if aaseq.include?(amino_acid)
93
- actual += 1
94
- end
95
- end
96
- [actual, expected]
97
- end
98
-
99
-
100
- # given: (actual # with 'AA', expected # with 'AA', total#peptides,
101
- # mean_fraction_of_cysteines_true)
102
- #
103
- # PepHit('AA') = Peptide containing at least one 'AA'
104
- # # expected PepHit('AA') # observed Bad Pep ('AA')
105
- # ----------------------- proportional_to -------------------------
106
- # # total PepHits # Total Bad PepHit
107
- #
108
- # returns the precision
109
- # the background correction factor will not reduce the actual count of
110
- # peptides to < 0. One can still get negative precision scores, however,
111
- # depending on the other variables.
112
- # background is the number of peptides with the amino acid in the purest
113
- # sample over the total number of peps.
114
- #---
115
- # this is thoroughly explained in my 2007_09 presentations (inkscape)
116
- #+++
117
- def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
118
- actual = actual.to_f
119
- @calculated_background = actual / total_peps
120
- actual -= (total_peps * background)
121
- # We were doing it compared to the number expected.. but this is more
122
- # clear
123
- # actual/false_hits = expected/total_peps_passing
124
- # false_hits = (total_peps_passing * actual) / expected
125
- if actual < 0.0 ; actual = 0.0 end
126
- total_number_false = (actual * total_peps).to_f / expected
127
- #fppr = total_number_false / total_peps
128
- prec = (total_peps - total_number_false) / total_peps
41
+ (@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
129
42
  end
130
43
 
131
44
  def to_param_string
132
- "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
45
+ "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
133
46
  end
134
47
  end
135
48
 
@@ -0,0 +1,112 @@
1
+ require 'validator/aa'
2
+
3
+
4
+ class Validator ; end
5
+ class Validator::AA ; end
6
+
7
+ # A class that uses the peps given to it and a background frequency to
8
+ # calculate the false_to_total_ratio at each turn.
9
+ class Validator::AAEst < Validator::AA
10
+ attr_accessor :constraint
11
+ attr_accessor :false_if_found
12
+
13
+ # the frequency of the amino acid is used to estimate the false to
14
+ # total ratio based on the pephits given for pephit_precision.
15
+ # see Validator::AA.calc_frequency to calculate a frequency
16
+ # or use set_frequency to set from pep hits.
17
+ attr_accessor :frequency
18
+
19
+ DEFAULTS = {
20
+ :false_if_found => true
21
+ }.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
22
+
23
+ # only takes a string right now for constraint
24
+ def initialize(constraint, options={})
25
+ @constraint = constraint.to_s
26
+ opts = DEFAULTS.merge(options)
27
+ (@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
28
+ end
29
+
30
+ def pephit_precision(peps)
31
+ set_false_to_total_ratio(peps)
32
+ super(peps)
33
+ end
34
+
35
+ def set_false_to_total_ratio(peps)
36
+ if peps.size > 0
37
+ expected = 0.0
38
+ peps.each do |pep|
39
+ expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
40
+ end
41
+ @false_to_total_ratio = expected / peps.size
42
+ else
43
+ @false_to_total_ratio = 1.0
44
+ end
45
+ end
46
+
47
+ def set_ongoing_false_to_total_ratio(peps)
48
+ if peps.size > 0
49
+ peps.each do |pep|
50
+ @expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
51
+ end
52
+ # @increment_total_submitted should == @increment_tps and @increment_fps
53
+ # since these are either/or
54
+ @false_to_total_ratio = @expected / @increment_total_submitted
55
+ else
56
+ @false_to_total_ratio = 1.0
57
+ end
58
+ end
59
+
60
+
61
+ def to_param_string
62
+ "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
63
+ end
64
+
65
+ # takes objects responding to aaseq and sets the frequency based on
66
+ # constraint. constraint is one acceptable to initialize! returns self
67
+ def set_frequency(objs)
68
+ table = SpecID::AAFreqs.new.calculate_frequencies(objs)
69
+ @frequency = table[@constraint.to_sym]
70
+ self
71
+ end
72
+
73
+ # if adding pephits in groups at a time, the entire group does not need to be
74
+ # queried, just the individual hit. Use this OR pephits_precision (NOT
75
+ # both). The initial query to this method will begin a running tally that
76
+ # is saved by the validator.
77
+ # takes either an array or a single pephit (determined by if it is a
78
+ # SpecID::Pep)
79
+ def increment_pephits_precision(peps)
80
+ tmp = $VERBOSE; $VERBOSE = nil
81
+ unless @increment_initialized
82
+ initialize_increment
83
+ @expected = 0.0
84
+ end
85
+ $VERBOSE = tmp
86
+
87
+ to_submit =
88
+ if peps.is_a? SpecID::Pep
89
+ [peps]
90
+ else
91
+ peps
92
+ end
93
+ @increment_total_submitted += to_submit.size
94
+ (tps, fps) = partition(to_submit)
95
+ #### THIS IS THE MAGIC FOR THIS VALIDATOR:
96
+ set_ongoing_false_to_total_ratio(to_submit)
97
+
98
+ @increment_tps += tps.size
99
+ @increment_fps += fps.size
100
+ (num_tps, num_fps) =
101
+ if self.respond_to?(:calc_precision_prep) # for digestion based validators
102
+ (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
103
+ [num_tps, num_fps]
104
+ else
105
+ [@increment_tps, @increment_fps]
106
+ end
107
+ calc_precision(num_tps, num_fps)
108
+ end
109
+
110
+
111
+
112
+ end
@@ -1,14 +1,31 @@
1
1
  require 'validator'
2
2
 
3
+ require 'validator/true_pos'
4
+ require 'validator/aa'
5
+ require 'validator/aa_est'
6
+ require 'validator/bias'
7
+ require 'validator/decoy'
8
+ require 'validator/transmem'
9
+ require 'validator/probability'
10
+ require 'validator/q_value'
11
+ require 'validator/prot_from_pep'
12
+
13
+ ## these all for a stupid check...
14
+ require 'spec_id/sqt'
15
+ require 'spec_id/proph/prot_summary'
16
+ require 'spec_id/proph/pep_summary'
17
+
3
18
  class Validator::Cmdline
4
19
 
5
20
  Validator_symbols_to_classes = {
6
21
  :tmm => Validator::Transmem::Protein,
7
22
  :decoy => Validator::Decoy,
8
23
  :bad_aa => Validator::AA,
24
+ :bad_aa_est => Validator::AAEst,
9
25
  :tps => Validator::TruePos,
10
26
  :bias => Validator::Bias,
11
27
  :prob => Validator::Probability,
28
+ :qval => Validator::QValue,
12
29
  }
13
30
  # was VAL_DEFAULTS
14
31
  DEFAULTS = {
@@ -24,11 +41,16 @@ class Validator::Cmdline
24
41
  {
25
42
  :hits_together => true,
26
43
  :decoy_on_match => true,
44
+ :decoy_to_target_ratio => 1.0,
27
45
  },
28
46
  :bad_aa =>
29
47
  {
30
48
  :false_if_found => true,
31
- :estimate => true,
49
+ :bkg => 0.0,
50
+ },
51
+ :bad_aa_est =>
52
+ {
53
+ :false_if_found => true,
32
54
  :bkg => 0.0,
33
55
  },
34
56
  :bias =>
@@ -39,7 +61,7 @@ class Validator::Cmdline
39
61
  :ties => true,
40
62
  }
41
63
  COMMAND_LINE = {
42
- :decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
64
+ :decoy => ["--decoy /REGEXP/|FILENAME[,DTR,DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
43
65
  "FILENAME of separate search on decoys.",
44
66
  "All regular expressions must be surrounded by '/'",
45
67
  "(no extended options [trailing modifiers]).",
@@ -50,21 +72,30 @@ class Validator::Cmdline
50
72
  " --decoy '/^\\s*REVERSE/'",
51
73
  "If decoys proteins were searched in a separate file,",
52
74
  "then give the FILENAME (e.g., --decoy decoy.srg)",
75
+ "DTR = Decoy to Target Ratio (default: #{DEFAULTS[:decoy][:decoy_to_target_ratio]})",
53
76
  "DOM = *true/false, decoy on match",],
54
77
  :tps => ["--tps <fasta>", "for a completely defined sample, this is the",
55
78
  "fasta file containing the true protein hits"],
56
79
  # may require digestion:
57
- :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
58
- "information (that is shared between them).",
80
+ :fasta => ["--fasta FASTA", "fasta file for phobius transmembrane",
81
+ "(needed if PEPS options is not false)"],
82
+ :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "[not recommended]",
83
+ "Creates the 'false/total' ratio with in silico",
84
+ "digestion. Otherwise, the 3rd-10th best hits (sorted by",
85
+ "xcorr) are used.",
86
+ "The following validators will use this",
87
+ "information (shared between them) if option given",
59
88
  "ORIG_FASTA = the fasta file used to do the run",
60
89
  "PARAMS = the params file used to do the run",],
61
90
  :bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
62
91
  "PE = *true|false proteins in fasta file expected in sample",
63
92
  "BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
64
- :bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
93
+ :bad_aa => ["--bad_aa AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
65
94
  "AA = The amino acid (e.g., 'C')",
66
- "EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
67
95
  "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
96
+ :bad_aa_est => ["--bad_aa_est AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
97
+ "AA = The amino acid (e.g., 'C')",
98
+ "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa_est][:bkg]}):",],
68
99
 
69
100
  :tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
70
101
  "phobius.small:",
@@ -110,17 +141,27 @@ class Validator::Cmdline
110
141
  end
111
142
  opts[:validators].push([:prob, mthd])
112
143
  },
144
+ :qval => lambda {|ar, opts| opts[:validators].push([:qval]) },
113
145
  :decoy => lambda {|ar, opts|
114
146
  myargs = [:decoy]
115
147
  first_arg = ar[0]
116
- myargs[1] =
148
+ val_opts = {}
149
+ val_opts[:constraint] =
117
150
  if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
151
+ # cast as a regular expression of has '/ /'
118
152
  Regexp.new(first_arg[1...-1])
119
153
  else
154
+ # assume that it is a filename
155
+ raise ArgumentError, "File does not exist: #{first_arg}\n(was this supposed to be a regular expression? if so, should be given: /#{first_arg}/)" unless File.exist?(first_arg)
120
156
  first_arg
121
157
  end
122
- myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
123
- opts[:validators].push(myargs)
158
+ val_opts[:decoy_to_target_ratio] = (ar[1] || DEFAULTS[:decoy][:decoy_to_target_ratio]).to_f
159
+ val_opts[:decoy_on_match] = self.boolean(ar[2], DEFAULTS[:decoy][:decoy_on_match])
160
+ myargs.push(val_opts)
161
+ opts[:validators].push(myargs)
162
+ },
163
+ :fasta => lambda {|arg, opts|
164
+ opts[:fasta] = Fasta.new(arg)
124
165
  },
125
166
  :digestion => lambda {|ar, opts|
126
167
  raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
@@ -138,6 +179,9 @@ class Validator::Cmdline
138
179
  else
139
180
  DEFAULTS[:bias][:bkg]
140
181
  end
182
+ if ar[3]
183
+ val_opts[:false_to_total_ratio] = ar[3].to_f
184
+ end
141
185
  myargs.push(val_opts)
142
186
  opts[:validators].push(myargs)
143
187
  },
@@ -146,16 +190,36 @@ class Validator::Cmdline
146
190
  myargs = [:bad_aa]
147
191
  myargs.push( ar[0] )
148
192
  val_opts = {}
149
- val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
150
193
  val_opts[:background] =
151
- if ar[2]
152
- ar[2].to_f
194
+ if ar[1]
195
+ ar[1].to_f
153
196
  else
154
197
  DEFAULTS[:bad_aa][:bkg]
155
198
  end
199
+ if ar[2]
200
+ val_opts[:false_to_total_ratio] = ar[2].to_f
201
+ end
156
202
  myargs.push(val_opts)
157
203
  opts[:validators].push(myargs)
158
204
  },
205
+ :bad_aa_est => lambda {|ar, opts|
206
+ ## GET the FREQUENCY
207
+ myargs = [:bad_aa_est]
208
+ myargs.push( ar[0] )
209
+ val_opts = {}
210
+ val_opts[:background] =
211
+ if ar[1]
212
+ ar[1].to_f
213
+ else
214
+ DEFAULTS[:bad_aa_est][:bkg]
215
+ end
216
+ if ar[2]
217
+ val_opts[:frequency] = ar[2].to_f
218
+ end
219
+ myargs.push(val_opts)
220
+ opts[:validators].push(myargs)
221
+ },
222
+
159
223
  :tmm => lambda {|ar, opts|
160
224
  myargs = [:tmm]
161
225
  myargs.push( ar[0] )
@@ -177,16 +241,38 @@ class Validator::Cmdline
177
241
  if ar[4] ; ar[4].to_f
178
242
  else ; DEFAULTS[:tmm][:bkg]
179
243
  end
244
+ if ar[5]
245
+ val_opts[:false_to_total_ratio] = ar[5].to_f
246
+ end
180
247
  myargs.push(val_opts)
181
248
  opts[:validators].push( myargs )
182
249
  },
250
+ :pephits => lambda {|v,opts| opts[:pephits] = SpecID.new(v) },
183
251
  :tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
184
252
  :false_on_tie => lambda {|v,opts| opts[:ties] = false },
185
253
  }
186
254
 
255
+ def self.requires_pephits?(spec_id_obj)
256
+ case spec_id_obj
257
+ when Proph::ProtSummary : true
258
+ # at least currently (subject to change)
259
+ when Proph::PepSummary : true
260
+ when SQTGroup
261
+ if spec_id_obj.peps.first.respond_to?(:q_value)
262
+ # its percolator output and we don't have other hits to use
263
+ true
264
+ else
265
+ false
266
+ end
267
+ else ; false
268
+ end
269
+ end
270
+
187
271
  # remove the keys from opts involved in validators and return an array
188
272
  # of validators
189
- def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
273
+ # postfilter is one of :top_per_scan, :top_per_aaseq,
274
+ # :top_per_aaseq_charge (of which last two are subsets of scan)
275
+ def self.prepare_validators(opts, false_on_tie, interactive, postfilter, spec_id)
190
276
  validator_args = opts[:validators]
191
277
  correct_wins = !false_on_tie
192
278
  need_false_to_total_ratio = []
@@ -199,7 +285,9 @@ class Validator::Cmdline
199
285
  case tp
200
286
  when :tmm
201
287
  val_args[1][:correct_wins] = correct_wins
202
- val_args[1][:fasta] = opts[:digestion_objects][0]
288
+ if opts.key?(:fasta)
289
+ val_args[1][:fasta] = opts[:fasta]
290
+ end
203
291
  val_args
204
292
  when :bias
205
293
  val_args[1][:correct_wins] = correct_wins
@@ -208,10 +296,10 @@ class Validator::Cmdline
208
296
  val_args = [val_args[0], correct_wins]
209
297
  val_args
210
298
  when :decoy
211
- val_args = [val_args[0], val_args[1], correct_wins]
299
+ val_args[0][:correct_wins] = correct_wins
212
300
  # don't delete the key here since we need the decoy = regexp key
213
301
  val_args
214
- else ## bad_aa and prob are represented here:
302
+ else ## bad_aa, prob, and qval are represented here:
215
303
  val_args
216
304
  end
217
305
  val = Validator_symbols_to_classes[tp].new( *val_args )
@@ -219,10 +307,12 @@ class Validator::Cmdline
219
307
  if tp == :tmm
220
308
  transmem_vals << val
221
309
  end
222
- potential_digestion_classes = /Transmem|AA|Bias/
310
+ potential_digestion_classes = /Transmem|AA|AAEst|Bias/
223
311
  if val.class.to_s =~ potential_digestion_classes
224
- if val_args[1][:estimate] == true
225
- need_frequency << val
312
+ if val.class.to_s == 'Validator::AAEst'
313
+ need_frequency.push(val) if val.frequency.nil?
314
+ elsif !(val.false_to_total_ratio.nil?)
315
+ $stderr.puts "using false_to_total_ratio: #{val.false_to_total_ratio}"
226
316
  else
227
317
  need_false_to_total_ratio << val
228
318
  end
@@ -230,20 +320,62 @@ class Validator::Cmdline
230
320
  val
231
321
  end
232
322
 
233
- if need_false_to_total_ratio.size > 0
234
- raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
235
- peps = Digestor.digest( *(opts[:digestion_objects]) )
236
- need_false_to_total_ratio.each do |val|
237
- val.set_false_to_total_ratio( peps )
238
- end
239
- end
240
- if need_frequency.size > 0
241
- raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
242
- need_frequency.each do |val|
243
- val.set_frequency( opts[:digestion_objects][0] )
323
+ if ((need_false_to_total_ratio.size > 0) or (need_frequency.size > 0))
324
+ if opts.key?(:digestion_objects)
325
+ #raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
326
+ peps = Digestor.digest( *(opts[:digestion_objects]) )
327
+ need_false_to_total_ratio.each do |val|
328
+ val.set_false_to_total_ratio( peps )
329
+ end
330
+ if need_frequency.size > 0
331
+ need_frequency.each do |val|
332
+ val.set_frequency( opts[:digestion_objects][0] )
333
+ end
334
+ end
335
+ opts.delete(:digestion_objects)
336
+ else ## do the new and improved selection of non-top hits to get false_to_total_ratios and freqs
337
+ $stderr.puts "...using pephits to calculate background ratios"
338
+ # first_index, last_index
339
+ pephits =
340
+ if opts[:pephits] ## protein prophet (since it needs to get ratios somewhere
341
+ $stderr.puts "using --pephits"
342
+ opts[:pephits].peps
343
+ elsif requires_pephits?(spec_id)
344
+ raise ArgumentError, "with objects of class '#{spec_id.class}', one of your validators requires --pephits or --digestion"
345
+ else
346
+ $stderr.puts "using given spec_id.peps"
347
+ spec_id.peps
348
+ end
349
+
350
+ not_first_or_second_peps = Sequest.other_hits_sorted_by_xcorr(pephits, 2, 9, [:base_name, :first_scan, :charge])
351
+ pephits =
352
+ case postfilter
353
+ when :top_per_scan
354
+ $stderr.puts "using top_per_scan" ; not_first_or_second_peps
355
+ when :top_per_aaseq
356
+ # it doesn't matter which one is given since validators are
357
+ # based on amino acid sequence
358
+ $stderr.puts 'using top_per_aaseq'
359
+ not_first_or_second_peps.hash_by(:aaseq).values.map {|pep| pep.first }
360
+ when :top_per_aaseq_charge
361
+ $stderr.puts 'using top_per_aaseq_charge'
362
+ not_first_or_second_peps.hash_by(:aaseq, :charge).values.map {|pep| pep.first }
363
+ else
364
+ raise ArgumentError, "must have a valid postfilter method, yours: '#{postfilter}'"
365
+ end
366
+
367
+ need_false_to_total_ratio.each do |val|
368
+ val.set_false_to_total_ratio( pephits )
369
+ $stderr.puts "false_to_total_ratio for #{val.class.to_s}: #{val.false_to_total_ratio}"
370
+ end
371
+ if need_frequency.size > 0
372
+ need_frequency.each do |val|
373
+ $stderr.puts "Setting frequency!"
374
+ val.set_frequency( pephits )
375
+ end
376
+ end
244
377
  end
245
378
  end
246
- opts.delete(:digestion_objects)
247
379
 
248
380
  if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
249
381
  # This is overkill if we are doing a single filtering job, but it