mspire 0.3.1 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/Rakefile +2 -2
  2. data/bin/bioworks_to_pepxml.rb +15 -3
  3. data/bin/ms_to_lmat.rb +2 -1
  4. data/bin/sqt_group.rb +26 -0
  5. data/changelog.txt +36 -0
  6. data/lib/ms/msrun.rb +3 -1
  7. data/lib/ms/parser/mzdata/dom.rb +14 -14
  8. data/lib/ms/scan.rb +3 -3
  9. data/lib/mspire.rb +1 -1
  10. data/lib/sample_enzyme.rb +39 -0
  11. data/lib/spec_id.rb +18 -0
  12. data/lib/spec_id/aa_freqs.rb +6 -9
  13. data/lib/spec_id/digestor.rb +16 -17
  14. data/lib/spec_id/mass.rb +63 -1
  15. data/lib/spec_id/parser/proph.rb +101 -2
  16. data/lib/spec_id/precision/filter.rb +3 -2
  17. data/lib/spec_id/precision/filter/cmdline.rb +3 -1
  18. data/lib/spec_id/precision/filter/output.rb +1 -0
  19. data/lib/spec_id/precision/prob.rb +88 -21
  20. data/lib/spec_id/precision/prob/cmdline.rb +28 -16
  21. data/lib/spec_id/precision/prob/output.rb +8 -2
  22. data/lib/spec_id/proph/pep_summary.rb +25 -12
  23. data/lib/spec_id/sequest.rb +28 -0
  24. data/lib/spec_id/sequest/pepxml.rb +142 -197
  25. data/lib/spec_id/sqt.rb +349 -0
  26. data/lib/spec_id/srf.rb +33 -23
  27. data/lib/validator.rb +40 -57
  28. data/lib/validator/aa.rb +3 -90
  29. data/lib/validator/aa_est.rb +112 -0
  30. data/lib/validator/cmdline.rb +163 -31
  31. data/lib/validator/decoy.rb +15 -7
  32. data/lib/validator/digestion_based.rb +5 -4
  33. data/lib/validator/q_value.rb +32 -0
  34. data/script/peps_per_bin.rb +67 -0
  35. data/script/sqt_to_meta.rb +24 -0
  36. data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
  37. data/specs/bin/fasta_shaker_spec.rb +2 -2
  38. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
  39. data/specs/bin/filter_and_validate_spec.rb +25 -6
  40. data/specs/bin/ms_to_lmat_spec.rb +2 -2
  41. data/specs/bin/prob_validate_spec.rb +5 -3
  42. data/specs/sample_enzyme_spec.rb +86 -1
  43. data/specs/spec_helper.rb +11 -9
  44. data/specs/spec_id/bioworks_spec.rb +2 -1
  45. data/specs/spec_id/precision/filter_spec.rb +5 -5
  46. data/specs/spec_id/precision/prob_spec.rb +0 -67
  47. data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
  48. data/specs/spec_id/protein_summary_spec.rb +4 -4
  49. data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
  50. data/specs/spec_id/sequest_spec.rb +38 -0
  51. data/specs/spec_id/sqt_spec.rb +111 -3
  52. data/specs/spec_id_spec.rb +2 -0
  53. data/specs/transmem/phobius_spec.rb +3 -1
  54. data/specs/transmem/toppred_spec.rb +1 -1
  55. data/specs/validator/aa_est_spec.rb +66 -0
  56. data/specs/validator/aa_spec.rb +1 -68
  57. data/specs/validator/background_spec.rb +2 -0
  58. data/specs/validator/bias_spec.rb +3 -27
  59. data/specs/validator/decoy_spec.rb +2 -2
  60. data/specs/validator/transmem_spec.rb +2 -1
  61. data/test_files/small.sqt +87 -0
  62. metadata +312 -293
data/lib/validator.rb CHANGED
@@ -1,19 +1,34 @@
1
1
 
2
2
  class Validator
3
3
 
4
+ # in the absence of digestion, does the spec_id type requires pephits for
5
+ # validation?
6
+ def self.requires_pephits?(spec_id_obj)
7
+ case spec_id_obj
8
+ when Proph::ProtSummary : true
9
+ when Proph::PepSummary : true
10
+ when SQTGroup : true
11
+ else ; false
12
+ end
13
+ end
14
+
4
15
  Validator_to_string = {
5
16
  'Validator::AA' => 'badAA',
17
+ 'Validator::AAEst' => 'badAAEst',
6
18
  'Validator::Decoy' => 'decoy',
7
19
  'Validator::Transmem::Protein' => 'tmm',
8
20
  'Validator::TruePos' => 'tps',
9
21
  'Validator::Bias' => 'bias',
10
22
  'Validator::Probability' => 'prob',
23
+ 'Validator::QValue' => 'qval',
11
24
  :bad_aa => 'badAA',
25
+ :bad_aa_est => 'badAAEst',
12
26
  :decoy => 'decoy',
13
27
  :tmm => 'tmm',
14
28
  :tps => 'tps',
15
29
  :bias => 'bias',
16
30
  :prob => 'prob',
31
+ :qval => 'qval',
17
32
  }
18
33
 
19
34
  def initialize_increment
@@ -45,12 +60,12 @@ class Validator
45
60
  @increment_tps += tps.size
46
61
  @increment_fps += fps.size
47
62
  (num_tps, num_fps) =
48
- if self.respond_to?(:calc_precision_prep) # for digestion based validators
49
- (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
50
- [num_tps, num_fps]
51
- else
52
- [@increment_tps, @increment_fps]
53
- end
63
+ if self.respond_to?(:calc_precision_prep) # for digestion based validators
64
+ (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
65
+ [num_tps, num_fps]
66
+ else
67
+ [@increment_tps, @increment_fps]
68
+ end
54
69
  calc_precision(num_tps, num_fps)
55
70
  end
56
71
 
@@ -97,12 +112,16 @@ class Validator
97
112
  case val
98
113
  when Validator::TruePos
99
114
  hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
115
+ when Validator::AAEst
116
+ %w(frequency background calculated_background).each do |cat|
117
+ hash[cat.to_sym] = val.send(cat.to_sym)
118
+ end
100
119
  when Validator::AA
101
- %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
120
+ %w(false_to_total_ratio background calculated_background).each do |cat|
102
121
  hash[cat.to_sym] = val.send(cat.to_sym)
103
122
  end
104
123
  when Validator::Decoy
105
- %w(correct_wins decoy_on_match).each do |cat|
124
+ %w(decoy_to_target_ratio correct_wins decoy_on_match).each do |cat|
106
125
  hash[cat.to_sym] = val.send(cat.to_sym)
107
126
  end
108
127
  hash[:constraint] = val.constraint.inspect if val.constraint
@@ -119,6 +138,8 @@ class Validator
119
138
  %w(prob_method).each do |cat|
120
139
  hash[cat.to_sym] = val.send(cat.to_sym)
121
140
  end
141
+ when Validator::QValue
142
+ # no params to add
122
143
  else ; raise ArgumentError, "Don't know the validator class #{val}"
123
144
  end
124
145
  klass_as_s = val.class.to_s
@@ -127,46 +148,6 @@ class Validator
127
148
  hash
128
149
  end
129
150
  end
130
-
131
- =begin
132
- ## THIS IS WITH STRINGS AS KEYS!
133
- # takes an array of validators and returns a fresh array where each has been
134
- # turned into a sensible hash (with symbols as the keys!)
135
- def self.sensible_validator_hashes(validators)
136
- validators.map do |val|
137
- hash = {}
138
- case val
139
- when Validator::TruePos
140
- hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
141
- when Validator::AA
142
- %w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
143
- hash[cat] = val.send(cat.to_sym)
144
- end
145
- when Validator::Decoy
146
- %w(correct_wins decoy_on_match).each do |cat|
147
- hash[cat] = val.send(cat.to_sym)
148
- end
149
- hash['constraint'] = val.constraint.inspect if val.constraint
150
- when Validator::Bias
151
- %w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
152
- hash[cat] = val.send(cat.to_sym)
153
- end
154
- hash['file'] = val.fasta.filename
155
- when Validator::Transmem::Protein
156
- %w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
157
- hash[cat] = val.send(cat.to_sym)
158
- end
159
- when Validator::Probability
160
- else ; raise ArgumentError, "Don't know the validator class #{val}"
161
- end
162
- klass_as_s = val.class.to_s
163
- hash['type'] = Validator_to_string[klass_as_s]
164
- hash['class'] = klass_as_s
165
- hash
166
- end
167
- end
168
- =end
169
-
170
151
  end
171
152
 
172
153
  module Precision::Calculator
@@ -186,11 +167,11 @@ end
186
167
  # normal hits (which may be true or false) and the second are decoy hits.
187
168
  # edge case: if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
188
169
  module Precision::Calculator::Decoy
189
- def calc_precision(num_normal, num_decoy)
170
+ def calc_precision(num_normal, num_decoy, decoy_to_target_ratio=1.0)
190
171
  # will calculate as floats in case fractional amounts passed in for
191
172
  # whatever reason
192
173
  num_normal_f = num_normal.to_f
193
- num_true_pos = num_normal.to_f - num_decoy
174
+ num_true_pos = num_normal_f - (num_decoy.to_f / decoy_to_target_ratio)
194
175
  precision =
195
176
  if num_normal_f == 0.0
196
177
  if num_decoy.to_f > 0.0
@@ -204,11 +185,13 @@ module Precision::Calculator::Decoy
204
185
  end
205
186
  end
206
187
 
207
- require 'validator/true_pos'
208
- require 'validator/aa'
209
- require 'validator/bias'
210
- require 'validator/decoy'
211
- require 'validator/transmem'
212
- require 'validator/probability'
213
- require 'validator/prot_from_pep'
188
+ #require 'validator/true_pos'
189
+ #require 'validator/aa'
190
+ #require 'validator/aa_est'
191
+ #require 'validator/bias'
192
+ #require 'validator/decoy'
193
+ #require 'validator/transmem'
194
+ #require 'validator/probability'
195
+ #require 'validator/q_value'
196
+ #require 'validator/prot_from_pep'
214
197
 
data/lib/validator/aa.rb CHANGED
@@ -1,4 +1,3 @@
1
- require 'validator' # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
2
1
  require 'validator/digestion_based'
3
2
  require 'fasta'
4
3
  require 'spec_id/aa_freqs'
@@ -12,12 +11,7 @@ class Validator::AA < Validator::DigestionBased
12
11
 
13
12
  # it is a false hit if the amino acid is located in the peptide
14
13
  attr_accessor :false_if_found
15
-
16
- # if given, the frequency of the amino acid is used to estimate the false to
17
- # total ratio based on the pephits given for pephit_precision.
18
- # see Validator::AA.calc_frequency to calculate a frequency
19
- attr_accessor :frequency
20
-
14
+
21
15
  DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
22
16
  :false_if_found => true,
23
17
  } )
@@ -34,21 +28,9 @@ class Validator::AA < Validator::DigestionBased
34
28
  end
35
29
  end
36
30
 
37
- # takes a fasta object and sets the frequency based on constraint.
38
- # constraint is one acceptable to initialize!
39
- # returns self
40
- def set_frequency(fasta_obj)
41
- table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
42
- @frequency = table[@constraint.to_sym]
43
- self
44
- end
45
-
46
31
  # right now only accepts single amino acids as constraints (as a string,
47
32
  # e.g. 'C', or symbol, e.g. :C)
48
33
  # options:
49
- # :frequency OR :false_to_total_ratio should be used (NOT both)
50
- # :frequency => Float, if the frequency of the amino acid is known (see
51
- # Validator::AA.calc_frequency)
52
34
  # :false_to_total_ratio => if a true digestion was already performed (see
53
35
  # Validator::AA.calc_false_to_total_ratio)
54
36
  # :false_if_found => it is a false positive if the amino acid is found.
@@ -56,80 +38,11 @@ class Validator::AA < Validator::DigestionBased
56
38
  def initialize(constraint, options={})
57
39
  @constraint = constraint.to_s
58
40
  opts = DEFAULTS.merge(options)
59
- (@frequency, @false_to_total_ratio, @false_if_found, @background) = opts.values_at(:frequency, :false_to_total_ratio, :false_if_found, :background)
60
- end
61
-
62
- # if expected is 0 then will return precision = 1.0
63
- def pephit_precision(peps)
64
- if @frequency
65
- (actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
66
- if expected == 0.0
67
- 1.0
68
- else
69
- # what's this guy ?? good for??
70
- fraction_of_expected = actual.to_f/expected
71
- pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
72
- end
73
- elsif @false_to_total_ratio
74
- super(peps)
75
- else
76
- raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
77
- end
78
- end
79
-
80
- # returns (Actual(Int), Expected(Float)) based on how many peptides have at
81
- # least one amino_acid, the frequency it is observed in background (then we
82
- # can look at the size of each peptide and determine the likelihood of
83
- # having the peptide with at least one amino acid).
84
- # amino_acid should be a string (e.g., 'C')
85
- def at_least_one(amino_acid, freq, amino_acid_seqs)
86
- one_minus_freq = 1.0 - freq
87
- probs = []
88
- actual = 0
89
- expected = 0.0
90
- amino_acid_seqs.each do |aaseq|
91
- expected += (1.0 - (one_minus_freq**aaseq.size))
92
- if aaseq.include?(amino_acid)
93
- actual += 1
94
- end
95
- end
96
- [actual, expected]
97
- end
98
-
99
-
100
- # given: (actual # with 'AA', expected # with 'AA', total#peptides,
101
- # mean_fraction_of_cysteines_true)
102
- #
103
- # PepHit('AA') = Peptide containing at least one 'AA'
104
- # # expected PepHit('AA') # observed Bad Pep ('AA')
105
- # ----------------------- proportional_to -------------------------
106
- # # total PepHits # Total Bad PepHit
107
- #
108
- # returns the precision
109
- # the background correction factor will not reduce the actual count of
110
- # peptides to < 0. One can still get negative precision scores, however,
111
- # depending on the other variables.
112
- # background is the number of peptides with the amino acid in the purest
113
- # sample over the total number of peps.
114
- #---
115
- # this is thoroughly explained in my 2007_09 presentations (inkscape)
116
- #+++
117
- def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
118
- actual = actual.to_f
119
- @calculated_background = actual / total_peps
120
- actual -= (total_peps * background)
121
- # We were doing it compared to the number expected.. but this is more
122
- # clear
123
- # actual/false_hits = expected/total_peps_passing
124
- # false_hits = (total_peps_passing * actual) / expected
125
- if actual < 0.0 ; actual = 0.0 end
126
- total_number_false = (actual * total_peps).to_f / expected
127
- #fppr = total_number_false / total_peps
128
- prec = (total_peps - total_number_false) / total_peps
41
+ (@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
129
42
  end
130
43
 
131
44
  def to_param_string
132
- "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
45
+ "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
133
46
  end
134
47
  end
135
48
 
@@ -0,0 +1,112 @@
1
+ require 'validator/aa'
2
+
3
+
4
+ class Validator ; end
5
+ class Validator::AA ; end
6
+
7
+ # A class that uses the peps given to it and a background frequency to
8
+ # calculate the false_to_total_ratio at each turn.
9
+ class Validator::AAEst < Validator::AA
10
+ attr_accessor :constraint
11
+ attr_accessor :false_if_found
12
+
13
+ # the frequency of the amino acid is used to estimate the false to
14
+ # total ratio based on the pephits given for pephit_precision.
15
+ # see Validator::AA.calc_frequency to calculate a frequency
16
+ # or use set_frequency to set from pep hits.
17
+ attr_accessor :frequency
18
+
19
+ DEFAULTS = {
20
+ :false_if_found => true
21
+ }.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
22
+
23
+ # only takes a string right now for constraint
24
+ def initialize(constraint, options={})
25
+ @constraint = constraint.to_s
26
+ opts = DEFAULTS.merge(options)
27
+ (@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
28
+ end
29
+
30
+ def pephit_precision(peps)
31
+ set_false_to_total_ratio(peps)
32
+ super(peps)
33
+ end
34
+
35
+ def set_false_to_total_ratio(peps)
36
+ if peps.size > 0
37
+ expected = 0.0
38
+ peps.each do |pep|
39
+ expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
40
+ end
41
+ @false_to_total_ratio = expected / peps.size
42
+ else
43
+ @false_to_total_ratio = 1.0
44
+ end
45
+ end
46
+
47
+ def set_ongoing_false_to_total_ratio(peps)
48
+ if peps.size > 0
49
+ peps.each do |pep|
50
+ @expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
51
+ end
52
+ # @increment_total_submitted should == @increment_tps and @increment_fps
53
+ # since these are either/or
54
+ @false_to_total_ratio = @expected / @increment_total_submitted
55
+ else
56
+ @false_to_total_ratio = 1.0
57
+ end
58
+ end
59
+
60
+
61
+ def to_param_string
62
+ "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
63
+ end
64
+
65
+ # takes objects responding to aaseq and sets the frequency based on
66
+ # constraint. constraint is one acceptable to initialize! returns self
67
+ def set_frequency(objs)
68
+ table = SpecID::AAFreqs.new.calculate_frequencies(objs)
69
+ @frequency = table[@constraint.to_sym]
70
+ self
71
+ end
72
+
73
+ # if adding pephits in groups at a time, the entire group does not need to be
74
+ # queried, just the individual hit. Use this OR pephits_precision (NOT
75
+ # both). The initial query to this method will begin a running tally that
76
+ # is saved by the validator.
77
+ # takes either an array or a single pephit (determined by if it is a
78
+ # SpecID::Pep)
79
+ def increment_pephits_precision(peps)
80
+ tmp = $VERBOSE; $VERBOSE = nil
81
+ unless @increment_initialized
82
+ initialize_increment
83
+ @expected = 0.0
84
+ end
85
+ $VERBOSE = tmp
86
+
87
+ to_submit =
88
+ if peps.is_a? SpecID::Pep
89
+ [peps]
90
+ else
91
+ peps
92
+ end
93
+ @increment_total_submitted += to_submit.size
94
+ (tps, fps) = partition(to_submit)
95
+ #### THIS IS THE MAGIC FOR THIS VALIDATOR:
96
+ set_ongoing_false_to_total_ratio(to_submit)
97
+
98
+ @increment_tps += tps.size
99
+ @increment_fps += fps.size
100
+ (num_tps, num_fps) =
101
+ if self.respond_to?(:calc_precision_prep) # for digestion based validators
102
+ (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
103
+ [num_tps, num_fps]
104
+ else
105
+ [@increment_tps, @increment_fps]
106
+ end
107
+ calc_precision(num_tps, num_fps)
108
+ end
109
+
110
+
111
+
112
+ end
@@ -1,14 +1,31 @@
1
1
  require 'validator'
2
2
 
3
+ require 'validator/true_pos'
4
+ require 'validator/aa'
5
+ require 'validator/aa_est'
6
+ require 'validator/bias'
7
+ require 'validator/decoy'
8
+ require 'validator/transmem'
9
+ require 'validator/probability'
10
+ require 'validator/q_value'
11
+ require 'validator/prot_from_pep'
12
+
13
+ ## these all for a stupid check...
14
+ require 'spec_id/sqt'
15
+ require 'spec_id/proph/prot_summary'
16
+ require 'spec_id/proph/pep_summary'
17
+
3
18
  class Validator::Cmdline
4
19
 
5
20
  Validator_symbols_to_classes = {
6
21
  :tmm => Validator::Transmem::Protein,
7
22
  :decoy => Validator::Decoy,
8
23
  :bad_aa => Validator::AA,
24
+ :bad_aa_est => Validator::AAEst,
9
25
  :tps => Validator::TruePos,
10
26
  :bias => Validator::Bias,
11
27
  :prob => Validator::Probability,
28
+ :qval => Validator::QValue,
12
29
  }
13
30
  # was VAL_DEFAULTS
14
31
  DEFAULTS = {
@@ -24,11 +41,16 @@ class Validator::Cmdline
24
41
  {
25
42
  :hits_together => true,
26
43
  :decoy_on_match => true,
44
+ :decoy_to_target_ratio => 1.0,
27
45
  },
28
46
  :bad_aa =>
29
47
  {
30
48
  :false_if_found => true,
31
- :estimate => true,
49
+ :bkg => 0.0,
50
+ },
51
+ :bad_aa_est =>
52
+ {
53
+ :false_if_found => true,
32
54
  :bkg => 0.0,
33
55
  },
34
56
  :bias =>
@@ -39,7 +61,7 @@ class Validator::Cmdline
39
61
  :ties => true,
40
62
  }
41
63
  COMMAND_LINE = {
42
- :decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
64
+ :decoy => ["--decoy /REGEXP/|FILENAME[,DTR,DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
43
65
  "FILENAME of separate search on decoys.",
44
66
  "All regular expressions must be surrounded by '/'",
45
67
  "(no extended options [trailing modifiers]).",
@@ -50,21 +72,30 @@ class Validator::Cmdline
50
72
  " --decoy '/^\\s*REVERSE/'",
51
73
  "If decoys proteins were searched in a separate file,",
52
74
  "then give the FILENAME (e.g., --decoy decoy.srg)",
75
+ "DTR = Decoy to Target Ratio (default: #{DEFAULTS[:decoy][:decoy_to_target_ratio]})",
53
76
  "DOM = *true/false, decoy on match",],
54
77
  :tps => ["--tps <fasta>", "for a completely defined sample, this is the",
55
78
  "fasta file containing the true protein hits"],
56
79
  # may require digestion:
57
- :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
58
- "information (that is shared between them).",
80
+ :fasta => ["--fasta FASTA", "fasta file for phobius transmembrane",
81
+ "(needed if PEPS options is not false)"],
82
+ :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "[not recommended]",
83
+ "Creates the 'false/total' ratio with in silico",
84
+ "digestion. Otherwise, the 3rd-10th best hits (sorted by",
85
+ "xcorr) are used.",
86
+ "The following validators will use this",
87
+ "information (shared between them) if option given",
59
88
  "ORIG_FASTA = the fasta file used to do the run",
60
89
  "PARAMS = the params file used to do the run",],
61
90
  :bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
62
91
  "PE = *true|false proteins in fasta file expected in sample",
63
92
  "BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
64
- :bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
93
+ :bad_aa => ["--bad_aa AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
65
94
  "AA = The amino acid (e.g., 'C')",
66
- "EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
67
95
  "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
96
+ :bad_aa_est => ["--bad_aa_est AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
97
+ "AA = The amino acid (e.g., 'C')",
98
+ "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa_est][:bkg]}):",],
68
99
 
69
100
  :tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
70
101
  "phobius.small:",
@@ -110,17 +141,27 @@ class Validator::Cmdline
110
141
  end
111
142
  opts[:validators].push([:prob, mthd])
112
143
  },
144
+ :qval => lambda {|ar, opts| opts[:validators].push([:qval]) },
113
145
  :decoy => lambda {|ar, opts|
114
146
  myargs = [:decoy]
115
147
  first_arg = ar[0]
116
- myargs[1] =
148
+ val_opts = {}
149
+ val_opts[:constraint] =
117
150
  if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
151
+ # cast as a regular expression of has '/ /'
118
152
  Regexp.new(first_arg[1...-1])
119
153
  else
154
+ # assume that it is a filename
155
+ raise ArgumentError, "File does not exist: #{first_arg}\n(was this supposed to be a regular expression? if so, should be given: /#{first_arg}/)" unless File.exist?(first_arg)
120
156
  first_arg
121
157
  end
122
- myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
123
- opts[:validators].push(myargs)
158
+ val_opts[:decoy_to_target_ratio] = (ar[1] || DEFAULTS[:decoy][:decoy_to_target_ratio]).to_f
159
+ val_opts[:decoy_on_match] = self.boolean(ar[2], DEFAULTS[:decoy][:decoy_on_match])
160
+ myargs.push(val_opts)
161
+ opts[:validators].push(myargs)
162
+ },
163
+ :fasta => lambda {|arg, opts|
164
+ opts[:fasta] = Fasta.new(arg)
124
165
  },
125
166
  :digestion => lambda {|ar, opts|
126
167
  raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
@@ -138,6 +179,9 @@ class Validator::Cmdline
138
179
  else
139
180
  DEFAULTS[:bias][:bkg]
140
181
  end
182
+ if ar[3]
183
+ val_opts[:false_to_total_ratio] = ar[3].to_f
184
+ end
141
185
  myargs.push(val_opts)
142
186
  opts[:validators].push(myargs)
143
187
  },
@@ -146,16 +190,36 @@ class Validator::Cmdline
146
190
  myargs = [:bad_aa]
147
191
  myargs.push( ar[0] )
148
192
  val_opts = {}
149
- val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
150
193
  val_opts[:background] =
151
- if ar[2]
152
- ar[2].to_f
194
+ if ar[1]
195
+ ar[1].to_f
153
196
  else
154
197
  DEFAULTS[:bad_aa][:bkg]
155
198
  end
199
+ if ar[2]
200
+ val_opts[:false_to_total_ratio] = ar[2].to_f
201
+ end
156
202
  myargs.push(val_opts)
157
203
  opts[:validators].push(myargs)
158
204
  },
205
+ :bad_aa_est => lambda {|ar, opts|
206
+ ## GET the FREQUENCY
207
+ myargs = [:bad_aa_est]
208
+ myargs.push( ar[0] )
209
+ val_opts = {}
210
+ val_opts[:background] =
211
+ if ar[1]
212
+ ar[1].to_f
213
+ else
214
+ DEFAULTS[:bad_aa_est][:bkg]
215
+ end
216
+ if ar[2]
217
+ val_opts[:frequency] = ar[2].to_f
218
+ end
219
+ myargs.push(val_opts)
220
+ opts[:validators].push(myargs)
221
+ },
222
+
159
223
  :tmm => lambda {|ar, opts|
160
224
  myargs = [:tmm]
161
225
  myargs.push( ar[0] )
@@ -177,16 +241,38 @@ class Validator::Cmdline
177
241
  if ar[4] ; ar[4].to_f
178
242
  else ; DEFAULTS[:tmm][:bkg]
179
243
  end
244
+ if ar[5]
245
+ val_opts[:false_to_total_ratio] = ar[5].to_f
246
+ end
180
247
  myargs.push(val_opts)
181
248
  opts[:validators].push( myargs )
182
249
  },
250
+ :pephits => lambda {|v,opts| opts[:pephits] = SpecID.new(v) },
183
251
  :tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
184
252
  :false_on_tie => lambda {|v,opts| opts[:ties] = false },
185
253
  }
186
254
 
255
+ def self.requires_pephits?(spec_id_obj)
256
+ case spec_id_obj
257
+ when Proph::ProtSummary : true
258
+ # at least currently (subject to change)
259
+ when Proph::PepSummary : true
260
+ when SQTGroup
261
+ if spec_id_obj.peps.first.respond_to?(:q_value)
262
+ # its percolator output and we don't have other hits to use
263
+ true
264
+ else
265
+ false
266
+ end
267
+ else ; false
268
+ end
269
+ end
270
+
187
271
  # remove the keys from opts involved in validators and return an array
188
272
  # of validators
189
- def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
273
+ # postfilter is one of :top_per_scan, :top_per_aaseq,
274
+ # :top_per_aaseq_charge (of which last two are subsets of scan)
275
+ def self.prepare_validators(opts, false_on_tie, interactive, postfilter, spec_id)
190
276
  validator_args = opts[:validators]
191
277
  correct_wins = !false_on_tie
192
278
  need_false_to_total_ratio = []
@@ -199,7 +285,9 @@ class Validator::Cmdline
199
285
  case tp
200
286
  when :tmm
201
287
  val_args[1][:correct_wins] = correct_wins
202
- val_args[1][:fasta] = opts[:digestion_objects][0]
288
+ if opts.key?(:fasta)
289
+ val_args[1][:fasta] = opts[:fasta]
290
+ end
203
291
  val_args
204
292
  when :bias
205
293
  val_args[1][:correct_wins] = correct_wins
@@ -208,10 +296,10 @@ class Validator::Cmdline
208
296
  val_args = [val_args[0], correct_wins]
209
297
  val_args
210
298
  when :decoy
211
- val_args = [val_args[0], val_args[1], correct_wins]
299
+ val_args[0][:correct_wins] = correct_wins
212
300
  # don't delete the key here since we need the decoy = regexp key
213
301
  val_args
214
- else ## bad_aa and prob are represented here:
302
+ else ## bad_aa, prob, and qval are represented here:
215
303
  val_args
216
304
  end
217
305
  val = Validator_symbols_to_classes[tp].new( *val_args )
@@ -219,10 +307,12 @@ class Validator::Cmdline
219
307
  if tp == :tmm
220
308
  transmem_vals << val
221
309
  end
222
- potential_digestion_classes = /Transmem|AA|Bias/
310
+ potential_digestion_classes = /Transmem|AA|AAEst|Bias/
223
311
  if val.class.to_s =~ potential_digestion_classes
224
- if val_args[1][:estimate] == true
225
- need_frequency << val
312
+ if val.class.to_s == 'Validator::AAEst'
313
+ need_frequency.push(val) if val.frequency.nil?
314
+ elsif !(val.false_to_total_ratio.nil?)
315
+ $stderr.puts "using false_to_total_ratio: #{val.false_to_total_ratio}"
226
316
  else
227
317
  need_false_to_total_ratio << val
228
318
  end
@@ -230,20 +320,62 @@ class Validator::Cmdline
230
320
  val
231
321
  end
232
322
 
233
- if need_false_to_total_ratio.size > 0
234
- raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
235
- peps = Digestor.digest( *(opts[:digestion_objects]) )
236
- need_false_to_total_ratio.each do |val|
237
- val.set_false_to_total_ratio( peps )
238
- end
239
- end
240
- if need_frequency.size > 0
241
- raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
242
- need_frequency.each do |val|
243
- val.set_frequency( opts[:digestion_objects][0] )
323
+ if ((need_false_to_total_ratio.size > 0) or (need_frequency.size > 0))
324
+ if opts.key?(:digestion_objects)
325
+ #raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
326
+ peps = Digestor.digest( *(opts[:digestion_objects]) )
327
+ need_false_to_total_ratio.each do |val|
328
+ val.set_false_to_total_ratio( peps )
329
+ end
330
+ if need_frequency.size > 0
331
+ need_frequency.each do |val|
332
+ val.set_frequency( opts[:digestion_objects][0] )
333
+ end
334
+ end
335
+ opts.delete(:digestion_objects)
336
+ else ## do the new and improved selection of non-top hits to get false_to_total_ratios and freqs
337
+ $stderr.puts "...using pephits to calculate background ratios"
338
+ # first_index, last_index
339
+ pephits =
340
+ if opts[:pephits] ## protein prophet (since it needs to get ratios somewhere
341
+ $stderr.puts "using --pephits"
342
+ opts[:pephits].peps
343
+ elsif requires_pephits?(spec_id)
344
+ raise ArgumentError, "with objects of class '#{spec_id.class}', one of your validators requires --pephits or --digestion"
345
+ else
346
+ $stderr.puts "using given spec_id.peps"
347
+ spec_id.peps
348
+ end
349
+
350
+ not_first_or_second_peps = Sequest.other_hits_sorted_by_xcorr(pephits, 2, 9, [:base_name, :first_scan, :charge])
351
+ pephits =
352
+ case postfilter
353
+ when :top_per_scan
354
+ $stderr.puts "using top_per_scan" ; not_first_or_second_peps
355
+ when :top_per_aaseq
356
+ # it doesn't matter which one is given since validators are
357
+ # based on amino acid sequence
358
+ $stderr.puts 'using top_per_aaseq'
359
+ not_first_or_second_peps.hash_by(:aaseq).values.map {|pep| pep.first }
360
+ when :top_per_aaseq_charge
361
+ $stderr.puts 'using top_per_aaseq_charge'
362
+ not_first_or_second_peps.hash_by(:aaseq, :charge).values.map {|pep| pep.first }
363
+ else
364
+ raise ArgumentError, "must have a valid postfilter method, yours: '#{postfilter}'"
365
+ end
366
+
367
+ need_false_to_total_ratio.each do |val|
368
+ val.set_false_to_total_ratio( pephits )
369
+ $stderr.puts "false_to_total_ratio for #{val.class.to_s}: #{val.false_to_total_ratio}"
370
+ end
371
+ if need_frequency.size > 0
372
+ need_frequency.each do |val|
373
+ $stderr.puts "Setting frequency!"
374
+ val.set_frequency( pephits )
375
+ end
376
+ end
244
377
  end
245
378
  end
246
- opts.delete(:digestion_objects)
247
379
 
248
380
  if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
249
381
  # This is overkill if we are doing a single filtering job, but it