mspire 0.3.1 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -2
- data/bin/bioworks_to_pepxml.rb +15 -3
- data/bin/ms_to_lmat.rb +2 -1
- data/bin/sqt_group.rb +26 -0
- data/changelog.txt +36 -0
- data/lib/ms/msrun.rb +3 -1
- data/lib/ms/parser/mzdata/dom.rb +14 -14
- data/lib/ms/scan.rb +3 -3
- data/lib/mspire.rb +1 -1
- data/lib/sample_enzyme.rb +39 -0
- data/lib/spec_id.rb +18 -0
- data/lib/spec_id/aa_freqs.rb +6 -9
- data/lib/spec_id/digestor.rb +16 -17
- data/lib/spec_id/mass.rb +63 -1
- data/lib/spec_id/parser/proph.rb +101 -2
- data/lib/spec_id/precision/filter.rb +3 -2
- data/lib/spec_id/precision/filter/cmdline.rb +3 -1
- data/lib/spec_id/precision/filter/output.rb +1 -0
- data/lib/spec_id/precision/prob.rb +88 -21
- data/lib/spec_id/precision/prob/cmdline.rb +28 -16
- data/lib/spec_id/precision/prob/output.rb +8 -2
- data/lib/spec_id/proph/pep_summary.rb +25 -12
- data/lib/spec_id/sequest.rb +28 -0
- data/lib/spec_id/sequest/pepxml.rb +142 -197
- data/lib/spec_id/sqt.rb +349 -0
- data/lib/spec_id/srf.rb +33 -23
- data/lib/validator.rb +40 -57
- data/lib/validator/aa.rb +3 -90
- data/lib/validator/aa_est.rb +112 -0
- data/lib/validator/cmdline.rb +163 -31
- data/lib/validator/decoy.rb +15 -7
- data/lib/validator/digestion_based.rb +5 -4
- data/lib/validator/q_value.rb +32 -0
- data/script/peps_per_bin.rb +67 -0
- data/script/sqt_to_meta.rb +24 -0
- data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
- data/specs/bin/fasta_shaker_spec.rb +2 -2
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
- data/specs/bin/filter_and_validate_spec.rb +25 -6
- data/specs/bin/ms_to_lmat_spec.rb +2 -2
- data/specs/bin/prob_validate_spec.rb +5 -3
- data/specs/sample_enzyme_spec.rb +86 -1
- data/specs/spec_helper.rb +11 -9
- data/specs/spec_id/bioworks_spec.rb +2 -1
- data/specs/spec_id/precision/filter_spec.rb +5 -5
- data/specs/spec_id/precision/prob_spec.rb +0 -67
- data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
- data/specs/spec_id/protein_summary_spec.rb +4 -4
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
- data/specs/spec_id/sequest_spec.rb +38 -0
- data/specs/spec_id/sqt_spec.rb +111 -3
- data/specs/spec_id_spec.rb +2 -0
- data/specs/transmem/phobius_spec.rb +3 -1
- data/specs/transmem/toppred_spec.rb +1 -1
- data/specs/validator/aa_est_spec.rb +66 -0
- data/specs/validator/aa_spec.rb +1 -68
- data/specs/validator/background_spec.rb +2 -0
- data/specs/validator/bias_spec.rb +3 -27
- data/specs/validator/decoy_spec.rb +2 -2
- data/specs/validator/transmem_spec.rb +2 -1
- data/test_files/small.sqt +87 -0
- metadata +312 -293
data/lib/validator.rb
CHANGED
@@ -1,19 +1,34 @@
|
|
1
1
|
|
2
2
|
class Validator
|
3
3
|
|
4
|
+
# in the absence of digestion, does the spec_id type requires pephits for
|
5
|
+
# validation?
|
6
|
+
def self.requires_pephits?(spec_id_obj)
|
7
|
+
case spec_id_obj
|
8
|
+
when Proph::ProtSummary : true
|
9
|
+
when Proph::PepSummary : true
|
10
|
+
when SQTGroup : true
|
11
|
+
else ; false
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
4
15
|
Validator_to_string = {
|
5
16
|
'Validator::AA' => 'badAA',
|
17
|
+
'Validator::AAEst' => 'badAAEst',
|
6
18
|
'Validator::Decoy' => 'decoy',
|
7
19
|
'Validator::Transmem::Protein' => 'tmm',
|
8
20
|
'Validator::TruePos' => 'tps',
|
9
21
|
'Validator::Bias' => 'bias',
|
10
22
|
'Validator::Probability' => 'prob',
|
23
|
+
'Validator::QValue' => 'qval',
|
11
24
|
:bad_aa => 'badAA',
|
25
|
+
:bad_aa_est => 'badAAEst',
|
12
26
|
:decoy => 'decoy',
|
13
27
|
:tmm => 'tmm',
|
14
28
|
:tps => 'tps',
|
15
29
|
:bias => 'bias',
|
16
30
|
:prob => 'prob',
|
31
|
+
:qval => 'qval',
|
17
32
|
}
|
18
33
|
|
19
34
|
def initialize_increment
|
@@ -45,12 +60,12 @@ class Validator
|
|
45
60
|
@increment_tps += tps.size
|
46
61
|
@increment_fps += fps.size
|
47
62
|
(num_tps, num_fps) =
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
63
|
+
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
64
|
+
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
65
|
+
[num_tps, num_fps]
|
66
|
+
else
|
67
|
+
[@increment_tps, @increment_fps]
|
68
|
+
end
|
54
69
|
calc_precision(num_tps, num_fps)
|
55
70
|
end
|
56
71
|
|
@@ -97,12 +112,16 @@ class Validator
|
|
97
112
|
case val
|
98
113
|
when Validator::TruePos
|
99
114
|
hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
|
115
|
+
when Validator::AAEst
|
116
|
+
%w(frequency background calculated_background).each do |cat|
|
117
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
118
|
+
end
|
100
119
|
when Validator::AA
|
101
|
-
%w(
|
120
|
+
%w(false_to_total_ratio background calculated_background).each do |cat|
|
102
121
|
hash[cat.to_sym] = val.send(cat.to_sym)
|
103
122
|
end
|
104
123
|
when Validator::Decoy
|
105
|
-
%w(correct_wins decoy_on_match).each do |cat|
|
124
|
+
%w(decoy_to_target_ratio correct_wins decoy_on_match).each do |cat|
|
106
125
|
hash[cat.to_sym] = val.send(cat.to_sym)
|
107
126
|
end
|
108
127
|
hash[:constraint] = val.constraint.inspect if val.constraint
|
@@ -119,6 +138,8 @@ class Validator
|
|
119
138
|
%w(prob_method).each do |cat|
|
120
139
|
hash[cat.to_sym] = val.send(cat.to_sym)
|
121
140
|
end
|
141
|
+
when Validator::QValue
|
142
|
+
# no params to add
|
122
143
|
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
123
144
|
end
|
124
145
|
klass_as_s = val.class.to_s
|
@@ -127,46 +148,6 @@ class Validator
|
|
127
148
|
hash
|
128
149
|
end
|
129
150
|
end
|
130
|
-
|
131
|
-
=begin
|
132
|
-
## THIS IS WITH STRINGS AS KEYS!
|
133
|
-
# takes an array of validators and returns a fresh array where each has been
|
134
|
-
# turned into a sensible hash (with symbols as the keys!)
|
135
|
-
def self.sensible_validator_hashes(validators)
|
136
|
-
validators.map do |val|
|
137
|
-
hash = {}
|
138
|
-
case val
|
139
|
-
when Validator::TruePos
|
140
|
-
hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
|
141
|
-
when Validator::AA
|
142
|
-
%w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
|
143
|
-
hash[cat] = val.send(cat.to_sym)
|
144
|
-
end
|
145
|
-
when Validator::Decoy
|
146
|
-
%w(correct_wins decoy_on_match).each do |cat|
|
147
|
-
hash[cat] = val.send(cat.to_sym)
|
148
|
-
end
|
149
|
-
hash['constraint'] = val.constraint.inspect if val.constraint
|
150
|
-
when Validator::Bias
|
151
|
-
%w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
|
152
|
-
hash[cat] = val.send(cat.to_sym)
|
153
|
-
end
|
154
|
-
hash['file'] = val.fasta.filename
|
155
|
-
when Validator::Transmem::Protein
|
156
|
-
%w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
|
157
|
-
hash[cat] = val.send(cat.to_sym)
|
158
|
-
end
|
159
|
-
when Validator::Probability
|
160
|
-
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
161
|
-
end
|
162
|
-
klass_as_s = val.class.to_s
|
163
|
-
hash['type'] = Validator_to_string[klass_as_s]
|
164
|
-
hash['class'] = klass_as_s
|
165
|
-
hash
|
166
|
-
end
|
167
|
-
end
|
168
|
-
=end
|
169
|
-
|
170
151
|
end
|
171
152
|
|
172
153
|
module Precision::Calculator
|
@@ -186,11 +167,11 @@ end
|
|
186
167
|
# normal hits (which may be true or false) and the second are decoy hits.
|
187
168
|
# edge case: if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
|
188
169
|
module Precision::Calculator::Decoy
|
189
|
-
def calc_precision(num_normal, num_decoy)
|
170
|
+
def calc_precision(num_normal, num_decoy, decoy_to_target_ratio=1.0)
|
190
171
|
# will calculate as floats in case fractional amounts passed in for
|
191
172
|
# whatever reason
|
192
173
|
num_normal_f = num_normal.to_f
|
193
|
-
num_true_pos =
|
174
|
+
num_true_pos = num_normal_f - (num_decoy.to_f / decoy_to_target_ratio)
|
194
175
|
precision =
|
195
176
|
if num_normal_f == 0.0
|
196
177
|
if num_decoy.to_f > 0.0
|
@@ -204,11 +185,13 @@ module Precision::Calculator::Decoy
|
|
204
185
|
end
|
205
186
|
end
|
206
187
|
|
207
|
-
require 'validator/true_pos'
|
208
|
-
require 'validator/aa'
|
209
|
-
require 'validator/
|
210
|
-
require 'validator/
|
211
|
-
require 'validator/
|
212
|
-
require 'validator/
|
213
|
-
require 'validator/
|
188
|
+
#require 'validator/true_pos'
|
189
|
+
#require 'validator/aa'
|
190
|
+
#require 'validator/aa_est'
|
191
|
+
#require 'validator/bias'
|
192
|
+
#require 'validator/decoy'
|
193
|
+
#require 'validator/transmem'
|
194
|
+
#require 'validator/probability'
|
195
|
+
#require 'validator/q_value'
|
196
|
+
#require 'validator/prot_from_pep'
|
214
197
|
|
data/lib/validator/aa.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'validator' # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
|
2
1
|
require 'validator/digestion_based'
|
3
2
|
require 'fasta'
|
4
3
|
require 'spec_id/aa_freqs'
|
@@ -12,12 +11,7 @@ class Validator::AA < Validator::DigestionBased
|
|
12
11
|
|
13
12
|
# it is a false hit if the amino acid is located in the peptide
|
14
13
|
attr_accessor :false_if_found
|
15
|
-
|
16
|
-
# if given, the frequency of the amino acid is used to estimate the false to
|
17
|
-
# total ratio based on the pephits given for pephit_precision.
|
18
|
-
# see Validator::AA.calc_frequency to calculate a frequency
|
19
|
-
attr_accessor :frequency
|
20
|
-
|
14
|
+
|
21
15
|
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
22
16
|
:false_if_found => true,
|
23
17
|
} )
|
@@ -34,21 +28,9 @@ class Validator::AA < Validator::DigestionBased
|
|
34
28
|
end
|
35
29
|
end
|
36
30
|
|
37
|
-
# takes a fasta object and sets the frequency based on constraint.
|
38
|
-
# constraint is one acceptable to initialize!
|
39
|
-
# returns self
|
40
|
-
def set_frequency(fasta_obj)
|
41
|
-
table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
|
42
|
-
@frequency = table[@constraint.to_sym]
|
43
|
-
self
|
44
|
-
end
|
45
|
-
|
46
31
|
# right now only accepts single amino acids as constraints (as a string,
|
47
32
|
# e.g. 'C', or symbol, e.g. :C)
|
48
33
|
# options:
|
49
|
-
# :frequency OR :false_to_total_ratio should be used (NOT both)
|
50
|
-
# :frequency => Float, if the frequency of the amino acid is known (see
|
51
|
-
# Validator::AA.calc_frequency)
|
52
34
|
# :false_to_total_ratio => if a true digestion was already performed (see
|
53
35
|
# Validator::AA.calc_false_to_total_ratio)
|
54
36
|
# :false_if_found => it is a false positive if the amino acid is found.
|
@@ -56,80 +38,11 @@ class Validator::AA < Validator::DigestionBased
|
|
56
38
|
def initialize(constraint, options={})
|
57
39
|
@constraint = constraint.to_s
|
58
40
|
opts = DEFAULTS.merge(options)
|
59
|
-
(@
|
60
|
-
end
|
61
|
-
|
62
|
-
# if expected is 0 then will return precision = 1.0
|
63
|
-
def pephit_precision(peps)
|
64
|
-
if @frequency
|
65
|
-
(actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
|
66
|
-
if expected == 0.0
|
67
|
-
1.0
|
68
|
-
else
|
69
|
-
# what's this guy ?? good for??
|
70
|
-
fraction_of_expected = actual.to_f/expected
|
71
|
-
pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
|
72
|
-
end
|
73
|
-
elsif @false_to_total_ratio
|
74
|
-
super(peps)
|
75
|
-
else
|
76
|
-
raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
# returns (Actual(Int), Expected(Float)) based on how many peptides have at
|
81
|
-
# least one amino_acid, the frequency it is observed in background (then we
|
82
|
-
# can look at the size of each peptide and determine the likelihood of
|
83
|
-
# having the peptide with at least one amino acid).
|
84
|
-
# amino_acid should be a string (e.g., 'C')
|
85
|
-
def at_least_one(amino_acid, freq, amino_acid_seqs)
|
86
|
-
one_minus_freq = 1.0 - freq
|
87
|
-
probs = []
|
88
|
-
actual = 0
|
89
|
-
expected = 0.0
|
90
|
-
amino_acid_seqs.each do |aaseq|
|
91
|
-
expected += (1.0 - (one_minus_freq**aaseq.size))
|
92
|
-
if aaseq.include?(amino_acid)
|
93
|
-
actual += 1
|
94
|
-
end
|
95
|
-
end
|
96
|
-
[actual, expected]
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
# given: (actual # with 'AA', expected # with 'AA', total#peptides,
|
101
|
-
# mean_fraction_of_cysteines_true)
|
102
|
-
#
|
103
|
-
# PepHit('AA') = Peptide containing at least one 'AA'
|
104
|
-
# # expected PepHit('AA') # observed Bad Pep ('AA')
|
105
|
-
# ----------------------- proportional_to -------------------------
|
106
|
-
# # total PepHits # Total Bad PepHit
|
107
|
-
#
|
108
|
-
# returns the precision
|
109
|
-
# the background correction factor will not reduce the actual count of
|
110
|
-
# peptides to < 0. One can still get negative precision scores, however,
|
111
|
-
# depending on the other variables.
|
112
|
-
# background is the number of peptides with the amino acid in the purest
|
113
|
-
# sample over the total number of peps.
|
114
|
-
#---
|
115
|
-
# this is thoroughly explained in my 2007_09 presentations (inkscape)
|
116
|
-
#+++
|
117
|
-
def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
|
118
|
-
actual = actual.to_f
|
119
|
-
@calculated_background = actual / total_peps
|
120
|
-
actual -= (total_peps * background)
|
121
|
-
# We were doing it compared to the number expected.. but this is more
|
122
|
-
# clear
|
123
|
-
# actual/false_hits = expected/total_peps_passing
|
124
|
-
# false_hits = (total_peps_passing * actual) / expected
|
125
|
-
if actual < 0.0 ; actual = 0.0 end
|
126
|
-
total_number_false = (actual * total_peps).to_f / expected
|
127
|
-
#fppr = total_number_false / total_peps
|
128
|
-
prec = (total_peps - total_number_false) / total_peps
|
41
|
+
(@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
|
129
42
|
end
|
130
43
|
|
131
44
|
def to_param_string
|
132
|
-
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "
|
45
|
+
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
133
46
|
end
|
134
47
|
end
|
135
48
|
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'validator/aa'
|
2
|
+
|
3
|
+
|
4
|
+
class Validator ; end
|
5
|
+
class Validator::AA ; end
|
6
|
+
|
7
|
+
# A class that uses the peps given to it and a background frequency to
|
8
|
+
# calculate the false_to_total_ratio at each turn.
|
9
|
+
class Validator::AAEst < Validator::AA
|
10
|
+
attr_accessor :constraint
|
11
|
+
attr_accessor :false_if_found
|
12
|
+
|
13
|
+
# the frequency of the amino acid is used to estimate the false to
|
14
|
+
# total ratio based on the pephits given for pephit_precision.
|
15
|
+
# see Validator::AA.calc_frequency to calculate a frequency
|
16
|
+
# or use set_frequency to set from pep hits.
|
17
|
+
attr_accessor :frequency
|
18
|
+
|
19
|
+
DEFAULTS = {
|
20
|
+
:false_if_found => true
|
21
|
+
}.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
|
22
|
+
|
23
|
+
# only takes a string right now for constraint
|
24
|
+
def initialize(constraint, options={})
|
25
|
+
@constraint = constraint.to_s
|
26
|
+
opts = DEFAULTS.merge(options)
|
27
|
+
(@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
|
28
|
+
end
|
29
|
+
|
30
|
+
def pephit_precision(peps)
|
31
|
+
set_false_to_total_ratio(peps)
|
32
|
+
super(peps)
|
33
|
+
end
|
34
|
+
|
35
|
+
def set_false_to_total_ratio(peps)
|
36
|
+
if peps.size > 0
|
37
|
+
expected = 0.0
|
38
|
+
peps.each do |pep|
|
39
|
+
expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
|
40
|
+
end
|
41
|
+
@false_to_total_ratio = expected / peps.size
|
42
|
+
else
|
43
|
+
@false_to_total_ratio = 1.0
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def set_ongoing_false_to_total_ratio(peps)
|
48
|
+
if peps.size > 0
|
49
|
+
peps.each do |pep|
|
50
|
+
@expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
|
51
|
+
end
|
52
|
+
# @increment_total_submitted should == @increment_tps and @increment_fps
|
53
|
+
# since these are either/or
|
54
|
+
@false_to_total_ratio = @expected / @increment_total_submitted
|
55
|
+
else
|
56
|
+
@false_to_total_ratio = 1.0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def to_param_string
|
62
|
+
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
63
|
+
end
|
64
|
+
|
65
|
+
# takes objects responding to aaseq and sets the frequency based on
|
66
|
+
# constraint. constraint is one acceptable to initialize! returns self
|
67
|
+
def set_frequency(objs)
|
68
|
+
table = SpecID::AAFreqs.new.calculate_frequencies(objs)
|
69
|
+
@frequency = table[@constraint.to_sym]
|
70
|
+
self
|
71
|
+
end
|
72
|
+
|
73
|
+
# if adding pephits in groups at a time, the entire group does not need to be
|
74
|
+
# queried, just the individual hit. Use this OR pephits_precision (NOT
|
75
|
+
# both). The initial query to this method will begin a running tally that
|
76
|
+
# is saved by the validator.
|
77
|
+
# takes either an array or a single pephit (determined by if it is a
|
78
|
+
# SpecID::Pep)
|
79
|
+
def increment_pephits_precision(peps)
|
80
|
+
tmp = $VERBOSE; $VERBOSE = nil
|
81
|
+
unless @increment_initialized
|
82
|
+
initialize_increment
|
83
|
+
@expected = 0.0
|
84
|
+
end
|
85
|
+
$VERBOSE = tmp
|
86
|
+
|
87
|
+
to_submit =
|
88
|
+
if peps.is_a? SpecID::Pep
|
89
|
+
[peps]
|
90
|
+
else
|
91
|
+
peps
|
92
|
+
end
|
93
|
+
@increment_total_submitted += to_submit.size
|
94
|
+
(tps, fps) = partition(to_submit)
|
95
|
+
#### THIS IS THE MAGIC FOR THIS VALIDATOR:
|
96
|
+
set_ongoing_false_to_total_ratio(to_submit)
|
97
|
+
|
98
|
+
@increment_tps += tps.size
|
99
|
+
@increment_fps += fps.size
|
100
|
+
(num_tps, num_fps) =
|
101
|
+
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
102
|
+
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
103
|
+
[num_tps, num_fps]
|
104
|
+
else
|
105
|
+
[@increment_tps, @increment_fps]
|
106
|
+
end
|
107
|
+
calc_precision(num_tps, num_fps)
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
end
|
data/lib/validator/cmdline.rb
CHANGED
@@ -1,14 +1,31 @@
|
|
1
1
|
require 'validator'
|
2
2
|
|
3
|
+
require 'validator/true_pos'
|
4
|
+
require 'validator/aa'
|
5
|
+
require 'validator/aa_est'
|
6
|
+
require 'validator/bias'
|
7
|
+
require 'validator/decoy'
|
8
|
+
require 'validator/transmem'
|
9
|
+
require 'validator/probability'
|
10
|
+
require 'validator/q_value'
|
11
|
+
require 'validator/prot_from_pep'
|
12
|
+
|
13
|
+
## these all for a stupid check...
|
14
|
+
require 'spec_id/sqt'
|
15
|
+
require 'spec_id/proph/prot_summary'
|
16
|
+
require 'spec_id/proph/pep_summary'
|
17
|
+
|
3
18
|
class Validator::Cmdline
|
4
19
|
|
5
20
|
Validator_symbols_to_classes = {
|
6
21
|
:tmm => Validator::Transmem::Protein,
|
7
22
|
:decoy => Validator::Decoy,
|
8
23
|
:bad_aa => Validator::AA,
|
24
|
+
:bad_aa_est => Validator::AAEst,
|
9
25
|
:tps => Validator::TruePos,
|
10
26
|
:bias => Validator::Bias,
|
11
27
|
:prob => Validator::Probability,
|
28
|
+
:qval => Validator::QValue,
|
12
29
|
}
|
13
30
|
# was VAL_DEFAULTS
|
14
31
|
DEFAULTS = {
|
@@ -24,11 +41,16 @@ class Validator::Cmdline
|
|
24
41
|
{
|
25
42
|
:hits_together => true,
|
26
43
|
:decoy_on_match => true,
|
44
|
+
:decoy_to_target_ratio => 1.0,
|
27
45
|
},
|
28
46
|
:bad_aa =>
|
29
47
|
{
|
30
48
|
:false_if_found => true,
|
31
|
-
:
|
49
|
+
:bkg => 0.0,
|
50
|
+
},
|
51
|
+
:bad_aa_est =>
|
52
|
+
{
|
53
|
+
:false_if_found => true,
|
32
54
|
:bkg => 0.0,
|
33
55
|
},
|
34
56
|
:bias =>
|
@@ -39,7 +61,7 @@ class Validator::Cmdline
|
|
39
61
|
:ties => true,
|
40
62
|
}
|
41
63
|
COMMAND_LINE = {
|
42
|
-
:decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
|
64
|
+
:decoy => ["--decoy /REGEXP/|FILENAME[,DTR,DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
|
43
65
|
"FILENAME of separate search on decoys.",
|
44
66
|
"All regular expressions must be surrounded by '/'",
|
45
67
|
"(no extended options [trailing modifiers]).",
|
@@ -50,21 +72,30 @@ class Validator::Cmdline
|
|
50
72
|
" --decoy '/^\\s*REVERSE/'",
|
51
73
|
"If decoys proteins were searched in a separate file,",
|
52
74
|
"then give the FILENAME (e.g., --decoy decoy.srg)",
|
75
|
+
"DTR = Decoy to Target Ratio (default: #{DEFAULTS[:decoy][:decoy_to_target_ratio]})",
|
53
76
|
"DOM = *true/false, decoy on match",],
|
54
77
|
:tps => ["--tps <fasta>", "for a completely defined sample, this is the",
|
55
78
|
"fasta file containing the true protein hits"],
|
56
79
|
# may require digestion:
|
57
|
-
:
|
58
|
-
|
80
|
+
:fasta => ["--fasta FASTA", "fasta file for phobius transmembrane",
|
81
|
+
"(needed if PEPS options is not false)"],
|
82
|
+
:digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "[not recommended]",
|
83
|
+
"Creates the 'false/total' ratio with in silico",
|
84
|
+
"digestion. Otherwise, the 3rd-10th best hits (sorted by",
|
85
|
+
"xcorr) are used.",
|
86
|
+
"The following validators will use this",
|
87
|
+
"information (shared between them) if option given",
|
59
88
|
"ORIG_FASTA = the fasta file used to do the run",
|
60
89
|
"PARAMS = the params file used to do the run",],
|
61
90
|
:bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
|
62
91
|
"PE = *true|false proteins in fasta file expected in sample",
|
63
92
|
"BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
|
64
|
-
:bad_aa => ["--bad_aa AA,
|
93
|
+
:bad_aa => ["--bad_aa AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
|
65
94
|
"AA = The amino acid (e.g., 'C')",
|
66
|
-
"EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
|
67
95
|
"BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
|
96
|
+
:bad_aa_est => ["--bad_aa_est AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
|
97
|
+
"AA = The amino acid (e.g., 'C')",
|
98
|
+
"BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa_est][:bkg]}):",],
|
68
99
|
|
69
100
|
:tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
|
70
101
|
"phobius.small:",
|
@@ -110,17 +141,27 @@ class Validator::Cmdline
|
|
110
141
|
end
|
111
142
|
opts[:validators].push([:prob, mthd])
|
112
143
|
},
|
144
|
+
:qval => lambda {|ar, opts| opts[:validators].push([:qval]) },
|
113
145
|
:decoy => lambda {|ar, opts|
|
114
146
|
myargs = [:decoy]
|
115
147
|
first_arg = ar[0]
|
116
|
-
|
148
|
+
val_opts = {}
|
149
|
+
val_opts[:constraint] =
|
117
150
|
if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
|
151
|
+
# cast as a regular expression of has '/ /'
|
118
152
|
Regexp.new(first_arg[1...-1])
|
119
153
|
else
|
154
|
+
# assume that it is a filename
|
155
|
+
raise ArgumentError, "File does not exist: #{first_arg}\n(was this supposed to be a regular expression? if so, should be given: /#{first_arg}/)" unless File.exist?(first_arg)
|
120
156
|
first_arg
|
121
157
|
end
|
122
|
-
|
123
|
-
|
158
|
+
val_opts[:decoy_to_target_ratio] = (ar[1] || DEFAULTS[:decoy][:decoy_to_target_ratio]).to_f
|
159
|
+
val_opts[:decoy_on_match] = self.boolean(ar[2], DEFAULTS[:decoy][:decoy_on_match])
|
160
|
+
myargs.push(val_opts)
|
161
|
+
opts[:validators].push(myargs)
|
162
|
+
},
|
163
|
+
:fasta => lambda {|arg, opts|
|
164
|
+
opts[:fasta] = Fasta.new(arg)
|
124
165
|
},
|
125
166
|
:digestion => lambda {|ar, opts|
|
126
167
|
raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
|
@@ -138,6 +179,9 @@ class Validator::Cmdline
|
|
138
179
|
else
|
139
180
|
DEFAULTS[:bias][:bkg]
|
140
181
|
end
|
182
|
+
if ar[3]
|
183
|
+
val_opts[:false_to_total_ratio] = ar[3].to_f
|
184
|
+
end
|
141
185
|
myargs.push(val_opts)
|
142
186
|
opts[:validators].push(myargs)
|
143
187
|
},
|
@@ -146,16 +190,36 @@ class Validator::Cmdline
|
|
146
190
|
myargs = [:bad_aa]
|
147
191
|
myargs.push( ar[0] )
|
148
192
|
val_opts = {}
|
149
|
-
val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
|
150
193
|
val_opts[:background] =
|
151
|
-
if ar[
|
152
|
-
ar[
|
194
|
+
if ar[1]
|
195
|
+
ar[1].to_f
|
153
196
|
else
|
154
197
|
DEFAULTS[:bad_aa][:bkg]
|
155
198
|
end
|
199
|
+
if ar[2]
|
200
|
+
val_opts[:false_to_total_ratio] = ar[2].to_f
|
201
|
+
end
|
156
202
|
myargs.push(val_opts)
|
157
203
|
opts[:validators].push(myargs)
|
158
204
|
},
|
205
|
+
:bad_aa_est => lambda {|ar, opts|
|
206
|
+
## GET the FREQUENCY
|
207
|
+
myargs = [:bad_aa_est]
|
208
|
+
myargs.push( ar[0] )
|
209
|
+
val_opts = {}
|
210
|
+
val_opts[:background] =
|
211
|
+
if ar[1]
|
212
|
+
ar[1].to_f
|
213
|
+
else
|
214
|
+
DEFAULTS[:bad_aa_est][:bkg]
|
215
|
+
end
|
216
|
+
if ar[2]
|
217
|
+
val_opts[:frequency] = ar[2].to_f
|
218
|
+
end
|
219
|
+
myargs.push(val_opts)
|
220
|
+
opts[:validators].push(myargs)
|
221
|
+
},
|
222
|
+
|
159
223
|
:tmm => lambda {|ar, opts|
|
160
224
|
myargs = [:tmm]
|
161
225
|
myargs.push( ar[0] )
|
@@ -177,16 +241,38 @@ class Validator::Cmdline
|
|
177
241
|
if ar[4] ; ar[4].to_f
|
178
242
|
else ; DEFAULTS[:tmm][:bkg]
|
179
243
|
end
|
244
|
+
if ar[5]
|
245
|
+
val_opts[:false_to_total_ratio] = ar[5].to_f
|
246
|
+
end
|
180
247
|
myargs.push(val_opts)
|
181
248
|
opts[:validators].push( myargs )
|
182
249
|
},
|
250
|
+
:pephits => lambda {|v,opts| opts[:pephits] = SpecID.new(v) },
|
183
251
|
:tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
|
184
252
|
:false_on_tie => lambda {|v,opts| opts[:ties] = false },
|
185
253
|
}
|
186
254
|
|
255
|
+
def self.requires_pephits?(spec_id_obj)
|
256
|
+
case spec_id_obj
|
257
|
+
when Proph::ProtSummary : true
|
258
|
+
# at least currently (subject to change)
|
259
|
+
when Proph::PepSummary : true
|
260
|
+
when SQTGroup
|
261
|
+
if spec_id_obj.peps.first.respond_to?(:q_value)
|
262
|
+
# its percolator output and we don't have other hits to use
|
263
|
+
true
|
264
|
+
else
|
265
|
+
false
|
266
|
+
end
|
267
|
+
else ; false
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
187
271
|
# remove the keys from opts involved in validators and return an array
|
188
272
|
# of validators
|
189
|
-
|
273
|
+
# postfilter is one of :top_per_scan, :top_per_aaseq,
|
274
|
+
# :top_per_aaseq_charge (of which last two are subsets of scan)
|
275
|
+
def self.prepare_validators(opts, false_on_tie, interactive, postfilter, spec_id)
|
190
276
|
validator_args = opts[:validators]
|
191
277
|
correct_wins = !false_on_tie
|
192
278
|
need_false_to_total_ratio = []
|
@@ -199,7 +285,9 @@ class Validator::Cmdline
|
|
199
285
|
case tp
|
200
286
|
when :tmm
|
201
287
|
val_args[1][:correct_wins] = correct_wins
|
202
|
-
|
288
|
+
if opts.key?(:fasta)
|
289
|
+
val_args[1][:fasta] = opts[:fasta]
|
290
|
+
end
|
203
291
|
val_args
|
204
292
|
when :bias
|
205
293
|
val_args[1][:correct_wins] = correct_wins
|
@@ -208,10 +296,10 @@ class Validator::Cmdline
|
|
208
296
|
val_args = [val_args[0], correct_wins]
|
209
297
|
val_args
|
210
298
|
when :decoy
|
211
|
-
val_args
|
299
|
+
val_args[0][:correct_wins] = correct_wins
|
212
300
|
# don't delete the key here since we need the decoy = regexp key
|
213
301
|
val_args
|
214
|
-
else ## bad_aa and
|
302
|
+
else ## bad_aa, prob, and qval are represented here:
|
215
303
|
val_args
|
216
304
|
end
|
217
305
|
val = Validator_symbols_to_classes[tp].new( *val_args )
|
@@ -219,10 +307,12 @@ class Validator::Cmdline
|
|
219
307
|
if tp == :tmm
|
220
308
|
transmem_vals << val
|
221
309
|
end
|
222
|
-
potential_digestion_classes = /Transmem|AA|Bias/
|
310
|
+
potential_digestion_classes = /Transmem|AA|AAEst|Bias/
|
223
311
|
if val.class.to_s =~ potential_digestion_classes
|
224
|
-
if
|
225
|
-
need_frequency
|
312
|
+
if val.class.to_s == 'Validator::AAEst'
|
313
|
+
need_frequency.push(val) if val.frequency.nil?
|
314
|
+
elsif !(val.false_to_total_ratio.nil?)
|
315
|
+
$stderr.puts "using false_to_total_ratio: #{val.false_to_total_ratio}"
|
226
316
|
else
|
227
317
|
need_false_to_total_ratio << val
|
228
318
|
end
|
@@ -230,20 +320,62 @@ class Validator::Cmdline
|
|
230
320
|
val
|
231
321
|
end
|
232
322
|
|
233
|
-
if need_false_to_total_ratio.size > 0
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
323
|
+
if ((need_false_to_total_ratio.size > 0) or (need_frequency.size > 0))
|
324
|
+
if opts.key?(:digestion_objects)
|
325
|
+
#raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
|
326
|
+
peps = Digestor.digest( *(opts[:digestion_objects]) )
|
327
|
+
need_false_to_total_ratio.each do |val|
|
328
|
+
val.set_false_to_total_ratio( peps )
|
329
|
+
end
|
330
|
+
if need_frequency.size > 0
|
331
|
+
need_frequency.each do |val|
|
332
|
+
val.set_frequency( opts[:digestion_objects][0] )
|
333
|
+
end
|
334
|
+
end
|
335
|
+
opts.delete(:digestion_objects)
|
336
|
+
else ## do the new and improved selection of non-top hits to get false_to_total_ratios and freqs
|
337
|
+
$stderr.puts "...using pephits to calculate background ratios"
|
338
|
+
# first_index, last_index
|
339
|
+
pephits =
|
340
|
+
if opts[:pephits] ## protein prophet (since it needs to get ratios somewhere
|
341
|
+
$stderr.puts "using --pephits"
|
342
|
+
opts[:pephits].peps
|
343
|
+
elsif requires_pephits?(spec_id)
|
344
|
+
raise ArgumentError, "with objects of class '#{spec_id.class}', one of your validators requires --pephits or --digestion"
|
345
|
+
else
|
346
|
+
$stderr.puts "using given spec_id.peps"
|
347
|
+
spec_id.peps
|
348
|
+
end
|
349
|
+
|
350
|
+
not_first_or_second_peps = Sequest.other_hits_sorted_by_xcorr(pephits, 2, 9, [:base_name, :first_scan, :charge])
|
351
|
+
pephits =
|
352
|
+
case postfilter
|
353
|
+
when :top_per_scan
|
354
|
+
$stderr.puts "using top_per_scan" ; not_first_or_second_peps
|
355
|
+
when :top_per_aaseq
|
356
|
+
# it doesn't matter which one is given since validators are
|
357
|
+
# based on amino acid sequence
|
358
|
+
$stderr.puts 'using top_per_aaseq'
|
359
|
+
not_first_or_second_peps.hash_by(:aaseq).values.map {|pep| pep.first }
|
360
|
+
when :top_per_aaseq_charge
|
361
|
+
$stderr.puts 'using top_per_aaseq_charge'
|
362
|
+
not_first_or_second_peps.hash_by(:aaseq, :charge).values.map {|pep| pep.first }
|
363
|
+
else
|
364
|
+
raise ArgumentError, "must have a valid postfilter method, yours: '#{postfilter}'"
|
365
|
+
end
|
366
|
+
|
367
|
+
need_false_to_total_ratio.each do |val|
|
368
|
+
val.set_false_to_total_ratio( pephits )
|
369
|
+
$stderr.puts "false_to_total_ratio for #{val.class.to_s}: #{val.false_to_total_ratio}"
|
370
|
+
end
|
371
|
+
if need_frequency.size > 0
|
372
|
+
need_frequency.each do |val|
|
373
|
+
$stderr.puts "Setting frequency!"
|
374
|
+
val.set_frequency( pephits )
|
375
|
+
end
|
376
|
+
end
|
244
377
|
end
|
245
378
|
end
|
246
|
-
opts.delete(:digestion_objects)
|
247
379
|
|
248
380
|
if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
|
249
381
|
# This is overkill if we are doing a single filtering job, but it
|