mspire 0.3.1 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -2
- data/bin/bioworks_to_pepxml.rb +15 -3
- data/bin/ms_to_lmat.rb +2 -1
- data/bin/sqt_group.rb +26 -0
- data/changelog.txt +36 -0
- data/lib/ms/msrun.rb +3 -1
- data/lib/ms/parser/mzdata/dom.rb +14 -14
- data/lib/ms/scan.rb +3 -3
- data/lib/mspire.rb +1 -1
- data/lib/sample_enzyme.rb +39 -0
- data/lib/spec_id.rb +18 -0
- data/lib/spec_id/aa_freqs.rb +6 -9
- data/lib/spec_id/digestor.rb +16 -17
- data/lib/spec_id/mass.rb +63 -1
- data/lib/spec_id/parser/proph.rb +101 -2
- data/lib/spec_id/precision/filter.rb +3 -2
- data/lib/spec_id/precision/filter/cmdline.rb +3 -1
- data/lib/spec_id/precision/filter/output.rb +1 -0
- data/lib/spec_id/precision/prob.rb +88 -21
- data/lib/spec_id/precision/prob/cmdline.rb +28 -16
- data/lib/spec_id/precision/prob/output.rb +8 -2
- data/lib/spec_id/proph/pep_summary.rb +25 -12
- data/lib/spec_id/sequest.rb +28 -0
- data/lib/spec_id/sequest/pepxml.rb +142 -197
- data/lib/spec_id/sqt.rb +349 -0
- data/lib/spec_id/srf.rb +33 -23
- data/lib/validator.rb +40 -57
- data/lib/validator/aa.rb +3 -90
- data/lib/validator/aa_est.rb +112 -0
- data/lib/validator/cmdline.rb +163 -31
- data/lib/validator/decoy.rb +15 -7
- data/lib/validator/digestion_based.rb +5 -4
- data/lib/validator/q_value.rb +32 -0
- data/script/peps_per_bin.rb +67 -0
- data/script/sqt_to_meta.rb +24 -0
- data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
- data/specs/bin/fasta_shaker_spec.rb +2 -2
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
- data/specs/bin/filter_and_validate_spec.rb +25 -6
- data/specs/bin/ms_to_lmat_spec.rb +2 -2
- data/specs/bin/prob_validate_spec.rb +5 -3
- data/specs/sample_enzyme_spec.rb +86 -1
- data/specs/spec_helper.rb +11 -9
- data/specs/spec_id/bioworks_spec.rb +2 -1
- data/specs/spec_id/precision/filter_spec.rb +5 -5
- data/specs/spec_id/precision/prob_spec.rb +0 -67
- data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
- data/specs/spec_id/protein_summary_spec.rb +4 -4
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
- data/specs/spec_id/sequest_spec.rb +38 -0
- data/specs/spec_id/sqt_spec.rb +111 -3
- data/specs/spec_id_spec.rb +2 -0
- data/specs/transmem/phobius_spec.rb +3 -1
- data/specs/transmem/toppred_spec.rb +1 -1
- data/specs/validator/aa_est_spec.rb +66 -0
- data/specs/validator/aa_spec.rb +1 -68
- data/specs/validator/background_spec.rb +2 -0
- data/specs/validator/bias_spec.rb +3 -27
- data/specs/validator/decoy_spec.rb +2 -2
- data/specs/validator/transmem_spec.rb +2 -1
- data/test_files/small.sqt +87 -0
- metadata +312 -293
data/lib/validator.rb
CHANGED
@@ -1,19 +1,34 @@
|
|
1
1
|
|
2
2
|
class Validator
|
3
3
|
|
4
|
+
# in the absence of digestion, does the spec_id type requires pephits for
|
5
|
+
# validation?
|
6
|
+
def self.requires_pephits?(spec_id_obj)
|
7
|
+
case spec_id_obj
|
8
|
+
when Proph::ProtSummary : true
|
9
|
+
when Proph::PepSummary : true
|
10
|
+
when SQTGroup : true
|
11
|
+
else ; false
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
4
15
|
Validator_to_string = {
|
5
16
|
'Validator::AA' => 'badAA',
|
17
|
+
'Validator::AAEst' => 'badAAEst',
|
6
18
|
'Validator::Decoy' => 'decoy',
|
7
19
|
'Validator::Transmem::Protein' => 'tmm',
|
8
20
|
'Validator::TruePos' => 'tps',
|
9
21
|
'Validator::Bias' => 'bias',
|
10
22
|
'Validator::Probability' => 'prob',
|
23
|
+
'Validator::QValue' => 'qval',
|
11
24
|
:bad_aa => 'badAA',
|
25
|
+
:bad_aa_est => 'badAAEst',
|
12
26
|
:decoy => 'decoy',
|
13
27
|
:tmm => 'tmm',
|
14
28
|
:tps => 'tps',
|
15
29
|
:bias => 'bias',
|
16
30
|
:prob => 'prob',
|
31
|
+
:qval => 'qval',
|
17
32
|
}
|
18
33
|
|
19
34
|
def initialize_increment
|
@@ -45,12 +60,12 @@ class Validator
|
|
45
60
|
@increment_tps += tps.size
|
46
61
|
@increment_fps += fps.size
|
47
62
|
(num_tps, num_fps) =
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
63
|
+
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
64
|
+
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
65
|
+
[num_tps, num_fps]
|
66
|
+
else
|
67
|
+
[@increment_tps, @increment_fps]
|
68
|
+
end
|
54
69
|
calc_precision(num_tps, num_fps)
|
55
70
|
end
|
56
71
|
|
@@ -97,12 +112,16 @@ class Validator
|
|
97
112
|
case val
|
98
113
|
when Validator::TruePos
|
99
114
|
hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
|
115
|
+
when Validator::AAEst
|
116
|
+
%w(frequency background calculated_background).each do |cat|
|
117
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
118
|
+
end
|
100
119
|
when Validator::AA
|
101
|
-
%w(
|
120
|
+
%w(false_to_total_ratio background calculated_background).each do |cat|
|
102
121
|
hash[cat.to_sym] = val.send(cat.to_sym)
|
103
122
|
end
|
104
123
|
when Validator::Decoy
|
105
|
-
%w(correct_wins decoy_on_match).each do |cat|
|
124
|
+
%w(decoy_to_target_ratio correct_wins decoy_on_match).each do |cat|
|
106
125
|
hash[cat.to_sym] = val.send(cat.to_sym)
|
107
126
|
end
|
108
127
|
hash[:constraint] = val.constraint.inspect if val.constraint
|
@@ -119,6 +138,8 @@ class Validator
|
|
119
138
|
%w(prob_method).each do |cat|
|
120
139
|
hash[cat.to_sym] = val.send(cat.to_sym)
|
121
140
|
end
|
141
|
+
when Validator::QValue
|
142
|
+
# no params to add
|
122
143
|
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
123
144
|
end
|
124
145
|
klass_as_s = val.class.to_s
|
@@ -127,46 +148,6 @@ class Validator
|
|
127
148
|
hash
|
128
149
|
end
|
129
150
|
end
|
130
|
-
|
131
|
-
=begin
|
132
|
-
## THIS IS WITH STRINGS AS KEYS!
|
133
|
-
# takes an array of validators and returns a fresh array where each has been
|
134
|
-
# turned into a sensible hash (with symbols as the keys!)
|
135
|
-
def self.sensible_validator_hashes(validators)
|
136
|
-
validators.map do |val|
|
137
|
-
hash = {}
|
138
|
-
case val
|
139
|
-
when Validator::TruePos
|
140
|
-
hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
|
141
|
-
when Validator::AA
|
142
|
-
%w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
|
143
|
-
hash[cat] = val.send(cat.to_sym)
|
144
|
-
end
|
145
|
-
when Validator::Decoy
|
146
|
-
%w(correct_wins decoy_on_match).each do |cat|
|
147
|
-
hash[cat] = val.send(cat.to_sym)
|
148
|
-
end
|
149
|
-
hash['constraint'] = val.constraint.inspect if val.constraint
|
150
|
-
when Validator::Bias
|
151
|
-
%w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
|
152
|
-
hash[cat] = val.send(cat.to_sym)
|
153
|
-
end
|
154
|
-
hash['file'] = val.fasta.filename
|
155
|
-
when Validator::Transmem::Protein
|
156
|
-
%w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
|
157
|
-
hash[cat] = val.send(cat.to_sym)
|
158
|
-
end
|
159
|
-
when Validator::Probability
|
160
|
-
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
161
|
-
end
|
162
|
-
klass_as_s = val.class.to_s
|
163
|
-
hash['type'] = Validator_to_string[klass_as_s]
|
164
|
-
hash['class'] = klass_as_s
|
165
|
-
hash
|
166
|
-
end
|
167
|
-
end
|
168
|
-
=end
|
169
|
-
|
170
151
|
end
|
171
152
|
|
172
153
|
module Precision::Calculator
|
@@ -186,11 +167,11 @@ end
|
|
186
167
|
# normal hits (which may be true or false) and the second are decoy hits.
|
187
168
|
# edge case: if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
|
188
169
|
module Precision::Calculator::Decoy
|
189
|
-
def calc_precision(num_normal, num_decoy)
|
170
|
+
def calc_precision(num_normal, num_decoy, decoy_to_target_ratio=1.0)
|
190
171
|
# will calculate as floats in case fractional amounts passed in for
|
191
172
|
# whatever reason
|
192
173
|
num_normal_f = num_normal.to_f
|
193
|
-
num_true_pos =
|
174
|
+
num_true_pos = num_normal_f - (num_decoy.to_f / decoy_to_target_ratio)
|
194
175
|
precision =
|
195
176
|
if num_normal_f == 0.0
|
196
177
|
if num_decoy.to_f > 0.0
|
@@ -204,11 +185,13 @@ module Precision::Calculator::Decoy
|
|
204
185
|
end
|
205
186
|
end
|
206
187
|
|
207
|
-
require 'validator/true_pos'
|
208
|
-
require 'validator/aa'
|
209
|
-
require 'validator/
|
210
|
-
require 'validator/
|
211
|
-
require 'validator/
|
212
|
-
require 'validator/
|
213
|
-
require 'validator/
|
188
|
+
#require 'validator/true_pos'
|
189
|
+
#require 'validator/aa'
|
190
|
+
#require 'validator/aa_est'
|
191
|
+
#require 'validator/bias'
|
192
|
+
#require 'validator/decoy'
|
193
|
+
#require 'validator/transmem'
|
194
|
+
#require 'validator/probability'
|
195
|
+
#require 'validator/q_value'
|
196
|
+
#require 'validator/prot_from_pep'
|
214
197
|
|
data/lib/validator/aa.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'validator' # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
|
2
1
|
require 'validator/digestion_based'
|
3
2
|
require 'fasta'
|
4
3
|
require 'spec_id/aa_freqs'
|
@@ -12,12 +11,7 @@ class Validator::AA < Validator::DigestionBased
|
|
12
11
|
|
13
12
|
# it is a false hit if the amino acid is located in the peptide
|
14
13
|
attr_accessor :false_if_found
|
15
|
-
|
16
|
-
# if given, the frequency of the amino acid is used to estimate the false to
|
17
|
-
# total ratio based on the pephits given for pephit_precision.
|
18
|
-
# see Validator::AA.calc_frequency to calculate a frequency
|
19
|
-
attr_accessor :frequency
|
20
|
-
|
14
|
+
|
21
15
|
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
22
16
|
:false_if_found => true,
|
23
17
|
} )
|
@@ -34,21 +28,9 @@ class Validator::AA < Validator::DigestionBased
|
|
34
28
|
end
|
35
29
|
end
|
36
30
|
|
37
|
-
# takes a fasta object and sets the frequency based on constraint.
|
38
|
-
# constraint is one acceptable to initialize!
|
39
|
-
# returns self
|
40
|
-
def set_frequency(fasta_obj)
|
41
|
-
table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
|
42
|
-
@frequency = table[@constraint.to_sym]
|
43
|
-
self
|
44
|
-
end
|
45
|
-
|
46
31
|
# right now only accepts single amino acids as constraints (as a string,
|
47
32
|
# e.g. 'C', or symbol, e.g. :C)
|
48
33
|
# options:
|
49
|
-
# :frequency OR :false_to_total_ratio should be used (NOT both)
|
50
|
-
# :frequency => Float, if the frequency of the amino acid is known (see
|
51
|
-
# Validator::AA.calc_frequency)
|
52
34
|
# :false_to_total_ratio => if a true digestion was already performed (see
|
53
35
|
# Validator::AA.calc_false_to_total_ratio)
|
54
36
|
# :false_if_found => it is a false positive if the amino acid is found.
|
@@ -56,80 +38,11 @@ class Validator::AA < Validator::DigestionBased
|
|
56
38
|
def initialize(constraint, options={})
|
57
39
|
@constraint = constraint.to_s
|
58
40
|
opts = DEFAULTS.merge(options)
|
59
|
-
(@
|
60
|
-
end
|
61
|
-
|
62
|
-
# if expected is 0 then will return precision = 1.0
|
63
|
-
def pephit_precision(peps)
|
64
|
-
if @frequency
|
65
|
-
(actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
|
66
|
-
if expected == 0.0
|
67
|
-
1.0
|
68
|
-
else
|
69
|
-
# what's this guy ?? good for??
|
70
|
-
fraction_of_expected = actual.to_f/expected
|
71
|
-
pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
|
72
|
-
end
|
73
|
-
elsif @false_to_total_ratio
|
74
|
-
super(peps)
|
75
|
-
else
|
76
|
-
raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
# returns (Actual(Int), Expected(Float)) based on how many peptides have at
|
81
|
-
# least one amino_acid, the frequency it is observed in background (then we
|
82
|
-
# can look at the size of each peptide and determine the likelihood of
|
83
|
-
# having the peptide with at least one amino acid).
|
84
|
-
# amino_acid should be a string (e.g., 'C')
|
85
|
-
def at_least_one(amino_acid, freq, amino_acid_seqs)
|
86
|
-
one_minus_freq = 1.0 - freq
|
87
|
-
probs = []
|
88
|
-
actual = 0
|
89
|
-
expected = 0.0
|
90
|
-
amino_acid_seqs.each do |aaseq|
|
91
|
-
expected += (1.0 - (one_minus_freq**aaseq.size))
|
92
|
-
if aaseq.include?(amino_acid)
|
93
|
-
actual += 1
|
94
|
-
end
|
95
|
-
end
|
96
|
-
[actual, expected]
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
# given: (actual # with 'AA', expected # with 'AA', total#peptides,
|
101
|
-
# mean_fraction_of_cysteines_true)
|
102
|
-
#
|
103
|
-
# PepHit('AA') = Peptide containing at least one 'AA'
|
104
|
-
# # expected PepHit('AA') # observed Bad Pep ('AA')
|
105
|
-
# ----------------------- proportional_to -------------------------
|
106
|
-
# # total PepHits # Total Bad PepHit
|
107
|
-
#
|
108
|
-
# returns the precision
|
109
|
-
# the background correction factor will not reduce the actual count of
|
110
|
-
# peptides to < 0. One can still get negative precision scores, however,
|
111
|
-
# depending on the other variables.
|
112
|
-
# background is the number of peptides with the amino acid in the purest
|
113
|
-
# sample over the total number of peps.
|
114
|
-
#---
|
115
|
-
# this is thoroughly explained in my 2007_09 presentations (inkscape)
|
116
|
-
#+++
|
117
|
-
def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
|
118
|
-
actual = actual.to_f
|
119
|
-
@calculated_background = actual / total_peps
|
120
|
-
actual -= (total_peps * background)
|
121
|
-
# We were doing it compared to the number expected.. but this is more
|
122
|
-
# clear
|
123
|
-
# actual/false_hits = expected/total_peps_passing
|
124
|
-
# false_hits = (total_peps_passing * actual) / expected
|
125
|
-
if actual < 0.0 ; actual = 0.0 end
|
126
|
-
total_number_false = (actual * total_peps).to_f / expected
|
127
|
-
#fppr = total_number_false / total_peps
|
128
|
-
prec = (total_peps - total_number_false) / total_peps
|
41
|
+
(@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
|
129
42
|
end
|
130
43
|
|
131
44
|
def to_param_string
|
132
|
-
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "
|
45
|
+
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
133
46
|
end
|
134
47
|
end
|
135
48
|
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'validator/aa'
|
2
|
+
|
3
|
+
|
4
|
+
class Validator ; end
|
5
|
+
class Validator::AA ; end
|
6
|
+
|
7
|
+
# A class that uses the peps given to it and a background frequency to
|
8
|
+
# calculate the false_to_total_ratio at each turn.
|
9
|
+
class Validator::AAEst < Validator::AA
|
10
|
+
attr_accessor :constraint
|
11
|
+
attr_accessor :false_if_found
|
12
|
+
|
13
|
+
# the frequency of the amino acid is used to estimate the false to
|
14
|
+
# total ratio based on the pephits given for pephit_precision.
|
15
|
+
# see Validator::AA.calc_frequency to calculate a frequency
|
16
|
+
# or use set_frequency to set from pep hits.
|
17
|
+
attr_accessor :frequency
|
18
|
+
|
19
|
+
DEFAULTS = {
|
20
|
+
:false_if_found => true
|
21
|
+
}.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
|
22
|
+
|
23
|
+
# only takes a string right now for constraint
|
24
|
+
def initialize(constraint, options={})
|
25
|
+
@constraint = constraint.to_s
|
26
|
+
opts = DEFAULTS.merge(options)
|
27
|
+
(@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
|
28
|
+
end
|
29
|
+
|
30
|
+
def pephit_precision(peps)
|
31
|
+
set_false_to_total_ratio(peps)
|
32
|
+
super(peps)
|
33
|
+
end
|
34
|
+
|
35
|
+
def set_false_to_total_ratio(peps)
|
36
|
+
if peps.size > 0
|
37
|
+
expected = 0.0
|
38
|
+
peps.each do |pep|
|
39
|
+
expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
|
40
|
+
end
|
41
|
+
@false_to_total_ratio = expected / peps.size
|
42
|
+
else
|
43
|
+
@false_to_total_ratio = 1.0
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def set_ongoing_false_to_total_ratio(peps)
|
48
|
+
if peps.size > 0
|
49
|
+
peps.each do |pep|
|
50
|
+
@expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
|
51
|
+
end
|
52
|
+
# @increment_total_submitted should == @increment_tps and @increment_fps
|
53
|
+
# since these are either/or
|
54
|
+
@false_to_total_ratio = @expected / @increment_total_submitted
|
55
|
+
else
|
56
|
+
@false_to_total_ratio = 1.0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def to_param_string
|
62
|
+
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
63
|
+
end
|
64
|
+
|
65
|
+
# takes objects responding to aaseq and sets the frequency based on
|
66
|
+
# constraint. constraint is one acceptable to initialize! returns self
|
67
|
+
def set_frequency(objs)
|
68
|
+
table = SpecID::AAFreqs.new.calculate_frequencies(objs)
|
69
|
+
@frequency = table[@constraint.to_sym]
|
70
|
+
self
|
71
|
+
end
|
72
|
+
|
73
|
+
# if adding pephits in groups at a time, the entire group does not need to be
|
74
|
+
# queried, just the individual hit. Use this OR pephits_precision (NOT
|
75
|
+
# both). The initial query to this method will begin a running tally that
|
76
|
+
# is saved by the validator.
|
77
|
+
# takes either an array or a single pephit (determined by if it is a
|
78
|
+
# SpecID::Pep)
|
79
|
+
def increment_pephits_precision(peps)
|
80
|
+
tmp = $VERBOSE; $VERBOSE = nil
|
81
|
+
unless @increment_initialized
|
82
|
+
initialize_increment
|
83
|
+
@expected = 0.0
|
84
|
+
end
|
85
|
+
$VERBOSE = tmp
|
86
|
+
|
87
|
+
to_submit =
|
88
|
+
if peps.is_a? SpecID::Pep
|
89
|
+
[peps]
|
90
|
+
else
|
91
|
+
peps
|
92
|
+
end
|
93
|
+
@increment_total_submitted += to_submit.size
|
94
|
+
(tps, fps) = partition(to_submit)
|
95
|
+
#### THIS IS THE MAGIC FOR THIS VALIDATOR:
|
96
|
+
set_ongoing_false_to_total_ratio(to_submit)
|
97
|
+
|
98
|
+
@increment_tps += tps.size
|
99
|
+
@increment_fps += fps.size
|
100
|
+
(num_tps, num_fps) =
|
101
|
+
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
102
|
+
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
103
|
+
[num_tps, num_fps]
|
104
|
+
else
|
105
|
+
[@increment_tps, @increment_fps]
|
106
|
+
end
|
107
|
+
calc_precision(num_tps, num_fps)
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
end
|
data/lib/validator/cmdline.rb
CHANGED
@@ -1,14 +1,31 @@
|
|
1
1
|
require 'validator'
|
2
2
|
|
3
|
+
require 'validator/true_pos'
|
4
|
+
require 'validator/aa'
|
5
|
+
require 'validator/aa_est'
|
6
|
+
require 'validator/bias'
|
7
|
+
require 'validator/decoy'
|
8
|
+
require 'validator/transmem'
|
9
|
+
require 'validator/probability'
|
10
|
+
require 'validator/q_value'
|
11
|
+
require 'validator/prot_from_pep'
|
12
|
+
|
13
|
+
## these all for a stupid check...
|
14
|
+
require 'spec_id/sqt'
|
15
|
+
require 'spec_id/proph/prot_summary'
|
16
|
+
require 'spec_id/proph/pep_summary'
|
17
|
+
|
3
18
|
class Validator::Cmdline
|
4
19
|
|
5
20
|
Validator_symbols_to_classes = {
|
6
21
|
:tmm => Validator::Transmem::Protein,
|
7
22
|
:decoy => Validator::Decoy,
|
8
23
|
:bad_aa => Validator::AA,
|
24
|
+
:bad_aa_est => Validator::AAEst,
|
9
25
|
:tps => Validator::TruePos,
|
10
26
|
:bias => Validator::Bias,
|
11
27
|
:prob => Validator::Probability,
|
28
|
+
:qval => Validator::QValue,
|
12
29
|
}
|
13
30
|
# was VAL_DEFAULTS
|
14
31
|
DEFAULTS = {
|
@@ -24,11 +41,16 @@ class Validator::Cmdline
|
|
24
41
|
{
|
25
42
|
:hits_together => true,
|
26
43
|
:decoy_on_match => true,
|
44
|
+
:decoy_to_target_ratio => 1.0,
|
27
45
|
},
|
28
46
|
:bad_aa =>
|
29
47
|
{
|
30
48
|
:false_if_found => true,
|
31
|
-
:
|
49
|
+
:bkg => 0.0,
|
50
|
+
},
|
51
|
+
:bad_aa_est =>
|
52
|
+
{
|
53
|
+
:false_if_found => true,
|
32
54
|
:bkg => 0.0,
|
33
55
|
},
|
34
56
|
:bias =>
|
@@ -39,7 +61,7 @@ class Validator::Cmdline
|
|
39
61
|
:ties => true,
|
40
62
|
}
|
41
63
|
COMMAND_LINE = {
|
42
|
-
:decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
|
64
|
+
:decoy => ["--decoy /REGEXP/|FILENAME[,DTR,DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
|
43
65
|
"FILENAME of separate search on decoys.",
|
44
66
|
"All regular expressions must be surrounded by '/'",
|
45
67
|
"(no extended options [trailing modifiers]).",
|
@@ -50,21 +72,30 @@ class Validator::Cmdline
|
|
50
72
|
" --decoy '/^\\s*REVERSE/'",
|
51
73
|
"If decoys proteins were searched in a separate file,",
|
52
74
|
"then give the FILENAME (e.g., --decoy decoy.srg)",
|
75
|
+
"DTR = Decoy to Target Ratio (default: #{DEFAULTS[:decoy][:decoy_to_target_ratio]})",
|
53
76
|
"DOM = *true/false, decoy on match",],
|
54
77
|
:tps => ["--tps <fasta>", "for a completely defined sample, this is the",
|
55
78
|
"fasta file containing the true protein hits"],
|
56
79
|
# may require digestion:
|
57
|
-
:
|
58
|
-
|
80
|
+
:fasta => ["--fasta FASTA", "fasta file for phobius transmembrane",
|
81
|
+
"(needed if PEPS options is not false)"],
|
82
|
+
:digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "[not recommended]",
|
83
|
+
"Creates the 'false/total' ratio with in silico",
|
84
|
+
"digestion. Otherwise, the 3rd-10th best hits (sorted by",
|
85
|
+
"xcorr) are used.",
|
86
|
+
"The following validators will use this",
|
87
|
+
"information (shared between them) if option given",
|
59
88
|
"ORIG_FASTA = the fasta file used to do the run",
|
60
89
|
"PARAMS = the params file used to do the run",],
|
61
90
|
:bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
|
62
91
|
"PE = *true|false proteins in fasta file expected in sample",
|
63
92
|
"BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
|
64
|
-
:bad_aa => ["--bad_aa AA,
|
93
|
+
:bad_aa => ["--bad_aa AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
|
65
94
|
"AA = The amino acid (e.g., 'C')",
|
66
|
-
"EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
|
67
95
|
"BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
|
96
|
+
:bad_aa_est => ["--bad_aa_est AA,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
|
97
|
+
"AA = The amino acid (e.g., 'C')",
|
98
|
+
"BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa_est][:bkg]}):",],
|
68
99
|
|
69
100
|
:tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
|
70
101
|
"phobius.small:",
|
@@ -110,17 +141,27 @@ class Validator::Cmdline
|
|
110
141
|
end
|
111
142
|
opts[:validators].push([:prob, mthd])
|
112
143
|
},
|
144
|
+
:qval => lambda {|ar, opts| opts[:validators].push([:qval]) },
|
113
145
|
:decoy => lambda {|ar, opts|
|
114
146
|
myargs = [:decoy]
|
115
147
|
first_arg = ar[0]
|
116
|
-
|
148
|
+
val_opts = {}
|
149
|
+
val_opts[:constraint] =
|
117
150
|
if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
|
151
|
+
# cast as a regular expression of has '/ /'
|
118
152
|
Regexp.new(first_arg[1...-1])
|
119
153
|
else
|
154
|
+
# assume that it is a filename
|
155
|
+
raise ArgumentError, "File does not exist: #{first_arg}\n(was this supposed to be a regular expression? if so, should be given: /#{first_arg}/)" unless File.exist?(first_arg)
|
120
156
|
first_arg
|
121
157
|
end
|
122
|
-
|
123
|
-
|
158
|
+
val_opts[:decoy_to_target_ratio] = (ar[1] || DEFAULTS[:decoy][:decoy_to_target_ratio]).to_f
|
159
|
+
val_opts[:decoy_on_match] = self.boolean(ar[2], DEFAULTS[:decoy][:decoy_on_match])
|
160
|
+
myargs.push(val_opts)
|
161
|
+
opts[:validators].push(myargs)
|
162
|
+
},
|
163
|
+
:fasta => lambda {|arg, opts|
|
164
|
+
opts[:fasta] = Fasta.new(arg)
|
124
165
|
},
|
125
166
|
:digestion => lambda {|ar, opts|
|
126
167
|
raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
|
@@ -138,6 +179,9 @@ class Validator::Cmdline
|
|
138
179
|
else
|
139
180
|
DEFAULTS[:bias][:bkg]
|
140
181
|
end
|
182
|
+
if ar[3]
|
183
|
+
val_opts[:false_to_total_ratio] = ar[3].to_f
|
184
|
+
end
|
141
185
|
myargs.push(val_opts)
|
142
186
|
opts[:validators].push(myargs)
|
143
187
|
},
|
@@ -146,16 +190,36 @@ class Validator::Cmdline
|
|
146
190
|
myargs = [:bad_aa]
|
147
191
|
myargs.push( ar[0] )
|
148
192
|
val_opts = {}
|
149
|
-
val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
|
150
193
|
val_opts[:background] =
|
151
|
-
if ar[
|
152
|
-
ar[
|
194
|
+
if ar[1]
|
195
|
+
ar[1].to_f
|
153
196
|
else
|
154
197
|
DEFAULTS[:bad_aa][:bkg]
|
155
198
|
end
|
199
|
+
if ar[2]
|
200
|
+
val_opts[:false_to_total_ratio] = ar[2].to_f
|
201
|
+
end
|
156
202
|
myargs.push(val_opts)
|
157
203
|
opts[:validators].push(myargs)
|
158
204
|
},
|
205
|
+
:bad_aa_est => lambda {|ar, opts|
|
206
|
+
## GET the FREQUENCY
|
207
|
+
myargs = [:bad_aa_est]
|
208
|
+
myargs.push( ar[0] )
|
209
|
+
val_opts = {}
|
210
|
+
val_opts[:background] =
|
211
|
+
if ar[1]
|
212
|
+
ar[1].to_f
|
213
|
+
else
|
214
|
+
DEFAULTS[:bad_aa_est][:bkg]
|
215
|
+
end
|
216
|
+
if ar[2]
|
217
|
+
val_opts[:frequency] = ar[2].to_f
|
218
|
+
end
|
219
|
+
myargs.push(val_opts)
|
220
|
+
opts[:validators].push(myargs)
|
221
|
+
},
|
222
|
+
|
159
223
|
:tmm => lambda {|ar, opts|
|
160
224
|
myargs = [:tmm]
|
161
225
|
myargs.push( ar[0] )
|
@@ -177,16 +241,38 @@ class Validator::Cmdline
|
|
177
241
|
if ar[4] ; ar[4].to_f
|
178
242
|
else ; DEFAULTS[:tmm][:bkg]
|
179
243
|
end
|
244
|
+
if ar[5]
|
245
|
+
val_opts[:false_to_total_ratio] = ar[5].to_f
|
246
|
+
end
|
180
247
|
myargs.push(val_opts)
|
181
248
|
opts[:validators].push( myargs )
|
182
249
|
},
|
250
|
+
:pephits => lambda {|v,opts| opts[:pephits] = SpecID.new(v) },
|
183
251
|
:tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
|
184
252
|
:false_on_tie => lambda {|v,opts| opts[:ties] = false },
|
185
253
|
}
|
186
254
|
|
255
|
+
def self.requires_pephits?(spec_id_obj)
|
256
|
+
case spec_id_obj
|
257
|
+
when Proph::ProtSummary : true
|
258
|
+
# at least currently (subject to change)
|
259
|
+
when Proph::PepSummary : true
|
260
|
+
when SQTGroup
|
261
|
+
if spec_id_obj.peps.first.respond_to?(:q_value)
|
262
|
+
# its percolator output and we don't have other hits to use
|
263
|
+
true
|
264
|
+
else
|
265
|
+
false
|
266
|
+
end
|
267
|
+
else ; false
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
187
271
|
# remove the keys from opts involved in validators and return an array
|
188
272
|
# of validators
|
189
|
-
|
273
|
+
# postfilter is one of :top_per_scan, :top_per_aaseq,
|
274
|
+
# :top_per_aaseq_charge (of which last two are subsets of scan)
|
275
|
+
def self.prepare_validators(opts, false_on_tie, interactive, postfilter, spec_id)
|
190
276
|
validator_args = opts[:validators]
|
191
277
|
correct_wins = !false_on_tie
|
192
278
|
need_false_to_total_ratio = []
|
@@ -199,7 +285,9 @@ class Validator::Cmdline
|
|
199
285
|
case tp
|
200
286
|
when :tmm
|
201
287
|
val_args[1][:correct_wins] = correct_wins
|
202
|
-
|
288
|
+
if opts.key?(:fasta)
|
289
|
+
val_args[1][:fasta] = opts[:fasta]
|
290
|
+
end
|
203
291
|
val_args
|
204
292
|
when :bias
|
205
293
|
val_args[1][:correct_wins] = correct_wins
|
@@ -208,10 +296,10 @@ class Validator::Cmdline
|
|
208
296
|
val_args = [val_args[0], correct_wins]
|
209
297
|
val_args
|
210
298
|
when :decoy
|
211
|
-
val_args
|
299
|
+
val_args[0][:correct_wins] = correct_wins
|
212
300
|
# don't delete the key here since we need the decoy = regexp key
|
213
301
|
val_args
|
214
|
-
else ## bad_aa and
|
302
|
+
else ## bad_aa, prob, and qval are represented here:
|
215
303
|
val_args
|
216
304
|
end
|
217
305
|
val = Validator_symbols_to_classes[tp].new( *val_args )
|
@@ -219,10 +307,12 @@ class Validator::Cmdline
|
|
219
307
|
if tp == :tmm
|
220
308
|
transmem_vals << val
|
221
309
|
end
|
222
|
-
potential_digestion_classes = /Transmem|AA|Bias/
|
310
|
+
potential_digestion_classes = /Transmem|AA|AAEst|Bias/
|
223
311
|
if val.class.to_s =~ potential_digestion_classes
|
224
|
-
if
|
225
|
-
need_frequency
|
312
|
+
if val.class.to_s == 'Validator::AAEst'
|
313
|
+
need_frequency.push(val) if val.frequency.nil?
|
314
|
+
elsif !(val.false_to_total_ratio.nil?)
|
315
|
+
$stderr.puts "using false_to_total_ratio: #{val.false_to_total_ratio}"
|
226
316
|
else
|
227
317
|
need_false_to_total_ratio << val
|
228
318
|
end
|
@@ -230,20 +320,62 @@ class Validator::Cmdline
|
|
230
320
|
val
|
231
321
|
end
|
232
322
|
|
233
|
-
if need_false_to_total_ratio.size > 0
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
323
|
+
if ((need_false_to_total_ratio.size > 0) or (need_frequency.size > 0))
|
324
|
+
if opts.key?(:digestion_objects)
|
325
|
+
#raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
|
326
|
+
peps = Digestor.digest( *(opts[:digestion_objects]) )
|
327
|
+
need_false_to_total_ratio.each do |val|
|
328
|
+
val.set_false_to_total_ratio( peps )
|
329
|
+
end
|
330
|
+
if need_frequency.size > 0
|
331
|
+
need_frequency.each do |val|
|
332
|
+
val.set_frequency( opts[:digestion_objects][0] )
|
333
|
+
end
|
334
|
+
end
|
335
|
+
opts.delete(:digestion_objects)
|
336
|
+
else ## do the new and improved selection of non-top hits to get false_to_total_ratios and freqs
|
337
|
+
$stderr.puts "...using pephits to calculate background ratios"
|
338
|
+
# first_index, last_index
|
339
|
+
pephits =
|
340
|
+
if opts[:pephits] ## protein prophet (since it needs to get ratios somewhere
|
341
|
+
$stderr.puts "using --pephits"
|
342
|
+
opts[:pephits].peps
|
343
|
+
elsif requires_pephits?(spec_id)
|
344
|
+
raise ArgumentError, "with objects of class '#{spec_id.class}', one of your validators requires --pephits or --digestion"
|
345
|
+
else
|
346
|
+
$stderr.puts "using given spec_id.peps"
|
347
|
+
spec_id.peps
|
348
|
+
end
|
349
|
+
|
350
|
+
not_first_or_second_peps = Sequest.other_hits_sorted_by_xcorr(pephits, 2, 9, [:base_name, :first_scan, :charge])
|
351
|
+
pephits =
|
352
|
+
case postfilter
|
353
|
+
when :top_per_scan
|
354
|
+
$stderr.puts "using top_per_scan" ; not_first_or_second_peps
|
355
|
+
when :top_per_aaseq
|
356
|
+
# it doesn't matter which one is given since validators are
|
357
|
+
# based on amino acid sequence
|
358
|
+
$stderr.puts 'using top_per_aaseq'
|
359
|
+
not_first_or_second_peps.hash_by(:aaseq).values.map {|pep| pep.first }
|
360
|
+
when :top_per_aaseq_charge
|
361
|
+
$stderr.puts 'using top_per_aaseq_charge'
|
362
|
+
not_first_or_second_peps.hash_by(:aaseq, :charge).values.map {|pep| pep.first }
|
363
|
+
else
|
364
|
+
raise ArgumentError, "must have a valid postfilter method, yours: '#{postfilter}'"
|
365
|
+
end
|
366
|
+
|
367
|
+
need_false_to_total_ratio.each do |val|
|
368
|
+
val.set_false_to_total_ratio( pephits )
|
369
|
+
$stderr.puts "false_to_total_ratio for #{val.class.to_s}: #{val.false_to_total_ratio}"
|
370
|
+
end
|
371
|
+
if need_frequency.size > 0
|
372
|
+
need_frequency.each do |val|
|
373
|
+
$stderr.puts "Setting frequency!"
|
374
|
+
val.set_frequency( pephits )
|
375
|
+
end
|
376
|
+
end
|
244
377
|
end
|
245
378
|
end
|
246
|
-
opts.delete(:digestion_objects)
|
247
379
|
|
248
380
|
if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
|
249
381
|
# This is overkill if we are doing a single filtering job, but it
|