mspire 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'validator'
|
2
|
+
require 'vec'
|
3
|
+
require 'enumerator'
|
4
|
+
|
5
|
+
class Validator ; end
|
6
|
+
class Validator::Background
|
7
|
+
|
8
|
+
attr_accessor :data
|
9
|
+
|
10
|
+
def initialize(data=nil)
|
11
|
+
@data = data
|
12
|
+
end
|
13
|
+
|
14
|
+
def delete_nan!(vec)
|
15
|
+
vec.each_with_index do |v,i|
|
16
|
+
if v.nan?
|
17
|
+
vec[i] = 0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
|
23
|
+
data_vec = VecD[*@data]
|
24
|
+
delete_nan!(data_vec)
|
25
|
+
stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
|
26
|
+
smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
|
27
|
+
smoothed_stdev_derivs = smoothed_stdev.chim
|
28
|
+
last_0_index = index_of_last_0(smoothed_stdev_derivs)
|
29
|
+
min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
|
30
|
+
end
|
31
|
+
|
32
|
+
# not really working right currently
|
33
|
+
def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
|
34
|
+
data_vec = VecD[*@data]
|
35
|
+
delete_nan!(data_vec)
|
36
|
+
drvs = data_vec.chim
|
37
|
+
# absolute value
|
38
|
+
drvs.each_with_index {|x,i| drvs[i] = x.abs }
|
39
|
+
mv_avg = drvs.transform(avg_points) {|v| v.avg }
|
40
|
+
last_0_index = index_of_last_0(mv_avg.chim)
|
41
|
+
min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
|
42
|
+
end
|
43
|
+
|
44
|
+
def index_of_last_0(vec)
|
45
|
+
last_0_index = nil
|
46
|
+
vec.each_with_index do |v,i|
|
47
|
+
if v == 0
|
48
|
+
last_0_index = i
|
49
|
+
end
|
50
|
+
end
|
51
|
+
last_0_index
|
52
|
+
end
|
53
|
+
|
54
|
+
# returns the minimum value in the window centered on index
|
55
|
+
def min_in_window(vec, index, pre, post)
|
56
|
+
last_index = vec.size - 1
|
57
|
+
start = index - pre
|
58
|
+
stop = index + post
|
59
|
+
start = 0 if start < 0
|
60
|
+
stop = last_index if stop > last_index
|
61
|
+
vec[start..stop].min
|
62
|
+
end
|
63
|
+
|
64
|
+
# very simple, should work
|
65
|
+
def min_mesa(start, stop, points=3)
|
66
|
+
data_vec = VecD[*@data]
|
67
|
+
delete_nan!(data_vec)
|
68
|
+
smoothed = data_vec.transform(3) {|v| v.avg }
|
69
|
+
smoothed[start..stop].min
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'validator'
|
2
|
+
require 'validator/digestion_based'
|
3
|
+
|
4
|
+
# class for any generic kind of bias. For instance, a list of high abundance
|
5
|
+
# proteins we would expect to see, or a list of low abundance proteins we
|
6
|
+
# would not expect to see, or proteins that have been filtered out in some
|
7
|
+
# way, etc.
|
8
|
+
class Validator::Bias < Validator::DigestionBased
|
9
|
+
include Precision::Calculator
|
10
|
+
|
11
|
+
# a fasta object (by default containing proteins expected to be in the
|
12
|
+
# sample [see proteins_expected to modify that behavior])
|
13
|
+
attr_reader :fasta
|
14
|
+
|
15
|
+
# correct_wins means that only a single protein from a pep.aaseq must match
|
16
|
+
# the fasta object for the pep hit to be considered valid. Otherwise, all
|
17
|
+
# must be a match (logic negated by proteins_expected)
|
18
|
+
attr_accessor :correct_wins
|
19
|
+
|
20
|
+
# proteins_expected==true means we expect to see the proteins in the sample
|
21
|
+
# proteins_expected==false means we do not expect to see these proteins in
|
22
|
+
# the sample
|
23
|
+
attr_accessor :proteins_expected
|
24
|
+
|
25
|
+
# a hash made by taking each fasta reference in fasta_object, (everything
|
26
|
+
# until a space) and setting the value to true. It can be queried with the
|
27
|
+
# start of an fasta sequence
|
28
|
+
attr_accessor :short_reference_hash
|
29
|
+
|
30
|
+
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
31
|
+
:proteins_expected => true,
|
32
|
+
:correct_wins => true,
|
33
|
+
} )
|
34
|
+
|
35
|
+
# options:
|
36
|
+
# (t = true, f = false, '*'= default)
|
37
|
+
# :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
|
38
|
+
# :correct_wins => *t/f a single peptide hit from one of these proteins
|
39
|
+
# constitutes a true positive
|
40
|
+
# :background => Float (*0.0-1.0)
|
41
|
+
# :false_to_total_ratio => Float (*nil by default)
|
42
|
+
def initialize(fasta_object, options={})
|
43
|
+
opts = DEFAULTS.merge(options)
|
44
|
+
(@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
|
45
|
+
@fasta = fasta_object
|
46
|
+
@header_split_hash = @fasta.prots.map {|prot| prot.reference }
|
47
|
+
@short_reference_hash = self.class.make_short_reference_hash(fasta_object)
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.make_short_reference_hash(fasta_object)
|
51
|
+
hash = {}
|
52
|
+
fasta_object.each do |prot|
|
53
|
+
hash[prot.first_entry] = true
|
54
|
+
end
|
55
|
+
hash
|
56
|
+
end
|
57
|
+
|
58
|
+
def partition(peps)
|
59
|
+
klass = self.class
|
60
|
+
cw =
|
61
|
+
if !@proteins_expected
|
62
|
+
!@correct_wins
|
63
|
+
else
|
64
|
+
@correct_wins
|
65
|
+
end
|
66
|
+
|
67
|
+
(tp, fp) =
|
68
|
+
if cw
|
69
|
+
peps.partition do |pep|
|
70
|
+
pep.prots.any? do |pepprot|
|
71
|
+
@short_reference_hash.key?( pepprot.first_entry )
|
72
|
+
end
|
73
|
+
end
|
74
|
+
else
|
75
|
+
peps.partition do |pep|
|
76
|
+
pep.prots.any? do |pepprot|
|
77
|
+
!@short_reference_hash.key?( pepprot.first_entry )
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
if !@correct_wins
|
83
|
+
tp, fp = fp, tp
|
84
|
+
end
|
85
|
+
|
86
|
+
[tp, fp]
|
87
|
+
end
|
88
|
+
|
89
|
+
# pephit_precision is done through inheritance
|
90
|
+
|
91
|
+
def to_param_string
|
92
|
+
"abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
@@ -0,0 +1,260 @@
|
|
1
|
+
require 'validator'
|
2
|
+
|
3
|
+
class Validator::Cmdline
|
4
|
+
|
5
|
+
Validator_symbols_to_classes = {
|
6
|
+
:tmm => Validator::Transmem::Protein,
|
7
|
+
:decoy => Validator::Decoy,
|
8
|
+
:bad_aa => Validator::AA,
|
9
|
+
:tps => Validator::TruePos,
|
10
|
+
:bias => Validator::Bias,
|
11
|
+
:prob => Validator::Probability,
|
12
|
+
}
|
13
|
+
# was VAL_DEFAULTS
|
14
|
+
DEFAULTS = {
|
15
|
+
:tmm =>
|
16
|
+
{
|
17
|
+
# file
|
18
|
+
:min_num_tmm_seqs => 1,
|
19
|
+
:expect_soluble => true,
|
20
|
+
:no_include_tm_peps => 0.8,
|
21
|
+
:bkg => 0.0,
|
22
|
+
},
|
23
|
+
:decoy =>
|
24
|
+
{
|
25
|
+
:hits_together => true,
|
26
|
+
:decoy_on_match => true,
|
27
|
+
},
|
28
|
+
:bad_aa =>
|
29
|
+
{
|
30
|
+
:false_if_found => true,
|
31
|
+
:estimate => true,
|
32
|
+
:bkg => 0.0,
|
33
|
+
},
|
34
|
+
:bias =>
|
35
|
+
{
|
36
|
+
:bkg => 0.0,
|
37
|
+
:proteins_expected => true,
|
38
|
+
},
|
39
|
+
:ties => true,
|
40
|
+
}
|
41
|
+
COMMAND_LINE = {
|
42
|
+
:decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
|
43
|
+
"FILENAME of separate search on decoys.",
|
44
|
+
"All regular expressions must be surrounded by '/'",
|
45
|
+
"(no extended options [trailing modifiers]).",
|
46
|
+
"e.g., a run using concatenated reversed proteins that",
|
47
|
+
"includes 'REVERSE' in the fasta heading:",
|
48
|
+
" --decoy /REVERSE/",
|
49
|
+
"Anything fancier should be quoted:",
|
50
|
+
" --decoy '/^\\s*REVERSE/'",
|
51
|
+
"If decoys proteins were searched in a separate file,",
|
52
|
+
"then give the FILENAME (e.g., --decoy decoy.srg)",
|
53
|
+
"DOM = *true/false, decoy on match",],
|
54
|
+
:tps => ["--tps <fasta>", "for a completely defined sample, this is the",
|
55
|
+
"fasta file containing the true protein hits"],
|
56
|
+
# may require digestion:
|
57
|
+
:digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
|
58
|
+
"information (that is shared between them).",
|
59
|
+
"ORIG_FASTA = the fasta file used to do the run",
|
60
|
+
"PARAMS = the params file used to do the run",],
|
61
|
+
:bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
|
62
|
+
"PE = *true|false proteins in fasta file expected in sample",
|
63
|
+
"BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
|
64
|
+
:bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
|
65
|
+
"AA = The amino acid (e.g., 'C')",
|
66
|
+
"EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
|
67
|
+
"BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
|
68
|
+
|
69
|
+
:tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
|
70
|
+
"phobius.small:",
|
71
|
+
"http://phobius.cgb.ki.se/",
|
72
|
+
"(select 'Short' output, and save output as file)",
|
73
|
+
"toppred.out:",
|
74
|
+
"http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html",
|
75
|
+
"(output 'toppred.out' in 'New' or 'Xml' format)",
|
76
|
+
"MIN = Int, minimum number transmembrane seqs (def: #{DEFAULTS[:tmm][:min_num_tmm_seqs]})",
|
77
|
+
"SOL = true|false, this is a soluble fraction( def: #{DEFAULTS[:tmm][:expect_soluble]})",
|
78
|
+
"PEPS = Float | false, don't consider tm peps (>= fraction",
|
79
|
+
" tm content) (false skips) (def: #{DEFAULTS[:tmm][:no_include_tm_peps]})",
|
80
|
+
"BKG = Float , background contaminating insoluble (def: #{DEFAULTS[:tmm][:bkg]})"],
|
81
|
+
|
82
|
+
|
83
|
+
# VALIDATION MODIFIERS
|
84
|
+
:false_on_tie => ["--false_on_tie", "if peptide belongs to correct AND incorrect proteins",
|
85
|
+
"it will be counted as correct"],
|
86
|
+
|
87
|
+
}
|
88
|
+
|
89
|
+
def self.boolean(arg, default)
|
90
|
+
case arg
|
91
|
+
when 'true' ; true
|
92
|
+
when 'false' ; false
|
93
|
+
else ; default
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
PrepArgs = {
|
98
|
+
:prob => lambda {|ar, opts|
|
99
|
+
mthd =
|
100
|
+
if ar
|
101
|
+
if ar == 'nsp'
|
102
|
+
:probability
|
103
|
+
elsif ar == 'init'
|
104
|
+
:initial_probability
|
105
|
+
else
|
106
|
+
raise ArgumentError, "--prob [arg], optional arg can only be 'nsp' or 'init'!"
|
107
|
+
end
|
108
|
+
else
|
109
|
+
:probability
|
110
|
+
end
|
111
|
+
opts[:validators].push([:prob, mthd])
|
112
|
+
},
|
113
|
+
:decoy => lambda {|ar, opts|
|
114
|
+
myargs = [:decoy]
|
115
|
+
first_arg = ar[0]
|
116
|
+
myargs[1] =
|
117
|
+
if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
|
118
|
+
Regexp.new(first_arg[1...-1])
|
119
|
+
else
|
120
|
+
first_arg
|
121
|
+
end
|
122
|
+
myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
|
123
|
+
opts[:validators].push(myargs)
|
124
|
+
},
|
125
|
+
:digestion => lambda {|ar, opts|
|
126
|
+
raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
|
127
|
+
opts[:digestion] = ar.dup
|
128
|
+
opts[:digestion_objects] = [Fasta.new(ar[0]), Sequest::Params.new(ar[1])]
|
129
|
+
},
|
130
|
+
:bias => lambda {|ar, opts|
|
131
|
+
myargs = [:bias]
|
132
|
+
myargs.push( Fasta.new(ar[0]) )
|
133
|
+
val_opts = {}
|
134
|
+
val_opts[:proteins_expected] = self.boolean(ar[1], DEFAULTS[:bias][:proteins_expected])
|
135
|
+
val_opts[:background] =
|
136
|
+
if ar[2]
|
137
|
+
ar[2].to_f
|
138
|
+
else
|
139
|
+
DEFAULTS[:bias][:bkg]
|
140
|
+
end
|
141
|
+
myargs.push(val_opts)
|
142
|
+
opts[:validators].push(myargs)
|
143
|
+
},
|
144
|
+
:bad_aa => lambda {|ar, opts|
|
145
|
+
## GET the FREQUENCY
|
146
|
+
myargs = [:bad_aa]
|
147
|
+
myargs.push( ar[0] )
|
148
|
+
val_opts = {}
|
149
|
+
val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
|
150
|
+
val_opts[:background] =
|
151
|
+
if ar[2]
|
152
|
+
ar[2].to_f
|
153
|
+
else
|
154
|
+
DEFAULTS[:bad_aa][:bkg]
|
155
|
+
end
|
156
|
+
myargs.push(val_opts)
|
157
|
+
opts[:validators].push(myargs)
|
158
|
+
},
|
159
|
+
:tmm => lambda {|ar, opts|
|
160
|
+
myargs = [:tmm]
|
161
|
+
myargs.push( ar[0] )
|
162
|
+
val_opts = {}
|
163
|
+
val_opts[:min_num_tms] =
|
164
|
+
if ar[1] ; ar[1].to_i
|
165
|
+
else ; DEFAULTS[:tmm][:min_num_tmm_seqs]
|
166
|
+
end
|
167
|
+
val_opts[:soluble_fraction] = self.boolean(ar[2], DEFAULTS[:tmm][:expect_soluble])
|
168
|
+
val_opts[:no_include_tm_peps] =
|
169
|
+
if ar[3]
|
170
|
+
case ar[3]
|
171
|
+
when 'false' ; false
|
172
|
+
else ; ar[3].to_f
|
173
|
+
end
|
174
|
+
else ; DEFAULTS[:tmm][:no_include_tm_peps]
|
175
|
+
end
|
176
|
+
val_opts[:background] =
|
177
|
+
if ar[4] ; ar[4].to_f
|
178
|
+
else ; DEFAULTS[:tmm][:bkg]
|
179
|
+
end
|
180
|
+
myargs.push(val_opts)
|
181
|
+
opts[:validators].push( myargs )
|
182
|
+
},
|
183
|
+
:tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
|
184
|
+
:false_on_tie => lambda {|v,opts| opts[:ties] = false },
|
185
|
+
}
|
186
|
+
|
187
|
+
# remove the keys from opts involved in validators and return an array
|
188
|
+
# of validators
|
189
|
+
def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
|
190
|
+
validator_args = opts[:validators]
|
191
|
+
correct_wins = !false_on_tie
|
192
|
+
need_false_to_total_ratio = []
|
193
|
+
need_frequency = []
|
194
|
+
transmem_vals = []
|
195
|
+
validators = validator_args.map do |args|
|
196
|
+
tp = args.shift
|
197
|
+
val_args = args.dup # protect the original keys
|
198
|
+
val_args =
|
199
|
+
case tp
|
200
|
+
when :tmm
|
201
|
+
val_args[1][:correct_wins] = correct_wins
|
202
|
+
val_args[1][:fasta] = opts[:digestion_objects][0]
|
203
|
+
val_args
|
204
|
+
when :bias
|
205
|
+
val_args[1][:correct_wins] = correct_wins
|
206
|
+
val_args
|
207
|
+
when :tps
|
208
|
+
val_args = [val_args[0], correct_wins]
|
209
|
+
val_args
|
210
|
+
when :decoy
|
211
|
+
val_args = [val_args[0], val_args[1], correct_wins]
|
212
|
+
# don't delete the key here since we need the decoy = regexp key
|
213
|
+
val_args
|
214
|
+
else ## bad_aa and prob are represented here:
|
215
|
+
val_args
|
216
|
+
end
|
217
|
+
val = Validator_symbols_to_classes[tp].new( *val_args )
|
218
|
+
# make some lists of validators based on pre-processing needs:
|
219
|
+
if tp == :tmm
|
220
|
+
transmem_vals << val
|
221
|
+
end
|
222
|
+
potential_digestion_classes = /Transmem|AA|Bias/
|
223
|
+
if val.class.to_s =~ potential_digestion_classes
|
224
|
+
if val_args[1][:estimate] == true
|
225
|
+
need_frequency << val
|
226
|
+
else
|
227
|
+
need_false_to_total_ratio << val
|
228
|
+
end
|
229
|
+
end
|
230
|
+
val
|
231
|
+
end
|
232
|
+
|
233
|
+
if need_false_to_total_ratio.size > 0
|
234
|
+
raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
|
235
|
+
peps = Digestor.digest( *(opts[:digestion_objects]) )
|
236
|
+
need_false_to_total_ratio.each do |val|
|
237
|
+
val.set_false_to_total_ratio( peps )
|
238
|
+
end
|
239
|
+
end
|
240
|
+
if need_frequency.size > 0
|
241
|
+
raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
|
242
|
+
need_frequency.each do |val|
|
243
|
+
val.set_frequency( opts[:digestion_objects][0] )
|
244
|
+
end
|
245
|
+
end
|
246
|
+
opts.delete(:digestion_objects)
|
247
|
+
|
248
|
+
if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
|
249
|
+
# This is overkill if we are doing a single filtering job, but it
|
250
|
+
# ensures that it works in all the ways I'm doing it. Should
|
251
|
+
# refactor eventually !!
|
252
|
+
transmem_vals.each do |val| ## but, prob uses it too!
|
253
|
+
val.transmem_status_hash = val.create_transmem_status_hash(spec_id.peps)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
validators
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'validator'
|
2
|
+
|
3
|
+
class Validator::Decoy < Validator
|
4
|
+
include Precision::Calculator::Decoy
|
5
|
+
|
6
|
+
attr_accessor :constraint
|
7
|
+
|
8
|
+
attr_accessor :decoy_on_match
|
9
|
+
attr_accessor :correct_wins
|
10
|
+
|
11
|
+
attr_accessor :last_pep_was_decoy
|
12
|
+
|
13
|
+
attr_accessor :increment_normal
|
14
|
+
attr_accessor :increment_decoy
|
15
|
+
attr_accessor :increment_total_submitted
|
16
|
+
|
17
|
+
attr_reader :normal_peps_just_submitted
|
18
|
+
|
19
|
+
def initialize(constraint=nil, decoy_on_match = true, correct_wins = true)
|
20
|
+
@decoy_on_match = decoy_on_match
|
21
|
+
@correct_wins = correct_wins
|
22
|
+
@constraint = constraint
|
23
|
+
end
|
24
|
+
|
25
|
+
# returns [normal, decoy] (?? I think ??)
|
26
|
+
def partition(peps)
|
27
|
+
if @decoy_on_match
|
28
|
+
if @correct_wins
|
29
|
+
peps.partition do |pep|
|
30
|
+
!(pep.prots.all? {|prot| prot.reference.match(@constraint) })
|
31
|
+
end
|
32
|
+
else # fp wins
|
33
|
+
peps.partition do |pep|
|
34
|
+
!(pep.prots.any? {|prot| prot.reference.match(@constraint) })
|
35
|
+
end
|
36
|
+
end
|
37
|
+
else
|
38
|
+
if @correct_wins
|
39
|
+
peps.partition do |pep|
|
40
|
+
pep.prots.any? {|prot| prot.reference.match(@constraint) }
|
41
|
+
end
|
42
|
+
else
|
43
|
+
peps.partition do |pep|
|
44
|
+
pep.prots.all? {|prot| prot.reference.match(@constraint) }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def initialize_increment
|
51
|
+
@increment_normal = 0
|
52
|
+
@increment_decoy = 0
|
53
|
+
@increment_total_submitted = 0
|
54
|
+
@increment_initialized = true
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
# does not deal in separate_peps right now!!
|
59
|
+
# will take an array or single peptide
|
60
|
+
def increment_pephits_precision(peps)
|
61
|
+
tmp = $VERBOSE; $VERBOSE = nil
|
62
|
+
initialize_increment unless @increment_initialized
|
63
|
+
$VERBOSE = tmp
|
64
|
+
|
65
|
+
to_submit =
|
66
|
+
if peps.is_a? SpecID::Pep
|
67
|
+
[peps]
|
68
|
+
else
|
69
|
+
peps
|
70
|
+
end
|
71
|
+
|
72
|
+
@increment_total_submitted += to_submit.size
|
73
|
+
(normal, decoy) = partition(to_submit)
|
74
|
+
@normal_peps_just_submitted = normal
|
75
|
+
@increment_normal += normal.size
|
76
|
+
@increment_decoy += decoy.size
|
77
|
+
calc_precision(@increment_normal, @increment_decoy)
|
78
|
+
end
|
79
|
+
|
80
|
+
def pephit_precision(peps, separate_peps=nil)
|
81
|
+
if separate_peps
|
82
|
+
calc_precision(peps.size, separate_peps.size)
|
83
|
+
else
|
84
|
+
(norm, decoy) = partition(peps)
|
85
|
+
calc_precision(norm.size, decoy.size)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_param_string
|
90
|
+
"decoy="+ ["{constraint=#{(constraint ? constraint.inspect : '')}", "decoy_on_match=#{@decoy_on_match}", "correct_wins=#{@correct_wins}}"].join(", ")
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'validator'
|
2
|
+
require 'fasta'
|
3
|
+
require 'spec_id/sequest/params'
|
4
|
+
|
5
|
+
# objects of this class can calculate pephit_precision given an array of
|
6
|
+
# SpecID::Pep objects using the pephit_precision method.
|
7
|
+
class Validator::DigestionBased < Validator
|
8
|
+
DEFAULTS = {
|
9
|
+
:false_to_total_ratio => 1.0,
|
10
|
+
:background => 0.0,
|
11
|
+
}
|
12
|
+
|
13
|
+
# the number of tps
|
14
|
+
attr_accessor :increment_tps
|
15
|
+
# the number of fps
|
16
|
+
attr_accessor :increment_fps
|
17
|
+
|
18
|
+
# the total peptides submitted to the validator (regardless of tp, fp, or
|
19
|
+
# nil)
|
20
|
+
attr_accessor :increment_total_submitted
|
21
|
+
|
22
|
+
# the ratio of false hits to total peptides in the fasta file
|
23
|
+
attr_accessor :false_to_total_ratio
|
24
|
+
|
25
|
+
# the false_to_total_ratio calculated (but not applied)
|
26
|
+
attr_reader :calculated_background
|
27
|
+
|
28
|
+
# For a sample with no false hits in it, (under defaults) this is the
|
29
|
+
# fraction of peptides with the constraint over the total number of peptides
|
30
|
+
# from which these hits are derived.
|
31
|
+
attr_accessor :background
|
32
|
+
|
33
|
+
|
34
|
+
# expects that classes define a partition method, and a @background
|
35
|
+
def pephit_precision(peps)
|
36
|
+
## this gives us the fraction that are transmembrane (under defaults):
|
37
|
+
(tps, fps) = partition(peps)
|
38
|
+
(num_tps, num_fps) = calc_precision_prep(tps.size, fps.size)
|
39
|
+
calc_precision(num_tps, num_fps)
|
40
|
+
end
|
41
|
+
|
42
|
+
# returns [num_tps, num_fps]
|
43
|
+
def calc_precision_prep(num_tps, num_fps)
|
44
|
+
total_peps_passing_partition = num_tps + num_fps
|
45
|
+
num_fps = adjust_fps_for_background(num_tps, num_fps, @background)
|
46
|
+
## we must use the false_to_total_ratio to estimate how many are really
|
47
|
+
## incorrect!
|
48
|
+
# FALSE/TOTAL = FALSE(found)/TOTAL(found)
|
49
|
+
# TOTAL(found) = FALSE(found) * TOTAL/FALSE
|
50
|
+
# = FALSE(found) / (FALSE/TOTAL)
|
51
|
+
total_false = num_fps / @false_to_total_ratio
|
52
|
+
# NOTE: the partition algorithm drops peptides that are transmembrane
|
53
|
+
# under certain options. Thus, the total false estimate must be tempered
|
54
|
+
# by this lower number of total peptides.
|
55
|
+
adjusted_tps = total_peps_passing_partition.to_f - total_false
|
56
|
+
[adjusted_tps, total_false]
|
57
|
+
end
|
58
|
+
|
59
|
+
# returns self
|
60
|
+
# assumes partition returns (tps, fps)
|
61
|
+
def set_false_to_total_ratio(peps)
|
62
|
+
(tps, fps) = partition(peps)
|
63
|
+
@false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
|
64
|
+
self
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
class Validator::Probability
|
3
|
+
|
4
|
+
attr_accessor :prob_method
|
5
|
+
|
6
|
+
def initialize(prob_method=:probability)
|
7
|
+
@prob_method = prob_method
|
8
|
+
end
|
9
|
+
|
10
|
+
# objs should respond_to probability
|
11
|
+
def precision(objs)
|
12
|
+
return 1.0 if objs.size == 0
|
13
|
+
|
14
|
+
current_sum_one_minus_prob = 0.0
|
15
|
+
|
16
|
+
# this should work!
|
17
|
+
#objs.inject(0.0) {|sum,obj| sum + (1.0 - obj.probability) }
|
18
|
+
|
19
|
+
objs.each do |obj|
|
20
|
+
# SUM(1-probX)/#objs
|
21
|
+
current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
|
22
|
+
end
|
23
|
+
prec = 1.0 - (current_sum_one_minus_prob / objs.size)
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
# objs should respond_to probability
|
28
|
+
# These should be added from high probability(1.0) to low (0.0)
|
29
|
+
def increment_precision(objs)
|
30
|
+
if objs.is_a?(SpecID::Pep) or objs.is_a?(SpecID::Prot)
|
31
|
+
objs = [objs]
|
32
|
+
end
|
33
|
+
|
34
|
+
@total_objs ||= 0
|
35
|
+
@current_sum_one_minus_prob ||= 0.0
|
36
|
+
|
37
|
+
@total_objs += objs.size
|
38
|
+
objs.each do |obj|
|
39
|
+
@current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
|
40
|
+
end
|
41
|
+
prec = 1.0 - (@current_sum_one_minus_prob / @total_objs)
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
alias_method :pephit_precision, :precision
|
46
|
+
alias_method :prothit_precision, :precision
|
47
|
+
alias_method :increment_pephits_precision, :increment_precision
|
48
|
+
end
|