mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
require 'vec'
|
|
3
|
+
require 'enumerator'
|
|
4
|
+
|
|
5
|
+
class Validator ; end
|
|
6
|
+
class Validator::Background
|
|
7
|
+
|
|
8
|
+
attr_accessor :data
|
|
9
|
+
|
|
10
|
+
def initialize(data=nil)
|
|
11
|
+
@data = data
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def delete_nan!(vec)
|
|
15
|
+
vec.each_with_index do |v,i|
|
|
16
|
+
if v.nan?
|
|
17
|
+
vec[i] = 0
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
|
|
23
|
+
data_vec = VecD[*@data]
|
|
24
|
+
delete_nan!(data_vec)
|
|
25
|
+
stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
|
|
26
|
+
smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
|
|
27
|
+
smoothed_stdev_derivs = smoothed_stdev.chim
|
|
28
|
+
last_0_index = index_of_last_0(smoothed_stdev_derivs)
|
|
29
|
+
min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# not really working right currently
|
|
33
|
+
def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
|
|
34
|
+
data_vec = VecD[*@data]
|
|
35
|
+
delete_nan!(data_vec)
|
|
36
|
+
drvs = data_vec.chim
|
|
37
|
+
# absolute value
|
|
38
|
+
drvs.each_with_index {|x,i| drvs[i] = x.abs }
|
|
39
|
+
mv_avg = drvs.transform(avg_points) {|v| v.avg }
|
|
40
|
+
last_0_index = index_of_last_0(mv_avg.chim)
|
|
41
|
+
min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def index_of_last_0(vec)
|
|
45
|
+
last_0_index = nil
|
|
46
|
+
vec.each_with_index do |v,i|
|
|
47
|
+
if v == 0
|
|
48
|
+
last_0_index = i
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
last_0_index
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# returns the minimum value in the window centered on index
|
|
55
|
+
def min_in_window(vec, index, pre, post)
|
|
56
|
+
last_index = vec.size - 1
|
|
57
|
+
start = index - pre
|
|
58
|
+
stop = index + post
|
|
59
|
+
start = 0 if start < 0
|
|
60
|
+
stop = last_index if stop > last_index
|
|
61
|
+
vec[start..stop].min
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# very simple, should work
|
|
65
|
+
def min_mesa(start, stop, points=3)
|
|
66
|
+
data_vec = VecD[*@data]
|
|
67
|
+
delete_nan!(data_vec)
|
|
68
|
+
smoothed = data_vec.transform(3) {|v| v.avg }
|
|
69
|
+
smoothed[start..stop].min
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
require 'validator/digestion_based'
|
|
3
|
+
|
|
4
|
+
# class for any generic kind of bias. For instance, a list of high abundance
|
|
5
|
+
# proteins we would expect to see, or a list of low abundance proteins we
|
|
6
|
+
# would not expect to see, or proteins that have been filtered out in some
|
|
7
|
+
# way, etc.
|
|
8
|
+
class Validator::Bias < Validator::DigestionBased
|
|
9
|
+
include Precision::Calculator
|
|
10
|
+
|
|
11
|
+
# a fasta object (by default containing proteins expected to be in the
|
|
12
|
+
# sample [see proteins_expected to modify that behavior])
|
|
13
|
+
attr_reader :fasta
|
|
14
|
+
|
|
15
|
+
# correct_wins means that only a single protein from a pep.aaseq must match
|
|
16
|
+
# the fasta object for the pep hit to be considered valid. Otherwise, all
|
|
17
|
+
# must be a match (logic negated by proteins_expected)
|
|
18
|
+
attr_accessor :correct_wins
|
|
19
|
+
|
|
20
|
+
# proteins_expected==true means we expect to see the proteins in the sample
|
|
21
|
+
# proteins_expected==false means we do not expect to see these proteins in
|
|
22
|
+
# the sample
|
|
23
|
+
attr_accessor :proteins_expected
|
|
24
|
+
|
|
25
|
+
# a hash made by taking each fasta reference in fasta_object, (everything
|
|
26
|
+
# until a space) and setting the value to true. It can be queried with the
|
|
27
|
+
# start of an fasta sequence
|
|
28
|
+
attr_accessor :short_reference_hash
|
|
29
|
+
|
|
30
|
+
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
|
31
|
+
:proteins_expected => true,
|
|
32
|
+
:correct_wins => true,
|
|
33
|
+
} )
|
|
34
|
+
|
|
35
|
+
# options:
|
|
36
|
+
# (t = true, f = false, '*'= default)
|
|
37
|
+
# :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
|
|
38
|
+
# :correct_wins => *t/f a single peptide hit from one of these proteins
|
|
39
|
+
# constitutes a true positive
|
|
40
|
+
# :background => Float (*0.0-1.0)
|
|
41
|
+
# :false_to_total_ratio => Float (*nil by default)
|
|
42
|
+
def initialize(fasta_object, options={})
|
|
43
|
+
opts = DEFAULTS.merge(options)
|
|
44
|
+
(@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
|
|
45
|
+
@fasta = fasta_object
|
|
46
|
+
@header_split_hash = @fasta.prots.map {|prot| prot.reference }
|
|
47
|
+
@short_reference_hash = self.class.make_short_reference_hash(fasta_object)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.make_short_reference_hash(fasta_object)
|
|
51
|
+
hash = {}
|
|
52
|
+
fasta_object.each do |prot|
|
|
53
|
+
hash[prot.first_entry] = true
|
|
54
|
+
end
|
|
55
|
+
hash
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def partition(peps)
|
|
59
|
+
klass = self.class
|
|
60
|
+
cw =
|
|
61
|
+
if !@proteins_expected
|
|
62
|
+
!@correct_wins
|
|
63
|
+
else
|
|
64
|
+
@correct_wins
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
(tp, fp) =
|
|
68
|
+
if cw
|
|
69
|
+
peps.partition do |pep|
|
|
70
|
+
pep.prots.any? do |pepprot|
|
|
71
|
+
@short_reference_hash.key?( pepprot.first_entry )
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
else
|
|
75
|
+
peps.partition do |pep|
|
|
76
|
+
pep.prots.any? do |pepprot|
|
|
77
|
+
!@short_reference_hash.key?( pepprot.first_entry )
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
if !@correct_wins
|
|
83
|
+
tp, fp = fp, tp
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
[tp, fp]
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# pephit_precision is done through inheritance
|
|
90
|
+
|
|
91
|
+
def to_param_string
|
|
92
|
+
"abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
end
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
|
|
3
|
+
class Validator::Cmdline
|
|
4
|
+
|
|
5
|
+
Validator_symbols_to_classes = {
|
|
6
|
+
:tmm => Validator::Transmem::Protein,
|
|
7
|
+
:decoy => Validator::Decoy,
|
|
8
|
+
:bad_aa => Validator::AA,
|
|
9
|
+
:tps => Validator::TruePos,
|
|
10
|
+
:bias => Validator::Bias,
|
|
11
|
+
:prob => Validator::Probability,
|
|
12
|
+
}
|
|
13
|
+
# was VAL_DEFAULTS
|
|
14
|
+
DEFAULTS = {
|
|
15
|
+
:tmm =>
|
|
16
|
+
{
|
|
17
|
+
# file
|
|
18
|
+
:min_num_tmm_seqs => 1,
|
|
19
|
+
:expect_soluble => true,
|
|
20
|
+
:no_include_tm_peps => 0.8,
|
|
21
|
+
:bkg => 0.0,
|
|
22
|
+
},
|
|
23
|
+
:decoy =>
|
|
24
|
+
{
|
|
25
|
+
:hits_together => true,
|
|
26
|
+
:decoy_on_match => true,
|
|
27
|
+
},
|
|
28
|
+
:bad_aa =>
|
|
29
|
+
{
|
|
30
|
+
:false_if_found => true,
|
|
31
|
+
:estimate => true,
|
|
32
|
+
:bkg => 0.0,
|
|
33
|
+
},
|
|
34
|
+
:bias =>
|
|
35
|
+
{
|
|
36
|
+
:bkg => 0.0,
|
|
37
|
+
:proteins_expected => true,
|
|
38
|
+
},
|
|
39
|
+
:ties => true,
|
|
40
|
+
}
|
|
41
|
+
COMMAND_LINE = {
|
|
42
|
+
:decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
|
|
43
|
+
"FILENAME of separate search on decoys.",
|
|
44
|
+
"All regular expressions must be surrounded by '/'",
|
|
45
|
+
"(no extended options [trailing modifiers]).",
|
|
46
|
+
"e.g., a run using concatenated reversed proteins that",
|
|
47
|
+
"includes 'REVERSE' in the fasta heading:",
|
|
48
|
+
" --decoy /REVERSE/",
|
|
49
|
+
"Anything fancier should be quoted:",
|
|
50
|
+
" --decoy '/^\\s*REVERSE/'",
|
|
51
|
+
"If decoys proteins were searched in a separate file,",
|
|
52
|
+
"then give the FILENAME (e.g., --decoy decoy.srg)",
|
|
53
|
+
"DOM = *true/false, decoy on match",],
|
|
54
|
+
:tps => ["--tps <fasta>", "for a completely defined sample, this is the",
|
|
55
|
+
"fasta file containing the true protein hits"],
|
|
56
|
+
# may require digestion:
|
|
57
|
+
:digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
|
|
58
|
+
"information (that is shared between them).",
|
|
59
|
+
"ORIG_FASTA = the fasta file used to do the run",
|
|
60
|
+
"PARAMS = the params file used to do the run",],
|
|
61
|
+
:bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
|
|
62
|
+
"PE = *true|false proteins in fasta file expected in sample",
|
|
63
|
+
"BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
|
|
64
|
+
:bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
|
|
65
|
+
"AA = The amino acid (e.g., 'C')",
|
|
66
|
+
"EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
|
|
67
|
+
"BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
|
|
68
|
+
|
|
69
|
+
:tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
|
|
70
|
+
"phobius.small:",
|
|
71
|
+
"http://phobius.cgb.ki.se/",
|
|
72
|
+
"(select 'Short' output, and save output as file)",
|
|
73
|
+
"toppred.out:",
|
|
74
|
+
"http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html",
|
|
75
|
+
"(output 'toppred.out' in 'New' or 'Xml' format)",
|
|
76
|
+
"MIN = Int, minimum number transmembrane seqs (def: #{DEFAULTS[:tmm][:min_num_tmm_seqs]})",
|
|
77
|
+
"SOL = true|false, this is a soluble fraction( def: #{DEFAULTS[:tmm][:expect_soluble]})",
|
|
78
|
+
"PEPS = Float | false, don't consider tm peps (>= fraction",
|
|
79
|
+
" tm content) (false skips) (def: #{DEFAULTS[:tmm][:no_include_tm_peps]})",
|
|
80
|
+
"BKG = Float , background contaminating insoluble (def: #{DEFAULTS[:tmm][:bkg]})"],
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# VALIDATION MODIFIERS
|
|
84
|
+
:false_on_tie => ["--false_on_tie", "if peptide belongs to correct AND incorrect proteins",
|
|
85
|
+
"it will be counted as correct"],
|
|
86
|
+
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def self.boolean(arg, default)
|
|
90
|
+
case arg
|
|
91
|
+
when 'true' ; true
|
|
92
|
+
when 'false' ; false
|
|
93
|
+
else ; default
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
PrepArgs = {
|
|
98
|
+
:prob => lambda {|ar, opts|
|
|
99
|
+
mthd =
|
|
100
|
+
if ar
|
|
101
|
+
if ar == 'nsp'
|
|
102
|
+
:probability
|
|
103
|
+
elsif ar == 'init'
|
|
104
|
+
:initial_probability
|
|
105
|
+
else
|
|
106
|
+
raise ArgumentError, "--prob [arg], optional arg can only be 'nsp' or 'init'!"
|
|
107
|
+
end
|
|
108
|
+
else
|
|
109
|
+
:probability
|
|
110
|
+
end
|
|
111
|
+
opts[:validators].push([:prob, mthd])
|
|
112
|
+
},
|
|
113
|
+
:decoy => lambda {|ar, opts|
|
|
114
|
+
myargs = [:decoy]
|
|
115
|
+
first_arg = ar[0]
|
|
116
|
+
myargs[1] =
|
|
117
|
+
if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
|
|
118
|
+
Regexp.new(first_arg[1...-1])
|
|
119
|
+
else
|
|
120
|
+
first_arg
|
|
121
|
+
end
|
|
122
|
+
myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
|
|
123
|
+
opts[:validators].push(myargs)
|
|
124
|
+
},
|
|
125
|
+
:digestion => lambda {|ar, opts|
|
|
126
|
+
raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
|
|
127
|
+
opts[:digestion] = ar.dup
|
|
128
|
+
opts[:digestion_objects] = [Fasta.new(ar[0]), Sequest::Params.new(ar[1])]
|
|
129
|
+
},
|
|
130
|
+
:bias => lambda {|ar, opts|
|
|
131
|
+
myargs = [:bias]
|
|
132
|
+
myargs.push( Fasta.new(ar[0]) )
|
|
133
|
+
val_opts = {}
|
|
134
|
+
val_opts[:proteins_expected] = self.boolean(ar[1], DEFAULTS[:bias][:proteins_expected])
|
|
135
|
+
val_opts[:background] =
|
|
136
|
+
if ar[2]
|
|
137
|
+
ar[2].to_f
|
|
138
|
+
else
|
|
139
|
+
DEFAULTS[:bias][:bkg]
|
|
140
|
+
end
|
|
141
|
+
myargs.push(val_opts)
|
|
142
|
+
opts[:validators].push(myargs)
|
|
143
|
+
},
|
|
144
|
+
:bad_aa => lambda {|ar, opts|
|
|
145
|
+
## GET the FREQUENCY
|
|
146
|
+
myargs = [:bad_aa]
|
|
147
|
+
myargs.push( ar[0] )
|
|
148
|
+
val_opts = {}
|
|
149
|
+
val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
|
|
150
|
+
val_opts[:background] =
|
|
151
|
+
if ar[2]
|
|
152
|
+
ar[2].to_f
|
|
153
|
+
else
|
|
154
|
+
DEFAULTS[:bad_aa][:bkg]
|
|
155
|
+
end
|
|
156
|
+
myargs.push(val_opts)
|
|
157
|
+
opts[:validators].push(myargs)
|
|
158
|
+
},
|
|
159
|
+
:tmm => lambda {|ar, opts|
|
|
160
|
+
myargs = [:tmm]
|
|
161
|
+
myargs.push( ar[0] )
|
|
162
|
+
val_opts = {}
|
|
163
|
+
val_opts[:min_num_tms] =
|
|
164
|
+
if ar[1] ; ar[1].to_i
|
|
165
|
+
else ; DEFAULTS[:tmm][:min_num_tmm_seqs]
|
|
166
|
+
end
|
|
167
|
+
val_opts[:soluble_fraction] = self.boolean(ar[2], DEFAULTS[:tmm][:expect_soluble])
|
|
168
|
+
val_opts[:no_include_tm_peps] =
|
|
169
|
+
if ar[3]
|
|
170
|
+
case ar[3]
|
|
171
|
+
when 'false' ; false
|
|
172
|
+
else ; ar[3].to_f
|
|
173
|
+
end
|
|
174
|
+
else ; DEFAULTS[:tmm][:no_include_tm_peps]
|
|
175
|
+
end
|
|
176
|
+
val_opts[:background] =
|
|
177
|
+
if ar[4] ; ar[4].to_f
|
|
178
|
+
else ; DEFAULTS[:tmm][:bkg]
|
|
179
|
+
end
|
|
180
|
+
myargs.push(val_opts)
|
|
181
|
+
opts[:validators].push( myargs )
|
|
182
|
+
},
|
|
183
|
+
:tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
|
|
184
|
+
:false_on_tie => lambda {|v,opts| opts[:ties] = false },
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# remove the keys from opts involved in validators and return an array
|
|
188
|
+
# of validators
|
|
189
|
+
def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
|
|
190
|
+
validator_args = opts[:validators]
|
|
191
|
+
correct_wins = !false_on_tie
|
|
192
|
+
need_false_to_total_ratio = []
|
|
193
|
+
need_frequency = []
|
|
194
|
+
transmem_vals = []
|
|
195
|
+
validators = validator_args.map do |args|
|
|
196
|
+
tp = args.shift
|
|
197
|
+
val_args = args.dup # protect the original keys
|
|
198
|
+
val_args =
|
|
199
|
+
case tp
|
|
200
|
+
when :tmm
|
|
201
|
+
val_args[1][:correct_wins] = correct_wins
|
|
202
|
+
val_args[1][:fasta] = opts[:digestion_objects][0]
|
|
203
|
+
val_args
|
|
204
|
+
when :bias
|
|
205
|
+
val_args[1][:correct_wins] = correct_wins
|
|
206
|
+
val_args
|
|
207
|
+
when :tps
|
|
208
|
+
val_args = [val_args[0], correct_wins]
|
|
209
|
+
val_args
|
|
210
|
+
when :decoy
|
|
211
|
+
val_args = [val_args[0], val_args[1], correct_wins]
|
|
212
|
+
# don't delete the key here since we need the decoy = regexp key
|
|
213
|
+
val_args
|
|
214
|
+
else ## bad_aa and prob are represented here:
|
|
215
|
+
val_args
|
|
216
|
+
end
|
|
217
|
+
val = Validator_symbols_to_classes[tp].new( *val_args )
|
|
218
|
+
# make some lists of validators based on pre-processing needs:
|
|
219
|
+
if tp == :tmm
|
|
220
|
+
transmem_vals << val
|
|
221
|
+
end
|
|
222
|
+
potential_digestion_classes = /Transmem|AA|Bias/
|
|
223
|
+
if val.class.to_s =~ potential_digestion_classes
|
|
224
|
+
if val_args[1][:estimate] == true
|
|
225
|
+
need_frequency << val
|
|
226
|
+
else
|
|
227
|
+
need_false_to_total_ratio << val
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
val
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
if need_false_to_total_ratio.size > 0
|
|
234
|
+
raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
|
|
235
|
+
peps = Digestor.digest( *(opts[:digestion_objects]) )
|
|
236
|
+
need_false_to_total_ratio.each do |val|
|
|
237
|
+
val.set_false_to_total_ratio( peps )
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
if need_frequency.size > 0
|
|
241
|
+
raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
|
|
242
|
+
need_frequency.each do |val|
|
|
243
|
+
val.set_frequency( opts[:digestion_objects][0] )
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
opts.delete(:digestion_objects)
|
|
247
|
+
|
|
248
|
+
if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
|
|
249
|
+
# This is overkill if we are doing a single filtering job, but it
|
|
250
|
+
# ensures that it works in all the ways I'm doing it. Should
|
|
251
|
+
# refactor eventually !!
|
|
252
|
+
transmem_vals.each do |val| ## but, prob uses it too!
|
|
253
|
+
val.transmem_status_hash = val.create_transmem_status_hash(spec_id.peps)
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
validators
|
|
257
|
+
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
|
|
3
|
+
class Validator::Decoy < Validator
|
|
4
|
+
include Precision::Calculator::Decoy
|
|
5
|
+
|
|
6
|
+
attr_accessor :constraint
|
|
7
|
+
|
|
8
|
+
attr_accessor :decoy_on_match
|
|
9
|
+
attr_accessor :correct_wins
|
|
10
|
+
|
|
11
|
+
attr_accessor :last_pep_was_decoy
|
|
12
|
+
|
|
13
|
+
attr_accessor :increment_normal
|
|
14
|
+
attr_accessor :increment_decoy
|
|
15
|
+
attr_accessor :increment_total_submitted
|
|
16
|
+
|
|
17
|
+
attr_reader :normal_peps_just_submitted
|
|
18
|
+
|
|
19
|
+
def initialize(constraint=nil, decoy_on_match = true, correct_wins = true)
|
|
20
|
+
@decoy_on_match = decoy_on_match
|
|
21
|
+
@correct_wins = correct_wins
|
|
22
|
+
@constraint = constraint
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# returns [normal, decoy] (?? I think ??)
|
|
26
|
+
def partition(peps)
|
|
27
|
+
if @decoy_on_match
|
|
28
|
+
if @correct_wins
|
|
29
|
+
peps.partition do |pep|
|
|
30
|
+
!(pep.prots.all? {|prot| prot.reference.match(@constraint) })
|
|
31
|
+
end
|
|
32
|
+
else # fp wins
|
|
33
|
+
peps.partition do |pep|
|
|
34
|
+
!(pep.prots.any? {|prot| prot.reference.match(@constraint) })
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
else
|
|
38
|
+
if @correct_wins
|
|
39
|
+
peps.partition do |pep|
|
|
40
|
+
pep.prots.any? {|prot| prot.reference.match(@constraint) }
|
|
41
|
+
end
|
|
42
|
+
else
|
|
43
|
+
peps.partition do |pep|
|
|
44
|
+
pep.prots.all? {|prot| prot.reference.match(@constraint) }
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def initialize_increment
|
|
51
|
+
@increment_normal = 0
|
|
52
|
+
@increment_decoy = 0
|
|
53
|
+
@increment_total_submitted = 0
|
|
54
|
+
@increment_initialized = true
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# does not deal in separate_peps right now!!
|
|
59
|
+
# will take an array or single peptide
|
|
60
|
+
def increment_pephits_precision(peps)
|
|
61
|
+
tmp = $VERBOSE; $VERBOSE = nil
|
|
62
|
+
initialize_increment unless @increment_initialized
|
|
63
|
+
$VERBOSE = tmp
|
|
64
|
+
|
|
65
|
+
to_submit =
|
|
66
|
+
if peps.is_a? SpecID::Pep
|
|
67
|
+
[peps]
|
|
68
|
+
else
|
|
69
|
+
peps
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
@increment_total_submitted += to_submit.size
|
|
73
|
+
(normal, decoy) = partition(to_submit)
|
|
74
|
+
@normal_peps_just_submitted = normal
|
|
75
|
+
@increment_normal += normal.size
|
|
76
|
+
@increment_decoy += decoy.size
|
|
77
|
+
calc_precision(@increment_normal, @increment_decoy)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def pephit_precision(peps, separate_peps=nil)
|
|
81
|
+
if separate_peps
|
|
82
|
+
calc_precision(peps.size, separate_peps.size)
|
|
83
|
+
else
|
|
84
|
+
(norm, decoy) = partition(peps)
|
|
85
|
+
calc_precision(norm.size, decoy.size)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def to_param_string
|
|
90
|
+
"decoy="+ ["{constraint=#{(constraint ? constraint.inspect : '')}", "decoy_on_match=#{@decoy_on_match}", "correct_wins=#{@correct_wins}}"].join(", ")
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
require 'fasta'
|
|
3
|
+
require 'spec_id/sequest/params'
|
|
4
|
+
|
|
5
|
+
# objects of this class can calculate pephit_precision given an array of
|
|
6
|
+
# SpecID::Pep objects using the pephit_precision method.
|
|
7
|
+
class Validator::DigestionBased < Validator
|
|
8
|
+
DEFAULTS = {
|
|
9
|
+
:false_to_total_ratio => 1.0,
|
|
10
|
+
:background => 0.0,
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
# the number of tps
|
|
14
|
+
attr_accessor :increment_tps
|
|
15
|
+
# the number of fps
|
|
16
|
+
attr_accessor :increment_fps
|
|
17
|
+
|
|
18
|
+
# the total peptides submitted to the validator (regardless of tp, fp, or
|
|
19
|
+
# nil)
|
|
20
|
+
attr_accessor :increment_total_submitted
|
|
21
|
+
|
|
22
|
+
# the ratio of false hits to total peptides in the fasta file
|
|
23
|
+
attr_accessor :false_to_total_ratio
|
|
24
|
+
|
|
25
|
+
# the false_to_total_ratio calculated (but not applied)
|
|
26
|
+
attr_reader :calculated_background
|
|
27
|
+
|
|
28
|
+
# For a sample with no false hits in it, (under defaults) this is the
|
|
29
|
+
# fraction of peptides with the constraint over the total number of peptides
|
|
30
|
+
# from which these hits are derived.
|
|
31
|
+
attr_accessor :background
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# expects that classes define a partition method, and a @background
|
|
35
|
+
def pephit_precision(peps)
|
|
36
|
+
## this gives us the fraction that are transmembrane (under defaults):
|
|
37
|
+
(tps, fps) = partition(peps)
|
|
38
|
+
(num_tps, num_fps) = calc_precision_prep(tps.size, fps.size)
|
|
39
|
+
calc_precision(num_tps, num_fps)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# returns [num_tps, num_fps]
|
|
43
|
+
def calc_precision_prep(num_tps, num_fps)
|
|
44
|
+
total_peps_passing_partition = num_tps + num_fps
|
|
45
|
+
num_fps = adjust_fps_for_background(num_tps, num_fps, @background)
|
|
46
|
+
## we must use the false_to_total_ratio to estimate how many are really
|
|
47
|
+
## incorrect!
|
|
48
|
+
# FALSE/TOTAL = FALSE(found)/TOTAL(found)
|
|
49
|
+
# TOTAL(found) = FALSE(found) * TOTAL/FALSE
|
|
50
|
+
# = FALSE(found) / (FALSE/TOTAL)
|
|
51
|
+
total_false = num_fps / @false_to_total_ratio
|
|
52
|
+
# NOTE: the partition algorithm drops peptides that are transmembrane
|
|
53
|
+
# under certain options. Thus, the total false estimate must be tempered
|
|
54
|
+
# by this lower number of total peptides.
|
|
55
|
+
adjusted_tps = total_peps_passing_partition.to_f - total_false
|
|
56
|
+
[adjusted_tps, total_false]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# returns self
|
|
60
|
+
# assumes partition returns (tps, fps)
|
|
61
|
+
def set_false_to_total_ratio(peps)
|
|
62
|
+
(tps, fps) = partition(peps)
|
|
63
|
+
@false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
|
|
64
|
+
self
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
|
|
2
|
+
class Validator::Probability
|
|
3
|
+
|
|
4
|
+
attr_accessor :prob_method
|
|
5
|
+
|
|
6
|
+
def initialize(prob_method=:probability)
|
|
7
|
+
@prob_method = prob_method
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# objs should respond_to probability
|
|
11
|
+
def precision(objs)
|
|
12
|
+
return 1.0 if objs.size == 0
|
|
13
|
+
|
|
14
|
+
current_sum_one_minus_prob = 0.0
|
|
15
|
+
|
|
16
|
+
# this should work!
|
|
17
|
+
#objs.inject(0.0) {|sum,obj| sum + (1.0 - obj.probability) }
|
|
18
|
+
|
|
19
|
+
objs.each do |obj|
|
|
20
|
+
# SUM(1-probX)/#objs
|
|
21
|
+
current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
|
|
22
|
+
end
|
|
23
|
+
prec = 1.0 - (current_sum_one_minus_prob / objs.size)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# objs should respond_to probability
|
|
28
|
+
# These should be added from high probability(1.0) to low (0.0)
|
|
29
|
+
def increment_precision(objs)
|
|
30
|
+
if objs.is_a?(SpecID::Pep) or objs.is_a?(SpecID::Prot)
|
|
31
|
+
objs = [objs]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
@total_objs ||= 0
|
|
35
|
+
@current_sum_one_minus_prob ||= 0.0
|
|
36
|
+
|
|
37
|
+
@total_objs += objs.size
|
|
38
|
+
objs.each do |obj|
|
|
39
|
+
@current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
|
|
40
|
+
end
|
|
41
|
+
prec = 1.0 - (@current_sum_one_minus_prob / @total_objs)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
alias_method :pephit_precision, :precision
|
|
46
|
+
alias_method :prothit_precision, :precision
|
|
47
|
+
alias_method :increment_pephits_precision, :increment_precision
|
|
48
|
+
end
|