mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
require 'group_by'
|
|
5
|
+
require 'shuffle'
|
|
6
|
+
|
|
7
|
+
# calculates protein hit precision based on peptide precision
|
|
8
|
+
class Validator::ProtFromPep < Validator
|
|
9
|
+
|
|
10
|
+
# calculate protein precision based on the number of false peptides
|
|
11
|
+
# returns the precision based on the number of proteins *completely false*
|
|
12
|
+
# calculates the worst precision by assuming that proteins with the fewest
|
|
13
|
+
# peptides are all false (before prots with more pephits)
|
|
14
|
+
# note that this approaches the worst, but is not guaranteed to be worst
|
|
15
|
+
# unless each pephit maps to a single protein hit.
|
|
16
|
+
# [worst, normal_mean, normal_stddev]
|
|
17
|
+
# options
|
|
18
|
+
# :num_its_normal => Integer, # num iterations for normal (d: 10)
|
|
19
|
+
# :num_its_worstcase => Integer, # num iterations for worstcase (d: 10)
|
|
20
|
+
#
|
|
21
|
+
def prothit_precision(peps, num_false_pephits, opts={})
|
|
22
|
+
opts[:num_its_normal] ||= 10
|
|
23
|
+
opts[:num_its_worstcase] ||= 10
|
|
24
|
+
# get the num_peps_per_protein array
|
|
25
|
+
worst = worstcase_prothit_precision(peps, num_false_pephits, :num_its => opts[:num_its_worstcase])
|
|
26
|
+
(normal_mean, normal_stdev) = normal_prothit_precision( peps, num_false_pephits, :num_its => opts[:num_its_normal])
|
|
27
|
+
[worst, normal_mean, normal_stdev]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# returns an array of the number of peptide hits in each protein
|
|
31
|
+
def num_peps_per_protein(peps)
|
|
32
|
+
num_pephits_by_prot = Hash.new { 0 }
|
|
33
|
+
peps.each do |pep|
|
|
34
|
+
pep.prots.each do |prot|
|
|
35
|
+
num_pephits_by_prot[prot.reference] += 1
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
num_pephits_by_prot.values
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# returns the worstcase precision. This assumes that every small protein
|
|
42
|
+
# with the fewest peptide hits is completely 'filled' with incorrect hits in
|
|
43
|
+
# preference to any higher hit protein.
|
|
44
|
+
# Where each peptide hit maps to a single protein, this is guaranteed to be
|
|
45
|
+
# worst-case. If this doesn't hold, there are some extreme cases where a
|
|
46
|
+
# poorer precision could be generated, but this is still probably fairly
|
|
47
|
+
# close. Thus, a slightly different answer may be generated each time.
|
|
48
|
+
# ...variation is produced by shuffling the order of the proteins from which
|
|
49
|
+
# peptides are removed within groups of proteins having the same number of
|
|
50
|
+
# peptides.
|
|
51
|
+
# This method does NOT require that the prothits be updated to reflect only
|
|
52
|
+
# those pephits being passed in.
|
|
53
|
+
#
|
|
54
|
+
# validator.worstcase_prothit_precision(peps, 14, 1) # => 0.232111
|
|
55
|
+
#
|
|
56
|
+
# options:
|
|
57
|
+
# :num_its => Integer (default: 10) number of times to run (finds minimum)
|
|
58
|
+
# :one_prot_per_pep => true | *false assumes each peptide maps to a
|
|
59
|
+
# single protein
|
|
60
|
+
def worstcase_prothit_precision(peps, num_false_pephits, opts = {})
|
|
61
|
+
num_its = opts[:num_its] || 10
|
|
62
|
+
one_prot_per_pep = opts[:one_prot_per_pep] # nil or false still == false
|
|
63
|
+
one_prot_per_pep = false if one_prot_per_pep == nil
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
##############################################
|
|
67
|
+
# The END Cases (can be dealt with quickly)
|
|
68
|
+
##############################################
|
|
69
|
+
if num_false_pephits == 0
|
|
70
|
+
return 1.0
|
|
71
|
+
elsif num_false_pephits >= peps.size
|
|
72
|
+
return 0.0
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
if one_prot_per_pep
|
|
76
|
+
num_peps_per_prot = num_peps_per_protein(peps)
|
|
77
|
+
return worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
|
|
78
|
+
else
|
|
79
|
+
#####################################
|
|
80
|
+
# HERE's the basic plan!!
|
|
81
|
+
#####################################
|
|
82
|
+
# order the proteins by num peptides
|
|
83
|
+
# create a set of peptides
|
|
84
|
+
# delete peptides from the proteins off the set o' peptides (ensuring that
|
|
85
|
+
# a deleted one cannot be deleted twice)
|
|
86
|
+
|
|
87
|
+
#####################################
|
|
88
|
+
# order the proteins by num peptides
|
|
89
|
+
# and create a hash that holds the peptides (given here) in those proteins
|
|
90
|
+
prots_to_peps_here = Hash.new {|h,k| h[k] = [] }
|
|
91
|
+
prots_to_peps_size = Hash.new { 0 }
|
|
92
|
+
pep_ids = []
|
|
93
|
+
pep_ids_to_prot_ids = Hash.new {|h,k| h[k] = [] }
|
|
94
|
+
peps.each do |pep|
|
|
95
|
+
#puts pep.prots.size
|
|
96
|
+
pep.prots.each do |prot|
|
|
97
|
+
#p prot.reference
|
|
98
|
+
prots_to_peps_here[prot] << pep
|
|
99
|
+
prots_to_peps_size[prot] += 1
|
|
100
|
+
pep_ids << pep
|
|
101
|
+
pep_ids_to_prot_ids[pep] << prot
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
prot_ids_listed_by_peps_size = prots_to_peps_size.keys
|
|
105
|
+
tot_num_prots = prot_ids_listed_by_peps_size.size
|
|
106
|
+
|
|
107
|
+
sample = Array.new(num_its)
|
|
108
|
+
|
|
109
|
+
srand( 777 )
|
|
110
|
+
precision_sample = (0...num_its).to_a.map do
|
|
111
|
+
num_false_pephits_counter = num_false_pephits
|
|
112
|
+
# create a set of peptides
|
|
113
|
+
pep_ids_set = pep_ids.to_set
|
|
114
|
+
# shuffle the proteins within size groups
|
|
115
|
+
finished = false
|
|
116
|
+
prot_ids_listed_by_peps_size.group_by {|prot_id| prots_to_peps_size[prot_id] }.sort.each do |k,group_of_proteins_with_same_pep_size|
|
|
117
|
+
group_of_proteins_with_same_pep_size.shuffle!
|
|
118
|
+
group_of_proteins_with_same_pep_size.each do |prot_id|
|
|
119
|
+
prots_to_peps_here[prot_id].each do |pep_id|
|
|
120
|
+
if pep_ids_set.include?(pep_id) # if 1
|
|
121
|
+
# remove a peptide
|
|
122
|
+
pep_ids_set.delete(pep_id)
|
|
123
|
+
num_false_pephits_counter -= 1
|
|
124
|
+
if num_false_pephits_counter == 0 # if 2
|
|
125
|
+
finished = true
|
|
126
|
+
end # close if 2
|
|
127
|
+
end # close if 1
|
|
128
|
+
break if finished # each pep
|
|
129
|
+
end
|
|
130
|
+
break if finished # each prot
|
|
131
|
+
end
|
|
132
|
+
break if finished # each group_of_proteins_with_same_pep_size
|
|
133
|
+
end # each group_of_proteins_with_same_pep_size
|
|
134
|
+
## Figure out the number of proteins left!
|
|
135
|
+
proteins_still_around = pep_ids_set.inject(Set.new) {|protset,pep_id| protset.merge( pep_ids_to_prot_ids[pep_id]) }
|
|
136
|
+
|
|
137
|
+
proteins_still_around.size.to_f / tot_num_prots
|
|
138
|
+
end # a sample
|
|
139
|
+
return precision_sample.min
|
|
140
|
+
end # FINAL else
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# returns the precision of the worst possible outcome
|
|
144
|
+
def worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
|
|
145
|
+
completely_false_proteins = 0
|
|
146
|
+
num_peps_per_prot.sort.each do |num_peps|
|
|
147
|
+
num_false_pephits -= num_peps
|
|
148
|
+
if num_false_pephits >= 0
|
|
149
|
+
completely_false_proteins += 1
|
|
150
|
+
end
|
|
151
|
+
if num_false_pephits <= 0
|
|
152
|
+
break
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
num_prots = num_peps_per_prot.size
|
|
156
|
+
(num_prots - completely_false_proteins).to_f/num_prots
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# normal as in a standard normal distribution of peptide hits per protein
|
|
160
|
+
# they are distributed randomly and the precision is assumed to take on a
|
|
161
|
+
# standard normal distribution.
|
|
162
|
+
# num_peps_per_protein is an array of the number of peptides per protein hit
|
|
163
|
+
# (these are the true hits)
|
|
164
|
+
# assumes that the number follows a gaussian distribution (binomial
|
|
165
|
+
# distributions tend toward gaussians, I believe, at large N)
|
|
166
|
+
# returns [mean_precision, stdev_precision]
|
|
167
|
+
# options:
|
|
168
|
+
# :num_its => Integer (default: 10)
|
|
169
|
+
#
|
|
170
|
+
# if num_iterations is set at 1, then only the precision will be returned
|
|
171
|
+
# though random, the same seed is always used to start this process, meaning
|
|
172
|
+
# that the same results will be produced on consecutive attempts.
|
|
173
|
+
#
|
|
174
|
+
# validator.normal_prothit_precision(peps, 13, :num_its => 1) # -> 0.95433
|
|
175
|
+
# validator.normal_prothit_precision(peps, 13, :num_its => 2) # -> [0.92002, 1.2223]
|
|
176
|
+
def normal_prothit_precision( peps, num_false_pephits, opts={})
|
|
177
|
+
num_iterations = opts[:num_its] || 10
|
|
178
|
+
srand( 38272 )
|
|
179
|
+
|
|
180
|
+
##############################################
|
|
181
|
+
# The END Cases (can be dealt with quickly)
|
|
182
|
+
##############################################
|
|
183
|
+
if num_false_pephits == 0
|
|
184
|
+
if num_iterations == 1
|
|
185
|
+
return 1.0
|
|
186
|
+
else
|
|
187
|
+
return [1.0, 0.0]
|
|
188
|
+
end
|
|
189
|
+
elsif num_false_pephits >= peps.size
|
|
190
|
+
if num_iterations == 1
|
|
191
|
+
return 0.0
|
|
192
|
+
else
|
|
193
|
+
return [0.0, 0.0]
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
##############################################
|
|
198
|
+
# Everything else:
|
|
199
|
+
##############################################
|
|
200
|
+
|
|
201
|
+
sample = Array.new(num_iterations)
|
|
202
|
+
base_indices = (0...(peps.size)).to_a
|
|
203
|
+
### ACUTALLY, I THINK WE WANT TO CREATE AND MERGE!!!!
|
|
204
|
+
# This would mean that only a single hit would validate the protein
|
|
205
|
+
# if we are subtracting, then we lose the protein on a single peptide!!!!
|
|
206
|
+
prot_id_set = peps.inject(Set.new) do |prtset, pep|
|
|
207
|
+
prtset.merge( pep.prots.map {|prot| prot } )
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
tot_num_prots = prot_id_set.size
|
|
211
|
+
# could also merge off the good indices
|
|
212
|
+
# TODO: we should optimize based on how many false pephits given...
|
|
213
|
+
|
|
214
|
+
precision_sample = (0...num_iterations).to_a.map do
|
|
215
|
+
shuffled_indices = base_indices.map
|
|
216
|
+
shuffled_indices.shuffle!
|
|
217
|
+
good_indices = shuffled_indices[num_false_pephits..-1]
|
|
218
|
+
still_remaining = Set.new
|
|
219
|
+
|
|
220
|
+
peps.values_at(*good_indices).each do |pep|
|
|
221
|
+
still_remaining.merge(pep.prots.map {|prot| prot })
|
|
222
|
+
end
|
|
223
|
+
still_remaining.size.to_f / tot_num_prots
|
|
224
|
+
end
|
|
225
|
+
if num_iterations == 1
|
|
226
|
+
precision_sample.shift
|
|
227
|
+
else
|
|
228
|
+
#puts "PRECISION GROUP: "
|
|
229
|
+
#p precision_sample
|
|
230
|
+
sample_stats(precision_sample)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
require 'validator/digestion_based'
|
|
3
|
+
require 'transmem'
|
|
4
|
+
require 'fasta'
|
|
5
|
+
require 'spec_id/digestor'
|
|
6
|
+
require 'spec_id/sequest/params'
|
|
7
|
+
require 'spec_id/sequest/pepxml'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
module Validator::Transmem ; end
|
|
11
|
+
|
|
12
|
+
# objects of this class can calculate pephit_precision given an array of
|
|
13
|
+
# SpecID::Pep objects using the pephit_precision method.
|
|
14
|
+
class Validator::Transmem::Protein < Validator::DigestionBased
|
|
15
|
+
include Precision::Calculator
|
|
16
|
+
|
|
17
|
+
# a hash keyed by index reference which is true if >= min_num_tms
|
|
18
|
+
attr_accessor :transmem_by_ti_key
|
|
19
|
+
attr_accessor :transmem_index
|
|
20
|
+
|
|
21
|
+
# min_num_tms: Integer (1...), the min # certain transmembrane segments to
|
|
22
|
+
# consider the protein a transmembrane protein
|
|
23
|
+
attr_reader :min_num_tms
|
|
24
|
+
|
|
25
|
+
# soluble_fraction: *true/false
|
|
26
|
+
attr_accessor :soluble_fraction
|
|
27
|
+
|
|
28
|
+
# correct_wins: *true/false,
|
|
29
|
+
# if the peptide is found in some proteins that are transmembrane and some
|
|
30
|
+
# that are not, then if soluble_fraction==true, this peptide will be
|
|
31
|
+
# considered non-transmembrane. If soluble_fraction==false, then this
|
|
32
|
+
# will be considered transmembrane.
|
|
33
|
+
attr_accessor :correct_wins
|
|
34
|
+
|
|
35
|
+
# no_include_tm_peps: false or Float (0.0-1.0), peptides that have a
|
|
36
|
+
# fraction of amino acids that fall inside transmembrane sequences greater
|
|
37
|
+
# than or equal to the value of the argument will not be considered in the final
|
|
38
|
+
# calculation of peptide hit precision. (A transmembrane segment is
|
|
39
|
+
# likely to have very different properties than the rest of the peptides,
|
|
40
|
+
# so the assumption of equally flyable peptides is broken unless these are
|
|
41
|
+
# removed) nil or false will skip this filter. A reasonable value is
|
|
42
|
+
# probably 0.7.
|
|
43
|
+
attr_accessor :no_include_tm_peps
|
|
44
|
+
|
|
45
|
+
# if nil, then this will be calculated whe pephit_precision is called.
|
|
46
|
+
attr_accessor :transmem_status_hash
|
|
47
|
+
|
|
48
|
+
# the file used (toppred or phobius file)
|
|
49
|
+
attr_accessor :transmem_file
|
|
50
|
+
|
|
51
|
+
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( { :min_num_tms => 1, :soluble_fraction => true, :correct_wins => true, :no_include_tm_peps => false, :transmem_status_hash => nil} )
|
|
52
|
+
|
|
53
|
+
# expects a toppred.out file (see transmem/toppred)
|
|
54
|
+
# other types of transmembrane predictions)
|
|
55
|
+
# fasta_obj is a Fasta object.
|
|
56
|
+
# sequest_params_obj is a Sequest::Params object.
|
|
57
|
+
# OPTIONS:
|
|
58
|
+
# (see Validator::Transmem::Protein::DEFAULTS for defaults)
|
|
59
|
+
#
|
|
60
|
+
# no_include_tm_peps: *false
|
|
61
|
+
#
|
|
62
|
+
# NOTE: if fasta_obj and sequest_params_obj are not passed in then
|
|
63
|
+
# 'false_to_total_ratio' must be set later.
|
|
64
|
+
def initialize(a_transmem_file, options={})
|
|
65
|
+
@transmem_file = a_transmem_file
|
|
66
|
+
opts = self.class::DEFAULTS.merge(options)
|
|
67
|
+
|
|
68
|
+
(@min_num_tms, @soluble_fraction, @correct_wins, @no_include_tm_peps, @background, @transmem_status_hash, @false_to_total_ratio, fasta) = opts.values_at(:min_num_tms, :soluble_fraction, :correct_wins, :no_include_tm_peps, :background, :transmem_status_hash, :false_to_total_ratio, :fasta)
|
|
69
|
+
|
|
70
|
+
# fasta object is used to update hte phobius index if given
|
|
71
|
+
# a hash by reference => true/false (depending on min_num_tms)
|
|
72
|
+
@transmem_index = TransmemIndex.new(@transmem_file, fasta)
|
|
73
|
+
@transmem_by_ti_key = create_transmem_by_ti_key_hash(@transmem_index, @min_num_tms)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Designates each protein as transmembrane or not depending on :min_num_tms
|
|
77
|
+
# The hash is keyed by the TransmemIndex key.
|
|
78
|
+
def create_transmem_by_ti_key_hash(transmem_index, min_num_tms)
|
|
79
|
+
_transmem_by_ti_key = {}
|
|
80
|
+
num_certain_hash = transmem_index.num_certain_index
|
|
81
|
+
num_certain_hash.each do |id, num_certain|
|
|
82
|
+
if num_certain >= min_num_tms
|
|
83
|
+
_transmem_by_ti_key[id] = true
|
|
84
|
+
else
|
|
85
|
+
_transmem_by_ti_key[id] = false
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
_transmem_by_ti_key
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# returns a hash where each protein (and peptide if given peps) is indexed
|
|
92
|
+
# with itself with true/false/nil depending on transmembrane status. If
|
|
93
|
+
# given peptides, and :no_include_tm_peps is not false, will also set the
|
|
94
|
+
# attribute for peptides.
|
|
95
|
+
# the attribute (:no_include_tm_peps)
|
|
96
|
+
# NOTE: if given a list of peptides, this implementation will not overwrite a
|
|
97
|
+
# protein if it already has a true/false for transmem. This is so that a
|
|
98
|
+
# lookup does not have to be performed if the value is already defined as
|
|
99
|
+
# the assumption is that many peptides will point to the same protein.
|
|
100
|
+
def create_transmem_status_hash(peps)
|
|
101
|
+
thash = {}
|
|
102
|
+
peps.each do |pep|
|
|
103
|
+
pep.prots.each do |prot|
|
|
104
|
+
if !thash.key?(prot)
|
|
105
|
+
#prot.transmem == nil
|
|
106
|
+
thash[prot] = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
if @no_include_tm_peps
|
|
110
|
+
thash[pep] = pep_is_transmem?(pep)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
thash
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# sets the false_to_total_ratio and returns self for chaining.
|
|
117
|
+
# peps will usually be the peptides created by calling:
|
|
118
|
+
# peps = Digestor.digest( fasta_obj, sequest_params_obj )
|
|
119
|
+
def set_false_to_total_ratio(peps)
|
|
120
|
+
tm_hash = create_transmem_status_hash(peps)
|
|
121
|
+
(tps, fps) = partition(peps, tm_hash)
|
|
122
|
+
@false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
|
|
123
|
+
self
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def pephit_precision(peps)
|
|
127
|
+
if !@transmem_status_hash
|
|
128
|
+
@transmem_status_hash = create_transmem_status_hash(peps)
|
|
129
|
+
end
|
|
130
|
+
super(peps)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# regardless of transmembrane status of proteins peptide belongs to, asks
|
|
134
|
+
# what the avg overlap is with transmembrane sequences.
|
|
135
|
+
def pep_is_transmem?(pep)
|
|
136
|
+
prts = pep.prots
|
|
137
|
+
prts_w_keys = 0
|
|
138
|
+
sum_of_fractions = 0.0
|
|
139
|
+
prts.each do |prot|
|
|
140
|
+
key = @transmem_index.reference_to_key(prot.reference)
|
|
141
|
+
ans = @transmem_index.avg_overlap(key, pep.aaseq, :fraction)
|
|
142
|
+
if ans
|
|
143
|
+
sum_of_fractions += ans
|
|
144
|
+
prts_w_keys += 1
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
if prts_w_keys > 0
|
|
148
|
+
avg_of_fractions = sum_of_fractions / prts_w_keys
|
|
149
|
+
avg_of_fractions >= @no_include_tm_peps
|
|
150
|
+
else
|
|
151
|
+
nil
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# each peptide must have prots and the prots must respond true/false to
|
|
156
|
+
# the 'transmem' method
|
|
157
|
+
# if given a hash, it will override the @transmem_status_hash
|
|
158
|
+
def partition(peps, transmem_status_hash=nil)
|
|
159
|
+
# The fast way to do this is to play with the logic
|
|
160
|
+
# For the insoluble fraction we calculate as if incorrect wins
|
|
161
|
+
# and swap the tp's and fp's (I've verified that this is correct
|
|
162
|
+
# empirically)
|
|
163
|
+
|
|
164
|
+
# the code could be cleaner here, but efforts to minimize calls in the
|
|
165
|
+
# inner loops create this structure...
|
|
166
|
+
tm_hash = transmem_status_hash || @transmem_status_hash
|
|
167
|
+
|
|
168
|
+
my_peps =
|
|
169
|
+
if @no_include_tm_peps
|
|
170
|
+
# remove all thos peps with fractional overlap >= @no_include
|
|
171
|
+
# [1,2,3,4].reject {|n| n >= 3} #-> [1, 2]
|
|
172
|
+
# remove pep.transmem == true and pep.transmem == nil
|
|
173
|
+
|
|
174
|
+
if tm_hash
|
|
175
|
+
peps.reject do |pep|
|
|
176
|
+
tm_hash[pep] != false
|
|
177
|
+
end
|
|
178
|
+
else
|
|
179
|
+
peps.reject do |pep|
|
|
180
|
+
pep_is_transmem?(pep) != false
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
else
|
|
184
|
+
peps
|
|
185
|
+
end
|
|
186
|
+
cw = @correct_wins
|
|
187
|
+
sf = @soluble_fraction
|
|
188
|
+
if !sf
|
|
189
|
+
cw = !cw
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
tp = []
|
|
193
|
+
fp = []
|
|
194
|
+
|
|
195
|
+
if cw
|
|
196
|
+
my_peps.each do |pep|
|
|
197
|
+
one_prot_is_not_transmem = false
|
|
198
|
+
not_all_nil = false
|
|
199
|
+
if tm_hash
|
|
200
|
+
pep.prots.each do |prot|
|
|
201
|
+
tm_status = tm_hash[prot]
|
|
202
|
+
if tm_status == false
|
|
203
|
+
one_prot_is_not_transmem = true
|
|
204
|
+
break
|
|
205
|
+
elsif tm_status == true
|
|
206
|
+
not_all_nil = true
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
else
|
|
210
|
+
pep.prots.each do |prot|
|
|
211
|
+
tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
|
|
212
|
+
if tm_status == false
|
|
213
|
+
one_prot_is_not_transmem = true
|
|
214
|
+
break
|
|
215
|
+
elsif tm_status == true
|
|
216
|
+
not_all_nil = true
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
if one_prot_is_not_transmem
|
|
221
|
+
tp << pep
|
|
222
|
+
else
|
|
223
|
+
if not_all_nil
|
|
224
|
+
fp << pep
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
else
|
|
229
|
+
my_peps.each do |pep|
|
|
230
|
+
one_prot_is_transmem = false
|
|
231
|
+
not_all_nil = false
|
|
232
|
+
if tm_hash
|
|
233
|
+
pep.prots.each do |prot|
|
|
234
|
+
tm_status = tm_hash[prot]
|
|
235
|
+
if tm_status == true
|
|
236
|
+
one_prot_is_transmem = true
|
|
237
|
+
break
|
|
238
|
+
elsif tm_status == false
|
|
239
|
+
not_all_nil = true
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
else
|
|
243
|
+
pep.prots.each do |prot|
|
|
244
|
+
tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
|
|
245
|
+
if tm_status == true
|
|
246
|
+
one_prot_is_transmem = true
|
|
247
|
+
break
|
|
248
|
+
elsif tm_status == false
|
|
249
|
+
not_all_nil = true
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
if one_prot_is_transmem
|
|
254
|
+
fp << pep
|
|
255
|
+
else
|
|
256
|
+
if not_all_nil
|
|
257
|
+
tp << pep
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
if !sf # swap
|
|
263
|
+
fp,tp = tp,fp
|
|
264
|
+
cw = !cw
|
|
265
|
+
end
|
|
266
|
+
#puts "PARTITION ARRAY"
|
|
267
|
+
#p [tp, fp].map{|v| v.size}
|
|
268
|
+
[tp, fp]
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
end
|
|
272
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require 'validator'
|
|
2
|
+
|
|
3
|
+
class Validator::TruePos < Validator
|
|
4
|
+
include Precision::Calculator
|
|
5
|
+
attr_reader :fasta
|
|
6
|
+
attr_accessor :correct_wins
|
|
7
|
+
|
|
8
|
+
# correct_wins means that only a single protein from a pep.aaseq must match
|
|
9
|
+
# the fasta object for the pep hit to be considered valid. Otherwise, all
|
|
10
|
+
# must be a match
|
|
11
|
+
def initialize(fasta_obj, correct_wins = true)
|
|
12
|
+
@fasta = fasta_obj
|
|
13
|
+
@fasta_headers = @fasta.prots.map {|prot| prot.header }
|
|
14
|
+
@correct_wins = correct_wins
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def partition(peps)
|
|
18
|
+
if @correct_wins
|
|
19
|
+
peps.partition do |pep|
|
|
20
|
+
@fasta_headers.any? do |header|
|
|
21
|
+
pep.prots.any? do |pepprot|
|
|
22
|
+
header.include? pepprot.reference
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
else
|
|
27
|
+
peps.partition do |pep|
|
|
28
|
+
pep.prots.all? do |pepprot|
|
|
29
|
+
@fasta_headers.any? do |header|
|
|
30
|
+
header.include? pepprot.reference
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def pephit_precision(peps)
|
|
38
|
+
(tp, fp) = partition(peps)
|
|
39
|
+
calc_precision(tp.size, fp.size)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def to_param_string
|
|
43
|
+
"true_positives(tps)=" + ["{fasta=#{@fasta.filename}", "correct_wins=#{@correct_wins}}"].join(", ")
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|