mspire 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
@@ -0,0 +1,623 @@
|
|
1
|
+
require 'sort_by_attributes'
|
2
|
+
require 'validator'
|
3
|
+
require 'spec_id'
|
4
|
+
require 'merge_deep'
|
5
|
+
require 'spec_id/precision/filter/interactive'
|
6
|
+
require 'spec_id/precision/filter/output'
|
7
|
+
|
8
|
+
|
9
|
+
class Filter
|
10
|
+
|
11
|
+
# filters using previously passed in methods and options
|
12
|
+
def filter(group)
|
13
|
+
if @opts
|
14
|
+
send(@method, group, *@opts)
|
15
|
+
else
|
16
|
+
send(@method, group)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# replaces the contents of group with what passed
|
21
|
+
def filter!(group)
|
22
|
+
group.replace(filter(group))
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
# we have to require this after we setup our defaults hash
|
29
|
+
# require 'filter/spec_id/cmdline'
|
30
|
+
|
31
|
+
class SpecID::Precision::Filter
|
32
|
+
FV_DEFAULTS = {
|
33
|
+
:sequest =>
|
34
|
+
{
|
35
|
+
:xcorr1 => 1.0,
|
36
|
+
:xcorr2 => 1.5,
|
37
|
+
:xcorr3 => 2.0,
|
38
|
+
:deltacn => 0.1,
|
39
|
+
:ppm => 1000,
|
40
|
+
:include_deltacnstar => true,
|
41
|
+
},
|
42
|
+
|
43
|
+
|
44
|
+
# output
|
45
|
+
:proteins => false,
|
46
|
+
:output => [],
|
47
|
+
|
48
|
+
# general
|
49
|
+
:top_hit_by => :xcorr,
|
50
|
+
:postfilter => :top_per_scan,
|
51
|
+
:prefilter => false,
|
52
|
+
:hits_together => true,
|
53
|
+
|
54
|
+
# These are also defaulted in the commandline because they are necessary
|
55
|
+
# for the validators... could this introduce conflicts somehow?
|
56
|
+
:decoy_on_match => true,
|
57
|
+
:ties => true,
|
58
|
+
|
59
|
+
# UNLISTED FOR NOW:
|
60
|
+
:include_ties_in_top_hit_prefilter => true,
|
61
|
+
:include_ties_in_top_hit_postfilter => false,
|
62
|
+
}
|
63
|
+
|
64
|
+
require 'spec_id/precision/filter/cmdline'
|
65
|
+
|
66
|
+
def filter_and_validate_cmdline(args)
|
67
|
+
(spec_id_obj, options, option_parser) = CmdlineParser.new.parse(args)
|
68
|
+
if spec_id_obj == nil
|
69
|
+
puts option_parser
|
70
|
+
return
|
71
|
+
end
|
72
|
+
final_answer = SpecID::Precision::Filter.new.filter_and_validate(spec_id_obj, options)
|
73
|
+
end
|
74
|
+
|
75
|
+
# # output_array has doublets of [format, handle]
|
76
|
+
# # answer is the answer one gets out of filter_and_validate
|
77
|
+
# def output(answer, output_array)
|
78
|
+
# output_array.each do |format, handle|
|
79
|
+
# SpecID::Precision::Filter::Output.new(format, handle)
|
80
|
+
# end
|
81
|
+
# end
|
82
|
+
|
83
|
+
# Very high level method that takes simple parameters.
|
84
|
+
# spec_id may be a filename or a SpecID object (containing peps)
|
85
|
+
# Default values may be queried from SpecID::Precision::Filter::FV_DEFAULTS
|
86
|
+
# Returns a structured hash:
|
87
|
+
# Fl = Float ; Ar = Array
|
88
|
+
# { :params => <Hash of filtering params>,
|
89
|
+
# :pephits => <Ar of pephits>,
|
90
|
+
# :pephits_precision => [<array of precision>]
|
91
|
+
# # if :proteins => true
|
92
|
+
# :prothits => <Array of prothits>,
|
93
|
+
# :prothits_precision => [ Array of hashes where each hash =
|
94
|
+
# { :worst => Fl, :normal => Fl,
|
95
|
+
# :normal_stdev => Fl } ]
|
96
|
+
# }
|
97
|
+
#
|
98
|
+
# NOTE: Brackets [] indicate an Array! The Bar '|' indicates another option.
|
99
|
+
# The asterik '*' is the default option.
|
100
|
+
#
|
101
|
+
# :sequest => {
|
102
|
+
# :xcorr1 -> >= (xcorr +1 charge state)
|
103
|
+
# :xcorr2 -> >= (xcorr +2 charge state)
|
104
|
+
# :xcorr3 -> >= (xcorr +3 charge state)
|
105
|
+
# :deltacn -> >= (delta cn)
|
106
|
+
# :ppm -> <= parts per million (Float)
|
107
|
+
# :include_deltacnstar => *true | false include deltacn (given at 1.1) of
|
108
|
+
# top hit with no 2nd hit
|
109
|
+
#
|
110
|
+
# }
|
111
|
+
# OUTPUT:
|
112
|
+
# :proteins => true | *false gives proteins (and validation)
|
113
|
+
# :output => [[format, FILENAME=nil],...] formats to output filtering results.
|
114
|
+
# can be used multiple times
|
115
|
+
# FILENAME is the filename to use
|
116
|
+
# if nil, then outputs to $stdout
|
117
|
+
# valid formats are:
|
118
|
+
# :text_table (default)
|
119
|
+
# :yaml (need to implement)
|
120
|
+
# :protein_summary (need to implement)
|
121
|
+
# :html_table (need to implement)
|
122
|
+
# default value =>
|
123
|
+
# [[:text_table,nil]]
|
124
|
+
#
|
125
|
+
# VALIDATION:
|
126
|
+
# :validators => [Array] objects that respond to pephit_precision
|
127
|
+
# usually of base class Validator
|
128
|
+
# NOTE: if you have decoy peptides, you MUST have
|
129
|
+
# a Validator::Decoy object to separate them out.
|
130
|
+
# NOTE: if transmem validator passed in, the
|
131
|
+
# proteins in spec_id must already be granted
|
132
|
+
# transmem status!
|
133
|
+
#
|
134
|
+
#
|
135
|
+
# OTHER:
|
136
|
+
# :top_hit_by -> *:xcorr | :probability
|
137
|
+
# probabilities only in bioworks.xml files right now (if
|
138
|
+
# they were calculated).
|
139
|
+
# :postfilter -> *:top_per_scan | :top_per_aaseq | :top_per_aaseq_charge
|
140
|
+
# :top_per_scan hashes by filename + scan
|
141
|
+
# :top_per_aaseq hashes by top_per_scan + aaseq
|
142
|
+
# :top_per_aaseq_charge hashes by top_per_aaseq + charge
|
143
|
+
# :prefilter -> true | *false Takes top hit per file+scan+charge
|
144
|
+
# :interactive => interactive_object
|
145
|
+
# # should behave like this:
|
146
|
+
# # interactive_object.filter_args(currentopts) -> args_for_filtering | nil (done)
|
147
|
+
#
|
148
|
+
# # interactive_object.passing(final_answer)
|
149
|
+
|
150
|
+
# The defaults for filter_and_validate
|
151
|
+
|
152
|
+
def filter_and_validate(spec_id_obj, options={})
|
153
|
+
# NOTE:
|
154
|
+
# This is a fairly complicated method. The complication comes in doing
|
155
|
+
# top hit filters on separate/cat searches wanted them to be either
|
156
|
+
# together or separate. I opt for fewer conversions between the two, but
|
157
|
+
# that means keeping track of more things...
|
158
|
+
|
159
|
+
opts = FV_DEFAULTS.merge_deep(options)
|
160
|
+
|
161
|
+
spec_id = spec_id_obj
|
162
|
+
|
163
|
+
peps = spec_id.peps
|
164
|
+
filename = spec_id.filename
|
165
|
+
|
166
|
+
#######################################
|
167
|
+
# DEFAULTS:
|
168
|
+
interactive_changing_keys = [:xcorr1, :xcorr2, :xcorr3, :deltacn, :ppm, :include_deltacnstar, :postfilter]
|
169
|
+
interactive_shortcut_map = {
|
170
|
+
:xcorr1 => 'x1',
|
171
|
+
:xcorr2 => 'x2',
|
172
|
+
:xcorr3 => 'x3',
|
173
|
+
:deltacn => 'dcn',
|
174
|
+
:ppm => 'ppm',
|
175
|
+
:include_deltacnstar => 'dcns',
|
176
|
+
:postfilter => 'pf',
|
177
|
+
}
|
178
|
+
to_float = proc {|x| x.to_f}
|
179
|
+
to_bool = proc do |x|
|
180
|
+
case x
|
181
|
+
when /^t/io
|
182
|
+
true
|
183
|
+
when /^f/io
|
184
|
+
false
|
185
|
+
when true
|
186
|
+
true
|
187
|
+
when false
|
188
|
+
false
|
189
|
+
else
|
190
|
+
nil
|
191
|
+
end
|
192
|
+
end
|
193
|
+
to_postfilter = proc do |x|
|
194
|
+
case x
|
195
|
+
when 's'
|
196
|
+
:top_per_scan
|
197
|
+
when 'a'
|
198
|
+
:top_per_aaseq
|
199
|
+
when 'ac'
|
200
|
+
:top_per_aaseq_charge
|
201
|
+
when Symbol
|
202
|
+
x
|
203
|
+
end
|
204
|
+
end
|
205
|
+
casting_map = {
|
206
|
+
:xcorr1 => to_float,
|
207
|
+
:xcorr2 => to_float,
|
208
|
+
:xcorr3 => to_float,
|
209
|
+
:deltacn => to_float,
|
210
|
+
:ppm => to_float,
|
211
|
+
:include_deltacnstar => to_bool,
|
212
|
+
:postfilter => to_postfilter,
|
213
|
+
}
|
214
|
+
|
215
|
+
# output:
|
216
|
+
# NOTE: BOOLEANS that are by default false do not need a default!!
|
217
|
+
# They will yield false on key lookup if no key or false!
|
218
|
+
# BOOLEANS that by default are true should be queried like this
|
219
|
+
# !(opts[:<option>] == false)
|
220
|
+
|
221
|
+
# open up each of the files for writing
|
222
|
+
if opts[:output]
|
223
|
+
outputs = opts[:output].map do |format, where|
|
224
|
+
if where == nil
|
225
|
+
where = $stdout
|
226
|
+
end
|
227
|
+
SpecID::Precision::Filter::Output.new(format, where)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
postfilters_per_hash = {
|
232
|
+
:top_per_scan => [:base_name, :first_scan],
|
233
|
+
:top_per_aaseq => [:aaseq], # first by top_per_scan, then this guy
|
234
|
+
:top_per_aaseq_charge => [:aaseq, :charge], # first by top_per_scan, then this one
|
235
|
+
}
|
236
|
+
|
237
|
+
top_hit_by__to_sort_by = {
|
238
|
+
:xcorr => [:xcorr, {:down=> [:xcorr]}],
|
239
|
+
:probability => [:probability, (spec_id.hi_prob_best ? {:down=> [:probability]} : {})],
|
240
|
+
}
|
241
|
+
sort_by_att_opts = top_hit_by__to_sort_by[opts[:top_hit_by]]
|
242
|
+
opts_for_top_hit_prefilter = {
|
243
|
+
:per => [:base_name, :first_scan, :charge],
|
244
|
+
:by => sort_by_att_opts,
|
245
|
+
:include_ties => opts[:include_ties_in_top_hit_prefilter]
|
246
|
+
}
|
247
|
+
# PRIVATE DEFAULTS:
|
248
|
+
merge_prefix = 'DECOY_'
|
249
|
+
unmerge_regexp = /^DECOY_/
|
250
|
+
|
251
|
+
#######################################
|
252
|
+
|
253
|
+
|
254
|
+
# opts_decoy = opts[:decoy]
|
255
|
+
|
256
|
+
|
257
|
+
|
258
|
+
# if we have a Validator::Decoy object, we will use its defaults to split
|
259
|
+
# peptides.
|
260
|
+
decoy_validator =
|
261
|
+
if opts[:validators]
|
262
|
+
decoy_vals = opts[:validators].select {|v| v.class == Validator::Decoy }
|
263
|
+
if decoy_vals.size == 0
|
264
|
+
nil
|
265
|
+
elsif decoy_vals.size == 1
|
266
|
+
decoy_vals.first
|
267
|
+
else
|
268
|
+
raise ArgumentError, "can only have one Validator::Decoy object"
|
269
|
+
end
|
270
|
+
|
271
|
+
### suck out the relevant parameters
|
272
|
+
#sep_params = [:decoy_on_match, :correct_wins].inject({}) do |hash,k|
|
273
|
+
# hash[k] = decoy_validator.send(k)
|
274
|
+
# hash
|
275
|
+
#end
|
276
|
+
else
|
277
|
+
nil
|
278
|
+
end
|
279
|
+
|
280
|
+
decoy_validator_to_split_with = nil
|
281
|
+
|
282
|
+
pep_sets =
|
283
|
+
if decoy_validator
|
284
|
+
if decoy_validator.constraint.is_a?(Regexp)
|
285
|
+
if opts[:hits_together]
|
286
|
+
decoy_validator_to_split_with = decoy_validator
|
287
|
+
[peps]
|
288
|
+
else
|
289
|
+
(target, decoy) = decoy_validator.partition(peps)
|
290
|
+
#(target, decoy) = SpecID.classify_by_prot(peps, opts_decoy, sep_params[:decoy_on_match], sep_params[:correct_wins])
|
291
|
+
[target, decoy]
|
292
|
+
end
|
293
|
+
elsif decoy_validator.constraint.is_a?(String) ## a Filename
|
294
|
+
decoy_peps = SpecID.new(decoy_validator.constraint).peps
|
295
|
+
|
296
|
+
if opts[:hits_together]
|
297
|
+
# we fake that the protein sets are together
|
298
|
+
decoy_validator_to_split_with = Validator::Decoy.new(unmerge_regexp)
|
299
|
+
decoy_peps.each do |pep|
|
300
|
+
pep.prots.each {|prt| prt.reference = merge_prefix + prt.reference }
|
301
|
+
end
|
302
|
+
[peps + decoy_peps] # wrap them so we get the target out
|
303
|
+
else
|
304
|
+
[peps, decoy_peps]
|
305
|
+
end
|
306
|
+
else
|
307
|
+
raise ArgumentError, "Decoy::Validator#constraint must be a Regexp or valid SpecID file"
|
308
|
+
end
|
309
|
+
else
|
310
|
+
[peps] # no decoy
|
311
|
+
end
|
312
|
+
|
313
|
+
if opts[:proteins]
|
314
|
+
protein_validator = Validator::ProtFromPep.new
|
315
|
+
end
|
316
|
+
|
317
|
+
### TOP HITS PREFILTER < < TOP_HITS_TOGETHER > >
|
318
|
+
###########################
|
319
|
+
# TOP HITS FILTER:
|
320
|
+
###########################
|
321
|
+
# REALLY, this guy only exists for speed and memory consumption
|
322
|
+
# If we prefilter, we don't have to filter as many hits in every
|
323
|
+
# interactive round. I'd leave this guy out if I were doing only a
|
324
|
+
# sequest filter. (I should compare results with this filter and w/o)
|
325
|
+
# This guy is very tricky since we need to consider whether they are to be
|
326
|
+
# run together or separately and not do more work than we need
|
327
|
+
# get passed_target for any case (and passed_decoy if opts[:decoy])
|
328
|
+
|
329
|
+
|
330
|
+
top_hit_prefilter = SpecID::Precision::Filter::Peps.new(:top_hit, opts_for_top_hit_prefilter) if opts[:prefilter]
|
331
|
+
|
332
|
+
if top_hit_prefilter
|
333
|
+
pep_sets.map! do |pep_set|
|
334
|
+
top_hit_prefilter.filter(pep_set)
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# prepare our top hit filter:
|
339
|
+
# since we are now modulating this guy, we need to create it fresh every
|
340
|
+
# time
|
341
|
+
top_per_scan_postfilter = SpecID::Precision::Filter::Peps.new(:top_hit,
|
342
|
+
:per => postfilters_per_hash[:top_per_scan],
|
343
|
+
:by => sort_by_att_opts,
|
344
|
+
:include_ties => opts[:include_ties_in_top_hit_postfilter])
|
345
|
+
|
346
|
+
|
347
|
+
|
348
|
+
# Prepare to loop
|
349
|
+
# Give interactive help once here if necessary
|
350
|
+
interactive = opts[:interactive]
|
351
|
+
if interactive
|
352
|
+
ARGV.clear
|
353
|
+
interactive.out(interactive.interactive_help(interactive_changing_keys, interactive_shortcut_map)) if interactive.verbose
|
354
|
+
end
|
355
|
+
|
356
|
+
# the loop is for if we are interactive
|
357
|
+
final_answer = nil
|
358
|
+
loop do
|
359
|
+
|
360
|
+
if interactive #interactive
|
361
|
+
# a bit of a hack, but we shove on the postfilter param to modulate
|
362
|
+
opts[:sequest][:postfilter] = opts[:postfilter]
|
363
|
+
response = interactive.filter_args(opts[:sequest], interactive_changing_keys, interactive_shortcut_map, casting_map)
|
364
|
+
opts[:postfilter] = opts[:sequest].delete(:postfilter)
|
365
|
+
break if response == nil
|
366
|
+
end
|
367
|
+
|
368
|
+
# prepare our top hit filter:
|
369
|
+
# since we are now modulating this guy, we need to create it fresh every
|
370
|
+
# time
|
371
|
+
|
372
|
+
sub_postfilter =
|
373
|
+
if opts[:postfilter] == :top_per_scan
|
374
|
+
nil
|
375
|
+
else
|
376
|
+
postfilter_per_args = postfilters_per_hash[opts[:postfilter]]
|
377
|
+
SpecID::Precision::Filter::Peps.new(:top_hit,
|
378
|
+
:per => postfilter_per_args,
|
379
|
+
:by => sort_by_att_opts,
|
380
|
+
:include_ties => opts[:include_ties_in_top_hit_postfilter]
|
381
|
+
)
|
382
|
+
end
|
383
|
+
|
384
|
+
pep_sets_to_be_filtered = pep_sets.map
|
385
|
+
|
386
|
+
### SEQUEST < EITHER >
|
387
|
+
###########################
|
388
|
+
# SEQUEST FILTER:
|
389
|
+
###########################
|
390
|
+
# This guy is immune to the trickiness of top hits, so we just filter
|
391
|
+
# separately since validation is best done without decoys (except decoy)
|
392
|
+
sequest_args = opts[:sequest].values_at( :xcorr1, :xcorr2, :xcorr3, :deltacn, :ppm, :include_deltacnstar )
|
393
|
+
sequest_filter = SpecID::Precision::Filter::Peps.new(:standard_sequest_filter, *sequest_args)
|
394
|
+
|
395
|
+
pep_sets_filtered = pep_sets_to_be_filtered.map do |pep_set|
|
396
|
+
sequest_filter.filter(pep_set)
|
397
|
+
end
|
398
|
+
|
399
|
+
### FINAL HIT PER SCAN < < TOP_HITS_TOGETHER > >
|
400
|
+
##########################
|
401
|
+
# FINAL HIT PER SCAN
|
402
|
+
##########################
|
403
|
+
# Why not just do the top hit filter in the top hits pre filter before?
|
404
|
+
# Good question. Answer: We may have instances when the top hit (by
|
405
|
+
# xcorr) has some other poorer attribute than the hit at the other charge.
|
406
|
+
# In this case, we'd end up with no passing peptide.
|
407
|
+
# Also, the xcorr filter is per charge, so we may filter out the higher
|
408
|
+
# scoring peptide hit even though the other would pass based on its charge
|
409
|
+
# state, etc., etc....
|
410
|
+
# ###################################################
|
411
|
+
# NOTE THIS WELL:
|
412
|
+
# IF IT IS SUPPOSE TO be separate it's *ALREADY* separate, if together its
|
413
|
+
# *ALREADY* together!!!!
|
414
|
+
# the implication is that we don't need to do any merging or
|
415
|
+
# separating before we do this last filter!!!!
|
416
|
+
# ###################################################
|
417
|
+
|
418
|
+
# TODO: We need to add this guy in!
|
419
|
+
#if opts[:uniq_aa]
|
420
|
+
# pep_sets_filtered.map do |pep_set|
|
421
|
+
# end
|
422
|
+
#end
|
423
|
+
|
424
|
+
pep_sets_filtered.map! do |pep_set|
|
425
|
+
top_per_scan_postfilter.filter!(pep_set)
|
426
|
+
if sub_postfilter
|
427
|
+
sub_postfilter.filter!(pep_set)
|
428
|
+
else
|
429
|
+
pep_set
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
normal_post_filtered_peps = pep_sets_filtered.first
|
434
|
+
|
435
|
+
# separate the decoy's out if they are together
|
436
|
+
if decoy_validator_to_split_with # only set if opts[:hits_together]!!
|
437
|
+
(target, decoy) = decoy_validator_to_split_with.partition(normal_post_filtered_peps)
|
438
|
+
pep_sets_filtered = [target, decoy]
|
439
|
+
end
|
440
|
+
|
441
|
+
### VALIDATION < SEPARATE >
|
442
|
+
pephit_precision_array = get_pephit_precision(opts[:validators], *pep_sets_filtered) if opts[:validators]
|
443
|
+
|
444
|
+
final_answer = {
|
445
|
+
:params => opts,
|
446
|
+
:pephits => pep_sets_filtered.first,
|
447
|
+
}
|
448
|
+
if pephit_precision_array
|
449
|
+
final_answer[:pephits_precision] = pephit_precision_array
|
450
|
+
end
|
451
|
+
|
452
|
+
if opts[:proteins]
|
453
|
+
protein_precision_array = peptide_precision_to_protein_precision(protein_validator, normal_post_filtered_peps, pephit_precision_array)
|
454
|
+
# this could be factored out (since we do it in protein_precision)
|
455
|
+
|
456
|
+
# merge the final prots into a unique set:
|
457
|
+
final_answer[:prothits] = normal_post_filtered_peps.inject(Set.new) do |protset, pep|
|
458
|
+
protset.merge(pep.prots)
|
459
|
+
end
|
460
|
+
final_answer[:prothits_precision] = protein_precision_array
|
461
|
+
end
|
462
|
+
|
463
|
+
## output the output
|
464
|
+
outputs.each {|output| output.print(final_answer) }
|
465
|
+
|
466
|
+
if interactive
|
467
|
+
interactive.passing(opts, final_answer)
|
468
|
+
end
|
469
|
+
|
470
|
+
if !interactive
|
471
|
+
break
|
472
|
+
end
|
473
|
+
end
|
474
|
+
# Close the filehandles
|
475
|
+
outputs.each { |output| output.close } if opts[:output]
|
476
|
+
final_answer
|
477
|
+
end
|
478
|
+
|
479
|
+
# takes peps and a peptide_precision_hash. Returns a hash with the same
|
480
|
+
# keys of peptide_precision_hash where the value is a hash with these keys:
|
481
|
+
# :worst => worstcase protein precision
|
482
|
+
# :normal => estimaton by binomial/gaussian method (optimistic)
|
483
|
+
# :normal_stdev => the stdev of the normal method
|
484
|
+
def peptide_precision_to_protein_precision(protein_validator, peps, peptide_precision_array, round_num_false=:ceil)
|
485
|
+
peptide_precision_array.map do |precision|
|
486
|
+
num_false = ((1.0 - precision) * peps.size).ceil
|
487
|
+
reply = protein_validator.prothit_precision(peps, num_false)
|
488
|
+
hash = {}
|
489
|
+
%w(worst normal normal_stdev).zip(reply) do |label, answer|
|
490
|
+
hash[label.to_sym] = answer
|
491
|
+
end
|
492
|
+
hash
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
496
|
+
# takes an array of validator objects and peps (already separated out from
|
497
|
+
# decoys; the decoy's can be passed in
|
498
|
+
# returns an array of results
|
499
|
+
def get_pephit_precision(validators, peps, decoy_peps=nil, grant_transmem_status=false)
|
500
|
+
validators.map do |validator|
|
501
|
+
if validator.class == Validator::Decoy
|
502
|
+
validator.pephit_precision(peps, decoy_peps)
|
503
|
+
else
|
504
|
+
validator.pephit_precision(peps)
|
505
|
+
end
|
506
|
+
end
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
510
|
+
class SpecID::Precision::Filter::Peps < Filter
|
511
|
+
|
512
|
+
# can pass in the method to call. If you have static options and you will
|
513
|
+
# reuse your filter, you can pass them in here.
|
514
|
+
# BEWARE: this will override any passed into the method at filter time.
|
515
|
+
# If you need to do that, make a new, blank filter and pass in your args
|
516
|
+
# at filter time
|
517
|
+
def initialize(meth=nil, *opts)
|
518
|
+
@method = meth
|
519
|
+
if opts.size > 0
|
520
|
+
@opts = opts
|
521
|
+
else
|
522
|
+
@opts = nil
|
523
|
+
end
|
524
|
+
end
|
525
|
+
|
526
|
+
# passes the top peptide hits per attributes that it is hashed by
|
527
|
+
# all hits with same score as top score are returned
|
528
|
+
# assumes that all attributes are cast properly: Float,Integer, etc
|
529
|
+
# converts xcorr, deltacn, deltamass, mass, and charge into numerical types
|
530
|
+
# deletes the protein array (but not relevant proteins)
|
531
|
+
# hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
|
532
|
+
# returns self for chaining
|
533
|
+
# opts
|
534
|
+
# :per => Array of attributes e.g. [:first_scan, :charge] # TODO: allow lambda
|
535
|
+
# :by => an array for sort_by_attributes
|
536
|
+
# e.g. [:xcorr, :deltacn, :ppm, {:down => [:xcorr, :deltacn]}]
|
537
|
+
# :ties => *false | true | :as_array
|
538
|
+
# false - one top hit is selected by random (by sorting)
|
539
|
+
# true - all ties are included in final answer
|
540
|
+
# :as_array - ties are included as an array
|
541
|
+
def top_hit(peps, opts = {})
|
542
|
+
|
543
|
+
# get the top peptide by firstscan/charge (equivalent to .out files)
|
544
|
+
top_peps = []
|
545
|
+
#hash = peps.hash_by(*(opts[:per]))
|
546
|
+
per_array = opts[:per]
|
547
|
+
hash = peps.hash_by(*per_array)
|
548
|
+
ties = opts[:ties]
|
549
|
+
if ties == :as_array
|
550
|
+
as_array = true
|
551
|
+
end
|
552
|
+
hash.values.each do |v|
|
553
|
+
best_to_worst = v.sort_by_attributes(*(opts[:by]))
|
554
|
+
if ties
|
555
|
+
|
556
|
+
best_hit = best_to_worst.first
|
557
|
+
## get the values that matter for the top hit
|
558
|
+
# here get the attributes we are considering
|
559
|
+
atts =
|
560
|
+
if opts[:by].last.is_a? Hash
|
561
|
+
opts[:by][0...-1]
|
562
|
+
else
|
563
|
+
opts[:by].dup
|
564
|
+
end
|
565
|
+
# find the best hits values
|
566
|
+
top_hit_vals = atts.map do |att|
|
567
|
+
best_hit.send(att)
|
568
|
+
end
|
569
|
+
|
570
|
+
tying_peps = []
|
571
|
+
best_to_worst.each do |pep|
|
572
|
+
tie = true
|
573
|
+
atts.each_with_index do |att,i|
|
574
|
+
unless (pep.send(att) == top_hit_vals[i])
|
575
|
+
tie = false
|
576
|
+
break
|
577
|
+
end
|
578
|
+
end
|
579
|
+
if tie
|
580
|
+
tying_peps << pep
|
581
|
+
else
|
582
|
+
break
|
583
|
+
end
|
584
|
+
end
|
585
|
+
if as_array
|
586
|
+
if tying_peps.size == 1
|
587
|
+
top_peps.push( *tying_peps )
|
588
|
+
else
|
589
|
+
top_peps.push( tying_peps )
|
590
|
+
end
|
591
|
+
else
|
592
|
+
top_peps.push( *tying_peps )
|
593
|
+
end
|
594
|
+
else
|
595
|
+
top_peps << best_to_worst.first
|
596
|
+
end
|
597
|
+
end
|
598
|
+
top_peps
|
599
|
+
end
|
600
|
+
|
601
|
+
# returns self for chaining
|
602
|
+
def standard_sequest_filter(peps, x1,x2,x3,deltacn,ppm,include_deltacnstar=true)
|
603
|
+
peps.select do |pep|
|
604
|
+
pep_deltacn = pep.deltacn
|
605
|
+
pep_charge = pep.charge
|
606
|
+
|
607
|
+
## The outer parentheses are critical to getting the correct answer!
|
608
|
+
_passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
|
609
|
+
|
610
|
+
if _passing
|
611
|
+
if ((!include_deltacnstar) && (pep_deltacn > 1.0))
|
612
|
+
false
|
613
|
+
else
|
614
|
+
true
|
615
|
+
end
|
616
|
+
else
|
617
|
+
false
|
618
|
+
end
|
619
|
+
end
|
620
|
+
end
|
621
|
+
|
622
|
+
end
|
623
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module SpecID ; end
|
4
|
+
module SpecID::Precision ; end
|
5
|
+
|
6
|
+
module SpecID::Precision::Output
|
7
|
+
|
8
|
+
# takes a format type (as symbol) and the handle to write to
|
9
|
+
# if handle_or_file is a file, will open it and close (on calling close)
|
10
|
+
# if it is a handle, will not close it
|
11
|
+
def initialize(format, handle_or_file)
|
12
|
+
@handle =
|
13
|
+
if handle_or_file.is_a? String
|
14
|
+
@need_to_close = true
|
15
|
+
File.open(handle_or_file, 'w')
|
16
|
+
else
|
17
|
+
@need_to_close = false
|
18
|
+
handle_or_file
|
19
|
+
end
|
20
|
+
@format = format
|
21
|
+
end
|
22
|
+
|
23
|
+
# returns self
|
24
|
+
def print(answer)
|
25
|
+
send( @format, @handle, answer )
|
26
|
+
self
|
27
|
+
end
|
28
|
+
|
29
|
+
# turns all keys that are symbols into strings (recursively into *Hashes*)
|
30
|
+
def self.symbol_keys_to_string(hash)
|
31
|
+
new_hash = {}
|
32
|
+
hash.each do |k,v|
|
33
|
+
new_value =
|
34
|
+
if v.is_a? Hash
|
35
|
+
symbol_keys_to_string(v)
|
36
|
+
else
|
37
|
+
v
|
38
|
+
end
|
39
|
+
if k.is_a? Symbol
|
40
|
+
new_hash[k.to_s] = new_value
|
41
|
+
else
|
42
|
+
new_hash[k] = new_value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
new_hash
|
46
|
+
end
|
47
|
+
|
48
|
+
# TODO: implement recursively, this has just grown and grown terribly
|
49
|
+
def hash_as_string(hash)
|
50
|
+
hash.inspect
|
51
|
+
end
|
52
|
+
|
53
|
+
# will close the handle if it is a File object
|
54
|
+
def close
|
55
|
+
if @need_to_close
|
56
|
+
@handle.close
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|