mspire 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/validator.rb
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
|
2
|
+
class Validator
|
3
|
+
|
4
|
+
Validator_to_string = {
|
5
|
+
'Validator::AA' => 'badAA',
|
6
|
+
'Validator::Decoy' => 'decoy',
|
7
|
+
'Validator::Transmem::Protein' => 'tmm',
|
8
|
+
'Validator::TruePos' => 'tps',
|
9
|
+
'Validator::Bias' => 'bias',
|
10
|
+
'Validator::Probability' => 'prob',
|
11
|
+
:bad_aa => 'badAA',
|
12
|
+
:decoy => 'decoy',
|
13
|
+
:tmm => 'tmm',
|
14
|
+
:tps => 'tps',
|
15
|
+
:bias => 'bias',
|
16
|
+
:prob => 'prob',
|
17
|
+
}
|
18
|
+
|
19
|
+
def initialize_increment
|
20
|
+
@increment_tps = 0
|
21
|
+
@increment_fps = 0
|
22
|
+
@increment_total_submitted = 0
|
23
|
+
@increment_initialized = true
|
24
|
+
end
|
25
|
+
|
26
|
+
# if adding pephits in groups at a time, the entire group does not need to be
|
27
|
+
# queried, just the individual hit. Use this OR pephits_precision (NOT
|
28
|
+
# both). The initial query to this method will begin a running tally that
|
29
|
+
# is saved by the validator.
|
30
|
+
# takes either an array or a single pephit (determined by if it is a
|
31
|
+
# SpecID::Pep)
|
32
|
+
def increment_pephits_precision(peps)
|
33
|
+
tmp = $VERBOSE; $VERBOSE = nil
|
34
|
+
initialize_increment unless @increment_initialized
|
35
|
+
$VERBOSE = tmp
|
36
|
+
|
37
|
+
to_submit =
|
38
|
+
if peps.is_a? SpecID::Pep
|
39
|
+
[peps]
|
40
|
+
else
|
41
|
+
peps
|
42
|
+
end
|
43
|
+
@increment_total_submitted += to_submit.size
|
44
|
+
(tps, fps) = partition(to_submit)
|
45
|
+
@increment_tps += tps.size
|
46
|
+
@increment_fps += fps.size
|
47
|
+
(num_tps, num_fps) =
|
48
|
+
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
49
|
+
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
50
|
+
[num_tps, num_fps]
|
51
|
+
else
|
52
|
+
[@increment_tps, @increment_fps]
|
53
|
+
end
|
54
|
+
calc_precision(num_tps, num_fps)
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
# returns an adjusted false positive rate (a float not to drop below 0.0)
|
59
|
+
# based on a background of 'false'-false positive hits to total hits. Also
|
60
|
+
# sets the @calculated_background attribute. Accepts floats or ints
|
61
|
+
def adjust_fps_for_background(num_tps, num_fps, background)
|
62
|
+
num_fps = num_fps.to_f
|
63
|
+
total_peps = num_tps + num_fps
|
64
|
+
@calculated_background = num_fps / total_peps
|
65
|
+
num_fps -= (total_peps.to_f * background)
|
66
|
+
num_fps = 0.0 if num_fps < 0.0
|
67
|
+
num_fps
|
68
|
+
end
|
69
|
+
|
70
|
+
# copied from libjtp: vec
|
71
|
+
# returns the mean and std_dev
|
72
|
+
def sample_stats(array)
|
73
|
+
_len = array.size
|
74
|
+
_sum = 0.0
|
75
|
+
_sum_sq = 0.0
|
76
|
+
array.each do |val|
|
77
|
+
_sum += val
|
78
|
+
_sum_sq += val * val
|
79
|
+
end
|
80
|
+
std_dev = _sum_sq - ((_sum * _sum)/_len)
|
81
|
+
std_dev /= ( (_len > 1) ? (_len-1) : 1 )
|
82
|
+
# on occasion, a very small negative number occurs
|
83
|
+
if std_dev < 0.0
|
84
|
+
std_dev = 0.0
|
85
|
+
else
|
86
|
+
std_dev = Math.sqrt(std_dev)
|
87
|
+
end
|
88
|
+
mean = _sum.to_f/_len
|
89
|
+
[mean, std_dev]
|
90
|
+
end
|
91
|
+
|
92
|
+
# takes an array of validators and returns a fresh array where each has been
|
93
|
+
# turned into a sensible hash (with symbols as the keys!)
|
94
|
+
def self.sensible_validator_hashes(validators)
|
95
|
+
validators.map do |val|
|
96
|
+
hash = {}
|
97
|
+
case val
|
98
|
+
when Validator::TruePos
|
99
|
+
hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
|
100
|
+
when Validator::AA
|
101
|
+
%w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
|
102
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
103
|
+
end
|
104
|
+
when Validator::Decoy
|
105
|
+
%w(correct_wins decoy_on_match).each do |cat|
|
106
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
107
|
+
end
|
108
|
+
hash[:constraint] = val.constraint.inspect if val.constraint
|
109
|
+
when Validator::Bias
|
110
|
+
%w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
|
111
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
112
|
+
end
|
113
|
+
hash[:file] = val.fasta.filename
|
114
|
+
when Validator::Transmem::Protein
|
115
|
+
%w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
|
116
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
117
|
+
end
|
118
|
+
when Validator::Probability
|
119
|
+
%w(prob_method).each do |cat|
|
120
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
121
|
+
end
|
122
|
+
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
123
|
+
end
|
124
|
+
klass_as_s = val.class.to_s
|
125
|
+
hash[:type] = Validator_to_string[klass_as_s]
|
126
|
+
hash[:class] = klass_as_s
|
127
|
+
hash
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
=begin
|
132
|
+
## THIS IS WITH STRINGS AS KEYS!
|
133
|
+
# takes an array of validators and returns a fresh array where each has been
|
134
|
+
# turned into a sensible hash (with symbols as the keys!)
|
135
|
+
def self.sensible_validator_hashes(validators)
|
136
|
+
validators.map do |val|
|
137
|
+
hash = {}
|
138
|
+
case val
|
139
|
+
when Validator::TruePos
|
140
|
+
hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
|
141
|
+
when Validator::AA
|
142
|
+
%w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
|
143
|
+
hash[cat] = val.send(cat.to_sym)
|
144
|
+
end
|
145
|
+
when Validator::Decoy
|
146
|
+
%w(correct_wins decoy_on_match).each do |cat|
|
147
|
+
hash[cat] = val.send(cat.to_sym)
|
148
|
+
end
|
149
|
+
hash['constraint'] = val.constraint.inspect if val.constraint
|
150
|
+
when Validator::Bias
|
151
|
+
%w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
|
152
|
+
hash[cat] = val.send(cat.to_sym)
|
153
|
+
end
|
154
|
+
hash['file'] = val.fasta.filename
|
155
|
+
when Validator::Transmem::Protein
|
156
|
+
%w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
|
157
|
+
hash[cat] = val.send(cat.to_sym)
|
158
|
+
end
|
159
|
+
when Validator::Probability
|
160
|
+
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
161
|
+
end
|
162
|
+
klass_as_s = val.class.to_s
|
163
|
+
hash['type'] = Validator_to_string[klass_as_s]
|
164
|
+
hash['class'] = klass_as_s
|
165
|
+
hash
|
166
|
+
end
|
167
|
+
end
|
168
|
+
=end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
module Precision::Calculator
|
173
|
+
# calculates precision by the assumption that the first group are all true
|
174
|
+
# hits and the second are all false hits
|
175
|
+
# (0,0) is returned as 1.0
|
176
|
+
def calc_precision(num_true_hits, num_false_hits)
|
177
|
+
if ((num_true_hits.to_f == 0.0) && (num_false_hits.to_f == 0.0))
|
178
|
+
1.0
|
179
|
+
else
|
180
|
+
num_true_hits.to_f / (num_true_hits.to_f + num_false_hits.to_f)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
# will calculate precision for groups of proteins where the first group are
|
186
|
+
# normal hits (which may be true or false) and the second are decoy hits.
|
187
|
+
# edge case: if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
|
188
|
+
module Precision::Calculator::Decoy
|
189
|
+
def calc_precision(num_normal, num_decoy)
|
190
|
+
# will calculate as floats in case fractional amounts passed in for
|
191
|
+
# whatever reason
|
192
|
+
num_normal_f = num_normal.to_f
|
193
|
+
num_true_pos = num_normal.to_f - num_decoy
|
194
|
+
precision =
|
195
|
+
if num_normal_f == 0.0
|
196
|
+
if num_decoy.to_f > 0.0
|
197
|
+
0.0
|
198
|
+
else
|
199
|
+
1.0
|
200
|
+
end
|
201
|
+
else
|
202
|
+
num_true_pos/num_normal_f
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
require 'validator/true_pos'
|
208
|
+
require 'validator/aa'
|
209
|
+
require 'validator/bias'
|
210
|
+
require 'validator/decoy'
|
211
|
+
require 'validator/transmem'
|
212
|
+
require 'validator/probability'
|
213
|
+
require 'validator/prot_from_pep'
|
214
|
+
|
data/lib/xml.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
module XML
|
3
|
+
HourMinuteMatch = /[MH]/o
|
4
|
+
# returns a float object of seconds
|
5
|
+
# doesn't support year month, etc, yet
|
6
|
+
def self.duration_to_seconds(string)
|
7
|
+
case x = string[0,2]
|
8
|
+
when 'PT'
|
9
|
+
rest = string[2..-1]
|
10
|
+
# usually it will be this 'PT1.223434S':
|
11
|
+
if rest !~ HourMinuteMatch
|
12
|
+
rest[0...-1].to_f
|
13
|
+
else
|
14
|
+
addit = ''
|
15
|
+
total_secs = 0
|
16
|
+
total_secs_as_float = nil
|
17
|
+
rest.split('').each do |let|
|
18
|
+
case let
|
19
|
+
when 'H'
|
20
|
+
total_secs += addit.to_i * 3600
|
21
|
+
addit = ''
|
22
|
+
when 'M'
|
23
|
+
total_secs += addit.to_i * 60
|
24
|
+
addit = ''
|
25
|
+
when 'S'
|
26
|
+
total_secs_as_float = total_secs.to_f
|
27
|
+
total_secs_as_float += addit.to_f
|
28
|
+
else
|
29
|
+
addit << let
|
30
|
+
end
|
31
|
+
end
|
32
|
+
total_secs_as_float
|
33
|
+
end
|
34
|
+
else
|
35
|
+
abort 'need to include support for other durations'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
module XMLStyleParser
|
3
|
+
@done_once = nil
|
4
|
+
|
5
|
+
Parser_precedence = %w(AXML LibXML XMLParser Regexp REXML)
|
6
|
+
# currently AXML requires 'xmlparser' to be installed.... (may not always be
|
7
|
+
# the case...)
|
8
|
+
File_required = {'AXML' => /^axml/, 'LibXML' => /^xml\/libxml/, 'XMLParser' => /^xmlparser/}
|
9
|
+
|
10
|
+
# the method that the parser will call on the given file at parse!
|
11
|
+
attr_accessor :method
|
12
|
+
|
13
|
+
# parses the given file by sending to @method
|
14
|
+
def parse(file, opts={})
|
15
|
+
if respond_to? @method
|
16
|
+
send(@method, file, opts)
|
17
|
+
else
|
18
|
+
raise NoMethodError, "Parser of class #{self.class} can't parse #{@method} yet"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# XMLParser and xml/libxml are incompatible, so if xmlparser is available,
|
23
|
+
# libxml will not be loaded (XMLParser#parse is clobbered by
|
24
|
+
# XML::Parser#parse [don't ask me why])
|
25
|
+
def self.require_parsers
|
26
|
+
if !@done_once
|
27
|
+
have_xmlparser = false
|
28
|
+
begin
|
29
|
+
require 'xmlparser'
|
30
|
+
puts "Loaded XMLParser" if $VERBOSE
|
31
|
+
have_xmlparser = true
|
32
|
+
rescue LoadError
|
33
|
+
end
|
34
|
+
|
35
|
+
begin
|
36
|
+
require 'axml'
|
37
|
+
puts "Loaded AXML" if $VERBOSE
|
38
|
+
rescue LoadError
|
39
|
+
end
|
40
|
+
|
41
|
+
begin
|
42
|
+
unless have_xmlparser
|
43
|
+
require 'xml/libxml'
|
44
|
+
puts "Loaded xml/libxml" if $VERBOSE
|
45
|
+
################################################################
|
46
|
+
# IMPORTANT!
|
47
|
+
# This magic line makes the parser behave like it ought to!!
|
48
|
+
XML::Parser.default_keep_blanks = false
|
49
|
+
################################################################
|
50
|
+
end
|
51
|
+
rescue LoadError
|
52
|
+
end
|
53
|
+
end
|
54
|
+
@done_once = true
|
55
|
+
end
|
56
|
+
|
57
|
+
# returns an array of strings depending on File_required (in the order of
|
58
|
+
# Parser_precedence)
|
59
|
+
def self.available_xml_parsers
|
60
|
+
require_parsers
|
61
|
+
parser_precedence = Parser_precedence.dup
|
62
|
+
File_required.map do |k,v|
|
63
|
+
unless $".any? {|req_file| req_file.match(v) }
|
64
|
+
parser_precedence.delete(k)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
parser_precedence
|
68
|
+
end
|
69
|
+
|
70
|
+
## appends downcase to each parser type here and tries to require it
|
71
|
+
# returns all those that were required without a load error
|
72
|
+
def self.require_parse_files(base_dir)
|
73
|
+
XMLStyleParser.available_xml_parsers.select do |v|
|
74
|
+
to_require = base_dir + '/' + v.downcase
|
75
|
+
begin
|
76
|
+
require to_require
|
77
|
+
true
|
78
|
+
rescue LoadError
|
79
|
+
false
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# seeks a subclass that has the public_method @method
|
85
|
+
def self.choose_parser(const, method)
|
86
|
+
## First update @@parser_precedence to ensure we should get these guys
|
87
|
+
parser_precedence = available_xml_parsers
|
88
|
+
|
89
|
+
available_constants = parser_precedence.select do |v|
|
90
|
+
const.const_defined?(v)
|
91
|
+
end
|
92
|
+
available_subclasses = available_constants.map do |v|
|
93
|
+
const.const_get(v)
|
94
|
+
end
|
95
|
+
available = available_subclasses.select do |subclass|
|
96
|
+
subclass.public_method_defined? method
|
97
|
+
end
|
98
|
+
if available.size > 0
|
99
|
+
available.first
|
100
|
+
else
|
101
|
+
raise NoMethodError, "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module XMLParserWrapper
|
4
|
+
def parse_and_report(file, const, report_method=:report)
|
5
|
+
parse_and_report_string(IO.read(file), const, report_method)
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse_and_report_string(string, const, report_method=:report)
|
9
|
+
parser = self.class.const_get(const).new
|
10
|
+
parser.parse(string)
|
11
|
+
parser.send(report_method)
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse_and_report_io(io, const, report_method=:report)
|
15
|
+
parser = self.class.const_get(const).new
|
16
|
+
parser.parse(io)
|
17
|
+
parser.send(report_method)
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
|
4
|
+
require 'roc'
|
5
|
+
require 'optparse'
|
6
|
+
require 'generator'
|
7
|
+
|
8
|
+
$decoy = false
|
9
|
+
$base = "precision_vs_numhits"
|
10
|
+
|
11
|
+
opts = OptionParser.new do |op|
|
12
|
+
op.banner = "usage: #{File.basename(__FILE__)} smriti.csv ..."
|
13
|
+
op.separator ""
|
14
|
+
op.separator "smriti.csv = (tab delimited) prob, file:seq:charge, T/F"
|
15
|
+
op.separator ""
|
16
|
+
op.on("--decoy", "'F' indicates this is a decoy") {|v| $decoy = true }
|
17
|
+
op.on("-o", "--outfile <filename>", "base outfile name (#{$base})") {|v| $base = v}
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.parse!
|
21
|
+
|
22
|
+
if ARGV.size <= 0
|
23
|
+
puts opts
|
24
|
+
exit
|
25
|
+
end
|
26
|
+
|
27
|
+
files = ARGV.to_a
|
28
|
+
|
29
|
+
xys = files.map do |file|
|
30
|
+
triplets = IO.readlines(file).reject{|v| v =~ /^#/}.map do |line|
|
31
|
+
line.chomp.split("\t")
|
32
|
+
end
|
33
|
+
|
34
|
+
# check that they're all OK:
|
35
|
+
triplets.each do |trip|
|
36
|
+
if trip.size != 3 ; abort "bad triplet" end
|
37
|
+
end
|
38
|
+
|
39
|
+
# figure out the ordering (and correct if necessary):
|
40
|
+
higher_better = triplets[0][0].to_f > triplets.last[0].to_f
|
41
|
+
|
42
|
+
doublets = triplets.map do |trip|
|
43
|
+
value = trip[0].to_f
|
44
|
+
value *= -1 if higher_better
|
45
|
+
[value, ((trip[2] == 'T') ? true : false)]
|
46
|
+
end
|
47
|
+
|
48
|
+
roc = ROC.new
|
49
|
+
|
50
|
+
(tps, fps) = roc.doublets_to_separate(doublets)
|
51
|
+
|
52
|
+
(x, y) =
|
53
|
+
if $decoy
|
54
|
+
(numhits, precision) = DecoyROC.new.pred_and_ppv(tps, fps)
|
55
|
+
[numhits, precision]
|
56
|
+
else
|
57
|
+
(numhits, precision) = roc.numhits_and_ppv(doublets)
|
58
|
+
[numhits, precision]
|
59
|
+
end
|
60
|
+
[x,y]
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
## PLOT TO to_plot
|
66
|
+
File.open( $base + ".to_plot", 'w') do |fh|
|
67
|
+
fh.puts "XYData"
|
68
|
+
fh.puts $base
|
69
|
+
fh.puts "precision vs. num hits"
|
70
|
+
fh.puts "num hits"
|
71
|
+
fh.puts "precision"
|
72
|
+
files.zip(xys) do |file,xy|
|
73
|
+
(x,y) = xy
|
74
|
+
x.unshift(0)
|
75
|
+
y.unshift(1)
|
76
|
+
fh.puts file.sub(/\.[^\.]$/,'')
|
77
|
+
fh.puts x.join(" ")
|
78
|
+
fh.puts y.join(" ")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
File.open( $base + ".csv", 'w') do |fh|
|
83
|
+
columns = []
|
84
|
+
files.zip(xys) do |file,xy|
|
85
|
+
f = file.sub(/\.[^\.]$/,'')
|
86
|
+
(x,y) = xy
|
87
|
+
x.unshift("#Hits: #{f}")
|
88
|
+
y.unshift("Precision: #{f}")
|
89
|
+
columns << x << y
|
90
|
+
end
|
91
|
+
SyncEnumerator.new(*columns).each do |row|
|
92
|
+
fh.puts row.join("\t")
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'table'
|
5
|
+
|
6
|
+
require 'spec/gradient_program'
|
7
|
+
|
8
|
+
delimiter = "\t"
|
9
|
+
table_format = false
|
10
|
+
opts = OptionParser.new do |op|
|
11
|
+
op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
|
12
|
+
op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
|
13
|
+
if v == 'space'
|
14
|
+
delimiter = " "
|
15
|
+
elsif v == 'tab'
|
16
|
+
delimiter = "\t"
|
17
|
+
elsif v == 'format'
|
18
|
+
table_format = true
|
19
|
+
else
|
20
|
+
abort "don't recognize #{v}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.parse!
|
26
|
+
|
27
|
+
if ARGV.size == 0
|
28
|
+
puts opts
|
29
|
+
exit
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
sets_of_tables = {}
|
34
|
+
ARGV.each do |file|
|
35
|
+
File.open(file) do |fh|
|
36
|
+
sets_of_tables[file] = GradientProgram.all_from_handle(fh)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
sets_of_tables.each do |file, tables|
|
41
|
+
puts "FILE: #{file}"
|
42
|
+
tables.each do |gp|
|
43
|
+
puts "PUMP_TYPE: #{gp.pump_type}"
|
44
|
+
col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
|
45
|
+
data = gp.time_points.map do |tp|
|
46
|
+
line = [tp.time, *(tp.percentages)]
|
47
|
+
line << tp.flow_rate
|
48
|
+
end
|
49
|
+
table = Table.new(data, nil, col_labels)
|
50
|
+
if table_format
|
51
|
+
puts table.to_formatted_string
|
52
|
+
else
|
53
|
+
puts table.to_s(delimiter)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rexml/document'
|
4
|
+
|
5
|
+
if ARGV.size == 0
|
6
|
+
puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
|
7
|
+
puts "outputs a .csv file"
|
8
|
+
exit
|
9
|
+
end
|
10
|
+
|
11
|
+
class Protein
|
12
|
+
attr_accessor :name, :pi, :ni
|
13
|
+
def initialize(name, pi, ni)
|
14
|
+
@name, @pi, @ni = name, pi, ni
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Listener
|
19
|
+
attr_accessor :proteins
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@proteins = []
|
23
|
+
end
|
24
|
+
|
25
|
+
def tag_start(name, attrs)
|
26
|
+
if name == "protein"
|
27
|
+
protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
|
28
|
+
@proteins.push( protein )
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def method_missing(*args) ; end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
ARGV.each do |file|
|
37
|
+
File.open("output.csv", 'w') do |out|
|
38
|
+
listener = Listener.new
|
39
|
+
REXML::Document.parse_stream(File.new(file), listener)
|
40
|
+
listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
|
41
|
+
out.puts [protein.name, protein.pi, protein.ni].join("\t")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/script/mzXML2timeIndex.rb
CHANGED
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'spec_id'
|
4
|
+
require 'fasta'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
$top = false
|
8
|
+
opts = OptionParser.new do |op|
|
9
|
+
op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml <file>.fasta|prefix"
|
10
|
+
op.separator "outputs stdout (tab del sorted by probability) probability, file:aaseq:charge T/F"
|
11
|
+
op.separator "hashes on file+aaseq+charge"
|
12
|
+
op.on("-t", "--top", "only top peptide (by prob) per scan+charge") do
|
13
|
+
$top = true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.parse!
|
18
|
+
|
19
|
+
if ARGV.size < 2
|
20
|
+
puts opts.to_s
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
specid_file = ARGV.shift
|
25
|
+
file_or_prefix = ARGV.shift
|
26
|
+
|
27
|
+
specid = SpecID.new(specid_file)
|
28
|
+
|
29
|
+
indicator =
|
30
|
+
if File.exist? file_or_prefix
|
31
|
+
Fasta.new.read_file(file_or_prefix)
|
32
|
+
else
|
33
|
+
file_or_prefix
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
# returns an array containing the min prob peptides (in case of a tie)
|
38
|
+
def lowest_peps(ar)
|
39
|
+
min_prob = ar.min {|a,b| a.probability.to_f <=> b.probability.to_f }.probability.to_f
|
40
|
+
ar.select {|v| v.probability.to_f == min_prob }
|
41
|
+
end
|
42
|
+
|
43
|
+
peps = specid.peps
|
44
|
+
if $top
|
45
|
+
top_by_scan = []
|
46
|
+
peps.hash_by(:base_name, :first_scan).each do |k,v|
|
47
|
+
low_peps = lowest_peps(v)
|
48
|
+
top_by_scan.push( *low_peps )
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
results = top_by_scan.hash_by(:base_name, :aaseq, :charge).map do |k,v|
|
53
|
+
low_peps = lowest_peps(v)
|
54
|
+
#min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
55
|
+
all_prots = []
|
56
|
+
low_peps.each do |pep|
|
57
|
+
all_prot_references.push( *(pep.prots.map {|v| v.reference }) )
|
58
|
+
end
|
59
|
+
all_prot_references.uniq!
|
60
|
+
is_true =
|
61
|
+
if indicator.is_a? Fasta
|
62
|
+
all_prot_references.any? do |ref|
|
63
|
+
indicator.included_in_header?(ref)
|
64
|
+
end
|
65
|
+
else
|
66
|
+
!(all_prot_references.all? {|ref| ref.include?( indicator )})
|
67
|
+
end
|
68
|
+
[min_pep.probability.to_f, k, is_true]
|
69
|
+
end
|
70
|
+
|
71
|
+
results.sort.each do |result|
|
72
|
+
report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
|
73
|
+
puts report.join("\t")
|
74
|
+
end
|
75
|
+
|
76
|
+
=begin
|
77
|
+
# ORIGINAL CODE
|
78
|
+
peps = specid.peps
|
79
|
+
if $top
|
80
|
+
peps = peps.hash_by(:base_name, :first_scan).map do |k,v|
|
81
|
+
v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
results = peps.hash_by(:base_name, :aaseq, :charge).map do |k,v|
|
86
|
+
min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
87
|
+
references = min_pep.prots.map {|v| v.reference }.uniq
|
88
|
+
is_true =
|
89
|
+
if indicator.is_a? Fasta
|
90
|
+
references.any? do |ref|
|
91
|
+
indicator.included_in_header?(ref)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
!(references.all? {|ref| ref.include?( indicator )})
|
95
|
+
end
|
96
|
+
[min_pep.probability.to_f, k, is_true]
|
97
|
+
end
|
98
|
+
|
99
|
+
results.sort.each do |result|
|
100
|
+
report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
|
101
|
+
puts report.join("\t")
|
102
|
+
end
|
103
|
+
=end
|