mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/validator.rb
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
|
|
2
|
+
class Validator
|
|
3
|
+
|
|
4
|
+
Validator_to_string = {
|
|
5
|
+
'Validator::AA' => 'badAA',
|
|
6
|
+
'Validator::Decoy' => 'decoy',
|
|
7
|
+
'Validator::Transmem::Protein' => 'tmm',
|
|
8
|
+
'Validator::TruePos' => 'tps',
|
|
9
|
+
'Validator::Bias' => 'bias',
|
|
10
|
+
'Validator::Probability' => 'prob',
|
|
11
|
+
:bad_aa => 'badAA',
|
|
12
|
+
:decoy => 'decoy',
|
|
13
|
+
:tmm => 'tmm',
|
|
14
|
+
:tps => 'tps',
|
|
15
|
+
:bias => 'bias',
|
|
16
|
+
:prob => 'prob',
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def initialize_increment
|
|
20
|
+
@increment_tps = 0
|
|
21
|
+
@increment_fps = 0
|
|
22
|
+
@increment_total_submitted = 0
|
|
23
|
+
@increment_initialized = true
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# if adding pephits in groups at a time, the entire group does not need to be
|
|
27
|
+
# queried, just the individual hit. Use this OR pephits_precision (NOT
|
|
28
|
+
# both). The initial query to this method will begin a running tally that
|
|
29
|
+
# is saved by the validator.
|
|
30
|
+
# takes either an array or a single pephit (determined by if it is a
|
|
31
|
+
# SpecID::Pep)
|
|
32
|
+
def increment_pephits_precision(peps)
|
|
33
|
+
tmp = $VERBOSE; $VERBOSE = nil
|
|
34
|
+
initialize_increment unless @increment_initialized
|
|
35
|
+
$VERBOSE = tmp
|
|
36
|
+
|
|
37
|
+
to_submit =
|
|
38
|
+
if peps.is_a? SpecID::Pep
|
|
39
|
+
[peps]
|
|
40
|
+
else
|
|
41
|
+
peps
|
|
42
|
+
end
|
|
43
|
+
@increment_total_submitted += to_submit.size
|
|
44
|
+
(tps, fps) = partition(to_submit)
|
|
45
|
+
@increment_tps += tps.size
|
|
46
|
+
@increment_fps += fps.size
|
|
47
|
+
(num_tps, num_fps) =
|
|
48
|
+
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
|
49
|
+
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
|
50
|
+
[num_tps, num_fps]
|
|
51
|
+
else
|
|
52
|
+
[@increment_tps, @increment_fps]
|
|
53
|
+
end
|
|
54
|
+
calc_precision(num_tps, num_fps)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# returns an adjusted false positive rate (a float not to drop below 0.0)
|
|
59
|
+
# based on a background of 'false'-false positive hits to total hits. Also
|
|
60
|
+
# sets the @calculated_background attribute. Accepts floats or ints
|
|
61
|
+
def adjust_fps_for_background(num_tps, num_fps, background)
|
|
62
|
+
num_fps = num_fps.to_f
|
|
63
|
+
total_peps = num_tps + num_fps
|
|
64
|
+
@calculated_background = num_fps / total_peps
|
|
65
|
+
num_fps -= (total_peps.to_f * background)
|
|
66
|
+
num_fps = 0.0 if num_fps < 0.0
|
|
67
|
+
num_fps
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# copied from libjtp: vec
|
|
71
|
+
# returns the mean and std_dev
|
|
72
|
+
def sample_stats(array)
|
|
73
|
+
_len = array.size
|
|
74
|
+
_sum = 0.0
|
|
75
|
+
_sum_sq = 0.0
|
|
76
|
+
array.each do |val|
|
|
77
|
+
_sum += val
|
|
78
|
+
_sum_sq += val * val
|
|
79
|
+
end
|
|
80
|
+
std_dev = _sum_sq - ((_sum * _sum)/_len)
|
|
81
|
+
std_dev /= ( (_len > 1) ? (_len-1) : 1 )
|
|
82
|
+
# on occasion, a very small negative number occurs
|
|
83
|
+
if std_dev < 0.0
|
|
84
|
+
std_dev = 0.0
|
|
85
|
+
else
|
|
86
|
+
std_dev = Math.sqrt(std_dev)
|
|
87
|
+
end
|
|
88
|
+
mean = _sum.to_f/_len
|
|
89
|
+
[mean, std_dev]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# takes an array of validators and returns a fresh array where each has been
|
|
93
|
+
# turned into a sensible hash (with symbols as the keys!)
|
|
94
|
+
def self.sensible_validator_hashes(validators)
|
|
95
|
+
validators.map do |val|
|
|
96
|
+
hash = {}
|
|
97
|
+
case val
|
|
98
|
+
when Validator::TruePos
|
|
99
|
+
hash.merge( {:correct_wins => val.correct_wins, :file => val.fasta.filename } )
|
|
100
|
+
when Validator::AA
|
|
101
|
+
%w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
|
|
102
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
|
103
|
+
end
|
|
104
|
+
when Validator::Decoy
|
|
105
|
+
%w(correct_wins decoy_on_match).each do |cat|
|
|
106
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
|
107
|
+
end
|
|
108
|
+
hash[:constraint] = val.constraint.inspect if val.constraint
|
|
109
|
+
when Validator::Bias
|
|
110
|
+
%w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
|
|
111
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
|
112
|
+
end
|
|
113
|
+
hash[:file] = val.fasta.filename
|
|
114
|
+
when Validator::Transmem::Protein
|
|
115
|
+
%w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
|
|
116
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
|
117
|
+
end
|
|
118
|
+
when Validator::Probability
|
|
119
|
+
%w(prob_method).each do |cat|
|
|
120
|
+
hash[cat.to_sym] = val.send(cat.to_sym)
|
|
121
|
+
end
|
|
122
|
+
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
|
123
|
+
end
|
|
124
|
+
klass_as_s = val.class.to_s
|
|
125
|
+
hash[:type] = Validator_to_string[klass_as_s]
|
|
126
|
+
hash[:class] = klass_as_s
|
|
127
|
+
hash
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
=begin
|
|
132
|
+
## THIS IS WITH STRINGS AS KEYS!
|
|
133
|
+
# takes an array of validators and returns a fresh array where each has been
|
|
134
|
+
# turned into a sensible hash (with symbols as the keys!)
|
|
135
|
+
def self.sensible_validator_hashes(validators)
|
|
136
|
+
validators.map do |val|
|
|
137
|
+
hash = {}
|
|
138
|
+
case val
|
|
139
|
+
when Validator::TruePos
|
|
140
|
+
hash.merge( {'correct_wins' => val.correct_wins, 'file' => val.fasta.filename } )
|
|
141
|
+
when Validator::AA
|
|
142
|
+
%w(frequency false_to_total_ratio background calculated_background false_to_total_ratio).each do |cat|
|
|
143
|
+
hash[cat] = val.send(cat.to_sym)
|
|
144
|
+
end
|
|
145
|
+
when Validator::Decoy
|
|
146
|
+
%w(correct_wins decoy_on_match).each do |cat|
|
|
147
|
+
hash[cat] = val.send(cat.to_sym)
|
|
148
|
+
end
|
|
149
|
+
hash['constraint'] = val.constraint.inspect if val.constraint
|
|
150
|
+
when Validator::Bias
|
|
151
|
+
%w(correct_wins proteins_expected background calculated_background false_to_total_ratio).each do |cat|
|
|
152
|
+
hash[cat] = val.send(cat.to_sym)
|
|
153
|
+
end
|
|
154
|
+
hash['file'] = val.fasta.filename
|
|
155
|
+
when Validator::Transmem::Protein
|
|
156
|
+
%w(false_to_total_ratio min_num_tms soluble_fraction correct_wins no_include_tm_peps background calculated_background transmem_file).each do |cat|
|
|
157
|
+
hash[cat] = val.send(cat.to_sym)
|
|
158
|
+
end
|
|
159
|
+
when Validator::Probability
|
|
160
|
+
else ; raise ArgumentError, "Don't know the validator class #{val}"
|
|
161
|
+
end
|
|
162
|
+
klass_as_s = val.class.to_s
|
|
163
|
+
hash['type'] = Validator_to_string[klass_as_s]
|
|
164
|
+
hash['class'] = klass_as_s
|
|
165
|
+
hash
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
=end
|
|
169
|
+
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
module Precision::Calculator
|
|
173
|
+
# calculates precision by the assumption that the first group are all true
|
|
174
|
+
# hits and the second are all false hits
|
|
175
|
+
# (0,0) is returned as 1.0
|
|
176
|
+
def calc_precision(num_true_hits, num_false_hits)
|
|
177
|
+
if ((num_true_hits.to_f == 0.0) && (num_false_hits.to_f == 0.0))
|
|
178
|
+
1.0
|
|
179
|
+
else
|
|
180
|
+
num_true_hits.to_f / (num_true_hits.to_f + num_false_hits.to_f)
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# will calculate precision for groups of proteins where the first group are
|
|
186
|
+
# normal hits (which may be true or false) and the second are decoy hits.
|
|
187
|
+
# edge case: if num_normal.to_f == 0.0 then if num_decoy.to_f > 0 ; 0, else 1
|
|
188
|
+
module Precision::Calculator::Decoy
|
|
189
|
+
def calc_precision(num_normal, num_decoy)
|
|
190
|
+
# will calculate as floats in case fractional amounts passed in for
|
|
191
|
+
# whatever reason
|
|
192
|
+
num_normal_f = num_normal.to_f
|
|
193
|
+
num_true_pos = num_normal.to_f - num_decoy
|
|
194
|
+
precision =
|
|
195
|
+
if num_normal_f == 0.0
|
|
196
|
+
if num_decoy.to_f > 0.0
|
|
197
|
+
0.0
|
|
198
|
+
else
|
|
199
|
+
1.0
|
|
200
|
+
end
|
|
201
|
+
else
|
|
202
|
+
num_true_pos/num_normal_f
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
require 'validator/true_pos'
|
|
208
|
+
require 'validator/aa'
|
|
209
|
+
require 'validator/bias'
|
|
210
|
+
require 'validator/decoy'
|
|
211
|
+
require 'validator/transmem'
|
|
212
|
+
require 'validator/probability'
|
|
213
|
+
require 'validator/prot_from_pep'
|
|
214
|
+
|
data/lib/xml.rb
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
|
|
2
|
+
module XML
|
|
3
|
+
HourMinuteMatch = /[MH]/o
|
|
4
|
+
# returns a float object of seconds
|
|
5
|
+
# doesn't support year month, etc, yet
|
|
6
|
+
def self.duration_to_seconds(string)
|
|
7
|
+
case x = string[0,2]
|
|
8
|
+
when 'PT'
|
|
9
|
+
rest = string[2..-1]
|
|
10
|
+
# usually it will be this 'PT1.223434S':
|
|
11
|
+
if rest !~ HourMinuteMatch
|
|
12
|
+
rest[0...-1].to_f
|
|
13
|
+
else
|
|
14
|
+
addit = ''
|
|
15
|
+
total_secs = 0
|
|
16
|
+
total_secs_as_float = nil
|
|
17
|
+
rest.split('').each do |let|
|
|
18
|
+
case let
|
|
19
|
+
when 'H'
|
|
20
|
+
total_secs += addit.to_i * 3600
|
|
21
|
+
addit = ''
|
|
22
|
+
when 'M'
|
|
23
|
+
total_secs += addit.to_i * 60
|
|
24
|
+
addit = ''
|
|
25
|
+
when 'S'
|
|
26
|
+
total_secs_as_float = total_secs.to_f
|
|
27
|
+
total_secs_as_float += addit.to_f
|
|
28
|
+
else
|
|
29
|
+
addit << let
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
total_secs_as_float
|
|
33
|
+
end
|
|
34
|
+
else
|
|
35
|
+
abort 'need to include support for other durations'
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
|
|
2
|
+
module XMLStyleParser
|
|
3
|
+
@done_once = nil
|
|
4
|
+
|
|
5
|
+
Parser_precedence = %w(AXML LibXML XMLParser Regexp REXML)
|
|
6
|
+
# currently AXML requires 'xmlparser' to be installed.... (may not always be
|
|
7
|
+
# the case...)
|
|
8
|
+
File_required = {'AXML' => /^axml/, 'LibXML' => /^xml\/libxml/, 'XMLParser' => /^xmlparser/}
|
|
9
|
+
|
|
10
|
+
# the method that the parser will call on the given file at parse!
|
|
11
|
+
attr_accessor :method
|
|
12
|
+
|
|
13
|
+
# parses the given file by sending to @method
|
|
14
|
+
def parse(file, opts={})
|
|
15
|
+
if respond_to? @method
|
|
16
|
+
send(@method, file, opts)
|
|
17
|
+
else
|
|
18
|
+
raise NoMethodError, "Parser of class #{self.class} can't parse #{@method} yet"
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# XMLParser and xml/libxml are incompatible, so if xmlparser is available,
|
|
23
|
+
# libxml will not be loaded (XMLParser#parse is clobbered by
|
|
24
|
+
# XML::Parser#parse [don't ask me why])
|
|
25
|
+
def self.require_parsers
|
|
26
|
+
if !@done_once
|
|
27
|
+
have_xmlparser = false
|
|
28
|
+
begin
|
|
29
|
+
require 'xmlparser'
|
|
30
|
+
puts "Loaded XMLParser" if $VERBOSE
|
|
31
|
+
have_xmlparser = true
|
|
32
|
+
rescue LoadError
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
begin
|
|
36
|
+
require 'axml'
|
|
37
|
+
puts "Loaded AXML" if $VERBOSE
|
|
38
|
+
rescue LoadError
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
begin
|
|
42
|
+
unless have_xmlparser
|
|
43
|
+
require 'xml/libxml'
|
|
44
|
+
puts "Loaded xml/libxml" if $VERBOSE
|
|
45
|
+
################################################################
|
|
46
|
+
# IMPORTANT!
|
|
47
|
+
# This magic line makes the parser behave like it ought to!!
|
|
48
|
+
XML::Parser.default_keep_blanks = false
|
|
49
|
+
################################################################
|
|
50
|
+
end
|
|
51
|
+
rescue LoadError
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
@done_once = true
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# returns an array of strings depending on File_required (in the order of
|
|
58
|
+
# Parser_precedence)
|
|
59
|
+
def self.available_xml_parsers
|
|
60
|
+
require_parsers
|
|
61
|
+
parser_precedence = Parser_precedence.dup
|
|
62
|
+
File_required.map do |k,v|
|
|
63
|
+
unless $".any? {|req_file| req_file.match(v) }
|
|
64
|
+
parser_precedence.delete(k)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
parser_precedence
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
## appends downcase to each parser type here and tries to require it
|
|
71
|
+
# returns all those that were required without a load error
|
|
72
|
+
def self.require_parse_files(base_dir)
|
|
73
|
+
XMLStyleParser.available_xml_parsers.select do |v|
|
|
74
|
+
to_require = base_dir + '/' + v.downcase
|
|
75
|
+
begin
|
|
76
|
+
require to_require
|
|
77
|
+
true
|
|
78
|
+
rescue LoadError
|
|
79
|
+
false
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# seeks a subclass that has the public_method @method
|
|
85
|
+
def self.choose_parser(const, method)
|
|
86
|
+
## First update @@parser_precedence to ensure we should get these guys
|
|
87
|
+
parser_precedence = available_xml_parsers
|
|
88
|
+
|
|
89
|
+
available_constants = parser_precedence.select do |v|
|
|
90
|
+
const.const_defined?(v)
|
|
91
|
+
end
|
|
92
|
+
available_subclasses = available_constants.map do |v|
|
|
93
|
+
const.const_get(v)
|
|
94
|
+
end
|
|
95
|
+
available = available_subclasses.select do |subclass|
|
|
96
|
+
subclass.public_method_defined? method
|
|
97
|
+
end
|
|
98
|
+
if available.size > 0
|
|
99
|
+
available.first
|
|
100
|
+
else
|
|
101
|
+
raise NoMethodError, "No parser of class #{const} can parse :#{method}\n** Is 'axml' (or another xml parser) installed and working? **"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
module XMLParserWrapper
|
|
4
|
+
def parse_and_report(file, const, report_method=:report)
|
|
5
|
+
parse_and_report_string(IO.read(file), const, report_method)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def parse_and_report_string(string, const, report_method=:report)
|
|
9
|
+
parser = self.class.const_get(const).new
|
|
10
|
+
parser.parse(string)
|
|
11
|
+
parser.send(report_method)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def parse_and_report_io(io, const, report_method=:report)
|
|
15
|
+
parser = self.class.const_get(const).new
|
|
16
|
+
parser.parse(io)
|
|
17
|
+
parser.send(report_method)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#!/usr/bin/ruby -w
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
require 'roc'
|
|
5
|
+
require 'optparse'
|
|
6
|
+
require 'generator'
|
|
7
|
+
|
|
8
|
+
$decoy = false
|
|
9
|
+
$base = "precision_vs_numhits"
|
|
10
|
+
|
|
11
|
+
opts = OptionParser.new do |op|
|
|
12
|
+
op.banner = "usage: #{File.basename(__FILE__)} smriti.csv ..."
|
|
13
|
+
op.separator ""
|
|
14
|
+
op.separator "smriti.csv = (tab delimited) prob, file:seq:charge, T/F"
|
|
15
|
+
op.separator ""
|
|
16
|
+
op.on("--decoy", "'F' indicates this is a decoy") {|v| $decoy = true }
|
|
17
|
+
op.on("-o", "--outfile <filename>", "base outfile name (#{$base})") {|v| $base = v}
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
opts.parse!
|
|
21
|
+
|
|
22
|
+
if ARGV.size <= 0
|
|
23
|
+
puts opts
|
|
24
|
+
exit
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
files = ARGV.to_a
|
|
28
|
+
|
|
29
|
+
xys = files.map do |file|
|
|
30
|
+
triplets = IO.readlines(file).reject{|v| v =~ /^#/}.map do |line|
|
|
31
|
+
line.chomp.split("\t")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# check that they're all OK:
|
|
35
|
+
triplets.each do |trip|
|
|
36
|
+
if trip.size != 3 ; abort "bad triplet" end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# figure out the ordering (and correct if necessary):
|
|
40
|
+
higher_better = triplets[0][0].to_f > triplets.last[0].to_f
|
|
41
|
+
|
|
42
|
+
doublets = triplets.map do |trip|
|
|
43
|
+
value = trip[0].to_f
|
|
44
|
+
value *= -1 if higher_better
|
|
45
|
+
[value, ((trip[2] == 'T') ? true : false)]
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
roc = ROC.new
|
|
49
|
+
|
|
50
|
+
(tps, fps) = roc.doublets_to_separate(doublets)
|
|
51
|
+
|
|
52
|
+
(x, y) =
|
|
53
|
+
if $decoy
|
|
54
|
+
(numhits, precision) = DecoyROC.new.pred_and_ppv(tps, fps)
|
|
55
|
+
[numhits, precision]
|
|
56
|
+
else
|
|
57
|
+
(numhits, precision) = roc.numhits_and_ppv(doublets)
|
|
58
|
+
[numhits, precision]
|
|
59
|
+
end
|
|
60
|
+
[x,y]
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## PLOT TO to_plot
|
|
66
|
+
File.open( $base + ".to_plot", 'w') do |fh|
|
|
67
|
+
fh.puts "XYData"
|
|
68
|
+
fh.puts $base
|
|
69
|
+
fh.puts "precision vs. num hits"
|
|
70
|
+
fh.puts "num hits"
|
|
71
|
+
fh.puts "precision"
|
|
72
|
+
files.zip(xys) do |file,xy|
|
|
73
|
+
(x,y) = xy
|
|
74
|
+
x.unshift(0)
|
|
75
|
+
y.unshift(1)
|
|
76
|
+
fh.puts file.sub(/\.[^\.]$/,'')
|
|
77
|
+
fh.puts x.join(" ")
|
|
78
|
+
fh.puts y.join(" ")
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
File.open( $base + ".csv", 'w') do |fh|
|
|
83
|
+
columns = []
|
|
84
|
+
files.zip(xys) do |file,xy|
|
|
85
|
+
f = file.sub(/\.[^\.]$/,'')
|
|
86
|
+
(x,y) = xy
|
|
87
|
+
x.unshift("#Hits: #{f}")
|
|
88
|
+
y.unshift("Precision: #{f}")
|
|
89
|
+
columns << x << y
|
|
90
|
+
end
|
|
91
|
+
SyncEnumerator.new(*columns).each do |row|
|
|
92
|
+
fh.puts row.join("\t")
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'table'
|
|
5
|
+
|
|
6
|
+
require 'spec/gradient_program'
|
|
7
|
+
|
|
8
|
+
delimiter = "\t"
|
|
9
|
+
table_format = false
|
|
10
|
+
opts = OptionParser.new do |op|
|
|
11
|
+
op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
|
|
12
|
+
op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
|
|
13
|
+
if v == 'space'
|
|
14
|
+
delimiter = " "
|
|
15
|
+
elsif v == 'tab'
|
|
16
|
+
delimiter = "\t"
|
|
17
|
+
elsif v == 'format'
|
|
18
|
+
table_format = true
|
|
19
|
+
else
|
|
20
|
+
abort "don't recognize #{v}"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
opts.parse!
|
|
26
|
+
|
|
27
|
+
if ARGV.size == 0
|
|
28
|
+
puts opts
|
|
29
|
+
exit
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
sets_of_tables = {}
|
|
34
|
+
ARGV.each do |file|
|
|
35
|
+
File.open(file) do |fh|
|
|
36
|
+
sets_of_tables[file] = GradientProgram.all_from_handle(fh)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
sets_of_tables.each do |file, tables|
|
|
41
|
+
puts "FILE: #{file}"
|
|
42
|
+
tables.each do |gp|
|
|
43
|
+
puts "PUMP_TYPE: #{gp.pump_type}"
|
|
44
|
+
col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
|
|
45
|
+
data = gp.time_points.map do |tp|
|
|
46
|
+
line = [tp.time, *(tp.percentages)]
|
|
47
|
+
line << tp.flow_rate
|
|
48
|
+
end
|
|
49
|
+
table = Table.new(data, nil, col_labels)
|
|
50
|
+
if table_format
|
|
51
|
+
puts table.to_formatted_string
|
|
52
|
+
else
|
|
53
|
+
puts table.to_s(delimiter)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'rexml/document'
|
|
4
|
+
|
|
5
|
+
if ARGV.size == 0
|
|
6
|
+
puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
|
|
7
|
+
puts "outputs a .csv file"
|
|
8
|
+
exit
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
class Protein
|
|
12
|
+
attr_accessor :name, :pi, :ni
|
|
13
|
+
def initialize(name, pi, ni)
|
|
14
|
+
@name, @pi, @ni = name, pi, ni
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class Listener
|
|
19
|
+
attr_accessor :proteins
|
|
20
|
+
|
|
21
|
+
def initialize
|
|
22
|
+
@proteins = []
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def tag_start(name, attrs)
|
|
26
|
+
if name == "protein"
|
|
27
|
+
protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
|
|
28
|
+
@proteins.push( protein )
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def method_missing(*args) ; end
|
|
33
|
+
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
ARGV.each do |file|
|
|
37
|
+
File.open("output.csv", 'w') do |out|
|
|
38
|
+
listener = Listener.new
|
|
39
|
+
REXML::Document.parse_stream(File.new(file), listener)
|
|
40
|
+
listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
|
|
41
|
+
out.puts [protein.name, protein.pi, protein.ni].join("\t")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
data/script/mzXML2timeIndex.rb
CHANGED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/ruby -w
|
|
2
|
+
|
|
3
|
+
require 'spec_id'
|
|
4
|
+
require 'fasta'
|
|
5
|
+
require 'optparse'
|
|
6
|
+
|
|
7
|
+
$top = false
|
|
8
|
+
opts = OptionParser.new do |op|
|
|
9
|
+
op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml <file>.fasta|prefix"
|
|
10
|
+
op.separator "outputs stdout (tab del sorted by probability) probability, file:aaseq:charge T/F"
|
|
11
|
+
op.separator "hashes on file+aaseq+charge"
|
|
12
|
+
op.on("-t", "--top", "only top peptide (by prob) per scan+charge") do
|
|
13
|
+
$top = true
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
opts.parse!
|
|
18
|
+
|
|
19
|
+
if ARGV.size < 2
|
|
20
|
+
puts opts.to_s
|
|
21
|
+
exit
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
specid_file = ARGV.shift
|
|
25
|
+
file_or_prefix = ARGV.shift
|
|
26
|
+
|
|
27
|
+
specid = SpecID.new(specid_file)
|
|
28
|
+
|
|
29
|
+
indicator =
|
|
30
|
+
if File.exist? file_or_prefix
|
|
31
|
+
Fasta.new.read_file(file_or_prefix)
|
|
32
|
+
else
|
|
33
|
+
file_or_prefix
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# returns an array containing the min prob peptides (in case of a tie)
|
|
38
|
+
def lowest_peps(ar)
|
|
39
|
+
min_prob = ar.min {|a,b| a.probability.to_f <=> b.probability.to_f }.probability.to_f
|
|
40
|
+
ar.select {|v| v.probability.to_f == min_prob }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
peps = specid.peps
|
|
44
|
+
if $top
|
|
45
|
+
top_by_scan = []
|
|
46
|
+
peps.hash_by(:base_name, :first_scan).each do |k,v|
|
|
47
|
+
low_peps = lowest_peps(v)
|
|
48
|
+
top_by_scan.push( *low_peps )
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
results = top_by_scan.hash_by(:base_name, :aaseq, :charge).map do |k,v|
|
|
53
|
+
low_peps = lowest_peps(v)
|
|
54
|
+
#min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
|
55
|
+
all_prots = []
|
|
56
|
+
low_peps.each do |pep|
|
|
57
|
+
all_prot_references.push( *(pep.prots.map {|v| v.reference }) )
|
|
58
|
+
end
|
|
59
|
+
all_prot_references.uniq!
|
|
60
|
+
is_true =
|
|
61
|
+
if indicator.is_a? Fasta
|
|
62
|
+
all_prot_references.any? do |ref|
|
|
63
|
+
indicator.included_in_header?(ref)
|
|
64
|
+
end
|
|
65
|
+
else
|
|
66
|
+
!(all_prot_references.all? {|ref| ref.include?( indicator )})
|
|
67
|
+
end
|
|
68
|
+
[min_pep.probability.to_f, k, is_true]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
results.sort.each do |result|
|
|
72
|
+
report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
|
|
73
|
+
puts report.join("\t")
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
=begin
|
|
77
|
+
# ORIGINAL CODE
|
|
78
|
+
peps = specid.peps
|
|
79
|
+
if $top
|
|
80
|
+
peps = peps.hash_by(:base_name, :first_scan).map do |k,v|
|
|
81
|
+
v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
results = peps.hash_by(:base_name, :aaseq, :charge).map do |k,v|
|
|
86
|
+
min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
|
87
|
+
references = min_pep.prots.map {|v| v.reference }.uniq
|
|
88
|
+
is_true =
|
|
89
|
+
if indicator.is_a? Fasta
|
|
90
|
+
references.any? do |ref|
|
|
91
|
+
indicator.included_in_header?(ref)
|
|
92
|
+
end
|
|
93
|
+
else
|
|
94
|
+
!(references.all? {|ref| ref.include?( indicator )})
|
|
95
|
+
end
|
|
96
|
+
[min_pep.probability.to_f, k, is_true]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
results.sort.each do |result|
|
|
100
|
+
report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
|
|
101
|
+
puts report.join("\t")
|
|
102
|
+
end
|
|
103
|
+
=end
|