mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/transmem.rb
DELETED
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# A transmemIndex is a hash that takes a fasta reference as key and returns
|
|
3
|
-
# a structured hash containing the transmembrane information.
|
|
4
|
-
module TransmemIndex
|
|
5
|
-
|
|
6
|
-
# returns :toppred or :phobius
|
|
7
|
-
def self.filetype(file)
|
|
8
|
-
tp = nil
|
|
9
|
-
File.open(file) do |fh|
|
|
10
|
-
while (line = fh.gets)
|
|
11
|
-
case line
|
|
12
|
-
when /SEQENCE/
|
|
13
|
-
tp = :phobius
|
|
14
|
-
break
|
|
15
|
-
when / 0 0 i/
|
|
16
|
-
tp = :phobius # if they don't have the headers,
|
|
17
|
-
# this will pick it up if they have a
|
|
18
|
-
# single prot without tm or signal peptide.
|
|
19
|
-
break
|
|
20
|
-
when /Algorithm specific parameters/
|
|
21
|
-
tp = :toppred # New text
|
|
22
|
-
break
|
|
23
|
-
when /<parameters>/
|
|
24
|
-
tp = :toppred # XML
|
|
25
|
-
break
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
tp
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
def reference_to_key(reference)
|
|
33
|
-
# needs to be subclassed or written
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# right now accepts toppred.out files
|
|
37
|
-
# Phobius objects can use the fasta object to update their hash for methods
|
|
38
|
-
# like avg_overlap
|
|
39
|
-
def self.new(file, fasta=nil)
|
|
40
|
-
case x = filetype(file)
|
|
41
|
-
when :toppred
|
|
42
|
-
require 'transmem/toppred'
|
|
43
|
-
TopPred::Index.new(file)
|
|
44
|
-
when :phobius
|
|
45
|
-
require 'transmem/phobius'
|
|
46
|
-
# warn "WARNING: You have NO fasta object with Phobius based TransmemIndex! (which needs one to do proper indexing!)" unless fasta
|
|
47
|
-
Phobius::Index.new(file, fasta)
|
|
48
|
-
else
|
|
49
|
-
raise ArgumentError, "#{x} filetype for #{file} not recognized!"
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# returns a hash of key -> num certain transmembrane segments
|
|
54
|
-
def num_certain_index
|
|
55
|
-
hash = {}
|
|
56
|
-
self.each do |k,v|
|
|
57
|
-
hash[k] = v[:num_certain_transmembrane_segments] || 0
|
|
58
|
-
end
|
|
59
|
-
hash
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# tp = :number or :fraction which is the fraction of the sequence size
|
|
63
|
-
# returns the average number of overlapping amino acids with transmembrane
|
|
64
|
-
# segments
|
|
65
|
-
# returns nil if there is no protein by that key
|
|
66
|
-
def avg_overlap(key, sequence, tp=:number)
|
|
67
|
-
if self.key? key
|
|
68
|
-
numbers = num_transmem_aa(self[key], sequence)
|
|
69
|
-
if numbers.size > 0
|
|
70
|
-
sum = 0
|
|
71
|
-
numbers.each {|num| sum += num}
|
|
72
|
-
avg_num = sum.to_f / numbers.size
|
|
73
|
-
# the one line way to do it
|
|
74
|
-
#avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
|
|
75
|
-
if tp == :fraction
|
|
76
|
-
avg_num / sequence.size
|
|
77
|
-
# this is the same as doing this:
|
|
78
|
-
#numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
|
|
79
|
-
else
|
|
80
|
-
avg_num
|
|
81
|
-
end
|
|
82
|
-
else
|
|
83
|
-
0.0
|
|
84
|
-
end
|
|
85
|
-
else # what to do if the protein isn't there?? which happens on occasion
|
|
86
|
-
nil
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# returns an array (usually length of 1) of the number of amino acids
|
|
91
|
-
# contained inside transmembrane spanning segments.
|
|
92
|
-
# assumes that tmhash has the key 'transmembrane_segments'
|
|
93
|
-
# if there are no transmembrane segments, returns empty array.
|
|
94
|
-
def num_transmem_aa(tmhash, sequence)
|
|
95
|
-
if tmhash.key? :transmembrane_segments
|
|
96
|
-
ranges = tmhash[:transmembrane_segments].map do |tmseg|
|
|
97
|
-
Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
|
|
98
|
-
end
|
|
99
|
-
num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
|
|
100
|
-
else
|
|
101
|
-
[]
|
|
102
|
-
end
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
# returns an array of the number of overlapping sequences in substring with
|
|
106
|
-
# the substrings defined in start_stop_doublets within full_sequence
|
|
107
|
-
# start_stop_doublets should be 0 indexed!!!
|
|
108
|
-
# the span includes the 'stop' position i.e., full_sequence[start..stop]
|
|
109
|
-
def num_overlapping_chars(full_sequence, ranges, substring)
|
|
110
|
-
#start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
|
|
111
|
-
if ranges.size == 0
|
|
112
|
-
[]
|
|
113
|
-
#full_sequence.enum_for(:scan, substring).map { 0 }
|
|
114
|
-
else
|
|
115
|
-
substring_ranges = []
|
|
116
|
-
pos = 0
|
|
117
|
-
slen = substring.size
|
|
118
|
-
while i=full_sequence.index(substring,pos)
|
|
119
|
-
substring_ranges << Range.new(i, i+slen-1)
|
|
120
|
-
pos = i + slen
|
|
121
|
-
end
|
|
122
|
-
# brute force way
|
|
123
|
-
last_tm_range = ranges.last.last
|
|
124
|
-
to_return = substring_ranges.map do |sb|
|
|
125
|
-
overlap = 0
|
|
126
|
-
# there's got to be a much simpler way to do this, but this does work...
|
|
127
|
-
ranges.each do |tm|
|
|
128
|
-
(frst, lst) =
|
|
129
|
-
if tm.include?( sb.first )
|
|
130
|
-
[tm, sb]
|
|
131
|
-
elsif tm.include?( sb.last )
|
|
132
|
-
[sb, tm]
|
|
133
|
-
else
|
|
134
|
-
nil
|
|
135
|
-
end
|
|
136
|
-
if frst
|
|
137
|
-
if lst.last <= frst.last
|
|
138
|
-
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
|
|
139
|
-
else
|
|
140
|
-
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
end
|
|
144
|
-
overlap
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
#substring_ranges = full_sequence.enum_for(:scan, substring).map do
|
|
154
|
-
# (ofirst, olast) = $~.offset(0)
|
|
155
|
-
# Range.new(ofirst, olast - 1)
|
|
156
|
-
# end
|
|
157
|
-
|
data/lib/validator/aa.rb
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
require 'validator/digestion_based'
|
|
2
|
-
require 'fasta'
|
|
3
|
-
require 'spec_id/aa_freqs'
|
|
4
|
-
|
|
5
|
-
# Constraints on aaseq attribute of peptides (the bare amino acid sequence)
|
|
6
|
-
# works by calculating amino acid frequencies in the fasta file used.
|
|
7
|
-
class Validator::AA < Validator::DigestionBased
|
|
8
|
-
include Precision::Calculator
|
|
9
|
-
|
|
10
|
-
attr_accessor :constraint
|
|
11
|
-
|
|
12
|
-
# it is a false hit if the amino acid is located in the peptide
|
|
13
|
-
attr_accessor :false_if_found
|
|
14
|
-
|
|
15
|
-
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
|
16
|
-
:false_if_found => true,
|
|
17
|
-
} )
|
|
18
|
-
|
|
19
|
-
# returns tp, fp
|
|
20
|
-
def partition(peps)
|
|
21
|
-
(found, not_found) = peps.partition do |pep|
|
|
22
|
-
pep.aaseq.include?(@constraint)
|
|
23
|
-
end
|
|
24
|
-
if @false_if_found
|
|
25
|
-
[not_found, found]
|
|
26
|
-
else
|
|
27
|
-
[found, not_found]
|
|
28
|
-
end
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# right now only accepts single amino acids as constraints (as a string,
|
|
32
|
-
# e.g. 'C', or symbol, e.g. :C)
|
|
33
|
-
# options:
|
|
34
|
-
# :false_to_total_ratio => if a true digestion was already performed (see
|
|
35
|
-
# Validator::AA.calc_false_to_total_ratio)
|
|
36
|
-
# :false_if_found => it is a false positive if the amino acid is found.
|
|
37
|
-
# :background => the background level of amino acid Float
|
|
38
|
-
def initialize(constraint, options={})
|
|
39
|
-
@constraint = constraint.to_s
|
|
40
|
-
opts = DEFAULTS.merge(options)
|
|
41
|
-
(@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
def to_param_string
|
|
45
|
-
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
|
data/lib/validator/aa_est.rb
DELETED
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
require 'validator/aa'
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class Validator ; end
|
|
5
|
-
class Validator::AA ; end
|
|
6
|
-
|
|
7
|
-
# A class that uses the peps given to it and a background frequency to
|
|
8
|
-
# calculate the false_to_total_ratio at each turn.
|
|
9
|
-
class Validator::AAEst < Validator::AA
|
|
10
|
-
attr_accessor :constraint
|
|
11
|
-
attr_accessor :false_if_found
|
|
12
|
-
|
|
13
|
-
# the frequency of the amino acid is used to estimate the false to
|
|
14
|
-
# total ratio based on the pephits given for pephit_precision.
|
|
15
|
-
# see Validator::AA.calc_frequency to calculate a frequency
|
|
16
|
-
# or use set_frequency to set from pep hits.
|
|
17
|
-
attr_accessor :frequency
|
|
18
|
-
|
|
19
|
-
DEFAULTS = {
|
|
20
|
-
:false_if_found => true
|
|
21
|
-
}.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
|
|
22
|
-
|
|
23
|
-
# only takes a string right now for constraint
|
|
24
|
-
def initialize(constraint, options={})
|
|
25
|
-
@constraint = constraint.to_s
|
|
26
|
-
opts = DEFAULTS.merge(options)
|
|
27
|
-
(@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
def pephit_precision(peps)
|
|
31
|
-
set_false_to_total_ratio(peps)
|
|
32
|
-
super(peps)
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
def set_false_to_total_ratio(peps)
|
|
36
|
-
if peps.size > 0
|
|
37
|
-
expected = 0.0
|
|
38
|
-
peps.each do |pep|
|
|
39
|
-
expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
|
|
40
|
-
end
|
|
41
|
-
@false_to_total_ratio = expected / peps.size
|
|
42
|
-
else
|
|
43
|
-
@false_to_total_ratio = 1.0
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
def set_ongoing_false_to_total_ratio(peps)
|
|
48
|
-
if peps.size > 0
|
|
49
|
-
peps.each do |pep|
|
|
50
|
-
@expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
|
|
51
|
-
end
|
|
52
|
-
# @increment_total_submitted should == @increment_tps and @increment_fps
|
|
53
|
-
# since these are either/or
|
|
54
|
-
@false_to_total_ratio = @expected / @increment_total_submitted
|
|
55
|
-
else
|
|
56
|
-
@false_to_total_ratio = 1.0
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def to_param_string
|
|
62
|
-
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# takes objects responding to aaseq and sets the frequency based on
|
|
66
|
-
# constraint. constraint is one acceptable to initialize! returns self
|
|
67
|
-
def set_frequency(objs)
|
|
68
|
-
table = SpecID::AAFreqs.new.calculate_frequencies(objs)
|
|
69
|
-
@frequency = table[@constraint.to_sym]
|
|
70
|
-
self
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# if adding pephits in groups at a time, the entire group does not need to be
|
|
74
|
-
# queried, just the individual hit. Use this OR pephits_precision (NOT
|
|
75
|
-
# both). The initial query to this method will begin a running tally that
|
|
76
|
-
# is saved by the validator.
|
|
77
|
-
# takes either an array or a single pephit (determined by if it is a
|
|
78
|
-
# SpecID::Pep)
|
|
79
|
-
def increment_pephits_precision(peps)
|
|
80
|
-
tmp = $VERBOSE; $VERBOSE = nil
|
|
81
|
-
unless @increment_initialized
|
|
82
|
-
initialize_increment
|
|
83
|
-
@expected = 0.0
|
|
84
|
-
end
|
|
85
|
-
$VERBOSE = tmp
|
|
86
|
-
|
|
87
|
-
to_submit =
|
|
88
|
-
if peps.is_a? SpecID::Pep
|
|
89
|
-
[peps]
|
|
90
|
-
else
|
|
91
|
-
peps
|
|
92
|
-
end
|
|
93
|
-
@increment_total_submitted += to_submit.size
|
|
94
|
-
(tps, fps) = partition(to_submit)
|
|
95
|
-
#### THIS IS THE MAGIC FOR THIS VALIDATOR:
|
|
96
|
-
set_ongoing_false_to_total_ratio(to_submit)
|
|
97
|
-
|
|
98
|
-
@increment_tps += tps.size
|
|
99
|
-
@increment_fps += fps.size
|
|
100
|
-
(num_tps, num_fps) =
|
|
101
|
-
if self.respond_to?(:calc_precision_prep) # for digestion based validators
|
|
102
|
-
(num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
|
|
103
|
-
[num_tps, num_fps]
|
|
104
|
-
else
|
|
105
|
-
[@increment_tps, @increment_fps]
|
|
106
|
-
end
|
|
107
|
-
calc_precision(num_tps, num_fps)
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
end
|
data/lib/validator/background.rb
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
require 'validator'
|
|
2
|
-
require 'vec'
|
|
3
|
-
require 'enumerator'
|
|
4
|
-
|
|
5
|
-
class Validator ; end
|
|
6
|
-
class Validator::Background
|
|
7
|
-
|
|
8
|
-
attr_accessor :data
|
|
9
|
-
|
|
10
|
-
def initialize(data=nil)
|
|
11
|
-
@data = data
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
def delete_nan!(vec)
|
|
15
|
-
vec.each_with_index do |v,i|
|
|
16
|
-
if v.nan?
|
|
17
|
-
vec[i] = 0
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
|
|
23
|
-
data_vec = VecD[*@data]
|
|
24
|
-
delete_nan!(data_vec)
|
|
25
|
-
stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
|
|
26
|
-
smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
|
|
27
|
-
smoothed_stdev_derivs = smoothed_stdev.chim
|
|
28
|
-
last_0_index = index_of_last_0(smoothed_stdev_derivs)
|
|
29
|
-
min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
def plot(vec)
|
|
33
|
-
`graph #{vec.join(" ")} -a -T X`
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# not really working right currently
|
|
37
|
-
def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
|
|
38
|
-
data_vec = VecD[*@data]
|
|
39
|
-
delete_nan!(data_vec)
|
|
40
|
-
drvs = data_vec.chim
|
|
41
|
-
# absolute value
|
|
42
|
-
drvs.each_with_index {|x,i| drvs[i] = x.abs }
|
|
43
|
-
mv_avg = drvs.transform(avg_points) {|v| v.avg }
|
|
44
|
-
last_0_index = index_of_last_0(mv_avg.chim)
|
|
45
|
-
min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def index_of_last_0(vec)
|
|
49
|
-
last_0_index = nil
|
|
50
|
-
vec.each_with_index do |v,i|
|
|
51
|
-
if v == 0
|
|
52
|
-
last_0_index = i
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
last_0_index
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# returns the minimum value in the window centered on index
|
|
59
|
-
def min_in_window(vec, index, pre, post)
|
|
60
|
-
last_index = vec.size - 1
|
|
61
|
-
start = index - pre
|
|
62
|
-
stop = index + post
|
|
63
|
-
start = 0 if start < 0
|
|
64
|
-
stop = last_index if stop > last_index
|
|
65
|
-
vec[start..stop].min
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# very simple, should work
|
|
69
|
-
def min_mesa(start, stop, points=3)
|
|
70
|
-
data_vec = VecD[*@data]
|
|
71
|
-
delete_nan!(data_vec)
|
|
72
|
-
smoothed = data_vec.transform(3) {|v| v.avg }
|
|
73
|
-
smoothed[start..stop].min
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
end
|
|
77
|
-
|
data/lib/validator/bias.rb
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
require 'validator'
|
|
2
|
-
require 'validator/digestion_based'
|
|
3
|
-
|
|
4
|
-
# class for any generic kind of bias. For instance, a list of high abundance
|
|
5
|
-
# proteins we would expect to see, or a list of low abundance proteins we
|
|
6
|
-
# would not expect to see, or proteins that have been filtered out in some
|
|
7
|
-
# way, etc.
|
|
8
|
-
class Validator::Bias < Validator::DigestionBased
|
|
9
|
-
include Precision::Calculator
|
|
10
|
-
|
|
11
|
-
# a fasta object (by default containing proteins expected to be in the
|
|
12
|
-
# sample [see proteins_expected to modify that behavior])
|
|
13
|
-
attr_reader :fasta
|
|
14
|
-
|
|
15
|
-
# correct_wins means that only a single protein from a pep.aaseq must match
|
|
16
|
-
# the fasta object for the pep hit to be considered valid. Otherwise, all
|
|
17
|
-
# must be a match (logic negated by proteins_expected)
|
|
18
|
-
attr_accessor :correct_wins
|
|
19
|
-
|
|
20
|
-
# proteins_expected==true means we expect to see the proteins in the sample
|
|
21
|
-
# proteins_expected==false means we do not expect to see these proteins in
|
|
22
|
-
# the sample
|
|
23
|
-
attr_accessor :proteins_expected
|
|
24
|
-
|
|
25
|
-
# a hash made by taking each fasta reference in fasta_object, (everything
|
|
26
|
-
# until a space) and setting the value to true. It can be queried with the
|
|
27
|
-
# start of an fasta sequence
|
|
28
|
-
attr_accessor :short_reference_hash
|
|
29
|
-
|
|
30
|
-
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
|
31
|
-
:proteins_expected => true,
|
|
32
|
-
:correct_wins => true,
|
|
33
|
-
} )
|
|
34
|
-
|
|
35
|
-
# options:
|
|
36
|
-
# (t = true, f = false, '*'= default)
|
|
37
|
-
# :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
|
|
38
|
-
# :correct_wins => *t/f a single peptide hit from one of these proteins
|
|
39
|
-
# constitutes a true positive
|
|
40
|
-
# :background => Float (*0.0-1.0)
|
|
41
|
-
# :false_to_total_ratio => Float (*nil by default)
|
|
42
|
-
def initialize(fasta_object, options={})
|
|
43
|
-
opts = DEFAULTS.merge(options)
|
|
44
|
-
(@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
|
|
45
|
-
@fasta = fasta_object
|
|
46
|
-
@header_split_hash = @fasta.prots.map {|prot| prot.reference }
|
|
47
|
-
@short_reference_hash = self.class.make_short_reference_hash(fasta_object)
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
def self.make_short_reference_hash(fasta_object)
|
|
51
|
-
hash = {}
|
|
52
|
-
fasta_object.each do |prot|
|
|
53
|
-
hash[prot.first_entry] = true
|
|
54
|
-
end
|
|
55
|
-
hash
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
def partition(peps)
|
|
59
|
-
klass = self.class
|
|
60
|
-
cw =
|
|
61
|
-
if !@proteins_expected
|
|
62
|
-
!@correct_wins
|
|
63
|
-
else
|
|
64
|
-
@correct_wins
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
(tp, fp) =
|
|
68
|
-
if cw
|
|
69
|
-
peps.partition do |pep|
|
|
70
|
-
pep.prots.any? do |pepprot|
|
|
71
|
-
@short_reference_hash.key?( pepprot.first_entry )
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
else
|
|
75
|
-
peps.partition do |pep|
|
|
76
|
-
pep.prots.any? do |pepprot|
|
|
77
|
-
!@short_reference_hash.key?( pepprot.first_entry )
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
if !@correct_wins
|
|
83
|
-
tp, fp = fp, tp
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
[tp, fp]
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
# pephit_precision is done through inheritance
|
|
90
|
-
|
|
91
|
-
def to_param_string
|
|
92
|
-
"abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
end
|