mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
require 'transmem'
|
|
2
|
+
require 'xml_style_parser'
|
|
3
|
+
|
|
4
|
+
class TopPred ; end
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TopPred::Index < Hash
|
|
8
|
+
include TransmemIndex
|
|
9
|
+
|
|
10
|
+
# we need to match whatever function toppred uses to generate identifiers if
|
|
11
|
+
# we want derivative processes to be fast and accurate
|
|
12
|
+
def reference_to_key(reference)
|
|
13
|
+
if reference
|
|
14
|
+
ri = reference.index(' ')
|
|
15
|
+
frst =
|
|
16
|
+
if ri
|
|
17
|
+
reference[0...reference.index(' ')]
|
|
18
|
+
else
|
|
19
|
+
reference
|
|
20
|
+
end
|
|
21
|
+
if frst
|
|
22
|
+
frst.gsub(/[^0-9a-zA-Z]/,'_')
|
|
23
|
+
else
|
|
24
|
+
nil
|
|
25
|
+
end
|
|
26
|
+
else
|
|
27
|
+
nil
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def initialize(file, kind=:default)
|
|
32
|
+
case kind
|
|
33
|
+
when :default
|
|
34
|
+
TopPred.default_index(file, self)
|
|
35
|
+
else
|
|
36
|
+
abort "can't do #{kind}"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# This class will probably change its interface some in the future
|
|
41
|
+
# That's the web portal
|
|
42
|
+
# http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
|
|
43
|
+
# How to run:
|
|
44
|
+
# uncheck 'Produce hydrophobicity graph image (-g)'
|
|
45
|
+
# choose 'Xml' or 'New: new text' output
|
|
46
|
+
# type in your email, then hit 'Run toppred'
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
class TopPred
|
|
50
|
+
include TransmemIndex
|
|
51
|
+
|
|
52
|
+
# returns the default index
|
|
53
|
+
def self.default_index(file, index={})
|
|
54
|
+
TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
module TopPred::Parser
|
|
60
|
+
# returns :xml or :text
|
|
61
|
+
def self.filetype(file)
|
|
62
|
+
File.open(file) do |fh|
|
|
63
|
+
case fh.gets
|
|
64
|
+
when /<\?xml version.*>/
|
|
65
|
+
:xml
|
|
66
|
+
when /Algorithm specific/
|
|
67
|
+
:text
|
|
68
|
+
else
|
|
69
|
+
nil
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# type = :xml or :text
|
|
75
|
+
def self.new(parser_type=:xml)
|
|
76
|
+
klass =
|
|
77
|
+
case parser_type
|
|
78
|
+
when :xml
|
|
79
|
+
TopPred::Parser_XML
|
|
80
|
+
when :text
|
|
81
|
+
TopPred::Parser_Text
|
|
82
|
+
else
|
|
83
|
+
abort "don't recognize parser type: #{parser_type}"
|
|
84
|
+
end
|
|
85
|
+
klass.new
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def file_to_index(file, index={})
|
|
89
|
+
File.open(file) {|fh| to_index(fh, index) }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# where each segment = [prob, first, last] and aaseq is a string each
|
|
93
|
+
# segment may also be a hash => first, last, probability (adding key
|
|
94
|
+
# 'aaseq')
|
|
95
|
+
# first/last '1' indexed returns segments where each is [prob,
|
|
96
|
+
# first, last, aaseq] or hash (above)
|
|
97
|
+
def add_sequences_to_segments(segments, aaseq)
|
|
98
|
+
if segments.first.is_a? Array
|
|
99
|
+
segments.each do |seg|
|
|
100
|
+
first_index = seg[1] - 1
|
|
101
|
+
length = (seg[2] - seg[1]) + 1
|
|
102
|
+
seg.push( aaseq[first_index, length] )
|
|
103
|
+
end
|
|
104
|
+
else
|
|
105
|
+
segments.each do |seg|
|
|
106
|
+
first_index = seg[:start] - 1
|
|
107
|
+
length = (seg[:stop] - seg[:start]) + 1
|
|
108
|
+
seg[:aaseq] = ( aaseq[first_index, length] )
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
segments
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
module TopPred::Parser_XML
|
|
119
|
+
include TopPred::Parser
|
|
120
|
+
include XMLStyleParser
|
|
121
|
+
|
|
122
|
+
def self.new(meth=:to_index)
|
|
123
|
+
parser = XMLStyleParser.choose_parser(self, meth).new
|
|
124
|
+
@method = meth
|
|
125
|
+
parser
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def parse(file)
|
|
129
|
+
send(@method, file)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
class TopPred::Parser_XML::DOM
|
|
134
|
+
include TopPred::Parser_XML
|
|
135
|
+
include XMLStyleParser
|
|
136
|
+
|
|
137
|
+
=begin
|
|
138
|
+
YAL010C:
|
|
139
|
+
num_putative_transmembrane_segments: 1
|
|
140
|
+
aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
|
|
141
|
+
best_structure_probability: 1.0
|
|
142
|
+
transmembrane_segments:
|
|
143
|
+
- aaseq: SLGAEFWLGLVSLSPGCSTTL
|
|
144
|
+
stop: 252
|
|
145
|
+
start: 232
|
|
146
|
+
probability: 1.0
|
|
147
|
+
num_certain_transmembrane_segments: 1
|
|
148
|
+
num_found: 2
|
|
149
|
+
=end
|
|
150
|
+
|
|
151
|
+
# should return a index
|
|
152
|
+
def to_index(io, index = {})
|
|
153
|
+
get_root_node_from_io(io) do |toppreds_n|
|
|
154
|
+
|
|
155
|
+
abort if toppreds_n.name != 'toppreds'
|
|
156
|
+
toppreds_n.find('child::toppred').each do |toppred_n|
|
|
157
|
+
att_hash = {}
|
|
158
|
+
sequence_n = toppred_n.find_first('child::sequence')
|
|
159
|
+
index[sequence_n['id']] = att_hash
|
|
160
|
+
att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
|
|
161
|
+
abort if att_hash[:aaseq].size != sequence_n['size'].to_i
|
|
162
|
+
tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
|
|
163
|
+
|
|
164
|
+
num_found = tmsummary_n['segments'].to_i
|
|
165
|
+
att_hash[:num_found] = num_found
|
|
166
|
+
if num_found > 0
|
|
167
|
+
|
|
168
|
+
num_certain_transmembrane_segments = 0
|
|
169
|
+
num_putative_transmembrane_segments = 0
|
|
170
|
+
tmsummary_n.find('child::segment').each do |segment_n|
|
|
171
|
+
abort if segment_n.name != 'segment'
|
|
172
|
+
case segment_n['type']
|
|
173
|
+
when 'certain'
|
|
174
|
+
num_certain_transmembrane_segments += 1
|
|
175
|
+
else # putative
|
|
176
|
+
num_putative_transmembrane_segments += 1
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
|
|
180
|
+
att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
|
|
181
|
+
|
|
182
|
+
topologies_n = tmsummary_n.next
|
|
183
|
+
abort if topologies_n.name != 'topologies'
|
|
184
|
+
# get the top probability topology:
|
|
185
|
+
top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
|
|
186
|
+
tmsegments = []
|
|
187
|
+
top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
|
|
188
|
+
tmhash = {}
|
|
189
|
+
tmhash[:start] = tmsegment_n['start'].to_i
|
|
190
|
+
tmhash[:stop] = tmsegment_n['stop'].to_i
|
|
191
|
+
## WARNING! it appears the probability is broken on xml output!!
|
|
192
|
+
tmhash[:probability] = tmsegment_n['prob'].to_f
|
|
193
|
+
tmsegments << tmhash
|
|
194
|
+
end
|
|
195
|
+
add_sequences_to_segments(tmsegments, att_hash[:aaseq])
|
|
196
|
+
att_hash[:transmembrane_segments] = tmsegments
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
index
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
class TopPred::Parser_Text
|
|
206
|
+
include TopPred::Parser
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# returns a hash structure in this form: {identifier => {aaseq => String,
|
|
210
|
+
# num_found: Int, num_certain_transmembrane_segments => Int,
|
|
211
|
+
# num_putative_transmembrane_segments => Int, best_structure_probability =>
|
|
212
|
+
# Float, transmembrane_segments => [probability => Float, start => Int, stop
|
|
213
|
+
# => Int, aaseq => String] } }
|
|
214
|
+
def to_index(io, index={})
|
|
215
|
+
current_record = nil
|
|
216
|
+
|
|
217
|
+
io.each do |line|
|
|
218
|
+
if line =~ /^Sequence : (.*?) +\(/
|
|
219
|
+
current_identifier = $1.dup
|
|
220
|
+
index[current_identifier] = {}
|
|
221
|
+
current_record = index[current_identifier]
|
|
222
|
+
current_record[:aaseq] = read_aaseq(io)
|
|
223
|
+
read_segment_summary(io, current_record)
|
|
224
|
+
elsif line =~ /^HEADER\s+START\s+STOP/
|
|
225
|
+
top_struc = top_structure( read_structures(io) )
|
|
226
|
+
current_record[:best_structure_probability] = top_struc[:probability]
|
|
227
|
+
current_record[:transmembrane_segments] = top_struc[:tm]
|
|
228
|
+
add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
|
|
229
|
+
segment_arrays_to_hashes(current_record[:transmembrane_segments])
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
index
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
private
|
|
236
|
+
|
|
237
|
+
# returns a list of all structures given a filehandle starting just after
|
|
238
|
+
# the first "HEADER START STOP ..." line
|
|
239
|
+
def read_structures(fh)
|
|
240
|
+
structures = []
|
|
241
|
+
loop do
|
|
242
|
+
structures.push( read_structure(fh) )
|
|
243
|
+
break if fh.eof?
|
|
244
|
+
line = fh.readline
|
|
245
|
+
unless line =~ /^HEADER\s+START\s+STOP/
|
|
246
|
+
break
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
structures
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# returns a hash with key :probability and key :tm contains an array of
|
|
253
|
+
# arrays: [prob(Float), start(Int), stop(Int)]
|
|
254
|
+
def read_structure(fh)
|
|
255
|
+
structure = {}
|
|
256
|
+
# READ the first line
|
|
257
|
+
line = fh.readline
|
|
258
|
+
structure[:probability] = line.split(/\s+/)[2].to_f
|
|
259
|
+
structure[:tm] = read_segments(fh)
|
|
260
|
+
structure
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# returns an array of arrays of transmembrane segments: [prob(Float),
|
|
264
|
+
# start(Int), stop(Int)]
|
|
265
|
+
# returns after seeing '//'
|
|
266
|
+
def read_segments(fh)
|
|
267
|
+
segments = []
|
|
268
|
+
st = Regexp.escape('//') ; end_regex = /#{st}/
|
|
269
|
+
fh.each do |line|
|
|
270
|
+
if line =~ /^TRANSMEM/
|
|
271
|
+
(header, start, stop, len, prob) = line.split(/\s+/)[0,5]
|
|
272
|
+
segments << [prob.to_f, start.to_i, stop.to_i]
|
|
273
|
+
elsif line =~ end_regex
|
|
274
|
+
break
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
segments
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# returns the top probability structure (first on tie)
|
|
281
|
+
def top_structure(list)
|
|
282
|
+
top_prob = list.first[:probability]
|
|
283
|
+
top_struc = list.first
|
|
284
|
+
list.each do |st|
|
|
285
|
+
if st[:probability] > top_prob
|
|
286
|
+
top_struc = st
|
|
287
|
+
top_prob = st[:probability]
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
top_struc
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def read_aaseq(fh)
|
|
294
|
+
aaseq = ''
|
|
295
|
+
fh.each do |line|
|
|
296
|
+
line.chomp!
|
|
297
|
+
unless line =~ /[\w\*]/
|
|
298
|
+
break
|
|
299
|
+
end
|
|
300
|
+
aaseq << line
|
|
301
|
+
end
|
|
302
|
+
aaseq
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def segment_arrays_to_hashes(list)
|
|
306
|
+
list.map! do |ar|
|
|
307
|
+
{ :probability => ar[0],
|
|
308
|
+
:start => ar[1],
|
|
309
|
+
:stop => ar[2],
|
|
310
|
+
:aaseq => ar[3],
|
|
311
|
+
}
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
# returns [certain, putative]
|
|
316
|
+
# expects first line to be a tm segment
|
|
317
|
+
def num_certain_putative(fh)
|
|
318
|
+
certain = 0
|
|
319
|
+
putative = 0
|
|
320
|
+
fh.each do |line|
|
|
321
|
+
certainty = line.chomp.split(/\s+/).last
|
|
322
|
+
if !certainty
|
|
323
|
+
break
|
|
324
|
+
else
|
|
325
|
+
certain += 1 if certainty == 'Certain'
|
|
326
|
+
putative += 1 if certainty == 'Putative'
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
[certain, putative]
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def read_segment_summary(fh, rec)
|
|
333
|
+
fh.each do |line|
|
|
334
|
+
if line =~ /Found: (.*?) segments/
|
|
335
|
+
rec[:num_found] = $1.to_i
|
|
336
|
+
break if rec[:num_found] == 0
|
|
337
|
+
elsif line =~ /Helix\s+Begin/
|
|
338
|
+
(cert, putat) = num_certain_putative(fh)
|
|
339
|
+
rec[:num_certain_transmembrane_segments] = cert
|
|
340
|
+
rec[:num_putative_transmembrane_segments] = putat
|
|
341
|
+
break
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
|
|
348
|
+
def get_root_node_from_io(io, &block)
|
|
349
|
+
# turn off warnings because this doesn't seem to work:
|
|
350
|
+
# XML::Parser.default_load_external_dtd = false
|
|
351
|
+
# (There is a warning about not finding DTD)
|
|
352
|
+
xml_parser_warnings = XML::Parser.default_warnings
|
|
353
|
+
XML::Parser.default_warnings = false
|
|
354
|
+
doc = XML::Parser.io(io).parse
|
|
355
|
+
root = doc.root
|
|
356
|
+
block.call(root)
|
|
357
|
+
# reset the warning level of XML::Parser:
|
|
358
|
+
XML::Parser.default_warnings = xml_parser_warnings
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
|
|
363
|
+
def get_root_node_from_io(io, &block)
|
|
364
|
+
root = ::AXML.parse(io)
|
|
365
|
+
block.call(root)
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
data/lib/transmem.rb
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
|
|
2
|
+
# A transmemIndex is a hash that takes a fasta reference as key and returns
|
|
3
|
+
# a structured hash containing the transmembrane information.
|
|
4
|
+
module TransmemIndex
|
|
5
|
+
|
|
6
|
+
# returns :toppred or :phobius
|
|
7
|
+
def self.filetype(file)
|
|
8
|
+
tp = nil
|
|
9
|
+
File.open(file) do |fh|
|
|
10
|
+
while (line = fh.gets)
|
|
11
|
+
case line
|
|
12
|
+
when /SEQENCE/
|
|
13
|
+
tp = :phobius
|
|
14
|
+
break
|
|
15
|
+
when / 0 0 i/
|
|
16
|
+
tp = :phobius # if they don't have the headers,
|
|
17
|
+
# this will pick it up if they have a
|
|
18
|
+
# single prot without tm or signal peptide.
|
|
19
|
+
break
|
|
20
|
+
when /Algorithm specific parameters/
|
|
21
|
+
tp = :toppred # New text
|
|
22
|
+
break
|
|
23
|
+
when /<parameters>/
|
|
24
|
+
tp = :toppred # XML
|
|
25
|
+
break
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
tp
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def reference_to_key(reference)
|
|
33
|
+
# needs to be subclassed or written
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# right now accepts toppred.out files
|
|
37
|
+
# Phobius objects can use the fasta object to update their hash for methods
|
|
38
|
+
# like avg_overlap
|
|
39
|
+
def self.new(file, fasta=nil)
|
|
40
|
+
case x = filetype(file)
|
|
41
|
+
when :toppred
|
|
42
|
+
require 'transmem/toppred'
|
|
43
|
+
TopPred::Index.new(file)
|
|
44
|
+
when :phobius
|
|
45
|
+
require 'transmem/phobius'
|
|
46
|
+
# warn "WARNING: You have NO fasta object with Phobius based TransmemIndex! (which needs one to do proper indexing!)" unless fasta
|
|
47
|
+
Phobius::Index.new(file, fasta)
|
|
48
|
+
else
|
|
49
|
+
raise ArgumentError, "#{x} filetype for #{file} not recognized!"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# returns a hash of key -> num certain transmembrane segments
|
|
54
|
+
def num_certain_index
|
|
55
|
+
hash = {}
|
|
56
|
+
self.each do |k,v|
|
|
57
|
+
hash[k] = v[:num_certain_transmembrane_segments] || 0
|
|
58
|
+
end
|
|
59
|
+
hash
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# tp = :number or :fraction which is the fraction of the sequence size
|
|
63
|
+
# returns the average number of overlapping amino acids with transmembrane
|
|
64
|
+
# segments
|
|
65
|
+
# returns nil if there is no protein by that key
|
|
66
|
+
def avg_overlap(key, sequence, tp=:number)
|
|
67
|
+
if self.key? key
|
|
68
|
+
numbers = num_transmem_aa(self[key], sequence)
|
|
69
|
+
if numbers.size > 0
|
|
70
|
+
sum = 0
|
|
71
|
+
numbers.each {|num| sum += num}
|
|
72
|
+
avg_num = sum.to_f / numbers.size
|
|
73
|
+
# the one line way to do it
|
|
74
|
+
#avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
|
|
75
|
+
if tp == :fraction
|
|
76
|
+
avg_num / sequence.size
|
|
77
|
+
# this is the same as doing this:
|
|
78
|
+
#numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
|
|
79
|
+
else
|
|
80
|
+
avg_num
|
|
81
|
+
end
|
|
82
|
+
else
|
|
83
|
+
0.0
|
|
84
|
+
end
|
|
85
|
+
else # what to do if the protein isn't there?? which happens on occasion
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# returns an array (usually length of 1) of the number of amino acids
|
|
91
|
+
# contained inside transmembrane spanning segments.
|
|
92
|
+
# assumes that tmhash has the key 'transmembrane_segments'
|
|
93
|
+
# if there are no transmembrane segments, returns empty array.
|
|
94
|
+
def num_transmem_aa(tmhash, sequence)
|
|
95
|
+
if tmhash.key? :transmembrane_segments
|
|
96
|
+
ranges = tmhash[:transmembrane_segments].map do |tmseg|
|
|
97
|
+
Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
|
|
98
|
+
end
|
|
99
|
+
num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
|
|
100
|
+
else
|
|
101
|
+
[]
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# returns an array of the number of overlapping sequences in substring with
|
|
106
|
+
# the substrings defined in start_stop_doublets within full_sequence
|
|
107
|
+
# start_stop_doublets should be 0 indexed!!!
|
|
108
|
+
# the span includes the 'stop' position i.e., full_sequence[start..stop]
|
|
109
|
+
def num_overlapping_chars(full_sequence, ranges, substring)
|
|
110
|
+
#start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
|
|
111
|
+
if ranges.size == 0
|
|
112
|
+
[]
|
|
113
|
+
#full_sequence.enum_for(:scan, substring).map { 0 }
|
|
114
|
+
else
|
|
115
|
+
substring_ranges = []
|
|
116
|
+
pos = 0
|
|
117
|
+
slen = substring.size
|
|
118
|
+
while i=full_sequence.index(substring,pos)
|
|
119
|
+
substring_ranges << Range.new(i, i+slen-1)
|
|
120
|
+
pos = i + slen
|
|
121
|
+
end
|
|
122
|
+
# brute force way
|
|
123
|
+
last_tm_range = ranges.last.last
|
|
124
|
+
to_return = substring_ranges.map do |sb|
|
|
125
|
+
overlap = 0
|
|
126
|
+
# there's got to be a much simpler way to do this, but this does work...
|
|
127
|
+
ranges.each do |tm|
|
|
128
|
+
(frst, lst) =
|
|
129
|
+
if tm.include?( sb.first )
|
|
130
|
+
[tm, sb]
|
|
131
|
+
elsif tm.include?( sb.last )
|
|
132
|
+
[sb, tm]
|
|
133
|
+
else
|
|
134
|
+
nil
|
|
135
|
+
end
|
|
136
|
+
if frst
|
|
137
|
+
if lst.last <= frst.last
|
|
138
|
+
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
|
|
139
|
+
else
|
|
140
|
+
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
overlap
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
#substring_ranges = full_sequence.enum_for(:scan, substring).map do
|
|
154
|
+
# (ofirst, olast) = $~.offset(0)
|
|
155
|
+
# Range.new(ofirst, olast - 1)
|
|
156
|
+
# end
|
|
157
|
+
|
data/lib/validator/aa.rb
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
require 'validator' # I'm not sure why I need this declaration here when I include it in the following digestion_based declaration??? (but I get a name error if I don't)
|
|
2
|
+
require 'validator/digestion_based'
|
|
3
|
+
require 'fasta'
|
|
4
|
+
require 'spec_id/aa_freqs'
|
|
5
|
+
|
|
6
|
+
# Constraints on aaseq attribute of peptides (the bare amino acid sequence)
|
|
7
|
+
# works by calculating amino acid frequencies in the fasta file used.
|
|
8
|
+
class Validator::AA < Validator::DigestionBased
|
|
9
|
+
include Precision::Calculator
|
|
10
|
+
|
|
11
|
+
attr_accessor :constraint
|
|
12
|
+
|
|
13
|
+
# it is a false hit if the amino acid is located in the peptide
|
|
14
|
+
attr_accessor :false_if_found
|
|
15
|
+
|
|
16
|
+
# if given, the frequency of the amino acid is used to estimate the false to
|
|
17
|
+
# total ratio based on the pephits given for pephit_precision.
|
|
18
|
+
# see Validator::AA.calc_frequency to calculate a frequency
|
|
19
|
+
attr_accessor :frequency
|
|
20
|
+
|
|
21
|
+
DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
|
|
22
|
+
:false_if_found => true,
|
|
23
|
+
} )
|
|
24
|
+
|
|
25
|
+
# returns tp, fp
|
|
26
|
+
def partition(peps)
|
|
27
|
+
(found, not_found) = peps.partition do |pep|
|
|
28
|
+
pep.aaseq.include?(@constraint)
|
|
29
|
+
end
|
|
30
|
+
if @false_if_found
|
|
31
|
+
[not_found, found]
|
|
32
|
+
else
|
|
33
|
+
[found, not_found]
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# takes a fasta object and sets the frequency based on constraint.
|
|
38
|
+
# constraint is one acceptable to initialize!
|
|
39
|
+
# returns self
|
|
40
|
+
def set_frequency(fasta_obj)
|
|
41
|
+
table = SpecID::AAFreqs.new.calculate_frequencies(fasta_obj)
|
|
42
|
+
@frequency = table[@constraint.to_sym]
|
|
43
|
+
self
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# right now only accepts single amino acids as constraints (as a string,
|
|
47
|
+
# e.g. 'C', or symbol, e.g. :C)
|
|
48
|
+
# options:
|
|
49
|
+
# :frequency OR :false_to_total_ratio should be used (NOT both)
|
|
50
|
+
# :frequency => Float, if the frequency of the amino acid is known (see
|
|
51
|
+
# Validator::AA.calc_frequency)
|
|
52
|
+
# :false_to_total_ratio => if a true digestion was already performed (see
|
|
53
|
+
# Validator::AA.calc_false_to_total_ratio)
|
|
54
|
+
# :false_if_found => it is a false positive if the amino acid is found.
|
|
55
|
+
# :background => the background level of amino acid Float
|
|
56
|
+
def initialize(constraint, options={})
|
|
57
|
+
@constraint = constraint.to_s
|
|
58
|
+
opts = DEFAULTS.merge(options)
|
|
59
|
+
(@frequency, @false_to_total_ratio, @false_if_found, @background) = opts.values_at(:frequency, :false_to_total_ratio, :false_if_found, :background)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# if expected is 0 then will return precision = 1.0
|
|
63
|
+
def pephit_precision(peps)
|
|
64
|
+
if @frequency
|
|
65
|
+
(actual, expected) = at_least_one(@constraint, @frequency, peps.map {|v| v.aaseq })
|
|
66
|
+
if expected == 0.0
|
|
67
|
+
1.0
|
|
68
|
+
else
|
|
69
|
+
# what's this guy ?? good for??
|
|
70
|
+
fraction_of_expected = actual.to_f/expected
|
|
71
|
+
pephit_precision_from_actual_and_expected(actual, expected, peps.size, @background)
|
|
72
|
+
end
|
|
73
|
+
elsif @false_to_total_ratio
|
|
74
|
+
super(peps)
|
|
75
|
+
else
|
|
76
|
+
raise ArgumentError, "@frequency or @false_to_total_ratio must be defined!"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# returns (Actual(Int), Expected(Float)) based on how many peptides have at
|
|
81
|
+
# least one amino_acid, the frequency it is observed in background (then we
|
|
82
|
+
# can look at the size of each peptide and determine the likelihood of
|
|
83
|
+
# having the peptide with at least one amino acid).
|
|
84
|
+
# amino_acid should be a string (e.g., 'C')
|
|
85
|
+
def at_least_one(amino_acid, freq, amino_acid_seqs)
|
|
86
|
+
one_minus_freq = 1.0 - freq
|
|
87
|
+
probs = []
|
|
88
|
+
actual = 0
|
|
89
|
+
expected = 0.0
|
|
90
|
+
amino_acid_seqs.each do |aaseq|
|
|
91
|
+
expected += (1.0 - (one_minus_freq**aaseq.size))
|
|
92
|
+
if aaseq.include?(amino_acid)
|
|
93
|
+
actual += 1
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
[actual, expected]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# given: (actual # with 'AA', expected # with 'AA', total#peptides,
|
|
101
|
+
# mean_fraction_of_cysteines_true)
|
|
102
|
+
#
|
|
103
|
+
# PepHit('AA') = Peptide containing at least one 'AA'
|
|
104
|
+
# # expected PepHit('AA') # observed Bad Pep ('AA')
|
|
105
|
+
# ----------------------- proportional_to -------------------------
|
|
106
|
+
# # total PepHits # Total Bad PepHit
|
|
107
|
+
#
|
|
108
|
+
# returns the precision
|
|
109
|
+
# the background correction factor will not reduce the actual count of
|
|
110
|
+
# peptides to < 0. One can still get negative precision scores, however,
|
|
111
|
+
# depending on the other variables.
|
|
112
|
+
# background is the number of peptides with the amino acid in the purest
|
|
113
|
+
# sample over the total number of peps.
|
|
114
|
+
#---
|
|
115
|
+
# this is thoroughly explained in my 2007_09 presentations (inkscape)
|
|
116
|
+
#+++
|
|
117
|
+
def pephit_precision_from_actual_and_expected(actual, expected, total_peps, background=DEFAULTS[:background])
|
|
118
|
+
actual = actual.to_f
|
|
119
|
+
@calculated_background = actual / total_peps
|
|
120
|
+
actual -= (total_peps * background)
|
|
121
|
+
# We were doing it compared to the number expected.. but this is more
|
|
122
|
+
# clear
|
|
123
|
+
# actual/false_hits = expected/total_peps_passing
|
|
124
|
+
# false_hits = (total_peps_passing * actual) / expected
|
|
125
|
+
if actual < 0.0 ; actual = 0.0 end
|
|
126
|
+
total_number_false = (actual * total_peps).to_f / expected
|
|
127
|
+
#fppr = total_number_false / total_peps
|
|
128
|
+
prec = (total_peps - total_number_false) / total_peps
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def to_param_string
|
|
132
|
+
"aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|