ms-error_rate 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +14 -0
- data/.gitmodules +9 -0
- data/History +16 -0
- data/LICENSE +2 -0
- data/Rakefile +52 -0
- data/VERSION +1 -1
- data/lib/ms/error_rate/decoy.rb +27 -0
- data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
- data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
- data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
- data/lib/ms/error_rate/qvalue.rb +93 -0
- data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
- data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
- data/lib/ms/error_rate/sbv.rb +111 -0
- data/lib/ms/error_rate.rb +9 -0
- data/lib/ms/ident.rb +125 -0
- data/lib/support/sort_by_attributes.rb +51 -0
- data/lib/transmembrane/phobius.rb +136 -0
- data/lib/transmembrane/toppred.rb +368 -0
- data/lib/transmembrane.rb +157 -0
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/expert_addition.rb +26 -0
- data/script/expert_list.rb +53 -0
- data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
- data/script/minimal_protein_set.rb +366 -0
- data/script/unique_seq_stats.rb +72 -0
- metadata +66 -14
data/lib/ms/ident.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'ms/fasta'
|
2
|
+
require 'ms/in_silico/digester'
|
3
|
+
|
4
|
+
module Ms
|
5
|
+
module Ident
|
6
|
+
|
7
|
+
IPI_RE = /IPI:([\w\d\.]+)\|/
|
8
|
+
GI_RE = /gi|([\w\d\.]+)\|/
|
9
|
+
|
10
|
+
# the twenty standard amino acids
|
11
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
12
|
+
|
13
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 1, :min_length => 8, :enzyme => Ms::InSilico::Digester::TRYPSIN, :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
14
|
+
|
15
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
16
|
+
# creates a temporary digestion file that contains all peptides digesting
|
17
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
18
|
+
# this file but on the final peptide centric db)
|
19
|
+
def self.peptide_centric_db(fasta_file, opts={})
|
20
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
21
|
+
|
22
|
+
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
23
|
+
|
24
|
+
unless id_regexp
|
25
|
+
id_regexp = Ms::Fasta.id_regexp(Ms::Fasta.filetype(fasta_file))
|
26
|
+
raise RuntimeError, "fasta file type not recognized, supply id_regexp" unless id_regexp
|
27
|
+
end
|
28
|
+
|
29
|
+
start_time = Time.now
|
30
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
31
|
+
|
32
|
+
if expand_aa
|
33
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
34
|
+
end
|
35
|
+
|
36
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
37
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
38
|
+
File.open(digestion_file, "w") do |fh|
|
39
|
+
Ms::Fasta.open(fasta_file) do |fasta|
|
40
|
+
fasta.each do |prot|
|
41
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
42
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
43
|
+
m_peps = []
|
44
|
+
init_methionine_peps = []
|
45
|
+
peptides.each do |pep|
|
46
|
+
# if the peptide is at the beginning of the protein sequence
|
47
|
+
if prot.sequence[0,pep.size] == pep
|
48
|
+
m_peps << pep[1..-1]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
peptides.push(*m_peps)
|
52
|
+
end
|
53
|
+
if expand_aa
|
54
|
+
peptides = peptides.map do |pep|
|
55
|
+
if pep =~ letters_to_expand_re
|
56
|
+
expand_peptides(pep, expand_aa)
|
57
|
+
else
|
58
|
+
pep
|
59
|
+
end
|
60
|
+
end.flatten
|
61
|
+
end
|
62
|
+
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
67
|
+
|
68
|
+
|
69
|
+
start_time = Time.now
|
70
|
+
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
71
|
+
|
72
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
73
|
+
IO.foreach(digestion_file) do |line|
|
74
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
75
|
+
id = prot.match(id_regexp)[1]
|
76
|
+
peps.each do |pep|
|
77
|
+
if pep.size >= min_length
|
78
|
+
hash[pep] << id
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
83
|
+
|
84
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
85
|
+
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
86
|
+
|
87
|
+
start_time = Time.now
|
88
|
+
print "Writing results to #{} ..." if $VERBOSE
|
89
|
+
|
90
|
+
File.open(final_outfile, 'w') do |out|
|
91
|
+
hash.each do |k,v|
|
92
|
+
out.puts( "#{k}: #{v.join('-')}" )
|
93
|
+
end
|
94
|
+
end
|
95
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
96
|
+
|
97
|
+
if remove_digestion_file
|
98
|
+
File.unlink(digestion_file)
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
# does combinatorial expansion of all letters requesting it.
|
104
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
105
|
+
def self.expand_peptides(peptide, expand_aa)
|
106
|
+
letters_in_order = expand_aa.keys.sort
|
107
|
+
index_and_key = []
|
108
|
+
peptide.split('').each_with_index do |char,i|
|
109
|
+
if let_index = letters_in_order.index(char)
|
110
|
+
index_and_key << [i, letters_in_order[let_index]]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
to_expand = [peptide]
|
114
|
+
index_and_key.each do |i,letter|
|
115
|
+
new_peps = []
|
116
|
+
while current_pep = to_expand.shift do
|
117
|
+
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
118
|
+
end
|
119
|
+
to_expand = new_peps.flatten
|
120
|
+
end
|
121
|
+
to_expand
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class Reverser
|
4
|
+
attr_accessor :obj
|
5
|
+
|
6
|
+
def initialize(obj)
|
7
|
+
@obj = obj
|
8
|
+
end
|
9
|
+
|
10
|
+
def <=>(other)
|
11
|
+
other.obj <=> self.obj
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Object
|
16
|
+
def rev
|
17
|
+
Reverser.new(self)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module Enumerable
|
22
|
+
# Provides sorting on multiple attributes (each directional) where atts is
|
23
|
+
# an array of symbols.
|
24
|
+
# the default is to sort ascending (small to large).
|
25
|
+
# the option :down => Symbol or ArrayOfSymbols
|
26
|
+
# sort_by_attributes(:age,:height,:weight) # -> sorts by age, height, and weight
|
27
|
+
# sort_by_attributes(:age,:height,:weight, :down => :height) # -> same as above, but sorts height from large to small
|
28
|
+
# sort_by_attributes(:age,:height,:weight, :down => [:height,:weight]) # -> same as above, but sorts height and weight from large to small
|
29
|
+
def sort_by_attributes(*atts)
|
30
|
+
down =
|
31
|
+
if atts.last.is_a? Hash
|
32
|
+
hash = atts.pop
|
33
|
+
unless hash[:down].is_a?(Array)
|
34
|
+
hash[:down] = [hash[:down]]
|
35
|
+
end
|
36
|
+
Set.new(hash[:down])
|
37
|
+
else
|
38
|
+
Set.new
|
39
|
+
end
|
40
|
+
self.sort_by do |obj|
|
41
|
+
atts.collect do |att|
|
42
|
+
if down.include?(att)
|
43
|
+
obj.send(att).rev
|
44
|
+
else
|
45
|
+
obj.send(att)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'transmembrane'
|
2
|
+
|
3
|
+
class Phobius ; end
|
4
|
+
|
5
|
+
# This class will probably change its interface some in the future
|
6
|
+
# That's the web portal
|
7
|
+
# http://phobius.sbc.su.se/
|
8
|
+
# How to run:
|
9
|
+
# Select output format as 'Short'
|
10
|
+
# then hit 'Submit Query'
|
11
|
+
|
12
|
+
# note: to implement some of the TransmembraneIndex features, the update_aaseq
|
13
|
+
# method must be called!
|
14
|
+
class Phobius::Index < Hash
|
15
|
+
include TransmembraneIndex
|
16
|
+
|
17
|
+
# will update_aaseq if given a fasta_obj
|
18
|
+
def initialize(file)
|
19
|
+
Phobius.default_index(file, self)
|
20
|
+
end
|
21
|
+
|
22
|
+
# we need to match whatever function phobius uses to generate identifiers if
|
23
|
+
# we want derivative processes to be fast and accurate
|
24
|
+
def reference_to_key(reference)
|
25
|
+
if reference
|
26
|
+
if reference.size > 0
|
27
|
+
index = reference.index(' ')
|
28
|
+
string =
|
29
|
+
if index
|
30
|
+
reference[0...index]
|
31
|
+
else
|
32
|
+
reference
|
33
|
+
end
|
34
|
+
string.gsub('"','')
|
35
|
+
else
|
36
|
+
''
|
37
|
+
end
|
38
|
+
else
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
class Phobius
|
46
|
+
include TransmembraneIndex
|
47
|
+
|
48
|
+
# returns the default index
|
49
|
+
def self.default_index(file, index={})
|
50
|
+
parser = Phobius::Parser.new(:short)
|
51
|
+
parser.file_to_index(file, index)
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
module Phobius::Parser
|
57
|
+
|
58
|
+
def self.new(parser_type=:short)
|
59
|
+
klass =
|
60
|
+
case parser_type
|
61
|
+
when :short
|
62
|
+
Phobius::ParserShort
|
63
|
+
else
|
64
|
+
raise ArgumentError, "don't recognize parser type: #{parser_type}"
|
65
|
+
end
|
66
|
+
klass.new
|
67
|
+
end
|
68
|
+
|
69
|
+
def file_to_index(file, index={})
|
70
|
+
File.open(file) {|fh| to_index(fh, index) }
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
class Phobius::ParserShort
|
77
|
+
include Phobius::Parser
|
78
|
+
|
79
|
+
# takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
|
80
|
+
# and returns an array of hashes with the keys :start and :stop
|
81
|
+
def prediction_to_array(string)
|
82
|
+
segments = []
|
83
|
+
string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
|
84
|
+
segments << { :start => m1.to_i, :stop => m2.to_i }
|
85
|
+
end
|
86
|
+
segments
|
87
|
+
end
|
88
|
+
|
89
|
+
# returns a hash structure in this form: { identifier => {
|
90
|
+
# :num_certain_transmembrane_segments => Int,
|
91
|
+
# :transmembrane_segments => [:start => Int, :stop
|
92
|
+
# => Int] }
|
93
|
+
# can parse io even if there is no header to key in on.
|
94
|
+
def to_index(io, index={})
|
95
|
+
init_pos = io.pos
|
96
|
+
cnt = 0
|
97
|
+
found_header = false
|
98
|
+
loop do
|
99
|
+
if io.gets =~ /SEQENCE/
|
100
|
+
found_header = true
|
101
|
+
break
|
102
|
+
end
|
103
|
+
cnt += 1
|
104
|
+
break if cnt > 10
|
105
|
+
end
|
106
|
+
if !found_header
|
107
|
+
io.pos = init_pos
|
108
|
+
end
|
109
|
+
current_record = nil
|
110
|
+
io.each do |line|
|
111
|
+
line.chomp!
|
112
|
+
# grab values
|
113
|
+
ar = line.split(/\s+/)
|
114
|
+
next if ar.size != 4
|
115
|
+
(key, num_tms, signal_peptide, prediction) = ar
|
116
|
+
# cast the values
|
117
|
+
num_tms = num_tms.to_i
|
118
|
+
signal_peptide =
|
119
|
+
case signal_peptide
|
120
|
+
when 'Y'
|
121
|
+
true
|
122
|
+
when '0'
|
123
|
+
false
|
124
|
+
end
|
125
|
+
index[key] = {
|
126
|
+
:num_certain_transmembrane_segments => num_tms,
|
127
|
+
:signal_peptide => signal_peptide,
|
128
|
+
}
|
129
|
+
if num_tms > 0
|
130
|
+
index[key][:transmembrane_segments] = prediction_to_array(prediction)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
index
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
@@ -0,0 +1,368 @@
|
|
1
|
+
require 'transmem'
|
2
|
+
require 'xml_style_parser'
|
3
|
+
|
4
|
+
class TopPred ; end
|
5
|
+
|
6
|
+
|
7
|
+
class TopPred::Index < Hash
|
8
|
+
include TransmemIndex
|
9
|
+
|
10
|
+
# we need to match whatever function toppred uses to generate identifiers if
|
11
|
+
# we want derivative processes to be fast and accurate
|
12
|
+
def reference_to_key(reference)
|
13
|
+
if reference
|
14
|
+
ri = reference.index(' ')
|
15
|
+
frst =
|
16
|
+
if ri
|
17
|
+
reference[0...reference.index(' ')]
|
18
|
+
else
|
19
|
+
reference
|
20
|
+
end
|
21
|
+
if frst
|
22
|
+
frst.gsub(/[^0-9a-zA-Z]/,'_')
|
23
|
+
else
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
else
|
27
|
+
nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(file, kind=:default)
|
32
|
+
case kind
|
33
|
+
when :default
|
34
|
+
TopPred.default_index(file, self)
|
35
|
+
else
|
36
|
+
abort "can't do #{kind}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# This class will probably change its interface some in the future
|
41
|
+
# That's the web portal
|
42
|
+
# http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
|
43
|
+
# How to run:
|
44
|
+
# uncheck 'Produce hydrophobicity graph image (-g)'
|
45
|
+
# choose 'Xml' or 'New: new text' output
|
46
|
+
# type in your email, then hit 'Run toppred'
|
47
|
+
end
|
48
|
+
|
49
|
+
class TopPred
|
50
|
+
include TransmemIndex
|
51
|
+
|
52
|
+
# returns the default index
|
53
|
+
def self.default_index(file, index={})
|
54
|
+
TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
module TopPred::Parser
|
60
|
+
# returns :xml or :text
|
61
|
+
def self.filetype(file)
|
62
|
+
File.open(file) do |fh|
|
63
|
+
case fh.gets
|
64
|
+
when /<\?xml version.*>/
|
65
|
+
:xml
|
66
|
+
when /Algorithm specific/
|
67
|
+
:text
|
68
|
+
else
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# type = :xml or :text
|
75
|
+
def self.new(parser_type=:xml)
|
76
|
+
klass =
|
77
|
+
case parser_type
|
78
|
+
when :xml
|
79
|
+
TopPred::Parser_XML
|
80
|
+
when :text
|
81
|
+
TopPred::Parser_Text
|
82
|
+
else
|
83
|
+
abort "don't recognize parser type: #{parser_type}"
|
84
|
+
end
|
85
|
+
klass.new
|
86
|
+
end
|
87
|
+
|
88
|
+
def file_to_index(file, index={})
|
89
|
+
File.open(file) {|fh| to_index(fh, index) }
|
90
|
+
end
|
91
|
+
|
92
|
+
# where each segment = [prob, first, last] and aaseq is a string each
|
93
|
+
# segment may also be a hash => first, last, probability (adding key
|
94
|
+
# 'aaseq')
|
95
|
+
# first/last '1' indexed returns segments where each is [prob,
|
96
|
+
# first, last, aaseq] or hash (above)
|
97
|
+
def add_sequences_to_segments(segments, aaseq)
|
98
|
+
if segments.first.is_a? Array
|
99
|
+
segments.each do |seg|
|
100
|
+
first_index = seg[1] - 1
|
101
|
+
length = (seg[2] - seg[1]) + 1
|
102
|
+
seg.push( aaseq[first_index, length] )
|
103
|
+
end
|
104
|
+
else
|
105
|
+
segments.each do |seg|
|
106
|
+
first_index = seg[:start] - 1
|
107
|
+
length = (seg[:stop] - seg[:start]) + 1
|
108
|
+
seg[:aaseq] = ( aaseq[first_index, length] )
|
109
|
+
end
|
110
|
+
end
|
111
|
+
segments
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
module TopPred::Parser_XML
|
119
|
+
include TopPred::Parser
|
120
|
+
include XMLStyleParser
|
121
|
+
|
122
|
+
def self.new(meth=:to_index)
|
123
|
+
parser = XMLStyleParser.choose_parser(self, meth).new
|
124
|
+
@method = meth
|
125
|
+
parser
|
126
|
+
end
|
127
|
+
|
128
|
+
def parse(file)
|
129
|
+
send(@method, file)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class TopPred::Parser_XML::DOM
|
134
|
+
include TopPred::Parser_XML
|
135
|
+
include XMLStyleParser
|
136
|
+
|
137
|
+
=begin
|
138
|
+
YAL010C:
|
139
|
+
num_putative_transmembrane_segments: 1
|
140
|
+
aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
|
141
|
+
best_structure_probability: 1.0
|
142
|
+
transmembrane_segments:
|
143
|
+
- aaseq: SLGAEFWLGLVSLSPGCSTTL
|
144
|
+
stop: 252
|
145
|
+
start: 232
|
146
|
+
probability: 1.0
|
147
|
+
num_certain_transmembrane_segments: 1
|
148
|
+
num_found: 2
|
149
|
+
=end
|
150
|
+
|
151
|
+
# should return a index
|
152
|
+
def to_index(io, index = {})
|
153
|
+
get_root_node_from_io(io) do |toppreds_n|
|
154
|
+
|
155
|
+
abort if toppreds_n.name != 'toppreds'
|
156
|
+
toppreds_n.find('child::toppred').each do |toppred_n|
|
157
|
+
att_hash = {}
|
158
|
+
sequence_n = toppred_n.find_first('child::sequence')
|
159
|
+
index[sequence_n['id']] = att_hash
|
160
|
+
att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
|
161
|
+
abort if att_hash[:aaseq].size != sequence_n['size'].to_i
|
162
|
+
tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
|
163
|
+
|
164
|
+
num_found = tmsummary_n['segments'].to_i
|
165
|
+
att_hash[:num_found] = num_found
|
166
|
+
if num_found > 0
|
167
|
+
|
168
|
+
num_certain_transmembrane_segments = 0
|
169
|
+
num_putative_transmembrane_segments = 0
|
170
|
+
tmsummary_n.find('child::segment').each do |segment_n|
|
171
|
+
abort if segment_n.name != 'segment'
|
172
|
+
case segment_n['type']
|
173
|
+
when 'certain'
|
174
|
+
num_certain_transmembrane_segments += 1
|
175
|
+
else # putative
|
176
|
+
num_putative_transmembrane_segments += 1
|
177
|
+
end
|
178
|
+
end
|
179
|
+
att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
|
180
|
+
att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
|
181
|
+
|
182
|
+
topologies_n = tmsummary_n.next
|
183
|
+
abort if topologies_n.name != 'topologies'
|
184
|
+
# get the top probability topology:
|
185
|
+
top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
|
186
|
+
tmsegments = []
|
187
|
+
top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
|
188
|
+
tmhash = {}
|
189
|
+
tmhash[:start] = tmsegment_n['start'].to_i
|
190
|
+
tmhash[:stop] = tmsegment_n['stop'].to_i
|
191
|
+
## WARNING! it appears the probability is broken on xml output!!
|
192
|
+
tmhash[:probability] = tmsegment_n['prob'].to_f
|
193
|
+
tmsegments << tmhash
|
194
|
+
end
|
195
|
+
add_sequences_to_segments(tmsegments, att_hash[:aaseq])
|
196
|
+
att_hash[:transmembrane_segments] = tmsegments
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
index
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
class TopPred::Parser_Text
|
206
|
+
include TopPred::Parser
|
207
|
+
|
208
|
+
|
209
|
+
# returns a hash structure in this form: {identifier => {aaseq => String,
|
210
|
+
# num_found: Int, num_certain_transmembrane_segments => Int,
|
211
|
+
# num_putative_transmembrane_segments => Int, best_structure_probability =>
|
212
|
+
# Float, transmembrane_segments => [probability => Float, start => Int, stop
|
213
|
+
# => Int, aaseq => String] } }
|
214
|
+
def to_index(io, index={})
|
215
|
+
current_record = nil
|
216
|
+
|
217
|
+
io.each do |line|
|
218
|
+
if line =~ /^Sequence : (.*?) +\(/
|
219
|
+
current_identifier = $1.dup
|
220
|
+
index[current_identifier] = {}
|
221
|
+
current_record = index[current_identifier]
|
222
|
+
current_record[:aaseq] = read_aaseq(io)
|
223
|
+
read_segment_summary(io, current_record)
|
224
|
+
elsif line =~ /^HEADER\s+START\s+STOP/
|
225
|
+
top_struc = top_structure( read_structures(io) )
|
226
|
+
current_record[:best_structure_probability] = top_struc[:probability]
|
227
|
+
current_record[:transmembrane_segments] = top_struc[:tm]
|
228
|
+
add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
|
229
|
+
segment_arrays_to_hashes(current_record[:transmembrane_segments])
|
230
|
+
end
|
231
|
+
end
|
232
|
+
index
|
233
|
+
end
|
234
|
+
|
235
|
+
private
|
236
|
+
|
237
|
+
# returns a list of all structures given a filehandle starting just after
|
238
|
+
# the first "HEADER START STOP ..." line
|
239
|
+
def read_structures(fh)
|
240
|
+
structures = []
|
241
|
+
loop do
|
242
|
+
structures.push( read_structure(fh) )
|
243
|
+
break if fh.eof?
|
244
|
+
line = fh.readline
|
245
|
+
unless line =~ /^HEADER\s+START\s+STOP/
|
246
|
+
break
|
247
|
+
end
|
248
|
+
end
|
249
|
+
structures
|
250
|
+
end
|
251
|
+
|
252
|
+
# returns a hash with key :probability and key :tm contains an array of
|
253
|
+
# arrays: [prob(Float), start(Int), stop(Int)]
|
254
|
+
def read_structure(fh)
|
255
|
+
structure = {}
|
256
|
+
# READ the first line
|
257
|
+
line = fh.readline
|
258
|
+
structure[:probability] = line.split(/\s+/)[2].to_f
|
259
|
+
structure[:tm] = read_segments(fh)
|
260
|
+
structure
|
261
|
+
end
|
262
|
+
|
263
|
+
# returns an array of arrays of transmembrane segments: [prob(Float),
|
264
|
+
# start(Int), stop(Int)]
|
265
|
+
# returns after seeing '//'
|
266
|
+
def read_segments(fh)
|
267
|
+
segments = []
|
268
|
+
st = Regexp.escape('//') ; end_regex = /#{st}/
|
269
|
+
fh.each do |line|
|
270
|
+
if line =~ /^TRANSMEM/
|
271
|
+
(header, start, stop, len, prob) = line.split(/\s+/)[0,5]
|
272
|
+
segments << [prob.to_f, start.to_i, stop.to_i]
|
273
|
+
elsif line =~ end_regex
|
274
|
+
break
|
275
|
+
end
|
276
|
+
end
|
277
|
+
segments
|
278
|
+
end
|
279
|
+
|
280
|
+
# returns the top probability structure (first on tie)
|
281
|
+
def top_structure(list)
|
282
|
+
top_prob = list.first[:probability]
|
283
|
+
top_struc = list.first
|
284
|
+
list.each do |st|
|
285
|
+
if st[:probability] > top_prob
|
286
|
+
top_struc = st
|
287
|
+
top_prob = st[:probability]
|
288
|
+
end
|
289
|
+
end
|
290
|
+
top_struc
|
291
|
+
end
|
292
|
+
|
293
|
+
def read_aaseq(fh)
|
294
|
+
aaseq = ''
|
295
|
+
fh.each do |line|
|
296
|
+
line.chomp!
|
297
|
+
unless line =~ /[\w\*]/
|
298
|
+
break
|
299
|
+
end
|
300
|
+
aaseq << line
|
301
|
+
end
|
302
|
+
aaseq
|
303
|
+
end
|
304
|
+
|
305
|
+
def segment_arrays_to_hashes(list)
|
306
|
+
list.map! do |ar|
|
307
|
+
{ :probability => ar[0],
|
308
|
+
:start => ar[1],
|
309
|
+
:stop => ar[2],
|
310
|
+
:aaseq => ar[3],
|
311
|
+
}
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# returns [certain, putative]
|
316
|
+
# expects first line to be a tm segment
|
317
|
+
def num_certain_putative(fh)
|
318
|
+
certain = 0
|
319
|
+
putative = 0
|
320
|
+
fh.each do |line|
|
321
|
+
certainty = line.chomp.split(/\s+/).last
|
322
|
+
if !certainty
|
323
|
+
break
|
324
|
+
else
|
325
|
+
certain += 1 if certainty == 'Certain'
|
326
|
+
putative += 1 if certainty == 'Putative'
|
327
|
+
end
|
328
|
+
end
|
329
|
+
[certain, putative]
|
330
|
+
end
|
331
|
+
|
332
|
+
def read_segment_summary(fh, rec)
|
333
|
+
fh.each do |line|
|
334
|
+
if line =~ /Found: (.*?) segments/
|
335
|
+
rec[:num_found] = $1.to_i
|
336
|
+
break if rec[:num_found] == 0
|
337
|
+
elsif line =~ /Helix\s+Begin/
|
338
|
+
(cert, putat) = num_certain_putative(fh)
|
339
|
+
rec[:num_certain_transmembrane_segments] = cert
|
340
|
+
rec[:num_putative_transmembrane_segments] = putat
|
341
|
+
break
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
|
348
|
+
def get_root_node_from_io(io, &block)
|
349
|
+
# turn off warnings because this doesn't seem to work:
|
350
|
+
# XML::Parser.default_load_external_dtd = false
|
351
|
+
# (There is a warning about not finding DTD)
|
352
|
+
xml_parser_warnings = XML::Parser.default_warnings
|
353
|
+
XML::Parser.default_warnings = false
|
354
|
+
doc = XML::Parser.io(io).parse
|
355
|
+
root = doc.root
|
356
|
+
block.call(root)
|
357
|
+
# reset the warning level of XML::Parser:
|
358
|
+
XML::Parser.default_warnings = xml_parser_warnings
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
|
363
|
+
def get_root_node_from_io(io, &block)
|
364
|
+
root = ::AXML.parse(io)
|
365
|
+
block.call(root)
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|