rubst 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +32 -0
- data/PostInstall.txt +4 -0
- data/README.rdoc +56 -0
- data/Rakefile +31 -0
- data/bin/rubst +7 -0
- data/config/hoe.rb +1 -0
- data/config/website.yml +2 -0
- data/javascripts/rounded_corners_lite.inc.js +285 -0
- data/lib/enumerable_extensions.rb +11 -0
- data/lib/environment.rb +58 -0
- data/lib/environment_feature.rb +14 -0
- data/lib/narray_extensions.rb +21 -0
- data/lib/nmatrix_extensions.rb +26 -0
- data/lib/rubst.rb +6 -0
- data/lib/rubst/cli.rb +536 -0
- data/rubst.gemspec +52 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +71 -0
- data/test/test_enumerable_extensions.rb +16 -0
- data/test/test_environment_feature.rb +11 -0
- data/test/test_helper.rb +2 -0
- data/test/test_nmatrix_extensions.rb +16 -0
- data/test/test_rubst.rb +11 -0
- data/test/test_rubst_cli.rb +8 -0
- data/website/index.html +73 -0
- data/website/index.txt +38 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +165 -0
- metadata.gz.sig +0 -0
data/lib/environment.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "set"
|
3
|
+
require "narray"
|
4
|
+
require "facets"
|
5
|
+
|
6
|
+
class Environment
|
7
|
+
|
8
|
+
@@amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
|
9
|
+
|
10
|
+
attr_accessor :number,
|
11
|
+
:label,
|
12
|
+
:freq_array,
|
13
|
+
:prob_array,
|
14
|
+
:logodd_array,
|
15
|
+
:smooth_prob_array
|
16
|
+
|
17
|
+
def initialize(number, label)
|
18
|
+
@number = number
|
19
|
+
@label = label
|
20
|
+
@freq_array = NArray.int(21)
|
21
|
+
@prob_array = NArray.float(21)
|
22
|
+
@logodd_array = NArray.float(21)
|
23
|
+
@smooth_prob_array = NArray.float(21)
|
24
|
+
end
|
25
|
+
|
26
|
+
def add_residue_count(a, inc = 1)
|
27
|
+
@freq_array[@@amino_acids.index(a.upcase)] += inc
|
28
|
+
end
|
29
|
+
|
30
|
+
def label_set
|
31
|
+
label.split("").map_with_index { |l, i| "#{i}#{l}" }.to_set
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
"#{number}-#{label}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
if $0 == __FILE__
|
40
|
+
|
41
|
+
require "test/unit"
|
42
|
+
|
43
|
+
class TestEnvironment < Test::Unit::TestCase
|
44
|
+
|
45
|
+
def setup
|
46
|
+
@env = Environment.new(1, "AHaSon")
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_label_set
|
50
|
+
assert_equal(%w[0A 1H 2a 3S 4o 5n].to_set, @env.label_set)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_to_s
|
54
|
+
assert_equal("1-AHaSon", @env.to_s)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "facets"
|
3
|
+
|
4
|
+
module NArrayExtensions
|
5
|
+
|
6
|
+
def pretty_string(opts={})
|
7
|
+
{ :col_header => nil,
|
8
|
+
:row_header => nil }.merge!(opts)
|
9
|
+
|
10
|
+
("%-3s" % "#") + opts[:col_header].inject("") { |s, a| s + ("%7s" % a) } + "\n" +
|
11
|
+
self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
|
12
|
+
if v.is_a? Float
|
13
|
+
s + ("%7.2f" % v)
|
14
|
+
else
|
15
|
+
s + ("%7d" % v)
|
16
|
+
end
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
NArray.send(:include, NArrayExtensions)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "facets"
|
3
|
+
require "narray"
|
4
|
+
|
5
|
+
module NMatrixExtensions
|
6
|
+
|
7
|
+
|
8
|
+
def pretty_string(opts={})
|
9
|
+
{ :col_header => nil,
|
10
|
+
:row_header => nil }.merge!(opts)
|
11
|
+
|
12
|
+
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
13
|
+
s + ("%7s" % a)
|
14
|
+
} + "\n" + self.to_a.map_with_index { |a, i|
|
15
|
+
("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
|
16
|
+
if v.is_a? Float
|
17
|
+
s + ("%7.2f" % v)
|
18
|
+
else
|
19
|
+
s + ("%7d" % v)
|
20
|
+
end
|
21
|
+
}
|
22
|
+
}.join("\n")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
NMatrix.send(:include, NMatrixExtensions)
|
data/lib/rubst.rb
ADDED
data/lib/rubst/cli.rb
ADDED
@@ -0,0 +1,536 @@
|
|
1
|
+
require "getoptlong"
|
2
|
+
require "rdoc/usage"
|
3
|
+
require "logger"
|
4
|
+
require "rubygems"
|
5
|
+
require "narray"
|
6
|
+
require "bio"
|
7
|
+
require "set"
|
8
|
+
require "facets"
|
9
|
+
|
10
|
+
require "narray_extensions"
|
11
|
+
require "nmatrix_extensions"
|
12
|
+
require "enumerable_extensions"
|
13
|
+
require "environment_feature"
|
14
|
+
require "environment"
|
15
|
+
|
16
|
+
module Rubst
|
17
|
+
class CLI
|
18
|
+
|
19
|
+
def self.print_version
|
20
|
+
puts Rubst::VERSION
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.print_usage
|
24
|
+
puts <<-USAGE
|
25
|
+
|
26
|
+
rubst [ options ] -f TEM-file
|
27
|
+
or
|
28
|
+
rubst [ options ] -l TEMLIST-file
|
29
|
+
|
30
|
+
Available options:
|
31
|
+
|
32
|
+
-h, --help
|
33
|
+
show help
|
34
|
+
-f, --tem-file FILE
|
35
|
+
a tem file
|
36
|
+
-l, --tem-list FILE
|
37
|
+
a list for tem files
|
38
|
+
-o, --outfile FILE
|
39
|
+
output filename ("allmat.dat" if not specified)
|
40
|
+
--weight INTEGER (PID)
|
41
|
+
clustering level (PID) for the BLOSUM-like weighting (not supported yet)
|
42
|
+
--noweight
|
43
|
+
calculate substitution counts with no weights (default)
|
44
|
+
-c, --classdef FILE
|
45
|
+
a file for the defintion of environments (default: 'classdef.dat')
|
46
|
+
-y, --cys INTEGER
|
47
|
+
0 for using C and J only for structure
|
48
|
+
1 for both structure and sequence (default)
|
49
|
+
-output INTEGER
|
50
|
+
0 for raw counts (no-smoothing performed)
|
51
|
+
1 for probabilities
|
52
|
+
2 for log-odds (default)
|
53
|
+
--scale INTEGER
|
54
|
+
log-odds matrices in 1/n bit units (default 3)
|
55
|
+
--sigma DOUBLE
|
56
|
+
change the sigma value for smoothing (default 5)
|
57
|
+
--add DOUBLE
|
58
|
+
add this value to raw counts when deriving log-odds without smoothing
|
59
|
+
(default 1/#classes)
|
60
|
+
--penv
|
61
|
+
use environment-dependent frequencies for log-odds calculation (default false)
|
62
|
+
--pidmin DOUBLE
|
63
|
+
count substitutions only for pairs with PID equal to or greater than this value (default none)
|
64
|
+
--pidmax DOUBLE
|
65
|
+
count substitutions only for pairs with PID smaller than this value (default none)
|
66
|
+
--analysis
|
67
|
+
analyze structural environments (also -D)
|
68
|
+
-v, --verbose INTEGER
|
69
|
+
0 for WARN level (default)
|
70
|
+
1 for INFO level or more severe
|
71
|
+
2 for DEBUG level or more sever
|
72
|
+
--version
|
73
|
+
print version
|
74
|
+
|
75
|
+
USAGE
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.execute(arguments=[])
|
79
|
+
#
|
80
|
+
# Abbreviations in the source codes
|
81
|
+
#
|
82
|
+
# * env: environment
|
83
|
+
# * tem: (FUGUE) template
|
84
|
+
# * classdef: (envlironment) class definition
|
85
|
+
# * aa: amino acid
|
86
|
+
# * tot: total
|
87
|
+
# * rel: relative
|
88
|
+
# * obs: observation (frequency)
|
89
|
+
# * mut: mutation
|
90
|
+
# * mutb: mutability
|
91
|
+
# * freq: frequency
|
92
|
+
# * prob: probability
|
93
|
+
# * opts: options
|
94
|
+
|
95
|
+
#
|
96
|
+
# Global variables & Abbreviations
|
97
|
+
#
|
98
|
+
$logger = Logger.new(STDOUT)
|
99
|
+
$amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
|
100
|
+
$tem_list = nil
|
101
|
+
$tem_file = nil
|
102
|
+
$classdef = nil
|
103
|
+
$outfile = nil
|
104
|
+
$format = nil
|
105
|
+
$aa_tot_obs = {}
|
106
|
+
$aa_mut_obs = {}
|
107
|
+
$aa_mutb = {}
|
108
|
+
$aa_rel_mutb = {}
|
109
|
+
$aa_rel_freq = {}
|
110
|
+
$tot_aa = 0
|
111
|
+
$sigma = 5.0
|
112
|
+
$smooth_prob = {}
|
113
|
+
|
114
|
+
#
|
115
|
+
# Options parsing
|
116
|
+
#
|
117
|
+
opts = GetoptLong.new(
|
118
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
119
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
120
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
121
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
122
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
123
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
124
|
+
[ '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
125
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
126
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
127
|
+
)
|
128
|
+
|
129
|
+
opts.each do |opt, arg|
|
130
|
+
case opt
|
131
|
+
when '--help'
|
132
|
+
print_usage
|
133
|
+
when '--tem-list'
|
134
|
+
$tem_list = arg
|
135
|
+
when '--tem-file'
|
136
|
+
$tem_file = arg
|
137
|
+
when '--classdef'
|
138
|
+
$classdef = arg || 'classdef.dat'
|
139
|
+
when '--output'
|
140
|
+
$format = arg.to_i || 2
|
141
|
+
when '-o'
|
142
|
+
$outfile = arg || 'allmat.dat'
|
143
|
+
when '--cyc'
|
144
|
+
$cysteine = (arg.to_i == 1 ? false : true)
|
145
|
+
when '--verbose'
|
146
|
+
$logger.level = case arg.to_i
|
147
|
+
when 0 then Logger::WARN
|
148
|
+
when 1 then Logger::INFO
|
149
|
+
when 2 then Logger::DEBUG
|
150
|
+
else Logger::INFO
|
151
|
+
end
|
152
|
+
when '--version'
|
153
|
+
print_version
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if ARGV.length != 0 or
|
158
|
+
!$tem_list && !$tem_file or
|
159
|
+
$tem_list && $tem_file
|
160
|
+
print_usage
|
161
|
+
exit 0
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Reading Environment Class Definition File
|
166
|
+
#
|
167
|
+
$env_features = []
|
168
|
+
$env_features << EnvironmentFeature.new("sequence",
|
169
|
+
$amino_acids,
|
170
|
+
$amino_acids,
|
171
|
+
"F",
|
172
|
+
"F")
|
173
|
+
|
174
|
+
IO.foreach($classdef) do |l|
|
175
|
+
next if l =~ /^#/
|
176
|
+
if (env_ftr = l.chomp.split(/;/)).length == 5
|
177
|
+
$logger.info ">>> An environment feature, #{l.chomp} detected"
|
178
|
+
if env_ftr[-1] == "T"
|
179
|
+
$logger.warn "!!! The environment feature, #{l.chomp} silent"
|
180
|
+
next
|
181
|
+
end
|
182
|
+
if env_ftr[-2] == "T"
|
183
|
+
$logger.warn "!!! The environment feature, #{l.chomp} constrained"
|
184
|
+
end
|
185
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
186
|
+
env_ftr[1].split(""),
|
187
|
+
env_ftr[2].split(""),
|
188
|
+
env_ftr[3],
|
189
|
+
env_ftr[4])
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
$envs = {}
|
194
|
+
$env_features.inject([]) { |sum, ec|
|
195
|
+
sum << ec.labels
|
196
|
+
}.inject { |pro, lb|
|
197
|
+
pro.product(lb)
|
198
|
+
}.each_with_index { |e, i|
|
199
|
+
$envs[e.flatten.join] = Environment.new(i, e.flatten.join)
|
200
|
+
}
|
201
|
+
|
202
|
+
#
|
203
|
+
# Reading TEM list file
|
204
|
+
#
|
205
|
+
if $tem_list
|
206
|
+
IO.foreach($tem_list) do |tem_file|
|
207
|
+
tem_file.chomp!
|
208
|
+
|
209
|
+
$logger.debug ">>> Processing #{tem_file} ..."
|
210
|
+
|
211
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
212
|
+
ff = Bio::FlatFile.auto(tem_file)
|
213
|
+
ff.each_entry do |pir|
|
214
|
+
if pir.definition == "sequence"
|
215
|
+
ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
env_labels = {}
|
220
|
+
disulphide = {}
|
221
|
+
|
222
|
+
ali.each_pair do |key, seq|
|
223
|
+
# check disulphide bond environment first!
|
224
|
+
ff.rewind
|
225
|
+
ff.each_entry do |pir|
|
226
|
+
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
227
|
+
disulphide[key] = pir.data.gsub("\n", "").split("")
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
$env_features.each_with_index do |ec, ei|
|
232
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
233
|
+
|
234
|
+
ff.rewind
|
235
|
+
ff.each_entry do |pir|
|
236
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
237
|
+
labels = pir.data.gsub("\n", "").split("").map_with_index { |sym, pos|
|
238
|
+
if sym == "-"
|
239
|
+
"-"
|
240
|
+
elsif sym == "X" || sym == "x"
|
241
|
+
"X"
|
242
|
+
else
|
243
|
+
if ei == 0 # Amino Acid Environment Feature
|
244
|
+
((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
245
|
+
else
|
246
|
+
ec.labels[ec.symbols.index(sym)]
|
247
|
+
end
|
248
|
+
end
|
249
|
+
}
|
250
|
+
|
251
|
+
if env_labels[key].empty?
|
252
|
+
env_labels[key] = labels
|
253
|
+
else
|
254
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
ali.each_pair do |key1, seq1|
|
262
|
+
ali.each_pair do |key2, seq2|
|
263
|
+
if key1 != key2
|
264
|
+
s1 = seq1.split("")
|
265
|
+
s2 = seq2.split("")
|
266
|
+
s1.each_with_index do |source, pos|
|
267
|
+
if env_labels[key1][pos].include?("X")
|
268
|
+
$logger.info ">>> Substitutions from #{key1}-#{pos}-#{source} were masked"
|
269
|
+
next
|
270
|
+
end
|
271
|
+
|
272
|
+
source.upcase!
|
273
|
+
target = s2[pos].upcase
|
274
|
+
|
275
|
+
if !$amino_acids.include?(source)
|
276
|
+
$logger.info "!!! #{key1}-#{pos}-#{source} is not standard amino acid"
|
277
|
+
next
|
278
|
+
end
|
279
|
+
|
280
|
+
if !$amino_acids.include?(target)
|
281
|
+
$logger.info "!!! #{key1}-#{pos}-#{target} is not standard amino acid"
|
282
|
+
next
|
283
|
+
end
|
284
|
+
|
285
|
+
source = (((disulphide[key1][pos] == "F") && (source == "C")) ? "J" : source)
|
286
|
+
target = (((disulphide[key2][pos] == "F") && (target == "C")) ? "J" : target)
|
287
|
+
|
288
|
+
$envs[env_labels[key1][pos]].add_residue_count(target)
|
289
|
+
|
290
|
+
if $aa_tot_obs.has_key?(source)
|
291
|
+
$aa_tot_obs[source] += 1
|
292
|
+
else
|
293
|
+
$aa_tot_obs[source] = 1
|
294
|
+
end
|
295
|
+
|
296
|
+
if source != target
|
297
|
+
if $aa_mut_obs.has_key?(source)
|
298
|
+
$aa_mut_obs[source] += 1
|
299
|
+
else
|
300
|
+
$aa_mut_obs[source] = 1
|
301
|
+
end
|
302
|
+
end
|
303
|
+
$logger.debug ">>> Add #{key1}-#{pos}-#{source} -> #{key2}-#{pos}-#{target} substituion for #{env_labels[key1][pos]}"
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end # IO.foreach($tem_list)
|
309
|
+
|
310
|
+
# #
|
311
|
+
# # Frequency matrix
|
312
|
+
# #
|
313
|
+
# $envs.values.sort_by { |v| v.number }.each do |env|
|
314
|
+
# puts ">#{env.label} #{env.number}"
|
315
|
+
# puts env.freq_array.pretty_string(:col_header => $amino_acids,
|
316
|
+
# :row_header => "Prb")
|
317
|
+
# end
|
318
|
+
|
319
|
+
$tot_freq_matrix = NMatrix.int(21,21)
|
320
|
+
|
321
|
+
# for each combination of environment features
|
322
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
323
|
+
env_groups.each_pair do |label, group|
|
324
|
+
$grp_freq_matrix = NMatrix.int(21,21)
|
325
|
+
|
326
|
+
$amino_acids.each_with_index do |aa, ai|
|
327
|
+
freq_array = group.find { |e| e.label.start_with?(aa) }.freq_array
|
328
|
+
0.upto($grp_freq_matrix.shape[1] - 1) do |j|
|
329
|
+
$grp_freq_matrix[ai, j] = freq_array[j]
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
$tot_freq_matrix += $grp_freq_matrix
|
334
|
+
|
335
|
+
# puts ">#{label}"
|
336
|
+
# puts $grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
337
|
+
end
|
338
|
+
|
339
|
+
# # for total
|
340
|
+
# puts ">Total"
|
341
|
+
# puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
342
|
+
|
343
|
+
#
|
344
|
+
# Amino Acid Frequencies and Mutabilities
|
345
|
+
#
|
346
|
+
ala_factor = 100 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
347
|
+
$tot_aa = $aa_tot_obs.values.sum
|
348
|
+
|
349
|
+
# puts ">Total amino acid frequencies"
|
350
|
+
# puts "%-3s %8s %8s %8s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUT REL_MUT REL_FRQ]
|
351
|
+
|
352
|
+
$aa_tot_obs.each_pair do |res, freq|
|
353
|
+
$aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
|
354
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
355
|
+
$aa_rel_freq[res] = freq / $tot_aa.to_f
|
356
|
+
|
357
|
+
# puts "%-3s %8d %8d %8.2f %8d %8.4f" % [res, $aa_mut_obs[res], freq, $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
358
|
+
end
|
359
|
+
|
360
|
+
#
|
361
|
+
# Probability matrices
|
362
|
+
#
|
363
|
+
tot_prob_matrix = NMatrix.float(21, 21)
|
364
|
+
|
365
|
+
0.upto($tot_freq_matrix.shape[0] - 1) do |i|
|
366
|
+
col_sum = 0
|
367
|
+
0.upto($tot_freq_matrix.shape[1] - 1) do |j|
|
368
|
+
col_sum += $tot_freq_matrix[i, j]
|
369
|
+
end
|
370
|
+
0.upto($tot_freq_matrix.shape[1] - 1) do |k|
|
371
|
+
# normalized substitutions probabilities with mutability
|
372
|
+
#tot_prob_matrix[i, k] = $aa_rel_mutb[$amino_acids[k]] * $tot_freq_matrix[i,k] / col_sum.to_f
|
373
|
+
|
374
|
+
# raw substitution probabilities with just frequencies
|
375
|
+
tot_prob_matrix[i, k] = 100 * $tot_freq_matrix[i,k] / col_sum.to_f
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# puts ">Total probability"
|
380
|
+
# puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
381
|
+
|
382
|
+
# # a new way of getting probability matrix
|
383
|
+
# new_tot_prob_matrix = NMatrix.float(21, 21)
|
384
|
+
#
|
385
|
+
# 0.upto($tot_freq_matrix.shape[0] - 1) do |i|
|
386
|
+
# col_sum = 0
|
387
|
+
# 0.upto($tot_freq_matrix.shape[1] - 1) do |j|
|
388
|
+
# col_sum += ((1 - $aa_mutb[$amino_acids[j]]) / $aa_rel_freq[$amino_acids[j]]) * $tot_freq_matrix[i, j]
|
389
|
+
# end
|
390
|
+
# 0.upto($tot_freq_matrix.shape[1] - 1) do |k|
|
391
|
+
# new_tot_prob_matrix[i, k] = 100 * ((1 - $aa_mutb[$amino_acids[k]]) / $aa_rel_freq[$amino_acids[k]]) * $tot_freq_matrix[i, k] / col_sum
|
392
|
+
# end
|
393
|
+
# end
|
394
|
+
|
395
|
+
#
|
396
|
+
# p1 probability
|
397
|
+
#
|
398
|
+
p1 = NArray.float(21)
|
399
|
+
a0 = NArray.float(21).fill(1 / 21.0)
|
400
|
+
big_N = $tot_aa
|
401
|
+
small_n = 21
|
402
|
+
omega1 = 1.0 / (1 + big_N.to_f / ($sigma * small_n.to_f))
|
403
|
+
omega2 = 1.0 - omega1
|
404
|
+
|
405
|
+
0.upto(p1.shape[0] - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
406
|
+
$smooth_prob[1] = p1
|
407
|
+
|
408
|
+
# puts "P1 probability matrix"
|
409
|
+
# puts p1.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
410
|
+
# puts p1.sum
|
411
|
+
|
412
|
+
#
|
413
|
+
# p2 and above
|
414
|
+
#
|
415
|
+
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
416
|
+
|
417
|
+
1.upto($env_features.size) do |ci|
|
418
|
+
env_labels.combination(ci) do |c1|
|
419
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
420
|
+
pattern = "." * $env_features.size
|
421
|
+
labels.each do |label|
|
422
|
+
j = label[0].chr.to_i
|
423
|
+
l = label[1].chr
|
424
|
+
pattern[j] = l
|
425
|
+
end
|
426
|
+
|
427
|
+
# get environmetns, frequencies, and probabilities
|
428
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
429
|
+
freq_arr = envs.inject(NArray.int(21)) { |sum, env| sum + env.freq_array }
|
430
|
+
|
431
|
+
# if freq_arr.sum == 0
|
432
|
+
# $logger.warn "!!! Environment combination, #{labels.to_set} has no frequency"
|
433
|
+
#
|
434
|
+
# # store smoothed probabilties in a hash using a set of envrionment labels as a key
|
435
|
+
# smooth_prob_arr = NArray.float(21).fill(0.0)
|
436
|
+
# if !$smooth_prob.has_key?(ci + 1)
|
437
|
+
# $smooth_prob[ci + 1] = {}
|
438
|
+
# $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
439
|
+
# else
|
440
|
+
# $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
441
|
+
# end
|
442
|
+
#
|
443
|
+
# # print smoothed probabilties
|
444
|
+
# puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
|
445
|
+
# puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
446
|
+
# puts smooth_prob_arr.sum
|
447
|
+
# next
|
448
|
+
# end
|
449
|
+
|
450
|
+
prob_arr = NArray.float(21)
|
451
|
+
0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
452
|
+
|
453
|
+
# collect priors
|
454
|
+
priors = []
|
455
|
+
if ci > 1
|
456
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
457
|
+
else
|
458
|
+
priors << $smooth_prob[1]
|
459
|
+
end
|
460
|
+
|
461
|
+
# averaging priors... have a look at Entropy based normalization!
|
462
|
+
#pri_avg = priors.sum / priors.size
|
463
|
+
|
464
|
+
# entropy based weighting priors
|
465
|
+
entropy_max = Math::log(21)
|
466
|
+
entropies = priors.map do |prior|
|
467
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
468
|
+
end
|
469
|
+
pri_avg = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
470
|
+
|
471
|
+
|
472
|
+
smooth_prob_arr = NArray.float(21)
|
473
|
+
big_N = freq_arr.sum.to_f
|
474
|
+
small_n = 21.0
|
475
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
476
|
+
omega2 = 1.0 - omega1
|
477
|
+
|
478
|
+
# smoothing step
|
479
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * pri_avg[i] + omega2 * prob_arr[i]) }
|
480
|
+
|
481
|
+
# normalization step
|
482
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
483
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
484
|
+
|
485
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
486
|
+
if !$smooth_prob.has_key?(ci + 1)
|
487
|
+
$smooth_prob[ci + 1] = {}
|
488
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
489
|
+
else
|
490
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
491
|
+
end
|
492
|
+
|
493
|
+
# # print smoothed probabilties
|
494
|
+
# puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
|
495
|
+
# puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
496
|
+
# puts smooth_prob_arr.sum
|
497
|
+
end
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
# summarizing ...
|
502
|
+
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
503
|
+
# $envs.values.each do |e|
|
504
|
+
# puts ">" + e.label
|
505
|
+
# puts e.smooth_prob_array.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
506
|
+
# end
|
507
|
+
|
508
|
+
tot_smooth_prob_matrix = NMatrix.float(21,21)
|
509
|
+
|
510
|
+
# for each combination of environment features
|
511
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
512
|
+
env_groups.sort.each do |label, group|
|
513
|
+
grp_prob_matrix = NMatrix.float(21,21)
|
514
|
+
|
515
|
+
$amino_acids.each_with_index do |aa, ai|
|
516
|
+
smooth_prob_array = group.find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
517
|
+
0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
|
518
|
+
end
|
519
|
+
|
520
|
+
tot_smooth_prob_matrix += grp_prob_matrix
|
521
|
+
|
522
|
+
puts ">#{label}"
|
523
|
+
puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
524
|
+
end
|
525
|
+
|
526
|
+
tot_smooth_prob_matrix /= env_groups.size
|
527
|
+
|
528
|
+
# for total
|
529
|
+
puts ">Total Probability"
|
530
|
+
puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
531
|
+
|
532
|
+
end
|
533
|
+
|
534
|
+
end
|
535
|
+
end
|
536
|
+
end
|