rubst 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +32 -0
- data/PostInstall.txt +4 -0
- data/README.rdoc +56 -0
- data/Rakefile +31 -0
- data/bin/rubst +7 -0
- data/config/hoe.rb +1 -0
- data/config/website.yml +2 -0
- data/javascripts/rounded_corners_lite.inc.js +285 -0
- data/lib/enumerable_extensions.rb +11 -0
- data/lib/environment.rb +58 -0
- data/lib/environment_feature.rb +14 -0
- data/lib/narray_extensions.rb +21 -0
- data/lib/nmatrix_extensions.rb +26 -0
- data/lib/rubst.rb +6 -0
- data/lib/rubst/cli.rb +536 -0
- data/rubst.gemspec +52 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +71 -0
- data/test/test_enumerable_extensions.rb +16 -0
- data/test/test_environment_feature.rb +11 -0
- data/test/test_helper.rb +2 -0
- data/test/test_nmatrix_extensions.rb +16 -0
- data/test/test_rubst.rb +11 -0
- data/test/test_rubst_cli.rb +8 -0
- data/website/index.html +73 -0
- data/website/index.txt +38 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +165 -0
- metadata.gz.sig +0 -0
data/lib/environment.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "set"
|
3
|
+
require "narray"
|
4
|
+
require "facets"
|
5
|
+
|
6
|
+
class Environment
|
7
|
+
|
8
|
+
@@amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
|
9
|
+
|
10
|
+
attr_accessor :number,
|
11
|
+
:label,
|
12
|
+
:freq_array,
|
13
|
+
:prob_array,
|
14
|
+
:logodd_array,
|
15
|
+
:smooth_prob_array
|
16
|
+
|
17
|
+
def initialize(number, label)
|
18
|
+
@number = number
|
19
|
+
@label = label
|
20
|
+
@freq_array = NArray.int(21)
|
21
|
+
@prob_array = NArray.float(21)
|
22
|
+
@logodd_array = NArray.float(21)
|
23
|
+
@smooth_prob_array = NArray.float(21)
|
24
|
+
end
|
25
|
+
|
26
|
+
def add_residue_count(a, inc = 1)
|
27
|
+
@freq_array[@@amino_acids.index(a.upcase)] += inc
|
28
|
+
end
|
29
|
+
|
30
|
+
def label_set
|
31
|
+
label.split("").map_with_index { |l, i| "#{i}#{l}" }.to_set
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
"#{number}-#{label}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
if $0 == __FILE__
|
40
|
+
|
41
|
+
require "test/unit"
|
42
|
+
|
43
|
+
class TestEnvironment < Test::Unit::TestCase
|
44
|
+
|
45
|
+
def setup
|
46
|
+
@env = Environment.new(1, "AHaSon")
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_label_set
|
50
|
+
assert_equal(%w[0A 1H 2a 3S 4o 5n].to_set, @env.label_set)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_to_s
|
54
|
+
assert_equal("1-AHaSon", @env.to_s)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "facets"
|
3
|
+
|
4
|
+
module NArrayExtensions
|
5
|
+
|
6
|
+
def pretty_string(opts={})
|
7
|
+
{ :col_header => nil,
|
8
|
+
:row_header => nil }.merge!(opts)
|
9
|
+
|
10
|
+
("%-3s" % "#") + opts[:col_header].inject("") { |s, a| s + ("%7s" % a) } + "\n" +
|
11
|
+
self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
|
12
|
+
if v.is_a? Float
|
13
|
+
s + ("%7.2f" % v)
|
14
|
+
else
|
15
|
+
s + ("%7d" % v)
|
16
|
+
end
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
NArray.send(:include, NArrayExtensions)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "facets"
|
3
|
+
require "narray"
|
4
|
+
|
5
|
+
module NMatrixExtensions
|
6
|
+
|
7
|
+
|
8
|
+
def pretty_string(opts={})
|
9
|
+
{ :col_header => nil,
|
10
|
+
:row_header => nil }.merge!(opts)
|
11
|
+
|
12
|
+
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
13
|
+
s + ("%7s" % a)
|
14
|
+
} + "\n" + self.to_a.map_with_index { |a, i|
|
15
|
+
("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
|
16
|
+
if v.is_a? Float
|
17
|
+
s + ("%7.2f" % v)
|
18
|
+
else
|
19
|
+
s + ("%7d" % v)
|
20
|
+
end
|
21
|
+
}
|
22
|
+
}.join("\n")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
NMatrix.send(:include, NMatrixExtensions)
|
data/lib/rubst.rb
ADDED
data/lib/rubst/cli.rb
ADDED
@@ -0,0 +1,536 @@
|
|
1
|
+
require "getoptlong"
|
2
|
+
require "rdoc/usage"
|
3
|
+
require "logger"
|
4
|
+
require "rubygems"
|
5
|
+
require "narray"
|
6
|
+
require "bio"
|
7
|
+
require "set"
|
8
|
+
require "facets"
|
9
|
+
|
10
|
+
require "narray_extensions"
|
11
|
+
require "nmatrix_extensions"
|
12
|
+
require "enumerable_extensions"
|
13
|
+
require "environment_feature"
|
14
|
+
require "environment"
|
15
|
+
|
16
|
+
module Rubst
|
17
|
+
class CLI
|
18
|
+
|
19
|
+
def self.print_version
|
20
|
+
puts Rubst::VERSION
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.print_usage
|
24
|
+
puts <<-USAGE
|
25
|
+
|
26
|
+
rubst [ options ] -f TEM-file
|
27
|
+
or
|
28
|
+
rubst [ options ] -l TEMLIST-file
|
29
|
+
|
30
|
+
Available options:
|
31
|
+
|
32
|
+
-h, --help
|
33
|
+
show help
|
34
|
+
-f, --tem-file FILE
|
35
|
+
a tem file
|
36
|
+
-l, --tem-list FILE
|
37
|
+
a list for tem files
|
38
|
+
-o, --outfile FILE
|
39
|
+
output filename ("allmat.dat" if not specified)
|
40
|
+
--weight INTEGER (PID)
|
41
|
+
clustering level (PID) for the BLOSUM-like weighting (not supported yet)
|
42
|
+
--noweight
|
43
|
+
calculate substitution counts with no weights (default)
|
44
|
+
-c, --classdef FILE
|
45
|
+
a file for the defintion of environments (default: 'classdef.dat')
|
46
|
+
-y, --cys INTEGER
|
47
|
+
0 for using C and J only for structure
|
48
|
+
1 for both structure and sequence (default)
|
49
|
+
-output INTEGER
|
50
|
+
0 for raw counts (no-smoothing performed)
|
51
|
+
1 for probabilities
|
52
|
+
2 for log-odds (default)
|
53
|
+
--scale INTEGER
|
54
|
+
log-odds matrices in 1/n bit units (default 3)
|
55
|
+
--sigma DOUBLE
|
56
|
+
change the sigma value for smoothing (default 5)
|
57
|
+
--add DOUBLE
|
58
|
+
add this value to raw counts when deriving log-odds without smoothing
|
59
|
+
(default 1/#classes)
|
60
|
+
--penv
|
61
|
+
use environment-dependent frequencies for log-odds calculation (default false)
|
62
|
+
--pidmin DOUBLE
|
63
|
+
count substitutions only for pairs with PID equal to or greater than this value (default none)
|
64
|
+
--pidmax DOUBLE
|
65
|
+
count substitutions only for pairs with PID smaller than this value (default none)
|
66
|
+
--analysis
|
67
|
+
analyze structural environments (also -D)
|
68
|
+
-v, --verbose INTEGER
|
69
|
+
0 for WARN level (default)
|
70
|
+
1 for INFO level or more severe
|
71
|
+
2 for DEBUG level or more sever
|
72
|
+
--version
|
73
|
+
print version
|
74
|
+
|
75
|
+
USAGE
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.execute(arguments=[])
|
79
|
+
#
|
80
|
+
# Abbreviations in the source codes
|
81
|
+
#
|
82
|
+
# * env: environment
|
83
|
+
# * tem: (FUGUE) template
|
84
|
+
# * classdef: (envlironment) class definition
|
85
|
+
# * aa: amino acid
|
86
|
+
# * tot: total
|
87
|
+
# * rel: relative
|
88
|
+
# * obs: observation (frequency)
|
89
|
+
# * mut: mutation
|
90
|
+
# * mutb: mutability
|
91
|
+
# * freq: frequency
|
92
|
+
# * prob: probability
|
93
|
+
# * opts: options
|
94
|
+
|
95
|
+
#
|
96
|
+
# Global variables & Abbreviations
|
97
|
+
#
|
98
|
+
$logger = Logger.new(STDOUT)
|
99
|
+
$amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
|
100
|
+
$tem_list = nil
|
101
|
+
$tem_file = nil
|
102
|
+
$classdef = nil
|
103
|
+
$outfile = nil
|
104
|
+
$format = nil
|
105
|
+
$aa_tot_obs = {}
|
106
|
+
$aa_mut_obs = {}
|
107
|
+
$aa_mutb = {}
|
108
|
+
$aa_rel_mutb = {}
|
109
|
+
$aa_rel_freq = {}
|
110
|
+
$tot_aa = 0
|
111
|
+
$sigma = 5.0
|
112
|
+
$smooth_prob = {}
|
113
|
+
|
114
|
+
#
|
115
|
+
# Options parsing
|
116
|
+
#
|
117
|
+
opts = GetoptLong.new(
|
118
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
119
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
120
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
121
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
122
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
123
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
124
|
+
[ '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
125
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
126
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
127
|
+
)
|
128
|
+
|
129
|
+
opts.each do |opt, arg|
|
130
|
+
case opt
|
131
|
+
when '--help'
|
132
|
+
print_usage
|
133
|
+
when '--tem-list'
|
134
|
+
$tem_list = arg
|
135
|
+
when '--tem-file'
|
136
|
+
$tem_file = arg
|
137
|
+
when '--classdef'
|
138
|
+
$classdef = arg || 'classdef.dat'
|
139
|
+
when '--output'
|
140
|
+
$format = arg.to_i || 2
|
141
|
+
when '-o'
|
142
|
+
$outfile = arg || 'allmat.dat'
|
143
|
+
when '--cyc'
|
144
|
+
$cysteine = (arg.to_i == 1 ? false : true)
|
145
|
+
when '--verbose'
|
146
|
+
$logger.level = case arg.to_i
|
147
|
+
when 0 then Logger::WARN
|
148
|
+
when 1 then Logger::INFO
|
149
|
+
when 2 then Logger::DEBUG
|
150
|
+
else Logger::INFO
|
151
|
+
end
|
152
|
+
when '--version'
|
153
|
+
print_version
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if ARGV.length != 0 or
|
158
|
+
!$tem_list && !$tem_file or
|
159
|
+
$tem_list && $tem_file
|
160
|
+
print_usage
|
161
|
+
exit 0
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Reading Environment Class Definition File
|
166
|
+
#
|
167
|
+
$env_features = []
|
168
|
+
$env_features << EnvironmentFeature.new("sequence",
|
169
|
+
$amino_acids,
|
170
|
+
$amino_acids,
|
171
|
+
"F",
|
172
|
+
"F")
|
173
|
+
|
174
|
+
IO.foreach($classdef) do |l|
|
175
|
+
next if l =~ /^#/
|
176
|
+
if (env_ftr = l.chomp.split(/;/)).length == 5
|
177
|
+
$logger.info ">>> An environment feature, #{l.chomp} detected"
|
178
|
+
if env_ftr[-1] == "T"
|
179
|
+
$logger.warn "!!! The environment feature, #{l.chomp} silent"
|
180
|
+
next
|
181
|
+
end
|
182
|
+
if env_ftr[-2] == "T"
|
183
|
+
$logger.warn "!!! The environment feature, #{l.chomp} constrained"
|
184
|
+
end
|
185
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
186
|
+
env_ftr[1].split(""),
|
187
|
+
env_ftr[2].split(""),
|
188
|
+
env_ftr[3],
|
189
|
+
env_ftr[4])
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
$envs = {}
|
194
|
+
$env_features.inject([]) { |sum, ec|
|
195
|
+
sum << ec.labels
|
196
|
+
}.inject { |pro, lb|
|
197
|
+
pro.product(lb)
|
198
|
+
}.each_with_index { |e, i|
|
199
|
+
$envs[e.flatten.join] = Environment.new(i, e.flatten.join)
|
200
|
+
}
|
201
|
+
|
202
|
+
#
|
203
|
+
# Reading TEM list file
|
204
|
+
#
|
205
|
+
if $tem_list
|
206
|
+
IO.foreach($tem_list) do |tem_file|
|
207
|
+
tem_file.chomp!
|
208
|
+
|
209
|
+
$logger.debug ">>> Processing #{tem_file} ..."
|
210
|
+
|
211
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
212
|
+
ff = Bio::FlatFile.auto(tem_file)
|
213
|
+
ff.each_entry do |pir|
|
214
|
+
if pir.definition == "sequence"
|
215
|
+
ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
env_labels = {}
|
220
|
+
disulphide = {}
|
221
|
+
|
222
|
+
ali.each_pair do |key, seq|
|
223
|
+
# check disulphide bond environment first!
|
224
|
+
ff.rewind
|
225
|
+
ff.each_entry do |pir|
|
226
|
+
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
227
|
+
disulphide[key] = pir.data.gsub("\n", "").split("")
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
$env_features.each_with_index do |ec, ei|
|
232
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
233
|
+
|
234
|
+
ff.rewind
|
235
|
+
ff.each_entry do |pir|
|
236
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
237
|
+
labels = pir.data.gsub("\n", "").split("").map_with_index { |sym, pos|
|
238
|
+
if sym == "-"
|
239
|
+
"-"
|
240
|
+
elsif sym == "X" || sym == "x"
|
241
|
+
"X"
|
242
|
+
else
|
243
|
+
if ei == 0 # Amino Acid Environment Feature
|
244
|
+
((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
245
|
+
else
|
246
|
+
ec.labels[ec.symbols.index(sym)]
|
247
|
+
end
|
248
|
+
end
|
249
|
+
}
|
250
|
+
|
251
|
+
if env_labels[key].empty?
|
252
|
+
env_labels[key] = labels
|
253
|
+
else
|
254
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
ali.each_pair do |key1, seq1|
|
262
|
+
ali.each_pair do |key2, seq2|
|
263
|
+
if key1 != key2
|
264
|
+
s1 = seq1.split("")
|
265
|
+
s2 = seq2.split("")
|
266
|
+
s1.each_with_index do |source, pos|
|
267
|
+
if env_labels[key1][pos].include?("X")
|
268
|
+
$logger.info ">>> Substitutions from #{key1}-#{pos}-#{source} were masked"
|
269
|
+
next
|
270
|
+
end
|
271
|
+
|
272
|
+
source.upcase!
|
273
|
+
target = s2[pos].upcase
|
274
|
+
|
275
|
+
if !$amino_acids.include?(source)
|
276
|
+
$logger.info "!!! #{key1}-#{pos}-#{source} is not standard amino acid"
|
277
|
+
next
|
278
|
+
end
|
279
|
+
|
280
|
+
if !$amino_acids.include?(target)
|
281
|
+
$logger.info "!!! #{key1}-#{pos}-#{target} is not standard amino acid"
|
282
|
+
next
|
283
|
+
end
|
284
|
+
|
285
|
+
source = (((disulphide[key1][pos] == "F") && (source == "C")) ? "J" : source)
|
286
|
+
target = (((disulphide[key2][pos] == "F") && (target == "C")) ? "J" : target)
|
287
|
+
|
288
|
+
$envs[env_labels[key1][pos]].add_residue_count(target)
|
289
|
+
|
290
|
+
if $aa_tot_obs.has_key?(source)
|
291
|
+
$aa_tot_obs[source] += 1
|
292
|
+
else
|
293
|
+
$aa_tot_obs[source] = 1
|
294
|
+
end
|
295
|
+
|
296
|
+
if source != target
|
297
|
+
if $aa_mut_obs.has_key?(source)
|
298
|
+
$aa_mut_obs[source] += 1
|
299
|
+
else
|
300
|
+
$aa_mut_obs[source] = 1
|
301
|
+
end
|
302
|
+
end
|
303
|
+
$logger.debug ">>> Add #{key1}-#{pos}-#{source} -> #{key2}-#{pos}-#{target} substituion for #{env_labels[key1][pos]}"
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end # IO.foreach($tem_list)
|
309
|
+
|
310
|
+
# #
|
311
|
+
# # Frequency matrix
|
312
|
+
# #
|
313
|
+
# $envs.values.sort_by { |v| v.number }.each do |env|
|
314
|
+
# puts ">#{env.label} #{env.number}"
|
315
|
+
# puts env.freq_array.pretty_string(:col_header => $amino_acids,
|
316
|
+
# :row_header => "Prb")
|
317
|
+
# end
|
318
|
+
|
319
|
+
$tot_freq_matrix = NMatrix.int(21,21)
|
320
|
+
|
321
|
+
# for each combination of environment features
|
322
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
323
|
+
env_groups.each_pair do |label, group|
|
324
|
+
$grp_freq_matrix = NMatrix.int(21,21)
|
325
|
+
|
326
|
+
$amino_acids.each_with_index do |aa, ai|
|
327
|
+
freq_array = group.find { |e| e.label.start_with?(aa) }.freq_array
|
328
|
+
0.upto($grp_freq_matrix.shape[1] - 1) do |j|
|
329
|
+
$grp_freq_matrix[ai, j] = freq_array[j]
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
$tot_freq_matrix += $grp_freq_matrix
|
334
|
+
|
335
|
+
# puts ">#{label}"
|
336
|
+
# puts $grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
337
|
+
end
|
338
|
+
|
339
|
+
# # for total
|
340
|
+
# puts ">Total"
|
341
|
+
# puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
342
|
+
|
343
|
+
#
|
344
|
+
# Amino Acid Frequencies and Mutabilities
|
345
|
+
#
|
346
|
+
ala_factor = 100 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
347
|
+
$tot_aa = $aa_tot_obs.values.sum
|
348
|
+
|
349
|
+
# puts ">Total amino acid frequencies"
|
350
|
+
# puts "%-3s %8s %8s %8s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUT REL_MUT REL_FRQ]
|
351
|
+
|
352
|
+
$aa_tot_obs.each_pair do |res, freq|
|
353
|
+
$aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
|
354
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
355
|
+
$aa_rel_freq[res] = freq / $tot_aa.to_f
|
356
|
+
|
357
|
+
# puts "%-3s %8d %8d %8.2f %8d %8.4f" % [res, $aa_mut_obs[res], freq, $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
358
|
+
end
|
359
|
+
|
360
|
+
#
|
361
|
+
# Probability matrices
|
362
|
+
#
|
363
|
+
tot_prob_matrix = NMatrix.float(21, 21)
|
364
|
+
|
365
|
+
0.upto($tot_freq_matrix.shape[0] - 1) do |i|
|
366
|
+
col_sum = 0
|
367
|
+
0.upto($tot_freq_matrix.shape[1] - 1) do |j|
|
368
|
+
col_sum += $tot_freq_matrix[i, j]
|
369
|
+
end
|
370
|
+
0.upto($tot_freq_matrix.shape[1] - 1) do |k|
|
371
|
+
# normalized substitutions probabilities with mutability
|
372
|
+
#tot_prob_matrix[i, k] = $aa_rel_mutb[$amino_acids[k]] * $tot_freq_matrix[i,k] / col_sum.to_f
|
373
|
+
|
374
|
+
# raw substitution probabilities with just frequencies
|
375
|
+
tot_prob_matrix[i, k] = 100 * $tot_freq_matrix[i,k] / col_sum.to_f
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# puts ">Total probability"
|
380
|
+
# puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
381
|
+
|
382
|
+
# # a new way of getting probability matrix
|
383
|
+
# new_tot_prob_matrix = NMatrix.float(21, 21)
|
384
|
+
#
|
385
|
+
# 0.upto($tot_freq_matrix.shape[0] - 1) do |i|
|
386
|
+
# col_sum = 0
|
387
|
+
# 0.upto($tot_freq_matrix.shape[1] - 1) do |j|
|
388
|
+
# col_sum += ((1 - $aa_mutb[$amino_acids[j]]) / $aa_rel_freq[$amino_acids[j]]) * $tot_freq_matrix[i, j]
|
389
|
+
# end
|
390
|
+
# 0.upto($tot_freq_matrix.shape[1] - 1) do |k|
|
391
|
+
# new_tot_prob_matrix[i, k] = 100 * ((1 - $aa_mutb[$amino_acids[k]]) / $aa_rel_freq[$amino_acids[k]]) * $tot_freq_matrix[i, k] / col_sum
|
392
|
+
# end
|
393
|
+
# end
|
394
|
+
|
395
|
+
#
|
396
|
+
# p1 probability
|
397
|
+
#
|
398
|
+
p1 = NArray.float(21)
|
399
|
+
a0 = NArray.float(21).fill(1 / 21.0)
|
400
|
+
big_N = $tot_aa
|
401
|
+
small_n = 21
|
402
|
+
omega1 = 1.0 / (1 + big_N.to_f / ($sigma * small_n.to_f))
|
403
|
+
omega2 = 1.0 - omega1
|
404
|
+
|
405
|
+
0.upto(p1.shape[0] - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
406
|
+
$smooth_prob[1] = p1
|
407
|
+
|
408
|
+
# puts "P1 probability matrix"
|
409
|
+
# puts p1.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
410
|
+
# puts p1.sum
|
411
|
+
|
412
|
+
#
|
413
|
+
# p2 and above
|
414
|
+
#
|
415
|
+
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
416
|
+
|
417
|
+
1.upto($env_features.size) do |ci|
|
418
|
+
env_labels.combination(ci) do |c1|
|
419
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
420
|
+
pattern = "." * $env_features.size
|
421
|
+
labels.each do |label|
|
422
|
+
j = label[0].chr.to_i
|
423
|
+
l = label[1].chr
|
424
|
+
pattern[j] = l
|
425
|
+
end
|
426
|
+
|
427
|
+
# get environmetns, frequencies, and probabilities
|
428
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
429
|
+
freq_arr = envs.inject(NArray.int(21)) { |sum, env| sum + env.freq_array }
|
430
|
+
|
431
|
+
# if freq_arr.sum == 0
|
432
|
+
# $logger.warn "!!! Environment combination, #{labels.to_set} has no frequency"
|
433
|
+
#
|
434
|
+
# # store smoothed probabilties in a hash using a set of envrionment labels as a key
|
435
|
+
# smooth_prob_arr = NArray.float(21).fill(0.0)
|
436
|
+
# if !$smooth_prob.has_key?(ci + 1)
|
437
|
+
# $smooth_prob[ci + 1] = {}
|
438
|
+
# $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
439
|
+
# else
|
440
|
+
# $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
441
|
+
# end
|
442
|
+
#
|
443
|
+
# # print smoothed probabilties
|
444
|
+
# puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
|
445
|
+
# puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
446
|
+
# puts smooth_prob_arr.sum
|
447
|
+
# next
|
448
|
+
# end
|
449
|
+
|
450
|
+
prob_arr = NArray.float(21)
|
451
|
+
0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
452
|
+
|
453
|
+
# collect priors
|
454
|
+
priors = []
|
455
|
+
if ci > 1
|
456
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
457
|
+
else
|
458
|
+
priors << $smooth_prob[1]
|
459
|
+
end
|
460
|
+
|
461
|
+
# averaging priors... have a look at Entropy based normalization!
|
462
|
+
#pri_avg = priors.sum / priors.size
|
463
|
+
|
464
|
+
# entropy based weighting priors
|
465
|
+
entropy_max = Math::log(21)
|
466
|
+
entropies = priors.map do |prior|
|
467
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
468
|
+
end
|
469
|
+
pri_avg = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
470
|
+
|
471
|
+
|
472
|
+
smooth_prob_arr = NArray.float(21)
|
473
|
+
big_N = freq_arr.sum.to_f
|
474
|
+
small_n = 21.0
|
475
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
476
|
+
omega2 = 1.0 - omega1
|
477
|
+
|
478
|
+
# smoothing step
|
479
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * pri_avg[i] + omega2 * prob_arr[i]) }
|
480
|
+
|
481
|
+
# normalization step
|
482
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
483
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
484
|
+
|
485
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
486
|
+
if !$smooth_prob.has_key?(ci + 1)
|
487
|
+
$smooth_prob[ci + 1] = {}
|
488
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
489
|
+
else
|
490
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
491
|
+
end
|
492
|
+
|
493
|
+
# # print smoothed probabilties
|
494
|
+
# puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
|
495
|
+
# puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
496
|
+
# puts smooth_prob_arr.sum
|
497
|
+
end
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
# summarizing ...
|
502
|
+
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
503
|
+
# $envs.values.each do |e|
|
504
|
+
# puts ">" + e.label
|
505
|
+
# puts e.smooth_prob_array.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
|
506
|
+
# end
|
507
|
+
|
508
|
+
tot_smooth_prob_matrix = NMatrix.float(21,21)
|
509
|
+
|
510
|
+
# for each combination of environment features
|
511
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
512
|
+
env_groups.sort.each do |label, group|
|
513
|
+
grp_prob_matrix = NMatrix.float(21,21)
|
514
|
+
|
515
|
+
$amino_acids.each_with_index do |aa, ai|
|
516
|
+
smooth_prob_array = group.find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
517
|
+
0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
|
518
|
+
end
|
519
|
+
|
520
|
+
tot_smooth_prob_matrix += grp_prob_matrix
|
521
|
+
|
522
|
+
puts ">#{label}"
|
523
|
+
puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
524
|
+
end
|
525
|
+
|
526
|
+
tot_smooth_prob_matrix /= env_groups.size
|
527
|
+
|
528
|
+
# for total
|
529
|
+
puts ">Total Probability"
|
530
|
+
puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
531
|
+
|
532
|
+
end
|
533
|
+
|
534
|
+
end
|
535
|
+
end
|
536
|
+
end
|