egor 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +32 -0
- data/PostInstall.txt +5 -0
- data/README.rdoc +88 -0
- data/Rakefile +32 -0
- data/bin/egor +10 -0
- data/config/website.yml +2 -0
- data/config/website.yml.sample +2 -0
- data/egor.gemspec +53 -0
- data/lib/egor/cli.rb +1063 -0
- data/lib/egor.rb +6 -0
- data/lib/enumerable_extensions.rb +11 -0
- data/lib/environment.rb +58 -0
- data/lib/environment_feature.rb +14 -0
- data/lib/math_extensions.rb +7 -0
- data/lib/narray_extensions.rb +21 -0
- data/lib/nmatrix_extensions.rb +24 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +71 -0
- data/test/test_egor.rb +11 -0
- data/test/test_egor_cli.rb +8 -0
- data/test/test_enumerable_extensions.rb +16 -0
- data/test/test_environment_feature.rb +11 -0
- data/test/test_helper.rb +2 -0
- data/test/test_nmatrix_extensions.rb +16 -0
- data/website/index.html +78 -0
- data/website/index.txt +48 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +154 -0
data/lib/egor/cli.rb
ADDED
@@ -0,0 +1,1063 @@
|
|
1
|
+
require "getoptlong"
|
2
|
+
require "logger"
|
3
|
+
require "rubygems"
|
4
|
+
require "narray"
|
5
|
+
require "bio"
|
6
|
+
require "set"
|
7
|
+
require "facets"
|
8
|
+
require "simple_memoize"
|
9
|
+
|
10
|
+
require "narray_extensions"
|
11
|
+
require "nmatrix_extensions"
|
12
|
+
require "enumerable_extensions"
|
13
|
+
require "math_extensions"
|
14
|
+
require "environment_feature"
|
15
|
+
require "environment"
|
16
|
+
|
17
|
+
# This is a module for an actual command line interpreter for Egor
|
18
|
+
# ---
|
19
|
+
# Copyright (C) 2008-9 Semin Lee
|
20
|
+
module Egor
|
21
|
+
class CLI
|
22
|
+
class << self
|
23
|
+
|
24
|
+
# :nodoc:
|
25
|
+
def print_version
|
26
|
+
puts Egor::VERSION
|
27
|
+
end
|
28
|
+
|
29
|
+
# Print Egor's Usage on the screen
|
30
|
+
#
|
31
|
+
# :call-seq:
|
32
|
+
# Egor::CLI::print_usage
|
33
|
+
#
|
34
|
+
def print_usage
|
35
|
+
puts <<-USAGE
|
36
|
+
egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
|
37
|
+
|
38
|
+
Usage:
|
39
|
+
egor [ options ] -l TEMLIST-file -c CLASSDEF-file
|
40
|
+
or
|
41
|
+
egor [ options ] -f TEM-file -c CLASSDEF-file
|
42
|
+
|
43
|
+
Options:
|
44
|
+
--tem-file (-f) STRING: a tem file
|
45
|
+
--tem-list (-l) STRING: a list for tem files
|
46
|
+
--classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
|
47
|
+
--outfile (-o) STRING: output filename ("allmat.dat" if not specified)
|
48
|
+
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
|
+
--noweight: calculate substitution counts with no weights (default)
|
50
|
+
--smooth (-s) INTEGER:
|
51
|
+
0 for parial smoothing (default)
|
52
|
+
1 for full smoothing
|
53
|
+
--nosmooth: perform no smoothing operation
|
54
|
+
--cys (-y) INTEGER: (NOT implemented yet)
|
55
|
+
0 for using C and J only for structure
|
56
|
+
1 for both structure and sequence (default)
|
57
|
+
--output INTEGER:
|
58
|
+
0 for raw counts (no-smoothing performed)
|
59
|
+
1 for probabilities
|
60
|
+
2 for log-odds (default)
|
61
|
+
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
62
|
+
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
63
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
64
|
+
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
|
65
|
+
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
66
|
+
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
67
|
+
--verbose (-v) INTEGER
|
68
|
+
0 for ERROR level (default)
|
69
|
+
1 for WARN or above level
|
70
|
+
2 for INFO or above level
|
71
|
+
3 for DEBUG or above level
|
72
|
+
--version: print version
|
73
|
+
--help (-h): show help
|
74
|
+
|
75
|
+
USAGE
|
76
|
+
end
|
77
|
+
|
78
|
+
# Calculate PID between two sequences
|
79
|
+
#
|
80
|
+
# :call-seq:
|
81
|
+
# Egor::CLI::calc_pid(seq1, seq2) -> Float
|
82
|
+
#
|
83
|
+
def calc_pid(seq1, seq2)
|
84
|
+
s1 = seq1.split("")
|
85
|
+
s2 = seq2.split("")
|
86
|
+
cols = s1.zip(s2)
|
87
|
+
align = 0
|
88
|
+
ident = 0
|
89
|
+
intgp = 0
|
90
|
+
|
91
|
+
cols.each do |col|
|
92
|
+
if (col[0] != "-") && (col[1] != "-")
|
93
|
+
align += 1
|
94
|
+
if col[0] == col[1]
|
95
|
+
ident += 1
|
96
|
+
end
|
97
|
+
elsif (((col[0] == "-") && (col[1] != "-")) ||
|
98
|
+
((col[0] != "-") && (col[1] == "-")))
|
99
|
+
intgp += 1
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
pid = 100.0 * ident.to_f / (align + intgp)
|
104
|
+
end
|
105
|
+
memoize :calc_pid
|
106
|
+
|
107
|
+
# :nodoc:
|
108
|
+
def execute(arguments=[])
|
109
|
+
#
|
110
|
+
# Abbreviations in the aa1 codes
|
111
|
+
#
|
112
|
+
# * env: environment
|
113
|
+
# * tem: (FUGUE) template
|
114
|
+
# * classdef: (envlironment) class definition
|
115
|
+
# * aa: amino acid
|
116
|
+
# * aa: weighted amino acid
|
117
|
+
# * tot: total
|
118
|
+
# * rel: relative
|
119
|
+
# * obs: observation (frequency)
|
120
|
+
# * mut: mutation
|
121
|
+
# * mutb: mutability
|
122
|
+
# * freq: frequency
|
123
|
+
# * prob: probability
|
124
|
+
# * opts: options
|
125
|
+
#
|
126
|
+
|
127
|
+
# Part 1.
|
128
|
+
#
|
129
|
+
# Global variables and their default values
|
130
|
+
#
|
131
|
+
$logger = Logger.new(STDOUT)
|
132
|
+
$logger.level = Logger::ERROR
|
133
|
+
$amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
|
134
|
+
$tem_list = nil
|
135
|
+
$tem_file = nil
|
136
|
+
$classdef = "classdef.dat"
|
137
|
+
$outfile = "allmat.dat"
|
138
|
+
$outfh = nil # file hanfle for outfile
|
139
|
+
$output = 2
|
140
|
+
$aa_tot_obs = {}
|
141
|
+
$aa_mut_obs = {}
|
142
|
+
$aa_mutb = {}
|
143
|
+
$aa_rel_mutb = {}
|
144
|
+
$aa_rel_freq = {}
|
145
|
+
$env_aa_obs = {}
|
146
|
+
$ali_size = 0
|
147
|
+
$tot_aa = 0
|
148
|
+
$sigma = 5.0
|
149
|
+
$weight = 60
|
150
|
+
$noweight = false
|
151
|
+
$smooth = :partial
|
152
|
+
$nosmooth = false
|
153
|
+
$scale = 3
|
154
|
+
$pidmin = nil
|
155
|
+
$pidmax = nil
|
156
|
+
$scale = 3
|
157
|
+
$add = 0
|
158
|
+
$penv = false
|
159
|
+
$heatmap = false
|
160
|
+
$smooth_prob = {}
|
161
|
+
|
162
|
+
# Part 2.
|
163
|
+
#
|
164
|
+
# Parsing options
|
165
|
+
#
|
166
|
+
opts = GetoptLong.new(
|
167
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
168
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
169
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
170
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
171
|
+
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
172
|
+
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
173
|
+
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
174
|
+
[ '--heatmap', GetoptLong::NO_ARGUMENT ],
|
175
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
176
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
177
|
+
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
178
|
+
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
179
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
180
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
181
|
+
)
|
182
|
+
|
183
|
+
opts.each do |opt, arg|
|
184
|
+
case opt
|
185
|
+
when '--help'
|
186
|
+
print_usage
|
187
|
+
exit 0
|
188
|
+
when '--tem-list'
|
189
|
+
$tem_list = arg
|
190
|
+
when '--tem-file'
|
191
|
+
$tem_file = arg
|
192
|
+
when '--classdef'
|
193
|
+
$classdef = arg
|
194
|
+
when '--output'
|
195
|
+
$output = arg.to_i
|
196
|
+
when '--outfile'
|
197
|
+
$outfile = arg
|
198
|
+
when '--cyc'
|
199
|
+
$logger.error "!!! --cys option is not available yet"
|
200
|
+
exit 1
|
201
|
+
$cysteine = (arg.to_i == 1 ? false : true)
|
202
|
+
when '--weight'
|
203
|
+
$weight = arg.to_i
|
204
|
+
when '--sigma'
|
205
|
+
$sigma = arg.to_f
|
206
|
+
when '--pidmin'
|
207
|
+
$pidmin = arg.to_f
|
208
|
+
when '--pidmax'
|
209
|
+
$pidmax = arg.to_f
|
210
|
+
when '--noweight'
|
211
|
+
$noweight = true
|
212
|
+
when '--smooth'
|
213
|
+
$smooth = (arg.to_i == 1 ? :full : :parital)
|
214
|
+
when '--nosmooth'
|
215
|
+
$nosmooth = true
|
216
|
+
when '--scale'
|
217
|
+
$scale = arg.to_f
|
218
|
+
when '--add'
|
219
|
+
$add = arg.to_f
|
220
|
+
when '--penv'
|
221
|
+
$logger.error "!!! --penv option is not available yet"
|
222
|
+
exit 1
|
223
|
+
$penv = true
|
224
|
+
when '--heatmap'
|
225
|
+
$heatmap = true
|
226
|
+
when '--verbose'
|
227
|
+
$logger.level = case arg.to_i
|
228
|
+
when 0 then Logger::ERROR
|
229
|
+
when 1 then Logger::WARN
|
230
|
+
when 2 then Logger::INFO
|
231
|
+
when 3 then Logger::DEBUG
|
232
|
+
else Logger::ERROR
|
233
|
+
end
|
234
|
+
when '--version'
|
235
|
+
print_version
|
236
|
+
exit 0
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# when arguments are nonsense, print usage
|
241
|
+
if ((ARGV.length != 0) ||
|
242
|
+
(!$tem_list && !$tem_file) ||
|
243
|
+
($tem_list && $tem_file))
|
244
|
+
print_usage
|
245
|
+
exit 1
|
246
|
+
end
|
247
|
+
|
248
|
+
# Part 3.
|
249
|
+
#
|
250
|
+
# Reading Environment Class Definition File
|
251
|
+
#
|
252
|
+
|
253
|
+
# a hash for storing all environment feature objects
|
254
|
+
$env_features = []
|
255
|
+
|
256
|
+
# aa1 amino acid in a substitution itself is a environment feature
|
257
|
+
$env_features << EnvironmentFeature.new("sequence",
|
258
|
+
$amino_acids,
|
259
|
+
$amino_acids,
|
260
|
+
"F",
|
261
|
+
"F")
|
262
|
+
|
263
|
+
# read environment class definiton file and
|
264
|
+
# store them into the hash prepared above
|
265
|
+
IO.foreach($classdef) do |line|
|
266
|
+
if line.start_with?("#")
|
267
|
+
next
|
268
|
+
elsif (env_ftr = line.chomp.split(/;/)).length == 5
|
269
|
+
$logger.info ">>> An environment feature, #{line.chomp} detected"
|
270
|
+
if env_ftr[-1] == "T"
|
271
|
+
# skip silenced environment feature
|
272
|
+
$logger.warn "!!! The environment feature, #{line.chomp} silent"
|
273
|
+
next
|
274
|
+
end
|
275
|
+
if env_ftr[-2] == "T"
|
276
|
+
$logger.warn "!!! The environment feature, #{line.chomp} constrained"
|
277
|
+
end
|
278
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
279
|
+
env_ftr[1].split(""),
|
280
|
+
env_ftr[2].split(""),
|
281
|
+
env_ftr[3],
|
282
|
+
env_ftr[4])
|
283
|
+
else
|
284
|
+
$logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
|
285
|
+
exit 1
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# a hash for storing all environment objects
|
290
|
+
$envs = {}
|
291
|
+
|
292
|
+
# generate all possible combinations of environment labels, and
|
293
|
+
# create & store every environment object into the hash prepared above with the label as a key
|
294
|
+
$env_features.inject([]) { |sum, ec|
|
295
|
+
sum << ec.labels
|
296
|
+
}.inject { |pro, lb|
|
297
|
+
pro.product(lb)
|
298
|
+
}.each_with_index { |e, i|
|
299
|
+
$envs[e.flatten.join] = Environment.new(i, e.flatten.join)
|
300
|
+
}
|
301
|
+
|
302
|
+
# Part 4.
|
303
|
+
#
|
304
|
+
# Reading TEM file or TEMLIST list file and couting substitutions
|
305
|
+
#
|
306
|
+
|
307
|
+
# a global file handle for output
|
308
|
+
$outfh = File.open($outfile, "w")
|
309
|
+
|
310
|
+
if $tem_file
|
311
|
+
$tem_list = [$tem_file]
|
312
|
+
end
|
313
|
+
|
314
|
+
if $tem_list
|
315
|
+
IO.foreach($tem_list) do |tem_file|
|
316
|
+
tem_file.chomp!
|
317
|
+
|
318
|
+
$logger.info ">>> Analysing #{tem_file} ..."
|
319
|
+
|
320
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
321
|
+
ff = Bio::FlatFile.auto(tem_file)
|
322
|
+
ff.each_entry do |pir|
|
323
|
+
if pir.definition == "sequence"
|
324
|
+
ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
$ali_size += ali.size
|
329
|
+
env_labels = {}
|
330
|
+
disulphide = {}
|
331
|
+
|
332
|
+
ali.each_pair do |key, seq|
|
333
|
+
# check disulphide bond environment first!
|
334
|
+
ff.rewind
|
335
|
+
ff.each_entry do |pir|
|
336
|
+
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
337
|
+
disulphide[key] = pir.data.gsub("\n", "").split("")
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
$env_features.each_with_index do |ec, ei|
|
342
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
343
|
+
|
344
|
+
ff.rewind
|
345
|
+
ff.each_entry do |pir|
|
346
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
347
|
+
labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
|
348
|
+
if sym == "-"
|
349
|
+
"-"
|
350
|
+
elsif sym == "X" || sym == "x"
|
351
|
+
"X"
|
352
|
+
else
|
353
|
+
if ei == 0 # Amino Acid Environment Feature
|
354
|
+
((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
355
|
+
else
|
356
|
+
ec.labels[ec.symbols.index(sym)]
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
if env_labels[key].empty?
|
362
|
+
env_labels[key] = labels
|
363
|
+
else
|
364
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
if $noweight
|
372
|
+
ali.each_pair do |id1, seq1|
|
373
|
+
ali.each_pair do |id2, seq2|
|
374
|
+
if id1 != id2
|
375
|
+
pid = calc_pid(seq1, seq2)
|
376
|
+
s1 = seq1.split("")
|
377
|
+
s2 = seq2.split("")
|
378
|
+
|
379
|
+
# check PID_MIN
|
380
|
+
if $pidmin && (pid < $pidmin)
|
381
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
|
382
|
+
next
|
383
|
+
end
|
384
|
+
|
385
|
+
# check PID_MAX
|
386
|
+
if $pidmax && (pid > $pidmax)
|
387
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
|
388
|
+
next
|
389
|
+
end
|
390
|
+
|
391
|
+
s1.each_with_index do |aa1, pos|
|
392
|
+
if env_labels[id1][pos].include?("X")
|
393
|
+
$logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
394
|
+
next
|
395
|
+
end
|
396
|
+
|
397
|
+
aa1.upcase!
|
398
|
+
aa2 = s2[pos].upcase
|
399
|
+
|
400
|
+
if !$amino_acids.include?(aa1)
|
401
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
402
|
+
next
|
403
|
+
end
|
404
|
+
|
405
|
+
if !$amino_acids.include?(aa2)
|
406
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
407
|
+
next
|
408
|
+
end
|
409
|
+
|
410
|
+
aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
411
|
+
aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
412
|
+
|
413
|
+
$envs[env_labels[id1][pos]].add_residue_count(aa2)
|
414
|
+
|
415
|
+
grp_label = env_labels[id1][pos][1..-1]
|
416
|
+
|
417
|
+
if $env_aa_obs.has_key? grp_label
|
418
|
+
if $env_aa_obs[grp_label].has_key? aa1
|
419
|
+
$env_aa_obs[grp_label][aa1] += 1
|
420
|
+
else
|
421
|
+
$env_aa_obs[grp_label][aa1] = 1
|
422
|
+
end
|
423
|
+
else
|
424
|
+
$env_aa_obs[grp_label] = Hash.new(0)
|
425
|
+
$env_aa_obs[grp_label][aa1] = 1
|
426
|
+
end
|
427
|
+
|
428
|
+
if $aa_tot_obs.has_key? aa1
|
429
|
+
$aa_tot_obs[aa1] += 1
|
430
|
+
else
|
431
|
+
$aa_tot_obs[aa1] = 1
|
432
|
+
end
|
433
|
+
|
434
|
+
if aa1 != aa2
|
435
|
+
if $aa_mut_obs.has_key? aa1
|
436
|
+
$aa_mut_obs[aa1] += 1
|
437
|
+
else
|
438
|
+
$aa_mut_obs[aa1] = 1
|
439
|
+
end
|
440
|
+
end
|
441
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
442
|
+
end
|
443
|
+
end
|
444
|
+
end
|
445
|
+
end
|
446
|
+
else
|
447
|
+
# BLOSUM-like weighting
|
448
|
+
clusters = []
|
449
|
+
ali.each_pair { |i, s| clusters << [i] }
|
450
|
+
|
451
|
+
# a loop for single linkage clustering
|
452
|
+
begin
|
453
|
+
continue = false
|
454
|
+
0.upto(clusters.size - 2) do |i|
|
455
|
+
indexes = []
|
456
|
+
(i + 1).upto(clusters.size - 1) do |j|
|
457
|
+
found = false
|
458
|
+
clusters[i].each do |c1|
|
459
|
+
clusters[j].each do |c2|
|
460
|
+
if calc_pid(ali[c1], ali[c2]) >= $weight
|
461
|
+
indexes << j
|
462
|
+
found = true
|
463
|
+
break
|
464
|
+
end
|
465
|
+
end
|
466
|
+
break if found
|
467
|
+
end
|
468
|
+
end
|
469
|
+
|
470
|
+
unless indexes.empty?
|
471
|
+
continue = true
|
472
|
+
group = clusters[i]
|
473
|
+
indexes.each do |k|
|
474
|
+
group = group.concat(clusters[k])
|
475
|
+
clusters[k] = nil
|
476
|
+
end
|
477
|
+
clusters[i] = group
|
478
|
+
clusters.compact!
|
479
|
+
end
|
480
|
+
end
|
481
|
+
end while(continue)
|
482
|
+
|
483
|
+
clusters.combination(2).each do |cluster1, cluster2|
|
484
|
+
cluster1.each do |id1|
|
485
|
+
cluster2.each do |id2|
|
486
|
+
seq1 = ali[id1].split("")
|
487
|
+
seq2 = ali[id2].split("")
|
488
|
+
seq1.each_with_index do |aa1, pos|
|
489
|
+
if env_labels[id1][pos].include?("X")
|
490
|
+
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
491
|
+
next
|
492
|
+
end
|
493
|
+
|
494
|
+
aa1.upcase!
|
495
|
+
aa2 = seq2[pos].upcase
|
496
|
+
|
497
|
+
if !$amino_acids.include?(aa1)
|
498
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
499
|
+
next
|
500
|
+
end
|
501
|
+
|
502
|
+
if !$amino_acids.include?(aa2)
|
503
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
504
|
+
next
|
505
|
+
end
|
506
|
+
|
507
|
+
aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
508
|
+
aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
509
|
+
size1 = cluster1.size
|
510
|
+
size2 = cluster2.size
|
511
|
+
obs1 = 1.0 / size1
|
512
|
+
obs2 = 1.0 / size2
|
513
|
+
|
514
|
+
$envs[env_labels[id1][pos]].add_residue_count(aa2, 1.0 / (size1 * size2))
|
515
|
+
$envs[env_labels[id2][pos]].add_residue_count(aa1, 1.0 / (size1 * size2))
|
516
|
+
|
517
|
+
grp_label1 = env_labels[id1][pos][1..-1]
|
518
|
+
grp_label2 = env_labels[id2][pos][1..-1]
|
519
|
+
|
520
|
+
if $env_aa_obs.has_key? grp_label1
|
521
|
+
if $env_aa_obs[grp_label1].has_key? aa1
|
522
|
+
$env_aa_obs[grp_label1][aa1] += obs1
|
523
|
+
else
|
524
|
+
$env_aa_obs[grp_label1][aa1] = obs1
|
525
|
+
end
|
526
|
+
else
|
527
|
+
$env_aa_obs[grp_label1] = Hash.new(0.0)
|
528
|
+
$env_aa_obs[grp_label1][aa1] = obs1
|
529
|
+
end
|
530
|
+
|
531
|
+
if $env_aa_obs.has_key? grp_label2
|
532
|
+
if $env_aa_obs[grp_label2].has_key? aa2
|
533
|
+
$env_aa_obs[grp_label2][aa2] += obs2
|
534
|
+
else
|
535
|
+
$env_aa_obs[grp_label2][aa2] = obs2
|
536
|
+
end
|
537
|
+
else
|
538
|
+
$env_aa_obs[grp_label2] = Hash.new(0.0)
|
539
|
+
$env_aa_obs[grp_label2][aa2] = obs2
|
540
|
+
end
|
541
|
+
|
542
|
+
if $aa_tot_obs.has_key? aa1
|
543
|
+
$aa_tot_obs[aa1] += obs1
|
544
|
+
else
|
545
|
+
$aa_tot_obs[aa1] = obs1
|
546
|
+
end
|
547
|
+
|
548
|
+
if $aa_tot_obs.has_key? aa2
|
549
|
+
$aa_tot_obs[aa2] += obs2
|
550
|
+
else
|
551
|
+
$aa_tot_obs[aa2] = obs2
|
552
|
+
end
|
553
|
+
|
554
|
+
if aa1 != aa2
|
555
|
+
if $aa_mut_obs.has_key? aa1
|
556
|
+
$aa_mut_obs[aa1] += obs1
|
557
|
+
else
|
558
|
+
$aa_mut_obs[aa1] = obs1
|
559
|
+
end
|
560
|
+
if $aa_mut_obs.has_key? aa2
|
561
|
+
$aa_mut_obs[aa2] += obs2
|
562
|
+
else
|
563
|
+
$aa_mut_obs[aa2] = obs2
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
568
|
+
$logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
end # if !$nosmooth
|
574
|
+
end # IO.foreach($tem_list)
|
575
|
+
|
576
|
+
# print out default header
|
577
|
+
$outfh.puts <<HEADER
|
578
|
+
# Environment-specific amino acid substitution matrices
|
579
|
+
# Creator: egor version #{Egor::VERSION}
|
580
|
+
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
581
|
+
#
|
582
|
+
# Definitions for structural environments:
|
583
|
+
# #{$env_features.size - 1} features used
|
584
|
+
#
|
585
|
+
HEADER
|
586
|
+
|
587
|
+
$env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
|
588
|
+
|
589
|
+
$outfh.puts <<HEADER
|
590
|
+
#
|
591
|
+
# (read in from #{$classdef})
|
592
|
+
#
|
593
|
+
# Number of alignments: #{$ali_size}
|
594
|
+
# (list of .tem files read in from #{$tem_list})
|
595
|
+
#
|
596
|
+
# Total number of environments: #{Integer($envs.size / $amino_acids.size)}
|
597
|
+
#
|
598
|
+
# There are #{$amino_acids.size} amino acids considered.
|
599
|
+
# #{$amino_acids.join}
|
600
|
+
#
|
601
|
+
HEADER
|
602
|
+
|
603
|
+
if $noweight
|
604
|
+
$outfh.puts "# Weighting scheme: none"
|
605
|
+
else
|
606
|
+
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
607
|
+
end
|
608
|
+
$outfh.puts "#"
|
609
|
+
|
610
|
+
# calculate amino acid frequencies and mutabilities, and
|
611
|
+
# print them as default statistics in the header part
|
612
|
+
ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
613
|
+
$tot_aa = $aa_tot_obs.values.sum
|
614
|
+
|
615
|
+
$outfh.puts "#"
|
616
|
+
$outfh.puts "# Total amino acid frequencies:\n"
|
617
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUTB REL_MUTB REL_FRQ]
|
618
|
+
|
619
|
+
$aa_tot_obs.each_pair do |res, freq|
|
620
|
+
$aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
|
621
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
622
|
+
$aa_rel_freq[res] = freq / $tot_aa.to_f
|
623
|
+
end
|
624
|
+
|
625
|
+
$amino_acids.each do |res|
|
626
|
+
if $noweight
|
627
|
+
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
628
|
+
[res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
629
|
+
else
|
630
|
+
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
631
|
+
[res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
632
|
+
end
|
633
|
+
end
|
634
|
+
$outfh.puts "#"
|
635
|
+
|
636
|
+
# calculating probabilities for each environment
|
637
|
+
$envs.values.each do |e|
|
638
|
+
if e.freq_array.sum != 0
|
639
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
# count raw frequencies
|
644
|
+
$tot_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
645
|
+
|
646
|
+
# for each combination of environment features
|
647
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
648
|
+
|
649
|
+
env_groups.to_a.sort_by { |env_group|
|
650
|
+
# a bit clumsy sorting here...
|
651
|
+
env_group[0].split("").map_with_index { |l, i|
|
652
|
+
$env_features[i + 1].labels.index(l)
|
653
|
+
}
|
654
|
+
}.each_with_index do |group, group_no|
|
655
|
+
grp_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
656
|
+
|
657
|
+
$amino_acids.each_with_index do |aa, ai|
|
658
|
+
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
659
|
+
0.upto(20) { |j| grp_freq_matrix[ai, j] = freq_array[j] }
|
660
|
+
end
|
661
|
+
|
662
|
+
$tot_freq_matrix += grp_freq_matrix
|
663
|
+
|
664
|
+
if $output == 0
|
665
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
666
|
+
$outfh.puts grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
667
|
+
end
|
668
|
+
end
|
669
|
+
|
670
|
+
if $output == 0
|
671
|
+
$outfh.puts ">Total"
|
672
|
+
$outfh.puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
673
|
+
exit 0
|
674
|
+
end
|
675
|
+
|
676
|
+
# for probability
|
677
|
+
if $output == 1
|
678
|
+
$outfh.puts <<HEADER
|
679
|
+
#
|
680
|
+
# Each column (j) represents the probability distribution for the
|
681
|
+
# likelihood of acceptance of a mutational event by a residue type j in
|
682
|
+
# a particular structural environment (specified after >) leading to
|
683
|
+
# any other residue type (i) and sums up to 100.
|
684
|
+
#
|
685
|
+
HEADER
|
686
|
+
end
|
687
|
+
|
688
|
+
if ($output > 0) && $nosmooth
|
689
|
+
# Probability matrices
|
690
|
+
tot_prob_matrix = NMatrix.float(21, 21)
|
691
|
+
|
692
|
+
# for each combination of environment features
|
693
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
694
|
+
env_groups.to_a.sort_by { |env_group|
|
695
|
+
# a bit clumsy sorting here...
|
696
|
+
env_group[0].split("").map_with_index { |l, i|
|
697
|
+
$env_features[i + 1].labels.index(l)
|
698
|
+
}
|
699
|
+
}.each_with_index do |group, group_no|
|
700
|
+
grp_prob_matrix = NMatrix.float(21,21)
|
701
|
+
|
702
|
+
$amino_acids.each_with_index do |aa, ai|
|
703
|
+
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
704
|
+
0.upto(20) { |j| grp_prob_matrix[ai, j] = prob_array[j] }
|
705
|
+
end
|
706
|
+
|
707
|
+
tot_prob_matrix += grp_prob_matrix
|
708
|
+
|
709
|
+
if ($output == 1)
|
710
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
711
|
+
$outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
712
|
+
end
|
713
|
+
end
|
714
|
+
|
715
|
+
if ($output == 1)
|
716
|
+
$outfh.puts ">Total"
|
717
|
+
$outfh.puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
718
|
+
$outfh.close
|
719
|
+
exit 0
|
720
|
+
end
|
721
|
+
end
|
722
|
+
|
723
|
+
# for smoothing...
|
724
|
+
if ($output > 0) && !$nosmooth
|
725
|
+
#
|
726
|
+
# p1 probability
|
727
|
+
#
|
728
|
+
p1 = NArray.float(21)
|
729
|
+
a0 = NArray.float(21).fill(1 / 21.0)
|
730
|
+
big_N = $tot_aa.to_f
|
731
|
+
small_n = 21.0
|
732
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
733
|
+
omega2 = 1.0 - omega1
|
734
|
+
|
735
|
+
if $smooth == :partial
|
736
|
+
# for partial smoothing, p1 probability is not smoothed!
|
737
|
+
0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
|
738
|
+
$smooth_prob[1] = p1
|
739
|
+
else
|
740
|
+
# for full smoothing, p1 probability is smoothed
|
741
|
+
0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
742
|
+
$smooth_prob[1] = p1
|
743
|
+
end
|
744
|
+
|
745
|
+
#
|
746
|
+
# p2 and above
|
747
|
+
#
|
748
|
+
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
749
|
+
|
750
|
+
if $smooth == :partial
|
751
|
+
$outfh.puts <<HEADER
|
752
|
+
# Partial Smoothing:
|
753
|
+
#
|
754
|
+
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
755
|
+
# each row in all matrices (no smoothing)
|
756
|
+
# ^^^^^^^^^^^^
|
757
|
+
# p2(ri|Rj) is estimated as:
|
758
|
+
# p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
|
759
|
+
#
|
760
|
+
# p3(ri|Rj,fq) is estimated as:
|
761
|
+
# p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
|
762
|
+
# where
|
763
|
+
# A2(ri|fq) = p2(ri|fq) (fixed fq; partial smoothing)
|
764
|
+
#
|
765
|
+
# The smoothing procedure is curtailed here and finally
|
766
|
+
# p5(ri|Rj,...) is estimated as:
|
767
|
+
# p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
|
768
|
+
# where
|
769
|
+
# A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
|
770
|
+
#
|
771
|
+
# Weights (omegas) are calculated as in Topham et al. 1993)
|
772
|
+
#
|
773
|
+
# sigma value used is: 5.00
|
774
|
+
#
|
775
|
+
HEADER
|
776
|
+
1.upto($env_features.size) do |ci|
|
777
|
+
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
778
|
+
next if (ci > 2) && (ci < $env_features.size)
|
779
|
+
|
780
|
+
env_labels.combination(ci) do |c1|
|
781
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
782
|
+
pattern = "." * $env_features.size
|
783
|
+
|
784
|
+
labels.each do |label|
|
785
|
+
i = label[0].chr.to_i
|
786
|
+
l = label[1].chr
|
787
|
+
pattern[i] = l
|
788
|
+
end
|
789
|
+
|
790
|
+
if pattern =~ /^\./
|
791
|
+
$logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
|
792
|
+
next
|
793
|
+
end
|
794
|
+
|
795
|
+
# get environmetns, frequencies, and probabilities
|
796
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
797
|
+
freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
|
798
|
+
prob_arr = NArray.float(21)
|
799
|
+
0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
|
800
|
+
|
801
|
+
# # assess whether a residue type j is compatible with a particular combination of structural features
|
802
|
+
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
803
|
+
# if ci == $env_features.size
|
804
|
+
# aa_label = labels.find { |l| l.match(/^0/) }[1].chr
|
805
|
+
# sub_pattern = "." * $env_features.size
|
806
|
+
# sub_pattern[0] = aa_label
|
807
|
+
# sub_freq_sum = 0
|
808
|
+
#
|
809
|
+
# labels[1..-1].each do |label|
|
810
|
+
# next if label.start_with?("0")
|
811
|
+
# i = label[0].chr.to_i
|
812
|
+
# l = label[1].chr
|
813
|
+
# sub_pattern[i] = l
|
814
|
+
# sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
815
|
+
# sub_freq_arr = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
|
816
|
+
# sub_freq_sum += sub_freq_arr.sum
|
817
|
+
# end
|
818
|
+
#
|
819
|
+
# if sub_freq_sum == 0
|
820
|
+
# if $smooth_prob.has_key?(ci + 1)
|
821
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
822
|
+
# else
|
823
|
+
# $smooth_prob[ci + 1] = {}
|
824
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
825
|
+
# end
|
826
|
+
# $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
|
827
|
+
# next
|
828
|
+
# end
|
829
|
+
# end
|
830
|
+
|
831
|
+
# collect priors if ci > 1
|
832
|
+
priors = []
|
833
|
+
|
834
|
+
if ci == 2
|
835
|
+
labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
|
836
|
+
priors << $smooth_prob[2][c3.to_set]
|
837
|
+
}
|
838
|
+
elsif ci == $env_features.size
|
839
|
+
labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
|
840
|
+
priors << $smooth_prob[3][c3.to_set]
|
841
|
+
}
|
842
|
+
end
|
843
|
+
|
844
|
+
# entropy based weighting priors
|
845
|
+
entropy_max = Math::log(21)
|
846
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
|
847
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
848
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
849
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
850
|
+
|
851
|
+
# smoothing step
|
852
|
+
smooth_prob_arr = NArray.float(21)
|
853
|
+
big_N = freq_arr.sum.to_f
|
854
|
+
small_n = 21.0
|
855
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
856
|
+
omega2 = 1.0 - omega1
|
857
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
858
|
+
|
859
|
+
# normalization step
|
860
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
861
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
862
|
+
|
863
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
864
|
+
if !$smooth_prob.has_key?(ci + 1)
|
865
|
+
$smooth_prob[ci + 1] = {}
|
866
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
867
|
+
else
|
868
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
869
|
+
end
|
870
|
+
end
|
871
|
+
end
|
872
|
+
end
|
873
|
+
else
|
874
|
+
$outfh.puts <<HEADER
|
875
|
+
# Full Smoothing:
|
876
|
+
#
|
877
|
+
# p1(ri) is estimated as:
|
878
|
+
# p1(ri) = omega1 * A0 + omega2 * W1(ri)
|
879
|
+
#
|
880
|
+
# p2(ri|f1q) is estimated as:
|
881
|
+
# p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
|
882
|
+
#
|
883
|
+
# (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
|
884
|
+
#
|
885
|
+
# p3(ri|f1q,f2q) is estimated as:
|
886
|
+
# p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
|
887
|
+
# where
|
888
|
+
# A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
|
889
|
+
#
|
890
|
+
# The smoothing procedure is NOT curtailed here and it goes upto
|
891
|
+
#
|
892
|
+
# pn(ri|f1q,f2q,...,fn-1q) is estimated as:
|
893
|
+
# pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * W5(ri|f1q,f2q,...,fn-1q)
|
894
|
+
# where
|
895
|
+
# An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
|
896
|
+
#
|
897
|
+
# Weights (omegas) are calculated as in Topham et al. 1993)
|
898
|
+
#
|
899
|
+
# sigma value used is: 5.00
|
900
|
+
#
|
901
|
+
HEADER
|
902
|
+
# full smooting
|
903
|
+
1.upto($env_features.size) do |ci|
|
904
|
+
env_labels.combination(ci) do |c1|
|
905
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
906
|
+
pattern = "." * $env_features.size
|
907
|
+
labels.each do |label|
|
908
|
+
j = label[0].chr.to_i
|
909
|
+
l = label[1].chr
|
910
|
+
pattern[j] = l
|
911
|
+
end
|
912
|
+
|
913
|
+
# get environmetns, frequencies, and probabilities
|
914
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
915
|
+
freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
|
916
|
+
prob_arr = NArray.float(21)
|
917
|
+
0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
918
|
+
|
919
|
+
# collect priors
|
920
|
+
priors = []
|
921
|
+
if ci > 1
|
922
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
923
|
+
else
|
924
|
+
priors << $smooth_prob[1]
|
925
|
+
end
|
926
|
+
|
927
|
+
# entropy based weighting priors
|
928
|
+
entropy_max = Math::log(21)
|
929
|
+
entropies = priors.map do |prior|
|
930
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
931
|
+
end
|
932
|
+
weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
933
|
+
|
934
|
+
# smoothing step
|
935
|
+
smooth_prob_arr = NArray.float(21)
|
936
|
+
big_N = freq_arr.sum.to_f
|
937
|
+
small_n = 21.0
|
938
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
939
|
+
omega2 = 1.0 - omega1
|
940
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
941
|
+
|
942
|
+
# normalization step
|
943
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
944
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
945
|
+
|
946
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
947
|
+
if !$smooth_prob.has_key?(ci + 1)
|
948
|
+
$smooth_prob[ci + 1] = {}
|
949
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
950
|
+
else
|
951
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
952
|
+
end
|
953
|
+
end
|
954
|
+
end
|
955
|
+
end
|
956
|
+
end
|
957
|
+
|
958
|
+
# updating smoothed probability array for each envrionment
|
959
|
+
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
960
|
+
|
961
|
+
# for a total substitution probability matrix
|
962
|
+
tot_smooth_prob_matrix = NMatrix.float(21,21)
|
963
|
+
|
964
|
+
# grouping environments by its environment labels but amino acid label
|
965
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
966
|
+
|
967
|
+
# sorting environments and build 21X21 substitution matrices
|
968
|
+
env_groups.to_a.sort_by { |env_group|
|
969
|
+
# a bit clumsy sorting here...
|
970
|
+
env_group[0].split("").map_with_index { |l, i|
|
971
|
+
$env_features[i + 1].labels.index(l)
|
972
|
+
}
|
973
|
+
}.each_with_index do |group, group_no|
|
974
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
975
|
+
grp_prob_matrix = NMatrix.float(21,21)
|
976
|
+
|
977
|
+
$amino_acids.each_with_index do |aa, ai|
|
978
|
+
smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
979
|
+
0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
|
980
|
+
end
|
981
|
+
|
982
|
+
tot_smooth_prob_matrix += grp_prob_matrix
|
983
|
+
|
984
|
+
if $output == 1
|
985
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
986
|
+
$outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
987
|
+
end
|
988
|
+
end
|
989
|
+
|
990
|
+
tot_smooth_prob_matrix /= env_groups.size
|
991
|
+
|
992
|
+
if $output == 1
|
993
|
+
$outfh.puts ">Total"
|
994
|
+
$outfh.puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
995
|
+
$outfh.close
|
996
|
+
exit 0
|
997
|
+
end
|
998
|
+
|
999
|
+
if $output == 2
|
1000
|
+
$outfh.puts <<HEADER
|
1001
|
+
#
|
1002
|
+
# The probabilities were then divided by the background probabilities
|
1003
|
+
# which were derived from the environment-independent amino acid frequencies.
|
1004
|
+
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1005
|
+
#
|
1006
|
+
# Shown here are logarithms of these values multiplied by 3/log(2)
|
1007
|
+
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
1008
|
+
#
|
1009
|
+
# For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
|
1010
|
+
#
|
1011
|
+
HEADER
|
1012
|
+
|
1013
|
+
# log-add ratio matrices from now on
|
1014
|
+
tot_logo_mat = NMatrix.float(21,21)
|
1015
|
+
factor = $scale / Math::log(2)
|
1016
|
+
|
1017
|
+
# grouping environments by its environment labels but amino acid label
|
1018
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1019
|
+
|
1020
|
+
# sorting environments and build 21X21 substitution matrices
|
1021
|
+
env_groups.to_a.sort_by { |env_group|
|
1022
|
+
# a bit clumsy sorting here...
|
1023
|
+
env_group[0].split("").map_with_index { |l, i|
|
1024
|
+
$env_features[i + 1].labels.index(l)
|
1025
|
+
}
|
1026
|
+
}.each_with_index do |group, group_no|
|
1027
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1028
|
+
grp_label = group[0]
|
1029
|
+
grp_envs = group[1]
|
1030
|
+
grp_logo_mat = NMatrix.float(21,21)
|
1031
|
+
|
1032
|
+
$amino_acids.each_with_index do |aa, ai|
|
1033
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1034
|
+
logo_arr = NArray.float(21)
|
1035
|
+
|
1036
|
+
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1037
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1038
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1039
|
+
logo_arr[j] = factor * Math::log(odds)
|
1040
|
+
end
|
1041
|
+
0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1042
|
+
end
|
1043
|
+
|
1044
|
+
tot_logo_mat += grp_logo_mat
|
1045
|
+
|
1046
|
+
$outfh.puts ">#{grp_label} #{group_no}"
|
1047
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1048
|
+
end
|
1049
|
+
|
1050
|
+
tot_logo_mat /= env_groups.size
|
1051
|
+
|
1052
|
+
$outfh.puts ">Total"
|
1053
|
+
$outfh.puts tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1054
|
+
$outfh.close
|
1055
|
+
exit 0
|
1056
|
+
end
|
1057
|
+
end
|
1058
|
+
end
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
end # class << self
|
1062
|
+
end # class CLI
|
1063
|
+
end # module Egor
|