egor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +32 -0
- data/PostInstall.txt +5 -0
- data/README.rdoc +88 -0
- data/Rakefile +32 -0
- data/bin/egor +10 -0
- data/config/website.yml +2 -0
- data/config/website.yml.sample +2 -0
- data/egor.gemspec +53 -0
- data/lib/egor/cli.rb +1063 -0
- data/lib/egor.rb +6 -0
- data/lib/enumerable_extensions.rb +11 -0
- data/lib/environment.rb +58 -0
- data/lib/environment_feature.rb +14 -0
- data/lib/math_extensions.rb +7 -0
- data/lib/narray_extensions.rb +21 -0
- data/lib/nmatrix_extensions.rb +24 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +71 -0
- data/test/test_egor.rb +11 -0
- data/test/test_egor_cli.rb +8 -0
- data/test/test_enumerable_extensions.rb +16 -0
- data/test/test_environment_feature.rb +11 -0
- data/test/test_helper.rb +2 -0
- data/test/test_nmatrix_extensions.rb +16 -0
- data/website/index.html +78 -0
- data/website/index.txt +48 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +154 -0
data/lib/egor/cli.rb
ADDED
@@ -0,0 +1,1063 @@
|
|
1
|
+
require "getoptlong"
|
2
|
+
require "logger"
|
3
|
+
require "rubygems"
|
4
|
+
require "narray"
|
5
|
+
require "bio"
|
6
|
+
require "set"
|
7
|
+
require "facets"
|
8
|
+
require "simple_memoize"
|
9
|
+
|
10
|
+
require "narray_extensions"
|
11
|
+
require "nmatrix_extensions"
|
12
|
+
require "enumerable_extensions"
|
13
|
+
require "math_extensions"
|
14
|
+
require "environment_feature"
|
15
|
+
require "environment"
|
16
|
+
|
17
|
+
# This is a module for an actual command line interpreter for Egor
|
18
|
+
# ---
|
19
|
+
# Copyright (C) 2008-9 Semin Lee
|
20
|
+
module Egor
|
21
|
+
class CLI
|
22
|
+
class << self
|
23
|
+
|
24
|
+
# :nodoc:
|
25
|
+
def print_version
|
26
|
+
puts Egor::VERSION
|
27
|
+
end
|
28
|
+
|
29
|
+
# Print Egor's Usage on the screen
|
30
|
+
#
|
31
|
+
# :call-seq:
|
32
|
+
# Egor::CLI::print_usage
|
33
|
+
#
|
34
|
+
def print_usage
|
35
|
+
puts <<-USAGE
|
36
|
+
egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
|
37
|
+
|
38
|
+
Usage:
|
39
|
+
egor [ options ] -l TEMLIST-file -c CLASSDEF-file
|
40
|
+
or
|
41
|
+
egor [ options ] -f TEM-file -c CLASSDEF-file
|
42
|
+
|
43
|
+
Options:
|
44
|
+
--tem-file (-f) STRING: a tem file
|
45
|
+
--tem-list (-l) STRING: a list for tem files
|
46
|
+
--classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
|
47
|
+
--outfile (-o) STRING: output filename ("allmat.dat" if not specified)
|
48
|
+
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
|
+
--noweight: calculate substitution counts with no weights (default)
|
50
|
+
--smooth (-s) INTEGER:
|
51
|
+
0 for parial smoothing (default)
|
52
|
+
1 for full smoothing
|
53
|
+
--nosmooth: perform no smoothing operation
|
54
|
+
--cys (-y) INTEGER: (NOT implemented yet)
|
55
|
+
0 for using C and J only for structure
|
56
|
+
1 for both structure and sequence (default)
|
57
|
+
--output INTEGER:
|
58
|
+
0 for raw counts (no-smoothing performed)
|
59
|
+
1 for probabilities
|
60
|
+
2 for log-odds (default)
|
61
|
+
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
62
|
+
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
63
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
64
|
+
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
|
65
|
+
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
66
|
+
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
67
|
+
--verbose (-v) INTEGER
|
68
|
+
0 for ERROR level (default)
|
69
|
+
1 for WARN or above level
|
70
|
+
2 for INFO or above level
|
71
|
+
3 for DEBUG or above level
|
72
|
+
--version: print version
|
73
|
+
--help (-h): show help
|
74
|
+
|
75
|
+
USAGE
|
76
|
+
end
|
77
|
+
|
78
|
+
# Calculate PID between two sequences
|
79
|
+
#
|
80
|
+
# :call-seq:
|
81
|
+
# Egor::CLI::calc_pid(seq1, seq2) -> Float
|
82
|
+
#
|
83
|
+
def calc_pid(seq1, seq2)
|
84
|
+
s1 = seq1.split("")
|
85
|
+
s2 = seq2.split("")
|
86
|
+
cols = s1.zip(s2)
|
87
|
+
align = 0
|
88
|
+
ident = 0
|
89
|
+
intgp = 0
|
90
|
+
|
91
|
+
cols.each do |col|
|
92
|
+
if (col[0] != "-") && (col[1] != "-")
|
93
|
+
align += 1
|
94
|
+
if col[0] == col[1]
|
95
|
+
ident += 1
|
96
|
+
end
|
97
|
+
elsif (((col[0] == "-") && (col[1] != "-")) ||
|
98
|
+
((col[0] != "-") && (col[1] == "-")))
|
99
|
+
intgp += 1
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
pid = 100.0 * ident.to_f / (align + intgp)
|
104
|
+
end
|
105
|
+
memoize :calc_pid
|
106
|
+
|
107
|
+
# :nodoc:
|
108
|
+
def execute(arguments=[])
|
109
|
+
#
|
110
|
+
# Abbreviations in the aa1 codes
|
111
|
+
#
|
112
|
+
# * env: environment
|
113
|
+
# * tem: (FUGUE) template
|
114
|
+
# * classdef: (envlironment) class definition
|
115
|
+
# * aa: amino acid
|
116
|
+
# * aa: weighted amino acid
|
117
|
+
# * tot: total
|
118
|
+
# * rel: relative
|
119
|
+
# * obs: observation (frequency)
|
120
|
+
# * mut: mutation
|
121
|
+
# * mutb: mutability
|
122
|
+
# * freq: frequency
|
123
|
+
# * prob: probability
|
124
|
+
# * opts: options
|
125
|
+
#
|
126
|
+
|
127
|
+
# Part 1.
|
128
|
+
#
|
129
|
+
# Global variables and their default values
|
130
|
+
#
|
131
|
+
$logger = Logger.new(STDOUT)
|
132
|
+
$logger.level = Logger::ERROR
|
133
|
+
$amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
|
134
|
+
$tem_list = nil
|
135
|
+
$tem_file = nil
|
136
|
+
$classdef = "classdef.dat"
|
137
|
+
$outfile = "allmat.dat"
|
138
|
+
$outfh = nil # file hanfle for outfile
|
139
|
+
$output = 2
|
140
|
+
$aa_tot_obs = {}
|
141
|
+
$aa_mut_obs = {}
|
142
|
+
$aa_mutb = {}
|
143
|
+
$aa_rel_mutb = {}
|
144
|
+
$aa_rel_freq = {}
|
145
|
+
$env_aa_obs = {}
|
146
|
+
$ali_size = 0
|
147
|
+
$tot_aa = 0
|
148
|
+
$sigma = 5.0
|
149
|
+
$weight = 60
|
150
|
+
$noweight = false
|
151
|
+
$smooth = :partial
|
152
|
+
$nosmooth = false
|
153
|
+
$scale = 3
|
154
|
+
$pidmin = nil
|
155
|
+
$pidmax = nil
|
156
|
+
$scale = 3
|
157
|
+
$add = 0
|
158
|
+
$penv = false
|
159
|
+
$heatmap = false
|
160
|
+
$smooth_prob = {}
|
161
|
+
|
162
|
+
# Part 2.
|
163
|
+
#
|
164
|
+
# Parsing options
|
165
|
+
#
|
166
|
+
opts = GetoptLong.new(
|
167
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
168
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
169
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
170
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
171
|
+
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
172
|
+
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
173
|
+
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
174
|
+
[ '--heatmap', GetoptLong::NO_ARGUMENT ],
|
175
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
176
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
177
|
+
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
178
|
+
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
179
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
180
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
181
|
+
)
|
182
|
+
|
183
|
+
opts.each do |opt, arg|
|
184
|
+
case opt
|
185
|
+
when '--help'
|
186
|
+
print_usage
|
187
|
+
exit 0
|
188
|
+
when '--tem-list'
|
189
|
+
$tem_list = arg
|
190
|
+
when '--tem-file'
|
191
|
+
$tem_file = arg
|
192
|
+
when '--classdef'
|
193
|
+
$classdef = arg
|
194
|
+
when '--output'
|
195
|
+
$output = arg.to_i
|
196
|
+
when '--outfile'
|
197
|
+
$outfile = arg
|
198
|
+
when '--cyc'
|
199
|
+
$logger.error "!!! --cys option is not available yet"
|
200
|
+
exit 1
|
201
|
+
$cysteine = (arg.to_i == 1 ? false : true)
|
202
|
+
when '--weight'
|
203
|
+
$weight = arg.to_i
|
204
|
+
when '--sigma'
|
205
|
+
$sigma = arg.to_f
|
206
|
+
when '--pidmin'
|
207
|
+
$pidmin = arg.to_f
|
208
|
+
when '--pidmax'
|
209
|
+
$pidmax = arg.to_f
|
210
|
+
when '--noweight'
|
211
|
+
$noweight = true
|
212
|
+
when '--smooth'
|
213
|
+
$smooth = (arg.to_i == 1 ? :full : :parital)
|
214
|
+
when '--nosmooth'
|
215
|
+
$nosmooth = true
|
216
|
+
when '--scale'
|
217
|
+
$scale = arg.to_f
|
218
|
+
when '--add'
|
219
|
+
$add = arg.to_f
|
220
|
+
when '--penv'
|
221
|
+
$logger.error "!!! --penv option is not available yet"
|
222
|
+
exit 1
|
223
|
+
$penv = true
|
224
|
+
when '--heatmap'
|
225
|
+
$heatmap = true
|
226
|
+
when '--verbose'
|
227
|
+
$logger.level = case arg.to_i
|
228
|
+
when 0 then Logger::ERROR
|
229
|
+
when 1 then Logger::WARN
|
230
|
+
when 2 then Logger::INFO
|
231
|
+
when 3 then Logger::DEBUG
|
232
|
+
else Logger::ERROR
|
233
|
+
end
|
234
|
+
when '--version'
|
235
|
+
print_version
|
236
|
+
exit 0
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# when arguments are nonsense, print usage
|
241
|
+
if ((ARGV.length != 0) ||
|
242
|
+
(!$tem_list && !$tem_file) ||
|
243
|
+
($tem_list && $tem_file))
|
244
|
+
print_usage
|
245
|
+
exit 1
|
246
|
+
end
|
247
|
+
|
248
|
+
# Part 3.
|
249
|
+
#
|
250
|
+
# Reading Environment Class Definition File
|
251
|
+
#
|
252
|
+
|
253
|
+
# a hash for storing all environment feature objects
|
254
|
+
$env_features = []
|
255
|
+
|
256
|
+
# aa1 amino acid in a substitution itself is a environment feature
|
257
|
+
$env_features << EnvironmentFeature.new("sequence",
|
258
|
+
$amino_acids,
|
259
|
+
$amino_acids,
|
260
|
+
"F",
|
261
|
+
"F")
|
262
|
+
|
263
|
+
# read environment class definiton file and
|
264
|
+
# store them into the hash prepared above
|
265
|
+
IO.foreach($classdef) do |line|
|
266
|
+
if line.start_with?("#")
|
267
|
+
next
|
268
|
+
elsif (env_ftr = line.chomp.split(/;/)).length == 5
|
269
|
+
$logger.info ">>> An environment feature, #{line.chomp} detected"
|
270
|
+
if env_ftr[-1] == "T"
|
271
|
+
# skip silenced environment feature
|
272
|
+
$logger.warn "!!! The environment feature, #{line.chomp} silent"
|
273
|
+
next
|
274
|
+
end
|
275
|
+
if env_ftr[-2] == "T"
|
276
|
+
$logger.warn "!!! The environment feature, #{line.chomp} constrained"
|
277
|
+
end
|
278
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
279
|
+
env_ftr[1].split(""),
|
280
|
+
env_ftr[2].split(""),
|
281
|
+
env_ftr[3],
|
282
|
+
env_ftr[4])
|
283
|
+
else
|
284
|
+
$logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
|
285
|
+
exit 1
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# a hash for storing all environment objects
|
290
|
+
$envs = {}
|
291
|
+
|
292
|
+
# generate all possible combinations of environment labels, and
|
293
|
+
# create & store every environment object into the hash prepared above with the label as a key
|
294
|
+
$env_features.inject([]) { |sum, ec|
|
295
|
+
sum << ec.labels
|
296
|
+
}.inject { |pro, lb|
|
297
|
+
pro.product(lb)
|
298
|
+
}.each_with_index { |e, i|
|
299
|
+
$envs[e.flatten.join] = Environment.new(i, e.flatten.join)
|
300
|
+
}
|
301
|
+
|
302
|
+
# Part 4.
|
303
|
+
#
|
304
|
+
# Reading TEM file or TEMLIST list file and couting substitutions
|
305
|
+
#
|
306
|
+
|
307
|
+
# a global file handle for output
|
308
|
+
$outfh = File.open($outfile, "w")
|
309
|
+
|
310
|
+
if $tem_file
|
311
|
+
$tem_list = [$tem_file]
|
312
|
+
end
|
313
|
+
|
314
|
+
if $tem_list
|
315
|
+
IO.foreach($tem_list) do |tem_file|
|
316
|
+
tem_file.chomp!
|
317
|
+
|
318
|
+
$logger.info ">>> Analysing #{tem_file} ..."
|
319
|
+
|
320
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
321
|
+
ff = Bio::FlatFile.auto(tem_file)
|
322
|
+
ff.each_entry do |pir|
|
323
|
+
if pir.definition == "sequence"
|
324
|
+
ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
$ali_size += ali.size
|
329
|
+
env_labels = {}
|
330
|
+
disulphide = {}
|
331
|
+
|
332
|
+
ali.each_pair do |key, seq|
|
333
|
+
# check disulphide bond environment first!
|
334
|
+
ff.rewind
|
335
|
+
ff.each_entry do |pir|
|
336
|
+
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
337
|
+
disulphide[key] = pir.data.gsub("\n", "").split("")
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
$env_features.each_with_index do |ec, ei|
|
342
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
343
|
+
|
344
|
+
ff.rewind
|
345
|
+
ff.each_entry do |pir|
|
346
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
347
|
+
labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
|
348
|
+
if sym == "-"
|
349
|
+
"-"
|
350
|
+
elsif sym == "X" || sym == "x"
|
351
|
+
"X"
|
352
|
+
else
|
353
|
+
if ei == 0 # Amino Acid Environment Feature
|
354
|
+
((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
355
|
+
else
|
356
|
+
ec.labels[ec.symbols.index(sym)]
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
if env_labels[key].empty?
|
362
|
+
env_labels[key] = labels
|
363
|
+
else
|
364
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
if $noweight
|
372
|
+
ali.each_pair do |id1, seq1|
|
373
|
+
ali.each_pair do |id2, seq2|
|
374
|
+
if id1 != id2
|
375
|
+
pid = calc_pid(seq1, seq2)
|
376
|
+
s1 = seq1.split("")
|
377
|
+
s2 = seq2.split("")
|
378
|
+
|
379
|
+
# check PID_MIN
|
380
|
+
if $pidmin && (pid < $pidmin)
|
381
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
|
382
|
+
next
|
383
|
+
end
|
384
|
+
|
385
|
+
# check PID_MAX
|
386
|
+
if $pidmax && (pid > $pidmax)
|
387
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
|
388
|
+
next
|
389
|
+
end
|
390
|
+
|
391
|
+
s1.each_with_index do |aa1, pos|
|
392
|
+
if env_labels[id1][pos].include?("X")
|
393
|
+
$logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
394
|
+
next
|
395
|
+
end
|
396
|
+
|
397
|
+
aa1.upcase!
|
398
|
+
aa2 = s2[pos].upcase
|
399
|
+
|
400
|
+
if !$amino_acids.include?(aa1)
|
401
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
402
|
+
next
|
403
|
+
end
|
404
|
+
|
405
|
+
if !$amino_acids.include?(aa2)
|
406
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
407
|
+
next
|
408
|
+
end
|
409
|
+
|
410
|
+
aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
411
|
+
aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
412
|
+
|
413
|
+
$envs[env_labels[id1][pos]].add_residue_count(aa2)
|
414
|
+
|
415
|
+
grp_label = env_labels[id1][pos][1..-1]
|
416
|
+
|
417
|
+
if $env_aa_obs.has_key? grp_label
|
418
|
+
if $env_aa_obs[grp_label].has_key? aa1
|
419
|
+
$env_aa_obs[grp_label][aa1] += 1
|
420
|
+
else
|
421
|
+
$env_aa_obs[grp_label][aa1] = 1
|
422
|
+
end
|
423
|
+
else
|
424
|
+
$env_aa_obs[grp_label] = Hash.new(0)
|
425
|
+
$env_aa_obs[grp_label][aa1] = 1
|
426
|
+
end
|
427
|
+
|
428
|
+
if $aa_tot_obs.has_key? aa1
|
429
|
+
$aa_tot_obs[aa1] += 1
|
430
|
+
else
|
431
|
+
$aa_tot_obs[aa1] = 1
|
432
|
+
end
|
433
|
+
|
434
|
+
if aa1 != aa2
|
435
|
+
if $aa_mut_obs.has_key? aa1
|
436
|
+
$aa_mut_obs[aa1] += 1
|
437
|
+
else
|
438
|
+
$aa_mut_obs[aa1] = 1
|
439
|
+
end
|
440
|
+
end
|
441
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
442
|
+
end
|
443
|
+
end
|
444
|
+
end
|
445
|
+
end
|
446
|
+
else
|
447
|
+
# BLOSUM-like weighting
|
448
|
+
clusters = []
|
449
|
+
ali.each_pair { |i, s| clusters << [i] }
|
450
|
+
|
451
|
+
# a loop for single linkage clustering
|
452
|
+
begin
|
453
|
+
continue = false
|
454
|
+
0.upto(clusters.size - 2) do |i|
|
455
|
+
indexes = []
|
456
|
+
(i + 1).upto(clusters.size - 1) do |j|
|
457
|
+
found = false
|
458
|
+
clusters[i].each do |c1|
|
459
|
+
clusters[j].each do |c2|
|
460
|
+
if calc_pid(ali[c1], ali[c2]) >= $weight
|
461
|
+
indexes << j
|
462
|
+
found = true
|
463
|
+
break
|
464
|
+
end
|
465
|
+
end
|
466
|
+
break if found
|
467
|
+
end
|
468
|
+
end
|
469
|
+
|
470
|
+
unless indexes.empty?
|
471
|
+
continue = true
|
472
|
+
group = clusters[i]
|
473
|
+
indexes.each do |k|
|
474
|
+
group = group.concat(clusters[k])
|
475
|
+
clusters[k] = nil
|
476
|
+
end
|
477
|
+
clusters[i] = group
|
478
|
+
clusters.compact!
|
479
|
+
end
|
480
|
+
end
|
481
|
+
end while(continue)
|
482
|
+
|
483
|
+
clusters.combination(2).each do |cluster1, cluster2|
|
484
|
+
cluster1.each do |id1|
|
485
|
+
cluster2.each do |id2|
|
486
|
+
seq1 = ali[id1].split("")
|
487
|
+
seq2 = ali[id2].split("")
|
488
|
+
seq1.each_with_index do |aa1, pos|
|
489
|
+
if env_labels[id1][pos].include?("X")
|
490
|
+
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
491
|
+
next
|
492
|
+
end
|
493
|
+
|
494
|
+
aa1.upcase!
|
495
|
+
aa2 = seq2[pos].upcase
|
496
|
+
|
497
|
+
if !$amino_acids.include?(aa1)
|
498
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
499
|
+
next
|
500
|
+
end
|
501
|
+
|
502
|
+
if !$amino_acids.include?(aa2)
|
503
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
504
|
+
next
|
505
|
+
end
|
506
|
+
|
507
|
+
aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
508
|
+
aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
509
|
+
size1 = cluster1.size
|
510
|
+
size2 = cluster2.size
|
511
|
+
obs1 = 1.0 / size1
|
512
|
+
obs2 = 1.0 / size2
|
513
|
+
|
514
|
+
$envs[env_labels[id1][pos]].add_residue_count(aa2, 1.0 / (size1 * size2))
|
515
|
+
$envs[env_labels[id2][pos]].add_residue_count(aa1, 1.0 / (size1 * size2))
|
516
|
+
|
517
|
+
grp_label1 = env_labels[id1][pos][1..-1]
|
518
|
+
grp_label2 = env_labels[id2][pos][1..-1]
|
519
|
+
|
520
|
+
if $env_aa_obs.has_key? grp_label1
|
521
|
+
if $env_aa_obs[grp_label1].has_key? aa1
|
522
|
+
$env_aa_obs[grp_label1][aa1] += obs1
|
523
|
+
else
|
524
|
+
$env_aa_obs[grp_label1][aa1] = obs1
|
525
|
+
end
|
526
|
+
else
|
527
|
+
$env_aa_obs[grp_label1] = Hash.new(0.0)
|
528
|
+
$env_aa_obs[grp_label1][aa1] = obs1
|
529
|
+
end
|
530
|
+
|
531
|
+
if $env_aa_obs.has_key? grp_label2
|
532
|
+
if $env_aa_obs[grp_label2].has_key? aa2
|
533
|
+
$env_aa_obs[grp_label2][aa2] += obs2
|
534
|
+
else
|
535
|
+
$env_aa_obs[grp_label2][aa2] = obs2
|
536
|
+
end
|
537
|
+
else
|
538
|
+
$env_aa_obs[grp_label2] = Hash.new(0.0)
|
539
|
+
$env_aa_obs[grp_label2][aa2] = obs2
|
540
|
+
end
|
541
|
+
|
542
|
+
if $aa_tot_obs.has_key? aa1
|
543
|
+
$aa_tot_obs[aa1] += obs1
|
544
|
+
else
|
545
|
+
$aa_tot_obs[aa1] = obs1
|
546
|
+
end
|
547
|
+
|
548
|
+
if $aa_tot_obs.has_key? aa2
|
549
|
+
$aa_tot_obs[aa2] += obs2
|
550
|
+
else
|
551
|
+
$aa_tot_obs[aa2] = obs2
|
552
|
+
end
|
553
|
+
|
554
|
+
if aa1 != aa2
|
555
|
+
if $aa_mut_obs.has_key? aa1
|
556
|
+
$aa_mut_obs[aa1] += obs1
|
557
|
+
else
|
558
|
+
$aa_mut_obs[aa1] = obs1
|
559
|
+
end
|
560
|
+
if $aa_mut_obs.has_key? aa2
|
561
|
+
$aa_mut_obs[aa2] += obs2
|
562
|
+
else
|
563
|
+
$aa_mut_obs[aa2] = obs2
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
568
|
+
$logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
end # if !$nosmooth
|
574
|
+
end # IO.foreach($tem_list)
|
575
|
+
|
576
|
+
# print out default header
|
577
|
+
$outfh.puts <<HEADER
|
578
|
+
# Environment-specific amino acid substitution matrices
|
579
|
+
# Creator: egor version #{Egor::VERSION}
|
580
|
+
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
581
|
+
#
|
582
|
+
# Definitions for structural environments:
|
583
|
+
# #{$env_features.size - 1} features used
|
584
|
+
#
|
585
|
+
HEADER
|
586
|
+
|
587
|
+
$env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
|
588
|
+
|
589
|
+
$outfh.puts <<HEADER
|
590
|
+
#
|
591
|
+
# (read in from #{$classdef})
|
592
|
+
#
|
593
|
+
# Number of alignments: #{$ali_size}
|
594
|
+
# (list of .tem files read in from #{$tem_list})
|
595
|
+
#
|
596
|
+
# Total number of environments: #{Integer($envs.size / $amino_acids.size)}
|
597
|
+
#
|
598
|
+
# There are #{$amino_acids.size} amino acids considered.
|
599
|
+
# #{$amino_acids.join}
|
600
|
+
#
|
601
|
+
HEADER
|
602
|
+
|
603
|
+
if $noweight
|
604
|
+
$outfh.puts "# Weighting scheme: none"
|
605
|
+
else
|
606
|
+
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
607
|
+
end
|
608
|
+
$outfh.puts "#"
|
609
|
+
|
610
|
+
# calculate amino acid frequencies and mutabilities, and
|
611
|
+
# print them as default statistics in the header part
|
612
|
+
ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
613
|
+
$tot_aa = $aa_tot_obs.values.sum
|
614
|
+
|
615
|
+
$outfh.puts "#"
|
616
|
+
$outfh.puts "# Total amino acid frequencies:\n"
|
617
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUTB REL_MUTB REL_FRQ]
|
618
|
+
|
619
|
+
$aa_tot_obs.each_pair do |res, freq|
|
620
|
+
$aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
|
621
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
622
|
+
$aa_rel_freq[res] = freq / $tot_aa.to_f
|
623
|
+
end
|
624
|
+
|
625
|
+
$amino_acids.each do |res|
|
626
|
+
if $noweight
|
627
|
+
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
628
|
+
[res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
629
|
+
else
|
630
|
+
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
631
|
+
[res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
632
|
+
end
|
633
|
+
end
|
634
|
+
$outfh.puts "#"
|
635
|
+
|
636
|
+
# calculating probabilities for each environment
|
637
|
+
$envs.values.each do |e|
|
638
|
+
if e.freq_array.sum != 0
|
639
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
# count raw frequencies
|
644
|
+
$tot_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
645
|
+
|
646
|
+
# for each combination of environment features
|
647
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
648
|
+
|
649
|
+
env_groups.to_a.sort_by { |env_group|
|
650
|
+
# a bit clumsy sorting here...
|
651
|
+
env_group[0].split("").map_with_index { |l, i|
|
652
|
+
$env_features[i + 1].labels.index(l)
|
653
|
+
}
|
654
|
+
}.each_with_index do |group, group_no|
|
655
|
+
grp_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
656
|
+
|
657
|
+
$amino_acids.each_with_index do |aa, ai|
|
658
|
+
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
659
|
+
0.upto(20) { |j| grp_freq_matrix[ai, j] = freq_array[j] }
|
660
|
+
end
|
661
|
+
|
662
|
+
$tot_freq_matrix += grp_freq_matrix
|
663
|
+
|
664
|
+
if $output == 0
|
665
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
666
|
+
$outfh.puts grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
667
|
+
end
|
668
|
+
end
|
669
|
+
|
670
|
+
if $output == 0
|
671
|
+
$outfh.puts ">Total"
|
672
|
+
$outfh.puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
673
|
+
exit 0
|
674
|
+
end
|
675
|
+
|
676
|
+
# for probability
|
677
|
+
if $output == 1
|
678
|
+
$outfh.puts <<HEADER
|
679
|
+
#
|
680
|
+
# Each column (j) represents the probability distribution for the
|
681
|
+
# likelihood of acceptance of a mutational event by a residue type j in
|
682
|
+
# a particular structural environment (specified after >) leading to
|
683
|
+
# any other residue type (i) and sums up to 100.
|
684
|
+
#
|
685
|
+
HEADER
|
686
|
+
end
|
687
|
+
|
688
|
+
if ($output > 0) && $nosmooth
|
689
|
+
# Probability matrices
|
690
|
+
tot_prob_matrix = NMatrix.float(21, 21)
|
691
|
+
|
692
|
+
# for each combination of environment features
|
693
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
694
|
+
env_groups.to_a.sort_by { |env_group|
|
695
|
+
# a bit clumsy sorting here...
|
696
|
+
env_group[0].split("").map_with_index { |l, i|
|
697
|
+
$env_features[i + 1].labels.index(l)
|
698
|
+
}
|
699
|
+
}.each_with_index do |group, group_no|
|
700
|
+
grp_prob_matrix = NMatrix.float(21,21)
|
701
|
+
|
702
|
+
$amino_acids.each_with_index do |aa, ai|
|
703
|
+
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
704
|
+
0.upto(20) { |j| grp_prob_matrix[ai, j] = prob_array[j] }
|
705
|
+
end
|
706
|
+
|
707
|
+
tot_prob_matrix += grp_prob_matrix
|
708
|
+
|
709
|
+
if ($output == 1)
|
710
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
711
|
+
$outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
712
|
+
end
|
713
|
+
end
|
714
|
+
|
715
|
+
if ($output == 1)
|
716
|
+
$outfh.puts ">Total"
|
717
|
+
$outfh.puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
718
|
+
$outfh.close
|
719
|
+
exit 0
|
720
|
+
end
|
721
|
+
end
|
722
|
+
|
723
|
+
# for smoothing...
|
724
|
+
if ($output > 0) && !$nosmooth
|
725
|
+
#
|
726
|
+
# p1 probability
|
727
|
+
#
|
728
|
+
p1 = NArray.float(21)
|
729
|
+
a0 = NArray.float(21).fill(1 / 21.0)
|
730
|
+
big_N = $tot_aa.to_f
|
731
|
+
small_n = 21.0
|
732
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
733
|
+
omega2 = 1.0 - omega1
|
734
|
+
|
735
|
+
if $smooth == :partial
|
736
|
+
# for partial smoothing, p1 probability is not smoothed!
|
737
|
+
0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
|
738
|
+
$smooth_prob[1] = p1
|
739
|
+
else
|
740
|
+
# for full smoothing, p1 probability is smoothed
|
741
|
+
0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
742
|
+
$smooth_prob[1] = p1
|
743
|
+
end
|
744
|
+
|
745
|
+
#
|
746
|
+
# p2 and above
|
747
|
+
#
|
748
|
+
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
749
|
+
|
750
|
+
if $smooth == :partial
|
751
|
+
$outfh.puts <<HEADER
|
752
|
+
# Partial Smoothing:
|
753
|
+
#
|
754
|
+
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
755
|
+
# each row in all matrices (no smoothing)
|
756
|
+
# ^^^^^^^^^^^^
|
757
|
+
# p2(ri|Rj) is estimated as:
|
758
|
+
# p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
|
759
|
+
#
|
760
|
+
# p3(ri|Rj,fq) is estimated as:
|
761
|
+
# p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
|
762
|
+
# where
|
763
|
+
# A2(ri|fq) = p2(ri|fq) (fixed fq; partial smoothing)
|
764
|
+
#
|
765
|
+
# The smoothing procedure is curtailed here and finally
|
766
|
+
# p5(ri|Rj,...) is estimated as:
|
767
|
+
# p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
|
768
|
+
# where
|
769
|
+
# A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
|
770
|
+
#
|
771
|
+
# Weights (omegas) are calculated as in Topham et al. 1993)
|
772
|
+
#
|
773
|
+
# sigma value used is: 5.00
|
774
|
+
#
|
775
|
+
HEADER
|
776
|
+
1.upto($env_features.size) do |ci|
|
777
|
+
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
778
|
+
next if (ci > 2) && (ci < $env_features.size)
|
779
|
+
|
780
|
+
env_labels.combination(ci) do |c1|
|
781
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
782
|
+
pattern = "." * $env_features.size
|
783
|
+
|
784
|
+
labels.each do |label|
|
785
|
+
i = label[0].chr.to_i
|
786
|
+
l = label[1].chr
|
787
|
+
pattern[i] = l
|
788
|
+
end
|
789
|
+
|
790
|
+
if pattern =~ /^\./
|
791
|
+
$logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
|
792
|
+
next
|
793
|
+
end
|
794
|
+
|
795
|
+
# get environmetns, frequencies, and probabilities
|
796
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
797
|
+
freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
|
798
|
+
prob_arr = NArray.float(21)
|
799
|
+
0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
|
800
|
+
|
801
|
+
# # assess whether a residue type j is compatible with a particular combination of structural features
|
802
|
+
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
803
|
+
# if ci == $env_features.size
|
804
|
+
# aa_label = labels.find { |l| l.match(/^0/) }[1].chr
|
805
|
+
# sub_pattern = "." * $env_features.size
|
806
|
+
# sub_pattern[0] = aa_label
|
807
|
+
# sub_freq_sum = 0
|
808
|
+
#
|
809
|
+
# labels[1..-1].each do |label|
|
810
|
+
# next if label.start_with?("0")
|
811
|
+
# i = label[0].chr.to_i
|
812
|
+
# l = label[1].chr
|
813
|
+
# sub_pattern[i] = l
|
814
|
+
# sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
815
|
+
# sub_freq_arr = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
|
816
|
+
# sub_freq_sum += sub_freq_arr.sum
|
817
|
+
# end
|
818
|
+
#
|
819
|
+
# if sub_freq_sum == 0
|
820
|
+
# if $smooth_prob.has_key?(ci + 1)
|
821
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
822
|
+
# else
|
823
|
+
# $smooth_prob[ci + 1] = {}
|
824
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
825
|
+
# end
|
826
|
+
# $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
|
827
|
+
# next
|
828
|
+
# end
|
829
|
+
# end
|
830
|
+
|
831
|
+
# collect priors if ci > 1
|
832
|
+
priors = []
|
833
|
+
|
834
|
+
if ci == 2
|
835
|
+
labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
|
836
|
+
priors << $smooth_prob[2][c3.to_set]
|
837
|
+
}
|
838
|
+
elsif ci == $env_features.size
|
839
|
+
labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
|
840
|
+
priors << $smooth_prob[3][c3.to_set]
|
841
|
+
}
|
842
|
+
end
|
843
|
+
|
844
|
+
# entropy based weighting priors
|
845
|
+
entropy_max = Math::log(21)
|
846
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
|
847
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
848
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
849
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
850
|
+
|
851
|
+
# smoothing step
|
852
|
+
smooth_prob_arr = NArray.float(21)
|
853
|
+
big_N = freq_arr.sum.to_f
|
854
|
+
small_n = 21.0
|
855
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
856
|
+
omega2 = 1.0 - omega1
|
857
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
858
|
+
|
859
|
+
# normalization step
|
860
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
861
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
862
|
+
|
863
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
864
|
+
if !$smooth_prob.has_key?(ci + 1)
|
865
|
+
$smooth_prob[ci + 1] = {}
|
866
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
867
|
+
else
|
868
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
869
|
+
end
|
870
|
+
end
|
871
|
+
end
|
872
|
+
end
|
873
|
+
else
|
874
|
+
$outfh.puts <<HEADER
|
875
|
+
# Full Smoothing:
|
876
|
+
#
|
877
|
+
# p1(ri) is estimated as:
|
878
|
+
# p1(ri) = omega1 * A0 + omega2 * W1(ri)
|
879
|
+
#
|
880
|
+
# p2(ri|f1q) is estimated as:
|
881
|
+
# p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
|
882
|
+
#
|
883
|
+
# (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
|
884
|
+
#
|
885
|
+
# p3(ri|f1q,f2q) is estimated as:
|
886
|
+
# p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
|
887
|
+
# where
|
888
|
+
# A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
|
889
|
+
#
|
890
|
+
# The smoothing procedure is NOT curtailed here and it goes upto
|
891
|
+
#
|
892
|
+
# pn(ri|f1q,f2q,...,fn-1q) is estimated as:
|
893
|
+
# pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * W5(ri|f1q,f2q,...,fn-1q)
|
894
|
+
# where
|
895
|
+
# An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
|
896
|
+
#
|
897
|
+
# Weights (omegas) are calculated as in Topham et al. 1993)
|
898
|
+
#
|
899
|
+
# sigma value used is: 5.00
|
900
|
+
#
|
901
|
+
HEADER
|
902
|
+
# full smooting
|
903
|
+
1.upto($env_features.size) do |ci|
|
904
|
+
env_labels.combination(ci) do |c1|
|
905
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
906
|
+
pattern = "." * $env_features.size
|
907
|
+
labels.each do |label|
|
908
|
+
j = label[0].chr.to_i
|
909
|
+
l = label[1].chr
|
910
|
+
pattern[j] = l
|
911
|
+
end
|
912
|
+
|
913
|
+
# get environmetns, frequencies, and probabilities
|
914
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
915
|
+
freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
|
916
|
+
prob_arr = NArray.float(21)
|
917
|
+
0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
918
|
+
|
919
|
+
# collect priors
|
920
|
+
priors = []
|
921
|
+
if ci > 1
|
922
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
923
|
+
else
|
924
|
+
priors << $smooth_prob[1]
|
925
|
+
end
|
926
|
+
|
927
|
+
# entropy based weighting priors
|
928
|
+
entropy_max = Math::log(21)
|
929
|
+
entropies = priors.map do |prior|
|
930
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
931
|
+
end
|
932
|
+
weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
933
|
+
|
934
|
+
# smoothing step
|
935
|
+
smooth_prob_arr = NArray.float(21)
|
936
|
+
big_N = freq_arr.sum.to_f
|
937
|
+
small_n = 21.0
|
938
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
939
|
+
omega2 = 1.0 - omega1
|
940
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
941
|
+
|
942
|
+
# normalization step
|
943
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
944
|
+
0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
945
|
+
|
946
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
947
|
+
if !$smooth_prob.has_key?(ci + 1)
|
948
|
+
$smooth_prob[ci + 1] = {}
|
949
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
950
|
+
else
|
951
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
952
|
+
end
|
953
|
+
end
|
954
|
+
end
|
955
|
+
end
|
956
|
+
end
|
957
|
+
|
958
|
+
# updating smoothed probability array for each envrionment
|
959
|
+
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
960
|
+
|
961
|
+
# for a total substitution probability matrix
|
962
|
+
tot_smooth_prob_matrix = NMatrix.float(21,21)
|
963
|
+
|
964
|
+
# grouping environments by its environment labels but amino acid label
|
965
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
966
|
+
|
967
|
+
# sorting environments and build 21X21 substitution matrices
|
968
|
+
env_groups.to_a.sort_by { |env_group|
|
969
|
+
# a bit clumsy sorting here...
|
970
|
+
env_group[0].split("").map_with_index { |l, i|
|
971
|
+
$env_features[i + 1].labels.index(l)
|
972
|
+
}
|
973
|
+
}.each_with_index do |group, group_no|
|
974
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
975
|
+
grp_prob_matrix = NMatrix.float(21,21)
|
976
|
+
|
977
|
+
$amino_acids.each_with_index do |aa, ai|
|
978
|
+
smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
979
|
+
0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
|
980
|
+
end
|
981
|
+
|
982
|
+
tot_smooth_prob_matrix += grp_prob_matrix
|
983
|
+
|
984
|
+
if $output == 1
|
985
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
986
|
+
$outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
987
|
+
end
|
988
|
+
end
|
989
|
+
|
990
|
+
tot_smooth_prob_matrix /= env_groups.size
|
991
|
+
|
992
|
+
if $output == 1
|
993
|
+
$outfh.puts ">Total"
|
994
|
+
$outfh.puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
995
|
+
$outfh.close
|
996
|
+
exit 0
|
997
|
+
end
|
998
|
+
|
999
|
+
if $output == 2
|
1000
|
+
$outfh.puts <<HEADER
|
1001
|
+
#
|
1002
|
+
# The probabilities were then divided by the background probabilities
|
1003
|
+
# which were derived from the environment-independent amino acid frequencies.
|
1004
|
+
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1005
|
+
#
|
1006
|
+
# Shown here are logarithms of these values multiplied by 3/log(2)
|
1007
|
+
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
1008
|
+
#
|
1009
|
+
# For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
|
1010
|
+
#
|
1011
|
+
HEADER
|
1012
|
+
|
1013
|
+
# log-add ratio matrices from now on
|
1014
|
+
tot_logo_mat = NMatrix.float(21,21)
|
1015
|
+
factor = $scale / Math::log(2)
|
1016
|
+
|
1017
|
+
# grouping environments by its environment labels but amino acid label
|
1018
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1019
|
+
|
1020
|
+
# sorting environments and build 21X21 substitution matrices
|
1021
|
+
env_groups.to_a.sort_by { |env_group|
|
1022
|
+
# a bit clumsy sorting here...
|
1023
|
+
env_group[0].split("").map_with_index { |l, i|
|
1024
|
+
$env_features[i + 1].labels.index(l)
|
1025
|
+
}
|
1026
|
+
}.each_with_index do |group, group_no|
|
1027
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1028
|
+
grp_label = group[0]
|
1029
|
+
grp_envs = group[1]
|
1030
|
+
grp_logo_mat = NMatrix.float(21,21)
|
1031
|
+
|
1032
|
+
$amino_acids.each_with_index do |aa, ai|
|
1033
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1034
|
+
logo_arr = NArray.float(21)
|
1035
|
+
|
1036
|
+
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1037
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1038
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1039
|
+
logo_arr[j] = factor * Math::log(odds)
|
1040
|
+
end
|
1041
|
+
0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1042
|
+
end
|
1043
|
+
|
1044
|
+
tot_logo_mat += grp_logo_mat
|
1045
|
+
|
1046
|
+
$outfh.puts ">#{grp_label} #{group_no}"
|
1047
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1048
|
+
end
|
1049
|
+
|
1050
|
+
tot_logo_mat /= env_groups.size
|
1051
|
+
|
1052
|
+
$outfh.puts ">Total"
|
1053
|
+
$outfh.puts tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1054
|
+
$outfh.close
|
1055
|
+
exit 0
|
1056
|
+
end
|
1057
|
+
end
|
1058
|
+
end
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
end # class << self
|
1062
|
+
end # class CLI
|
1063
|
+
end # module Egor
|