semin-ulla 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/History.txt +38 -0
- data/Manifest.txt +90 -0
- data/PostInstall.txt +5 -0
- data/README.rdoc +259 -0
- data/Rakefile +32 -0
- data/bin/ulla +10 -0
- data/config/website.yml +2 -0
- data/config/website.yml.sample +2 -0
- data/lib/math_extensions.rb +7 -0
- data/lib/narray_extensions.rb +22 -0
- data/lib/nmatrix_extensions.rb +245 -0
- data/lib/string_extensions.rb +17 -0
- data/lib/ulla/cli.rb +1742 -0
- data/lib/ulla/environment.rb +34 -0
- data/lib/ulla/environment_class_hash.rb +20 -0
- data/lib/ulla/environment_feature.rb +26 -0
- data/lib/ulla/environment_feature_array.rb +12 -0
- data/lib/ulla/heatmap_array.rb +111 -0
- data/lib/ulla.rb +6 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +71 -0
- data/test/test_helper.rb +2 -0
- data/test/test_math_extensions.rb +11 -0
- data/test/test_narray_extensions.rb +14 -0
- data/test/test_nmatrix_extensions.rb +16 -0
- data/test/test_string_extensions.rb +11 -0
- data/test/test_ulla.rb +11 -0
- data/test/ulla/test_cli.rb +9 -0
- data/test/ulla/test_environment_class_hash.rb +25 -0
- data/test/ulla/test_environment_feature.rb +29 -0
- data/website/index.html +16 -0
- data/website/index.txt +217 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +158 -0
- data/website/template.html.erb +57 -0
- metadata +215 -0
data/lib/ulla/cli.rb
ADDED
@@ -0,0 +1,1742 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'getoptlong'
|
3
|
+
require 'logger'
|
4
|
+
require 'narray'
|
5
|
+
require 'bio'
|
6
|
+
require 'set'
|
7
|
+
require 'facets'
|
8
|
+
|
9
|
+
require 'math_extensions'
|
10
|
+
require 'string_extensions'
|
11
|
+
require 'narray_extensions'
|
12
|
+
require 'nmatrix_extensions'
|
13
|
+
|
14
|
+
require 'ulla/environment'
|
15
|
+
require 'ulla/environment_class_hash'
|
16
|
+
require 'ulla/environment_feature'
|
17
|
+
require 'ulla/environment_feature_array'
|
18
|
+
require 'ulla/heatmap_array'
|
19
|
+
|
20
|
+
# This is a module for an actual command line interpreter for Ulla
|
21
|
+
# ---
|
22
|
+
# Copyright (C) 2008-9 Semin Lee
|
23
|
+
module Ulla
|
24
|
+
class CLI
|
25
|
+
class << self
|
26
|
+
|
27
|
+
# :nodoc:
|
28
|
+
def print_version
|
29
|
+
puts VERSION
|
30
|
+
end
|
31
|
+
|
32
|
+
# Print Ulla's Usage on the screen
|
33
|
+
#
|
34
|
+
# :call-seq:
|
35
|
+
# Ulla::CLI::print_usage
|
36
|
+
#
|
37
|
+
def print_usage
|
38
|
+
puts <<-USAGE
|
39
|
+
ulla: a program to calculate environment-specific amino acid substitution tables.
|
40
|
+
|
41
|
+
Usage:
|
42
|
+
ulla [ options ] -l TEMLIST-file -c CLASSDEF-file
|
43
|
+
or
|
44
|
+
ulla [ options ] -f TEM-file -c CLASSDEF-file
|
45
|
+
|
46
|
+
Options:
|
47
|
+
--tem-file (-f) FILE: a tem file
|
48
|
+
--tem-list (-l) FILE: a list for tem files
|
49
|
+
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
50
|
+
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
51
|
+
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
52
|
+
--noweight: calculate substitution counts with no weights
|
53
|
+
--smooth (-s) INTEGER:
|
54
|
+
0 for partial smoothing (default)
|
55
|
+
1 for full smoothing
|
56
|
+
--p1smooth: perform smoothing for p1 probability calculation when partial smoothing
|
57
|
+
--nosmooth: perform no smoothing operation
|
58
|
+
--cys (-y) INTEGER:
|
59
|
+
0 for using C and J only for structure (default)
|
60
|
+
1 for both structure and sequence
|
61
|
+
2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
|
62
|
+
--output INTEGER:
|
63
|
+
0 for raw counts (no smoothing performed)
|
64
|
+
1 for probabilities
|
65
|
+
2 for log-odds (default)
|
66
|
+
--noroundoff: do not round off log odds ratio
|
67
|
+
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
68
|
+
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
69
|
+
--autosigma: automatically adjust the sigma value for smoothing
|
70
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
71
|
+
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
72
|
+
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
73
|
+
--heatmap INTEGER:
|
74
|
+
0 create a heat map file for each substitution table
|
75
|
+
1 create one big file containing all heat maps from substitution tables
|
76
|
+
2 do both 0 and 1
|
77
|
+
--heatmap-format INTEGER:
|
78
|
+
0 for Portable Network Graphics (PNG) Format (default)
|
79
|
+
1 for Graphics Interchange Format (GIF)
|
80
|
+
2 for Joint Photographic Experts Group (JPEG) Format
|
81
|
+
3 for Microsoft Windows bitmap (BMP) Format
|
82
|
+
4 for Portable Document Format (PDF)
|
83
|
+
--heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
|
84
|
+
--heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
|
85
|
+
--heatmap-values: print values in the cells when generating heat maps
|
86
|
+
--verbose (-v) INTEGER
|
87
|
+
0 for ERROR level
|
88
|
+
1 for WARN or above level (default)
|
89
|
+
2 for INFO or above level
|
90
|
+
3 for DEBUG or above level
|
91
|
+
--version: print version
|
92
|
+
--help (-h): show help
|
93
|
+
|
94
|
+
USAGE
|
95
|
+
end
|
96
|
+
|
97
|
+
# Calculate PID between two sequences
|
98
|
+
#
|
99
|
+
# :call-seq:
|
100
|
+
# Ulla::CLI::calculate_pid(seq1, seq2) -> Float
|
101
|
+
#
|
102
|
+
def calculate_pid(seq1, seq2)
|
103
|
+
aas1 = seq1.split('')
|
104
|
+
aas2 = seq2.split('')
|
105
|
+
cols = aas1.zip(aas2)
|
106
|
+
align = 0 # no. of aligned columns
|
107
|
+
ident = 0 # no. of identical columns
|
108
|
+
intgp = 0 # no. of internal gaps
|
109
|
+
|
110
|
+
cols.each do |col|
|
111
|
+
if (col[0] != '-') && (col[1] != '-')
|
112
|
+
align += 1
|
113
|
+
if col[0] == col[1]
|
114
|
+
ident += 1
|
115
|
+
end
|
116
|
+
elsif (((col[0] == '-') && (col[1] != '-')) ||
|
117
|
+
((col[0] != '-') && (col[1] == '-')))
|
118
|
+
intgp += 1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
pid = 100.0 * ident.to_f / (align + intgp)
|
123
|
+
end
|
124
|
+
|
125
|
+
# :nodoc:
|
126
|
+
def execute(arguments=[])
|
127
|
+
#
|
128
|
+
# * Abbreviations in the codes
|
129
|
+
#
|
130
|
+
# env: environment
|
131
|
+
# tem: (FUGUE) template
|
132
|
+
# classdef: (envlironment) class definition
|
133
|
+
# aa: amino acid
|
134
|
+
# aa: weighted amino acid
|
135
|
+
# tot: total
|
136
|
+
# rel: relative
|
137
|
+
# jnt: joint
|
138
|
+
# cnt: count
|
139
|
+
# mut: mutation
|
140
|
+
# mutb: mutability
|
141
|
+
# freq: frequency
|
142
|
+
# prob: probability
|
143
|
+
# logo: log odds ratio
|
144
|
+
# opts: options
|
145
|
+
# fh: file handle
|
146
|
+
# ff: flat file
|
147
|
+
# ali: alignment
|
148
|
+
# mat: matrix
|
149
|
+
# arr: array
|
150
|
+
|
151
|
+
|
152
|
+
# Part 1.
|
153
|
+
#
|
154
|
+
# Global variables and their default values
|
155
|
+
#
|
156
|
+
|
157
|
+
$logger = Logger.new(STDOUT)
|
158
|
+
$logger.level = Logger::WARN
|
159
|
+
|
160
|
+
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
161
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
162
|
+
$tem_list = nil
|
163
|
+
$tem_file = nil
|
164
|
+
$classdef = 'classdef.dat'
|
165
|
+
$outfile = 'allmat.dat'
|
166
|
+
$outfh = nil # file hanfle for outfile
|
167
|
+
$output = 2 # default: log odds matrix
|
168
|
+
$ali_size = 0
|
169
|
+
$tot_aa = 0
|
170
|
+
$sigma = 5.0
|
171
|
+
$autosigma = false
|
172
|
+
$weight = 60
|
173
|
+
$noweight = false
|
174
|
+
$smooth = :partial
|
175
|
+
$nosmooth = false
|
176
|
+
$noroundoff = false
|
177
|
+
$p1smooth = false
|
178
|
+
$scale = 3
|
179
|
+
$pidmin = nil
|
180
|
+
$pidmax = nil
|
181
|
+
$scale = 3
|
182
|
+
$add = nil
|
183
|
+
$cys = 0
|
184
|
+
$targetenv = false
|
185
|
+
$penv = false
|
186
|
+
$heatmap = nil
|
187
|
+
$heatmapcol = nil
|
188
|
+
$heatmapformat = 'png'
|
189
|
+
$heatmapstem = 'heatmaps'
|
190
|
+
$heatmapvalues = false
|
191
|
+
$rvg_width = 550
|
192
|
+
$rvg_height = 650
|
193
|
+
$canvas_width = 550
|
194
|
+
$canvas_height = 650
|
195
|
+
$cell_width = 20
|
196
|
+
$cell_height = 20
|
197
|
+
|
198
|
+
$aa_tot_cnt = Hash.new(0)
|
199
|
+
$aa_mut_cnt = Hash.new(0)
|
200
|
+
$aa_mutb = {}
|
201
|
+
$aa_rel_mutb = {}
|
202
|
+
$aa_tot_freq = {}
|
203
|
+
$aa_env_cnt = Hash.new(0)
|
204
|
+
$smooth_prob = {}
|
205
|
+
$tot_cnt_mat = nil
|
206
|
+
$tot_prob_mat = nil
|
207
|
+
$tot_logo_mat = nil
|
208
|
+
$tot_smooth_prob = {}
|
209
|
+
|
210
|
+
# minimum ratio of amino acid count to sigma value
|
211
|
+
$min_cnt_sigma_ratio = 500.0
|
212
|
+
|
213
|
+
#
|
214
|
+
# Part 1 END
|
215
|
+
#
|
216
|
+
|
217
|
+
# Part 2.
|
218
|
+
#
|
219
|
+
# Parsing options
|
220
|
+
#
|
221
|
+
|
222
|
+
opts = GetoptLong.new(
|
223
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
224
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
225
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
226
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
227
|
+
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
228
|
+
[ '--nosmooth', GetoptLong::NO_ARGUMENT ],
|
229
|
+
[ '--p1smooth', GetoptLong::NO_ARGUMENT ],
|
230
|
+
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
231
|
+
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
232
|
+
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
233
|
+
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
234
|
+
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
235
|
+
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
236
|
+
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
237
|
+
[ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
|
238
|
+
[ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
|
239
|
+
[ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
|
240
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
241
|
+
[ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
|
242
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
243
|
+
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
244
|
+
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
245
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
246
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
247
|
+
)
|
248
|
+
|
249
|
+
begin
|
250
|
+
opts.each do |opt, arg|
|
251
|
+
case opt
|
252
|
+
when '--help'
|
253
|
+
print_usage
|
254
|
+
exit 0
|
255
|
+
when '--tem-list'
|
256
|
+
$tem_list = arg
|
257
|
+
when '--tem-file'
|
258
|
+
$tem_file = arg
|
259
|
+
when '--classdef'
|
260
|
+
$classdef = arg
|
261
|
+
when '--output'
|
262
|
+
$output = arg.to_i
|
263
|
+
when '--outfile'
|
264
|
+
$outfile = arg
|
265
|
+
when '--cys'
|
266
|
+
$cys = arg.to_i
|
267
|
+
when '--targetenv'
|
268
|
+
$targetenv = (arg.to_i == 1) ? true : false
|
269
|
+
when '--weight'
|
270
|
+
$weight = arg.to_i
|
271
|
+
when '--sigma'
|
272
|
+
$sigma = arg.to_f
|
273
|
+
when '--autosigma'
|
274
|
+
$autosigma = true
|
275
|
+
when '--pidmin'
|
276
|
+
$pidmin = arg.to_f
|
277
|
+
when '--pidmax'
|
278
|
+
$pidmax = arg.to_f
|
279
|
+
when '--noweight'
|
280
|
+
$noweight = true
|
281
|
+
when '--noroundoff'
|
282
|
+
$noroundoff = true
|
283
|
+
when '--smooth'
|
284
|
+
$smooth = (arg.to_i == 1) ? :full : :partial
|
285
|
+
when '--nosmooth'
|
286
|
+
$nosmooth = true
|
287
|
+
when '--p1smooth'
|
288
|
+
$p1smooth = true
|
289
|
+
when '--scale'
|
290
|
+
$scale = arg.to_f
|
291
|
+
when '--add'
|
292
|
+
$add = arg.to_f
|
293
|
+
when '--penv'
|
294
|
+
warn "--penv option is not supported."
|
295
|
+
exit 1
|
296
|
+
$penv = true
|
297
|
+
when '--heatmap'
|
298
|
+
$heatmap = case arg.to_i
|
299
|
+
when (0..2) then arg.to_i
|
300
|
+
else
|
301
|
+
warn "--heatmap #{arg.to_i} is not allowed."
|
302
|
+
exit1
|
303
|
+
end
|
304
|
+
when '--heatmap-columns'
|
305
|
+
$heatmapcol = arg.to_i
|
306
|
+
when '--heatmap-stem'
|
307
|
+
$heatmapstem = arg.to_s
|
308
|
+
when '--heatmap-format'
|
309
|
+
$heatmapformat = case arg.to_i
|
310
|
+
when 0 then 'png'
|
311
|
+
when 1 then 'gif'
|
312
|
+
when 2 then 'jpg'
|
313
|
+
when 3 then 'bmp'
|
314
|
+
when 4 then 'pdf'
|
315
|
+
else
|
316
|
+
warn "--heatmap-format #{arg.to_i} is not supported."
|
317
|
+
exit 1
|
318
|
+
end
|
319
|
+
when '--heatmap-values'
|
320
|
+
$heatmapvalues = true
|
321
|
+
when '--verbose'
|
322
|
+
$logger.level = case arg.to_i
|
323
|
+
when 0 then Logger::ERROR
|
324
|
+
when 1 then Logger::WARN
|
325
|
+
when 2 then Logger::INFO
|
326
|
+
when 3 then Logger::DEBUG
|
327
|
+
else
|
328
|
+
warn "--verbose (-v) #{arg.to_i} is not supported."
|
329
|
+
exit 1
|
330
|
+
end
|
331
|
+
when '--version'
|
332
|
+
print_version
|
333
|
+
exit 0
|
334
|
+
end
|
335
|
+
end
|
336
|
+
rescue
|
337
|
+
# invalid option
|
338
|
+
exit 1
|
339
|
+
end
|
340
|
+
|
341
|
+
# when arguments are nonsense, print usage
|
342
|
+
if ((ARGV.length != 0) ||
|
343
|
+
(!$tem_list && !$tem_file) ||
|
344
|
+
($tem_list && $tem_file))
|
345
|
+
print_usage
|
346
|
+
exit 1
|
347
|
+
end
|
348
|
+
|
349
|
+
# warn if any input file is missing
|
350
|
+
if $tem_list && !File.exist?($tem_list)
|
351
|
+
warn "Cannot find template list file, #{$tem_list}"
|
352
|
+
exit 1
|
353
|
+
end
|
354
|
+
|
355
|
+
if $tem_file && !File.exist?($tem_file)
|
356
|
+
warn "Cannot find template file, #{$tem_file}"
|
357
|
+
exit 1
|
358
|
+
end
|
359
|
+
|
360
|
+
if $classdef && !File.exist?($classdef)
|
361
|
+
warn "Cannot find environment class definition file, #{$classdef}"
|
362
|
+
exit 1
|
363
|
+
end
|
364
|
+
|
365
|
+
#
|
366
|
+
# Part 2 END
|
367
|
+
#
|
368
|
+
|
369
|
+
|
370
|
+
# Part 3.
|
371
|
+
#
|
372
|
+
# Reading Environment Class Definition File
|
373
|
+
#
|
374
|
+
|
375
|
+
# check --cys option and modify amino_acids set if necessary
|
376
|
+
if $cys == 2
|
377
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
|
378
|
+
end
|
379
|
+
|
380
|
+
# create an EnvironmentFeatureList object for storing all environment
|
381
|
+
# features
|
382
|
+
$env_features = EnvironmentFeatureArray.new
|
383
|
+
|
384
|
+
# an array for storing indexes of constrained environment features
|
385
|
+
$cst_features = []
|
386
|
+
|
387
|
+
# add substituted amino acid (aa1) in a substitution to the environment
|
388
|
+
# feature list
|
389
|
+
$env_features << EnvironmentFeature.new('sequence',
|
390
|
+
$amino_acids,
|
391
|
+
$amino_acids,
|
392
|
+
'F',
|
393
|
+
'F')
|
394
|
+
|
395
|
+
# read environment class definiton file and store them into
|
396
|
+
# the hash prepared above
|
397
|
+
env_index = 1
|
398
|
+
|
399
|
+
IO.foreach($classdef) do |line|
|
400
|
+
line.chomp!
|
401
|
+
if line.start_with?('#')
|
402
|
+
next
|
403
|
+
elsif (env_ftr = line.chomp.split(/;/)).length == 5
|
404
|
+
$logger.info "An environment feature, #{line} detected."
|
405
|
+
if env_ftr[-1] == 'T'
|
406
|
+
# skip silenced environment feature
|
407
|
+
$logger.warn "The environment feature, #{line} silent."
|
408
|
+
next
|
409
|
+
end
|
410
|
+
if env_ftr[-2] == 'T'
|
411
|
+
$cst_features << env_index
|
412
|
+
$logger.warn "The environment feature, #{line} constrained."
|
413
|
+
end
|
414
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
415
|
+
env_ftr[1].split(''),
|
416
|
+
env_ftr[2].split(''),
|
417
|
+
env_ftr[3],
|
418
|
+
env_ftr[4])
|
419
|
+
env_index += 1
|
420
|
+
else
|
421
|
+
$logger.error "\"#{line}\" doesn't seem to be a proper format for" +
|
422
|
+
"a environment class definition."
|
423
|
+
exit 1
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
# a hash for storing all environment classes
|
428
|
+
$env_classes = EnvironmentClassHash.new
|
429
|
+
|
430
|
+
# generate all possible combinations of environment labels, and store
|
431
|
+
# every environment class into the hash prepared above with the label
|
432
|
+
# as a key
|
433
|
+
$env_features.label_combinations.each_with_index { |e, i|
|
434
|
+
$env_classes[e.flatten.join] = Environment.new(i,
|
435
|
+
e.flatten.join,
|
436
|
+
$amino_acids)
|
437
|
+
}
|
438
|
+
|
439
|
+
#
|
440
|
+
# Part 3 END
|
441
|
+
#
|
442
|
+
|
443
|
+
|
444
|
+
# Part 4.
|
445
|
+
#
|
446
|
+
# Reading TEM file or TEMLIST list file and couting substitutions
|
447
|
+
#
|
448
|
+
|
449
|
+
# a global file handle for output
|
450
|
+
$outfh = File.open($outfile, 'w')
|
451
|
+
|
452
|
+
if $tem_file
|
453
|
+
$tem_list_io = StringIO.new($tem_file)
|
454
|
+
end
|
455
|
+
|
456
|
+
if $tem_list
|
457
|
+
$tem_list_io = File.open($tem_list)
|
458
|
+
end
|
459
|
+
|
460
|
+
$tem_list_io.each_line do |tem_file|
|
461
|
+
tem_file.chomp!
|
462
|
+
|
463
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
464
|
+
ff = Bio::FlatFile.auto(tem_file)
|
465
|
+
|
466
|
+
ff.each_entry do |pir|
|
467
|
+
if (pir.definition == 'sequence') || (pir.definition == 'structure')
|
468
|
+
ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
if ali.size < 2
|
473
|
+
$logger.warn "Skipped #{tem_file} which has only one unique entry."
|
474
|
+
next
|
475
|
+
end
|
476
|
+
|
477
|
+
$ali_size += 1
|
478
|
+
env_labels = {}
|
479
|
+
disulphide = {}
|
480
|
+
|
481
|
+
ali.each_pair do |key, seq|
|
482
|
+
# check disulphide bond environment first!
|
483
|
+
ff.rewind
|
484
|
+
ff.each_entry do |pir|
|
485
|
+
if ((pir.entry_id == key) &&
|
486
|
+
((pir.definition == "disulphide") ||
|
487
|
+
(pir.definition == "disulfide")))
|
488
|
+
disulphide[key] = pir.data.remove_internal_spaces.split('')
|
489
|
+
end
|
490
|
+
end
|
491
|
+
|
492
|
+
$env_features.each_with_index do |ec, ei|
|
493
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
494
|
+
|
495
|
+
ff.rewind
|
496
|
+
ff.each_entry do |pir|
|
497
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
498
|
+
labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
|
499
|
+
if sym == '-'
|
500
|
+
'-'
|
501
|
+
elsif sym == 'X' || sym == 'x'
|
502
|
+
'X'
|
503
|
+
else
|
504
|
+
if ei == 0 # Amino Acid Environment Feature
|
505
|
+
(disulphide.has_key?(key) &&
|
506
|
+
(disulphide[key][pos] == 'F') &&
|
507
|
+
(sym == 'C')) ? 'J' : sym
|
508
|
+
else
|
509
|
+
ec.labels[ec.symbols.index(sym)]
|
510
|
+
end
|
511
|
+
end
|
512
|
+
end
|
513
|
+
|
514
|
+
if env_labels[key].empty?
|
515
|
+
env_labels[key] = labels
|
516
|
+
else
|
517
|
+
env_labels[key].each_with_index { |e, i|
|
518
|
+
env_labels[key][i] = e + labels[i]
|
519
|
+
}
|
520
|
+
end
|
521
|
+
end
|
522
|
+
end
|
523
|
+
end
|
524
|
+
end
|
525
|
+
|
526
|
+
if $noweight
|
527
|
+
ali.each_pair do |id1, seq1|
|
528
|
+
ali.each_pair do |id2, seq2|
|
529
|
+
if id1 != id2
|
530
|
+
pid = calculate_pid(seq1, seq2)
|
531
|
+
s1 = seq1.split('')
|
532
|
+
s2 = seq2.split('')
|
533
|
+
|
534
|
+
# check PID_MIN
|
535
|
+
if $pidmin && (pid < $pidmin)
|
536
|
+
$logger.info "Skip alignment between #{id1} and #{id2} " +
|
537
|
+
"having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
538
|
+
next
|
539
|
+
end
|
540
|
+
|
541
|
+
# check PID_MAX
|
542
|
+
if $pidmax && (pid > $pidmax)
|
543
|
+
$logger.info "Skip alignment between #{id1} and #{id2} " +
|
544
|
+
"having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
545
|
+
next
|
546
|
+
end
|
547
|
+
|
548
|
+
s1.each_with_index do |aa1, pos|
|
549
|
+
aa1.upcase!
|
550
|
+
aa2 = s2[pos].upcase
|
551
|
+
|
552
|
+
if env_labels[id1][pos].include?('X')
|
553
|
+
$logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
|
554
|
+
next
|
555
|
+
end
|
556
|
+
|
557
|
+
if env_labels[id2][pos].include?('X')
|
558
|
+
$logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
|
559
|
+
next
|
560
|
+
end
|
561
|
+
|
562
|
+
unless $amino_acids.include?(aa1)
|
563
|
+
$logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
|
564
|
+
next
|
565
|
+
end
|
566
|
+
|
567
|
+
unless $amino_acids.include?(aa2)
|
568
|
+
$logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
|
569
|
+
next
|
570
|
+
end
|
571
|
+
|
572
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
|
573
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
|
574
|
+
|
575
|
+
if $cst_features.empty?
|
576
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
|
577
|
+
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
578
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
|
579
|
+
else
|
580
|
+
$logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
|
581
|
+
next
|
582
|
+
end
|
583
|
+
|
584
|
+
grp_label = env_labels[id1][pos][1..-1]
|
585
|
+
|
586
|
+
if $aa_env_cnt.has_key? grp_label
|
587
|
+
if $aa_env_cnt[grp_label].has_key? aa1
|
588
|
+
$aa_env_cnt[grp_label][aa1] += 1
|
589
|
+
else
|
590
|
+
$aa_env_cnt[grp_label][aa1] = 1
|
591
|
+
end
|
592
|
+
else
|
593
|
+
$aa_env_cnt[grp_label] = Hash.new(0)
|
594
|
+
$aa_env_cnt[grp_label][aa1] = 1
|
595
|
+
end
|
596
|
+
|
597
|
+
if $aa_tot_cnt.has_key? aa1
|
598
|
+
$aa_tot_cnt[aa1] += 1
|
599
|
+
else
|
600
|
+
$aa_tot_cnt[aa1] = 1
|
601
|
+
end
|
602
|
+
|
603
|
+
if aa1 != aa2
|
604
|
+
if $aa_mut_cnt.has_key? aa1
|
605
|
+
$aa_mut_cnt[aa1] += 1
|
606
|
+
else
|
607
|
+
$aa_mut_cnt[aa1] = 1
|
608
|
+
end
|
609
|
+
end
|
610
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
else
|
616
|
+
# BLOSUM-like weighting
|
617
|
+
clusters = []
|
618
|
+
ali.each_pair { |i, s| clusters << [i] }
|
619
|
+
|
620
|
+
# a loop for single linkage clustering
|
621
|
+
begin
|
622
|
+
continue = false
|
623
|
+
0.upto(clusters.size - 2) do |i|
|
624
|
+
indexes = []
|
625
|
+
(i + 1).upto(clusters.size - 1) do |j|
|
626
|
+
found = false
|
627
|
+
clusters[i].each do |c1|
|
628
|
+
clusters[j].each do |c2|
|
629
|
+
if calculate_pid(ali[c1], ali[c2]) >= $weight
|
630
|
+
indexes << j
|
631
|
+
found = true
|
632
|
+
break
|
633
|
+
end
|
634
|
+
end
|
635
|
+
break if found
|
636
|
+
end
|
637
|
+
end
|
638
|
+
|
639
|
+
unless indexes.empty?
|
640
|
+
continue = true
|
641
|
+
group = clusters[i]
|
642
|
+
indexes.each do |k|
|
643
|
+
group = group.concat(clusters[k])
|
644
|
+
clusters[k] = nil
|
645
|
+
end
|
646
|
+
clusters[i] = group
|
647
|
+
clusters.compact!
|
648
|
+
end
|
649
|
+
end
|
650
|
+
end while(continue)
|
651
|
+
|
652
|
+
if clusters.size < 2
|
653
|
+
$logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
|
654
|
+
next
|
655
|
+
end
|
656
|
+
|
657
|
+
clusters.combination(2).each do |cluster1, cluster2|
|
658
|
+
cluster1.each do |id1|
|
659
|
+
cluster2.each do |id2|
|
660
|
+
seq1 = ali[id1].split('')
|
661
|
+
seq2 = ali[id2].split('')
|
662
|
+
|
663
|
+
seq1.each_with_index do |aa1, pos|
|
664
|
+
aa1.upcase!
|
665
|
+
aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
|
666
|
+
|
667
|
+
if env_labels[id1][pos].include?('X')
|
668
|
+
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
|
669
|
+
next
|
670
|
+
end
|
671
|
+
|
672
|
+
if env_labels[id2][pos].include?('X')
|
673
|
+
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
|
674
|
+
next
|
675
|
+
end
|
676
|
+
|
677
|
+
unless $amino_acids.include?(aa1)
|
678
|
+
$logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
|
679
|
+
next
|
680
|
+
end
|
681
|
+
|
682
|
+
unless $amino_acids.include?(aa2)
|
683
|
+
$logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
|
684
|
+
next
|
685
|
+
end
|
686
|
+
|
687
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
|
688
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
|
689
|
+
cnt1 = 1.0 / cluster1.size
|
690
|
+
cnt2 = 1.0 / cluster2.size
|
691
|
+
jnt_cnt = cnt1 * cnt2
|
692
|
+
|
693
|
+
if $cst_features.empty?
|
694
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
|
695
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
|
696
|
+
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
697
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
|
698
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
|
699
|
+
else
|
700
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
|
701
|
+
next
|
702
|
+
end
|
703
|
+
|
704
|
+
grp_label1 = env_labels[id1][pos][1..-1]
|
705
|
+
grp_label2 = env_labels[id2][pos][1..-1]
|
706
|
+
|
707
|
+
if $aa_env_cnt.has_key? grp_label1
|
708
|
+
if $aa_env_cnt[grp_label1].has_key? aa1
|
709
|
+
$aa_env_cnt[grp_label1][aa1] += cnt1
|
710
|
+
else
|
711
|
+
$aa_env_cnt[grp_label1][aa1] = cnt1
|
712
|
+
end
|
713
|
+
else
|
714
|
+
$aa_env_cnt[grp_label1] = Hash.new(0.0)
|
715
|
+
$aa_env_cnt[grp_label1][aa1] = cnt1
|
716
|
+
end
|
717
|
+
|
718
|
+
if $aa_env_cnt.has_key? grp_label2
|
719
|
+
if $aa_env_cnt[grp_label2].has_key? aa2
|
720
|
+
$aa_env_cnt[grp_label2][aa2] += cnt2
|
721
|
+
else
|
722
|
+
$aa_env_cnt[grp_label2][aa2] = cnt2
|
723
|
+
end
|
724
|
+
else
|
725
|
+
$aa_env_cnt[grp_label2] = Hash.new(0.0)
|
726
|
+
$aa_env_cnt[grp_label2][aa2] = cnt2
|
727
|
+
end
|
728
|
+
|
729
|
+
if $aa_tot_cnt.has_key? aa1
|
730
|
+
$aa_tot_cnt[aa1] += cnt1
|
731
|
+
else
|
732
|
+
$aa_tot_cnt[aa1] = cnt1
|
733
|
+
end
|
734
|
+
|
735
|
+
if $aa_tot_cnt.has_key? aa2
|
736
|
+
$aa_tot_cnt[aa2] += cnt2
|
737
|
+
else
|
738
|
+
$aa_tot_cnt[aa2] = cnt2
|
739
|
+
end
|
740
|
+
|
741
|
+
if aa1 != aa2
|
742
|
+
if $aa_mut_cnt.has_key? aa1
|
743
|
+
$aa_mut_cnt[aa1] += cnt1
|
744
|
+
else
|
745
|
+
$aa_mut_cnt[aa1] = cnt1
|
746
|
+
end
|
747
|
+
if $aa_mut_cnt.has_key? aa2
|
748
|
+
$aa_mut_cnt[aa2] += cnt2
|
749
|
+
else
|
750
|
+
$aa_mut_cnt[aa2] = cnt2
|
751
|
+
end
|
752
|
+
end
|
753
|
+
|
754
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
|
755
|
+
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
|
756
|
+
end
|
757
|
+
end
|
758
|
+
end
|
759
|
+
end
|
760
|
+
end
|
761
|
+
$logger.info "Analysing #{tem_file} done."
|
762
|
+
end
|
763
|
+
|
764
|
+
# print out default header
|
765
|
+
$outfh.puts <<HEADER
|
766
|
+
# Environment-specific amino acid substitution matrices
|
767
|
+
# Creator: ulla version #{VERSION}
|
768
|
+
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
769
|
+
#
|
770
|
+
# Definitions for structural environments:
|
771
|
+
# #{$env_features.size - 1} features used
|
772
|
+
#
|
773
|
+
HEADER
|
774
|
+
|
775
|
+
$env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
|
776
|
+
|
777
|
+
$outfh.puts <<HEADER
|
778
|
+
# (read in from #{$classdef})
|
779
|
+
#
|
780
|
+
# Number of alignments: #{$ali_size}
|
781
|
+
# (list of .tem files read in from #{$tem_list})
|
782
|
+
#
|
783
|
+
# Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
|
784
|
+
#
|
785
|
+
# There are #{$amino_acids.size} amino acids considered.
|
786
|
+
# #{$amino_acids.join}
|
787
|
+
#
|
788
|
+
HEADER
|
789
|
+
|
790
|
+
if $amino_acids.include? 'J'
|
791
|
+
$outfh.puts <<HEADER
|
792
|
+
# C: Cystine (the disulfide-bonded form)
|
793
|
+
# J: Cysteine (the free thiol form)
|
794
|
+
#
|
795
|
+
HEADER
|
796
|
+
end
|
797
|
+
|
798
|
+
if $noweight
|
799
|
+
$outfh.puts '# Weighting scheme: none'
|
800
|
+
else
|
801
|
+
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
802
|
+
end
|
803
|
+
|
804
|
+
# calculate amino acid frequencies and mutabilities, and
|
805
|
+
# print them as default statistics in the header part
|
806
|
+
ala_factor = if $aa_tot_cnt['A'] == 0
|
807
|
+
0.0
|
808
|
+
elsif $aa_mut_cnt['A'] == 0
|
809
|
+
0.0
|
810
|
+
else
|
811
|
+
100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
|
812
|
+
end
|
813
|
+
$tot_aa = $aa_tot_cnt.values.sum
|
814
|
+
|
815
|
+
$outfh.puts '#'
|
816
|
+
$outfh.puts "# Total amino acid frequencies:\n"
|
817
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
818
|
+
|
819
|
+
min_cnt = -1
|
820
|
+
min_sigma = nil
|
821
|
+
|
822
|
+
$amino_acids.each do |res|
|
823
|
+
if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
|
824
|
+
if min_cnt < 0
|
825
|
+
min_cnt = $aa_tot_cnt[res]
|
826
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
827
|
+
elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
|
828
|
+
min_cnt = $aa_tot_cnt[res]
|
829
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
830
|
+
end
|
831
|
+
|
832
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
|
833
|
+
end
|
834
|
+
|
835
|
+
$aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
|
836
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
837
|
+
$aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
|
838
|
+
end
|
839
|
+
|
840
|
+
$amino_acids.each do |res|
|
841
|
+
if $noweight
|
842
|
+
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
|
843
|
+
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
844
|
+
else
|
845
|
+
$outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
|
846
|
+
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
847
|
+
end
|
848
|
+
end
|
849
|
+
|
850
|
+
if min_cnt > -1
|
851
|
+
$logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
|
852
|
+
if $autosigma
|
853
|
+
$logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
|
854
|
+
$sigma = min_sigma
|
855
|
+
end
|
856
|
+
end
|
857
|
+
|
858
|
+
$outfh.puts '#'
|
859
|
+
$outfh.puts '# RES: Amino acid one letter code'
|
860
|
+
$outfh.puts '# TOT_OBS: Total count of incidence'
|
861
|
+
$outfh.puts '# MUT_OBS: Total count of mutation'
|
862
|
+
$outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
|
863
|
+
$outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
|
864
|
+
$outfh.puts '# REL_FREQ: Relative frequency'
|
865
|
+
$outfh.puts '#'
|
866
|
+
|
867
|
+
#
|
868
|
+
# Part 4. END
|
869
|
+
#
|
870
|
+
|
871
|
+
|
872
|
+
# Part 5.
|
873
|
+
#
|
874
|
+
# Generating substitution frequency matrices
|
875
|
+
#
|
876
|
+
|
877
|
+
# calculating probabilities for each environment
|
878
|
+
$env_classes.values.each do |e|
|
879
|
+
if e.freq_array.sum != 0
|
880
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
# count raw frequencies
|
885
|
+
$tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
|
886
|
+
group_matrices = []
|
887
|
+
|
888
|
+
# for each combination of environment features
|
889
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
890
|
+
grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
|
891
|
+
|
892
|
+
$amino_acids.each_with_index do |aa, aj|
|
893
|
+
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
894
|
+
0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
|
895
|
+
end
|
896
|
+
|
897
|
+
$tot_cnt_mat += grp_cnt_mat
|
898
|
+
group_matrices << [group[0], grp_cnt_mat]
|
899
|
+
end
|
900
|
+
|
901
|
+
$logger.info "Counting substitutions done."
|
902
|
+
|
903
|
+
if $output == 0
|
904
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
905
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
|
906
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
907
|
+
|
908
|
+
group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
|
909
|
+
# for a matrix file
|
910
|
+
stem = "#{grp_no}. #{grp_label}"
|
911
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
912
|
+
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
|
913
|
+
:row_header => $amino_acids)
|
914
|
+
|
915
|
+
# for a heat map
|
916
|
+
if $heatmap == 0 or $heatmap == 2
|
917
|
+
grp_cnt_mat.heatmap(:col_header => $amino_acids,
|
918
|
+
:row_header => $amino_acids,
|
919
|
+
:rvg_width => $rvg_width,
|
920
|
+
:rvg_height => $rvg_height,
|
921
|
+
:canvas_width => $canvas_width,
|
922
|
+
:canvas_height => $canvas_height,
|
923
|
+
:max_val => grp_max_val.ceil,
|
924
|
+
:min_val => 0,
|
925
|
+
:print_value => $heatmapvalues,
|
926
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
927
|
+
|
928
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
929
|
+
end
|
930
|
+
|
931
|
+
if $heatmap == 1 or $heatmap == 2
|
932
|
+
title_font_size = $rvg_width * $heatmapcol / 80.0
|
933
|
+
heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
|
934
|
+
:row_header => $amino_acids,
|
935
|
+
:rvg_width => $rvg_width,
|
936
|
+
:rvg_height => $rvg_height - 50,
|
937
|
+
:canvas_width => $canvas_width,
|
938
|
+
:canvas_height => $canvas_height - 50,
|
939
|
+
:max_val => grp_max_val.ceil,
|
940
|
+
:min_val => 0,
|
941
|
+
:print_value => $heatmapvalues,
|
942
|
+
:print_gradient => false,
|
943
|
+
:title => stem,
|
944
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
945
|
+
end
|
946
|
+
end
|
947
|
+
|
948
|
+
if $heatmap == 1 or $heatmap == 2
|
949
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
950
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
951
|
+
:rvg_width => $rvg_width,
|
952
|
+
:max_val => grp_max_val.ceil,
|
953
|
+
:min_val => 0).write(file)
|
954
|
+
|
955
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
956
|
+
end
|
957
|
+
|
958
|
+
# total
|
959
|
+
$outfh.puts '>Total'
|
960
|
+
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
961
|
+
:row_header => $amino_acids)
|
962
|
+
|
963
|
+
if $heatmap == 0 or $heatmap == 2
|
964
|
+
stem = "#{group_matrices.size}. TOTAL"
|
965
|
+
heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
|
966
|
+
:row_header => $amino_acids,
|
967
|
+
:rvg_width => $rvg_width,
|
968
|
+
:rvg_height => $rvg_height,
|
969
|
+
:canvas_width => $canvas_width,
|
970
|
+
:canvas_height => $canvas_height,
|
971
|
+
:max_val => $tot_cnt_mat.max.ceil,
|
972
|
+
:min_val => 0,
|
973
|
+
:print_value => $heatmapvalues,
|
974
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
975
|
+
|
976
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
977
|
+
end
|
978
|
+
exit 0
|
979
|
+
end
|
980
|
+
|
981
|
+
#
|
982
|
+
# Part 5. END
|
983
|
+
#
|
984
|
+
|
985
|
+
|
986
|
+
# Part 6.
|
987
|
+
#
|
988
|
+
# Calculating substitution probability tables
|
989
|
+
#
|
990
|
+
|
991
|
+
if $output == 1
|
992
|
+
$outfh.puts <<HEADER
|
993
|
+
#
|
994
|
+
# Each column (j) represents the probability distribution for the
|
995
|
+
# likelihood of acceptance of a mutational event by a residue type j in
|
996
|
+
# a particular structural environment (specified after >) leading to
|
997
|
+
# any other residue type (i) and sums up to 100.
|
998
|
+
#
|
999
|
+
HEADER
|
1000
|
+
end
|
1001
|
+
|
1002
|
+
# when nosmoothing !!!
|
1003
|
+
if ($output > 0) && $nosmooth
|
1004
|
+
# reinitialize $tot_cnt_mat for pseudocounts
|
1005
|
+
$tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1006
|
+
|
1007
|
+
# for each combination of environment features
|
1008
|
+
pseudo_cnt = $add || (1.0 / $env_classes.group_size)
|
1009
|
+
|
1010
|
+
# add pseudo counts for each frequency vector
|
1011
|
+
$env_classes.values.each { |e| e.freq_array += pseudo_cnt }
|
1012
|
+
|
1013
|
+
# re-calculate probability vector for each environment class
|
1014
|
+
$env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
|
1015
|
+
|
1016
|
+
group_matrices = []
|
1017
|
+
|
1018
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1019
|
+
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1020
|
+
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1021
|
+
|
1022
|
+
$amino_acids.each_with_index do |aa, aj|
|
1023
|
+
env_class = group[1].find { |e| e.label.start_with?(aa) }
|
1024
|
+
0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
|
1025
|
+
0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
|
1026
|
+
end
|
1027
|
+
|
1028
|
+
$tot_cnt_mat += grp_cnt_mat
|
1029
|
+
group_matrices << [group[0], grp_prob_mat]
|
1030
|
+
end
|
1031
|
+
|
1032
|
+
if $output == 1
|
1033
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1034
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
|
1035
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
1036
|
+
|
1037
|
+
group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
|
1038
|
+
# for a matrix file
|
1039
|
+
stem = "#{grp_no}. #{grp_label}"
|
1040
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1041
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1042
|
+
:row_header => $amino_acids)
|
1043
|
+
|
1044
|
+
|
1045
|
+
# for a heat map
|
1046
|
+
if $heatmap == 0 or $heatmap == 2
|
1047
|
+
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1048
|
+
:row_header => $amino_acids,
|
1049
|
+
:rvg_width => $rvg_width,
|
1050
|
+
:rvg_height => $rvg_height,
|
1051
|
+
:canvas_width => $canvas_width,
|
1052
|
+
:canvas_height => $canvas_height,
|
1053
|
+
:max_val => grp_max_val.ceil,
|
1054
|
+
:min_val => 0,
|
1055
|
+
:print_value => $heatmapvalues,
|
1056
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1057
|
+
|
1058
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
if $heatmap == 1 or $heatmap == 2
|
1062
|
+
title_font_size = $rvg_width * $heatmapcol / 80.0
|
1063
|
+
heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1064
|
+
:row_header => $amino_acids,
|
1065
|
+
:rvg_width => $rvg_width,
|
1066
|
+
:rvg_height => $rvg_height - 50,
|
1067
|
+
:canvas_width => $canvas_width,
|
1068
|
+
:canvas_height => $canvas_height - 50,
|
1069
|
+
:max_val => grp_max_val.ceil,
|
1070
|
+
:min_val => 0,
|
1071
|
+
:print_value => $heatmapvalues,
|
1072
|
+
:print_gradient => false,
|
1073
|
+
:title => stem,
|
1074
|
+
:title_font_size => title_font_size)
|
1075
|
+
end
|
1076
|
+
end
|
1077
|
+
|
1078
|
+
# for heat maps in a single file
|
1079
|
+
if $heatmap == 1 or $heatmap == 2
|
1080
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1081
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1082
|
+
:rvg_width => $rvg_width,
|
1083
|
+
:max_val => grp_max_val.ceil,
|
1084
|
+
:min_val => 0).write(file)
|
1085
|
+
|
1086
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1087
|
+
end
|
1088
|
+
end
|
1089
|
+
|
1090
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1091
|
+
|
1092
|
+
0.upto($amino_acids.size - 1) do |aj|
|
1093
|
+
col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
|
1094
|
+
0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
if $output == 1
|
1098
|
+
$outfh.puts '>Total'
|
1099
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1100
|
+
:row_header => $amino_acids)
|
1101
|
+
$outfh.close
|
1102
|
+
|
1103
|
+
# for a heat map
|
1104
|
+
if $heatmap == 0 or $heatmap == 2
|
1105
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1106
|
+
$tot_prob_mat.heatmap(:col_header => $amino_acids,
|
1107
|
+
:row_header => $amino_acids,
|
1108
|
+
:rvg_width => $rvg_width,
|
1109
|
+
:rvg_height => $rvg_height,
|
1110
|
+
:canvas_width => $canvas_width,
|
1111
|
+
:canvas_height => $canvas_height,
|
1112
|
+
:max_val => $tot_prob_mat.max.ceil,
|
1113
|
+
:min_val => 0,
|
1114
|
+
:print_value => $heatmapvalues,
|
1115
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1116
|
+
|
1117
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1118
|
+
end
|
1119
|
+
exit 0
|
1120
|
+
end
|
1121
|
+
|
1122
|
+
$logger.info 'Calculating substitution probabilities (no smoothing) done.'
|
1123
|
+
end
|
1124
|
+
|
1125
|
+
# when smoothing!!!
|
1126
|
+
if ($output > 0) && !$nosmooth
|
1127
|
+
#
|
1128
|
+
# p1 probabilities
|
1129
|
+
#
|
1130
|
+
p1 = NArray.float($amino_acids.size)
|
1131
|
+
a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
|
1132
|
+
big_N = $tot_aa.to_f
|
1133
|
+
small_n = $amino_acids.size.to_f
|
1134
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
1135
|
+
omega2 = 1.0 - omega1
|
1136
|
+
|
1137
|
+
if ($smooth == :full) || $p1smooth
|
1138
|
+
# smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
|
1139
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
|
1140
|
+
$smooth_prob[1] = p1
|
1141
|
+
elsif ($smooth == :partial)
|
1142
|
+
# no smoothing for p1 probabilities just as Kenji's subst
|
1143
|
+
# in this case, p1 probabilities were taken from the amino acid frequencies of your data set
|
1144
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
|
1145
|
+
$smooth_prob[1] = p1
|
1146
|
+
end
|
1147
|
+
|
1148
|
+
#
|
1149
|
+
# p2 and above
|
1150
|
+
#
|
1151
|
+
env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
1152
|
+
|
1153
|
+
if $smooth == :partial
|
1154
|
+
$outfh.puts <<HEADER
|
1155
|
+
#
|
1156
|
+
# Partial Smoothing:
|
1157
|
+
#
|
1158
|
+
HEADER
|
1159
|
+
if $p1smooth
|
1160
|
+
$outfh.puts <<HEADER
|
1161
|
+
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
1162
|
+
# each row in all matrices and smoothing them with A0 (a uniform distribution)
|
1163
|
+
# ^^^^^^^^^
|
1164
|
+
HEADER
|
1165
|
+
else
|
1166
|
+
$outfh.puts <<HEADER
|
1167
|
+
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
1168
|
+
# each row in all matrices without smoothing
|
1169
|
+
# ^^^^^^^^^^^^^^^^^
|
1170
|
+
HEADER
|
1171
|
+
end
|
1172
|
+
|
1173
|
+
$outfh.puts <<HEADER
|
1174
|
+
# p2(ri|Rj) is estimated as:
|
1175
|
+
# p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
|
1176
|
+
#
|
1177
|
+
# p3(ri|Rj,fq) is estimated as:
|
1178
|
+
# p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
|
1179
|
+
# where
|
1180
|
+
# A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
|
1181
|
+
#
|
1182
|
+
# The smoothing procedure is curtailed here and finally
|
1183
|
+
# ^^^^^^^^^
|
1184
|
+
# p5(ri|Rj,...) is estimated as:
|
1185
|
+
# p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
|
1186
|
+
# where
|
1187
|
+
# A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
|
1188
|
+
#
|
1189
|
+
# Weights (omegas) are calculated as in Topham et al. (1993)
|
1190
|
+
#
|
1191
|
+
# sigma value used is: #{$sigma}
|
1192
|
+
#
|
1193
|
+
HEADER
|
1194
|
+
1.upto($env_features.size) do |ci|
|
1195
|
+
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
1196
|
+
if (ci > 2) && (ci < $env_features.size)
|
1197
|
+
$logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
|
1198
|
+
next
|
1199
|
+
end
|
1200
|
+
|
1201
|
+
env_labels.combination(ci) do |c1|
|
1202
|
+
c1[0].product(*c1[1..-1]).each do |labels|
|
1203
|
+
pattern = '.' * $env_features.size
|
1204
|
+
|
1205
|
+
labels.each do |label|
|
1206
|
+
i = label[0].chr.to_i
|
1207
|
+
l = label[1].chr
|
1208
|
+
pattern[i] = l
|
1209
|
+
end
|
1210
|
+
|
1211
|
+
if pattern =~ /^\./
|
1212
|
+
$logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
|
1213
|
+
next
|
1214
|
+
end
|
1215
|
+
|
1216
|
+
# get environments matching the pattern created above
|
1217
|
+
# and calculate amino acid frequencies and their probabilities for all the environments
|
1218
|
+
envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
|
1219
|
+
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1220
|
+
prob_arr = NArray.float($amino_acids.size)
|
1221
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
|
1222
|
+
|
1223
|
+
# # assess whether a residue type j is compatible with a particular combination of structural features
|
1224
|
+
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
1225
|
+
# if ci == $env_features.size
|
1226
|
+
# aa_label = labels.find { |l| l.match(/^0/) }[1].chr
|
1227
|
+
# sub_pattern = '.' * $env_features.size
|
1228
|
+
# sub_pattern[0] = aa_label
|
1229
|
+
# sub_freq_sum = 0
|
1230
|
+
#
|
1231
|
+
# labels[1..-1].each do |label|
|
1232
|
+
# next if label.start_with?('0')
|
1233
|
+
# i = label[0].chr.to_i
|
1234
|
+
# l = label[1].chr
|
1235
|
+
# sub_pattern[i] = l
|
1236
|
+
# sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
|
1237
|
+
# sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1238
|
+
# sub_freq_sum += sub_freq_arr.sum
|
1239
|
+
# end
|
1240
|
+
#
|
1241
|
+
# if sub_freq_sum == 0
|
1242
|
+
# if $smooth_prob.has_key?(ci + 1)
|
1243
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
1244
|
+
# else
|
1245
|
+
# $smooth_prob[ci + 1] = {}
|
1246
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
1247
|
+
# end
|
1248
|
+
# $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
|
1249
|
+
# next
|
1250
|
+
# end
|
1251
|
+
# end
|
1252
|
+
|
1253
|
+
# collect priors
|
1254
|
+
priors = []
|
1255
|
+
|
1256
|
+
if ci == 1
|
1257
|
+
priors << $smooth_prob[1]
|
1258
|
+
elsif ci == 2
|
1259
|
+
labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
|
1260
|
+
priors << $smooth_prob[2][c3.to_set]
|
1261
|
+
}
|
1262
|
+
elsif ci == $env_features.size
|
1263
|
+
labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
|
1264
|
+
priors << $smooth_prob[3][c3.to_set]
|
1265
|
+
}
|
1266
|
+
end
|
1267
|
+
|
1268
|
+
# entropy based prior weighting step
|
1269
|
+
entropy_max = Math::log($amino_acids.size)
|
1270
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
1271
|
+
begin
|
1272
|
+
p == 0.0 ? s - 1 : s + p * Math::log(p)
|
1273
|
+
rescue
|
1274
|
+
#puts "P: #{p}"
|
1275
|
+
end
|
1276
|
+
} }
|
1277
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
1278
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
1279
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
1280
|
+
|
1281
|
+
# smoothing step
|
1282
|
+
smooth_prob_arr = NArray.float($amino_acids.size)
|
1283
|
+
big_N = freq_arr.sum.to_f
|
1284
|
+
small_n = $amino_acids.size.to_f
|
1285
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
1286
|
+
omega2 = 1.0 - omega1
|
1287
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1288
|
+
|
1289
|
+
# normalization step
|
1290
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
1291
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
1292
|
+
|
1293
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1294
|
+
if $smooth_prob.has_key?(ci + 1)
|
1295
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1296
|
+
else
|
1297
|
+
$smooth_prob[ci + 1] = {}
|
1298
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1299
|
+
end
|
1300
|
+
end
|
1301
|
+
end
|
1302
|
+
end
|
1303
|
+
$logger.info 'Calculating substitution probabilities (partial smoothing) done.'
|
1304
|
+
else
|
1305
|
+
$outfh.puts <<HEADER
|
1306
|
+
#
|
1307
|
+
# Full Smoothing:
|
1308
|
+
#
|
1309
|
+
# p1(ri) is estimated as:
|
1310
|
+
# p1(ri) = omega1 * A0 + omega2 * W1(ri)
|
1311
|
+
#
|
1312
|
+
# p2(ri|f1q) is estimated as:
|
1313
|
+
# p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
|
1314
|
+
#
|
1315
|
+
# (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
|
1316
|
+
#
|
1317
|
+
# p3(ri|f1q,f2q) is estimated as:
|
1318
|
+
# p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
|
1319
|
+
# where
|
1320
|
+
# A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
|
1321
|
+
#
|
1322
|
+
# The smoothing procedure is NOT curtailed here and it goes upto
|
1323
|
+
# ^^^^^^^^^^^^^
|
1324
|
+
#
|
1325
|
+
# pn(ri|f1q,f2q,...,fn-1q) is estimated as:
|
1326
|
+
# pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
|
1327
|
+
# where
|
1328
|
+
# An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
|
1329
|
+
#
|
1330
|
+
# Weights (omegas) are calculated as in Topham et al. (1993)
|
1331
|
+
#
|
1332
|
+
# sigma value used is: #{$sigma}
|
1333
|
+
#
|
1334
|
+
HEADER
|
1335
|
+
# full smooting
|
1336
|
+
1.upto($env_features.size) do |ci|
|
1337
|
+
env_labels.combination(ci) do |c1|
|
1338
|
+
c1[0].product(*c1[1..-1]).each do |labels|
|
1339
|
+
pattern = '.' * $env_features.size
|
1340
|
+
labels.each do |label|
|
1341
|
+
j = label[0].chr.to_i
|
1342
|
+
l = label[1].chr
|
1343
|
+
pattern[j] = l
|
1344
|
+
end
|
1345
|
+
|
1346
|
+
# get environmetns, frequencies, and probabilities
|
1347
|
+
envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
|
1348
|
+
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1349
|
+
prob_arr = NArray.float($amino_acids.size)
|
1350
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
1351
|
+
|
1352
|
+
# collect priors
|
1353
|
+
priors = []
|
1354
|
+
if ci > 1
|
1355
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
1356
|
+
else
|
1357
|
+
priors << $smooth_prob[1]
|
1358
|
+
end
|
1359
|
+
|
1360
|
+
# entropy based weighting priors
|
1361
|
+
entropy_max = Math::log($amino_acids.size)
|
1362
|
+
entropies = priors.map do |prior|
|
1363
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
1364
|
+
end
|
1365
|
+
weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
1366
|
+
|
1367
|
+
# smoothing step
|
1368
|
+
smooth_prob_arr = NArray.float($amino_acids.size)
|
1369
|
+
big_N = freq_arr.sum.to_f
|
1370
|
+
small_n = $amino_acids.size.to_f
|
1371
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
1372
|
+
omega2 = 1.0 - omega1
|
1373
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1374
|
+
|
1375
|
+
# normalization step
|
1376
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
1377
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
1378
|
+
|
1379
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1380
|
+
if $smooth_prob.has_key?(ci + 1)
|
1381
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1382
|
+
else
|
1383
|
+
$smooth_prob[ci + 1] = {}
|
1384
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1385
|
+
end
|
1386
|
+
end
|
1387
|
+
end
|
1388
|
+
end
|
1389
|
+
$logger.info 'Calculating substitution probabilities (full smoothing) done.'
|
1390
|
+
end
|
1391
|
+
|
1392
|
+
# updating smoothed probability array for each envrionment
|
1393
|
+
$env_classes.values.each do |env|
|
1394
|
+
env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
|
1395
|
+
end
|
1396
|
+
|
1397
|
+
# sorting environments and build 21X21 substitution matrices
|
1398
|
+
group_matrices = []
|
1399
|
+
|
1400
|
+
$env_classes.groups_sorted_by_residue_labels.each do |group|
|
1401
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1402
|
+
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1403
|
+
|
1404
|
+
$amino_acids.each_with_index do |aa, ai|
|
1405
|
+
smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
1406
|
+
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
|
1407
|
+
end
|
1408
|
+
|
1409
|
+
group_matrices << [group[0], grp_prob_mat]
|
1410
|
+
end
|
1411
|
+
|
1412
|
+
if $output == 1
|
1413
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1414
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
|
1415
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
1416
|
+
|
1417
|
+
group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
|
1418
|
+
# for a matrix file
|
1419
|
+
stem = "#{grp_no}. #{grp_label}"
|
1420
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1421
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1422
|
+
:row_header => $amino_acids)
|
1423
|
+
|
1424
|
+
# for heat map generation
|
1425
|
+
if $heatmap == 0 or $heatmap == 2
|
1426
|
+
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1427
|
+
:row_header => $amino_acids,
|
1428
|
+
:rvg_width => $rvg_width,
|
1429
|
+
:rvg_height => $rvg_height,
|
1430
|
+
:canvas_width => $canvas_width,
|
1431
|
+
:canvas_height => $canvas_height,
|
1432
|
+
:max_val => grp_max_val.ceil,
|
1433
|
+
:min_val => 0,
|
1434
|
+
:print_value => $heatmapvalues,
|
1435
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1436
|
+
|
1437
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1438
|
+
end
|
1439
|
+
|
1440
|
+
if $heatmap == 1 or $heatmap == 2
|
1441
|
+
title_font_size = $rvg_width * $heatmapcol / 80.0
|
1442
|
+
heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1443
|
+
:row_header => $amino_acids,
|
1444
|
+
:rvg_width => $rvg_width,
|
1445
|
+
:rvg_height => $rvg_height - 50,
|
1446
|
+
:canvas_width => $canvas_width,
|
1447
|
+
:canvas_height => $canvas_height - 50,
|
1448
|
+
:max_val => grp_max_val.ceil,
|
1449
|
+
:min_val => 0,
|
1450
|
+
:print_value => $heatmapvalues,
|
1451
|
+
:print_gradient => false,
|
1452
|
+
:title => stem,
|
1453
|
+
:title_font_size => title_font_size)
|
1454
|
+
end
|
1455
|
+
end
|
1456
|
+
|
1457
|
+
# for heat maps in a single file
|
1458
|
+
if $heatmap == 1 or $heatmap == 2
|
1459
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1460
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1461
|
+
:rvg_width => $rvg_width,
|
1462
|
+
:max_val => grp_max_val.ceil,
|
1463
|
+
:min_val => 0).write(file)
|
1464
|
+
|
1465
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1466
|
+
end
|
1467
|
+
end
|
1468
|
+
|
1469
|
+
# for a total substitution probability matrix
|
1470
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1471
|
+
|
1472
|
+
$amino_acids.each_with_index do |aa, aj|
|
1473
|
+
0.upto($amino_acids.size - 1) do |ai|
|
1474
|
+
$tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
|
1475
|
+
end
|
1476
|
+
end
|
1477
|
+
|
1478
|
+
if $output == 1
|
1479
|
+
$outfh.puts '>Total'
|
1480
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1481
|
+
:row_header => $amino_acids)
|
1482
|
+
$outfh.close
|
1483
|
+
|
1484
|
+
# for a heat map
|
1485
|
+
if $heatmap == 0 or $heatmap == 2
|
1486
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1487
|
+
$tot_prob_mat.heatmap(:col_header => $amino_acids,
|
1488
|
+
:row_header => $amino_acids,
|
1489
|
+
:rvg_width => $rvg_width,
|
1490
|
+
:rvg_height => $rvg_height,
|
1491
|
+
:canvas_width => $canvas_width,
|
1492
|
+
:canvas_height => $canvas_height,
|
1493
|
+
:max_val => $tot_prob_mat.max.ceil,
|
1494
|
+
:min_val => 0,
|
1495
|
+
:print_value => $heatmapvalues,
|
1496
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1497
|
+
|
1498
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1499
|
+
end
|
1500
|
+
exit 0
|
1501
|
+
end
|
1502
|
+
end
|
1503
|
+
|
1504
|
+
#
|
1505
|
+
# Part 6. END
|
1506
|
+
#
|
1507
|
+
|
1508
|
+
|
1509
|
+
# Part 7.
|
1510
|
+
#
|
1511
|
+
# Calculating log odds ratio scoring matrices
|
1512
|
+
#
|
1513
|
+
if $output == 2
|
1514
|
+
$outfh.puts <<HEADER
|
1515
|
+
#
|
1516
|
+
# The probabilities were then divided by the background probabilities
|
1517
|
+
HEADER
|
1518
|
+
if $penv
|
1519
|
+
$outfh.puts <<HEADER
|
1520
|
+
# which were derived from the environment-dependent amino acid frequencies.
|
1521
|
+
# ^^^^^^^^^^^^^^^^^^^^^
|
1522
|
+
HEADER
|
1523
|
+
else
|
1524
|
+
$outfh.puts <<HEADER
|
1525
|
+
# which were derived from the environment-independent amino acid frequencies.
|
1526
|
+
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1527
|
+
HEADER
|
1528
|
+
end
|
1529
|
+
|
1530
|
+
grp_logo_mats = []
|
1531
|
+
factor = $scale / Math::log(2)
|
1532
|
+
|
1533
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1534
|
+
# calculating substitution probability matrix for each envrionment
|
1535
|
+
grp_label = group[0]
|
1536
|
+
grp_envs = group[1]
|
1537
|
+
grp_logo_mat = $cys == 0 ?
|
1538
|
+
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1539
|
+
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1540
|
+
|
1541
|
+
$amino_acids.each_with_index do |aa, aj|
|
1542
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1543
|
+
env.logo_array = $cys == 0 ?
|
1544
|
+
NArray.float($amino_acids.size + 1) :
|
1545
|
+
NArray.float($amino_acids.size)
|
1546
|
+
|
1547
|
+
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
|
1548
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1549
|
+
odds = prob / pai
|
1550
|
+
env.logo_array[ai] = factor * Math::log(odds)
|
1551
|
+
grp_logo_mat[aj, ai] = env.logo_array[ai]
|
1552
|
+
end
|
1553
|
+
|
1554
|
+
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1555
|
+
if $cys == 0
|
1556
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1557
|
+
prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
|
1558
|
+
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
|
1559
|
+
odds = prob / pai
|
1560
|
+
env.logo_array[$amino_acids.size] = factor * Math::log(odds)
|
1561
|
+
grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
|
1562
|
+
end
|
1563
|
+
end
|
1564
|
+
|
1565
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1566
|
+
end
|
1567
|
+
|
1568
|
+
$tot_logo_mat = $cys == 0 ?
|
1569
|
+
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1570
|
+
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1571
|
+
|
1572
|
+
$amino_acids.each_with_index do |aa1, aj|
|
1573
|
+
$amino_acids.each_with_index do |aa2, ai|
|
1574
|
+
prob = $tot_prob_mat[aj, ai]
|
1575
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1576
|
+
odds = prob / pai
|
1577
|
+
$tot_logo_mat[aj, ai] = factor * Math::log(odds)
|
1578
|
+
end
|
1579
|
+
|
1580
|
+
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1581
|
+
if $cys == 0
|
1582
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1583
|
+
prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
|
1584
|
+
odds = prob / pai
|
1585
|
+
$tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
|
1586
|
+
end
|
1587
|
+
end
|
1588
|
+
|
1589
|
+
|
1590
|
+
# calculating relative entropy for each amino acid pair H and
|
1591
|
+
# the expected score E in bit units
|
1592
|
+
tot_E = 0.0
|
1593
|
+
tot_H = 0.0
|
1594
|
+
|
1595
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1596
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
|
1597
|
+
if j != i
|
1598
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
|
1599
|
+
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
|
1600
|
+
else
|
1601
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
|
1602
|
+
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
|
1603
|
+
end
|
1604
|
+
end
|
1605
|
+
end
|
1606
|
+
|
1607
|
+
$outfh.puts <<HEADER
|
1608
|
+
#
|
1609
|
+
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1610
|
+
HEADER
|
1611
|
+
unless $noroundoff
|
1612
|
+
$outfh.puts <<HEADER
|
1613
|
+
# rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
|
1614
|
+
HEADER
|
1615
|
+
end
|
1616
|
+
|
1617
|
+
$outfh.puts <<HEADER
|
1618
|
+
# For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
|
1619
|
+
#
|
1620
|
+
HEADER
|
1621
|
+
|
1622
|
+
grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
|
1623
|
+
grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
|
1624
|
+
abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
|
1625
|
+
row_header = $cys ? $amino_acids + %w[U] : $amino_acids
|
1626
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1627
|
+
$heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
|
1628
|
+
|
1629
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1630
|
+
grp_label = arr[0]
|
1631
|
+
grp_logo_mat = arr[1]
|
1632
|
+
stem = "#{grp_no}. #{grp_label}"
|
1633
|
+
|
1634
|
+
unless $noroundoff
|
1635
|
+
grp_logo_mat = grp_logo_mat.round
|
1636
|
+
end
|
1637
|
+
|
1638
|
+
# for a matrix file
|
1639
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1640
|
+
$outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
|
1641
|
+
:row_header => row_header)
|
1642
|
+
# for a heat map
|
1643
|
+
if $heatmap == 0 or $heatmap == 2
|
1644
|
+
grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1645
|
+
:row_header => row_header,
|
1646
|
+
:rvg_width => $rvg_width,
|
1647
|
+
:rvg_height => $rvg_height,
|
1648
|
+
:canvas_width => $canvas_width,
|
1649
|
+
:canvas_height => $canvas_height,
|
1650
|
+
:gradient_beg_color => '#0000FF',
|
1651
|
+
:gradient_mid_color => '#FFFFFF',
|
1652
|
+
:gradient_end_color => '#FF0000',
|
1653
|
+
:max_val => abs_max_val.ceil,
|
1654
|
+
:mid_val => 0,
|
1655
|
+
:min_val => -1 * abs_max_val.ceil,
|
1656
|
+
:print_value => $heatmapvalues,
|
1657
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1658
|
+
|
1659
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1660
|
+
end
|
1661
|
+
|
1662
|
+
if $heatmap == 1 or $heatmap == 2
|
1663
|
+
title_font_size = $rvg_width * $heatmapcol / 80.0
|
1664
|
+
heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1665
|
+
:row_header => row_header,
|
1666
|
+
:rvg_width => $rvg_width,
|
1667
|
+
:rvg_height => $rvg_height - 50,
|
1668
|
+
:canvas_width => $canvas_width,
|
1669
|
+
:canvas_height => $canvas_height - 50,
|
1670
|
+
:gradient_beg_color => '#0000FF',
|
1671
|
+
:gradient_mid_color => '#FFFFFF',
|
1672
|
+
:gradient_end_color => '#FF0000',
|
1673
|
+
:max_val => abs_max_val.ceil,
|
1674
|
+
:mid_val => 0,
|
1675
|
+
:min_val => -1 * abs_max_val.ceil,
|
1676
|
+
:print_value => $heatmapvalues,
|
1677
|
+
:print_gradient => false,
|
1678
|
+
:title => stem,
|
1679
|
+
:title_font_size => title_font_size)
|
1680
|
+
end
|
1681
|
+
end
|
1682
|
+
|
1683
|
+
# for heat maps in a single file
|
1684
|
+
if $heatmap == 1 or $heatmap == 2
|
1685
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1686
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1687
|
+
:rvg_width => $rvg_width,
|
1688
|
+
:gradient_beg_color => '#0000FF',
|
1689
|
+
:gradient_mid_color => '#FFFFFF',
|
1690
|
+
:gradient_end_color => '#FF0000',
|
1691
|
+
:max_val => abs_max_val.ceil,
|
1692
|
+
:mid_val => 0,
|
1693
|
+
:min_val => -1 * abs_max_val.ceil).write(file)
|
1694
|
+
|
1695
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1696
|
+
end
|
1697
|
+
|
1698
|
+
# for a matrix file
|
1699
|
+
unless $noroundoff
|
1700
|
+
$tot_logo_mat = $tot_logo_mat.round
|
1701
|
+
end
|
1702
|
+
|
1703
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1704
|
+
$outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
|
1705
|
+
:row_header => row_header)
|
1706
|
+
|
1707
|
+
# for a heat map
|
1708
|
+
if $heatmap == 0 or $heatmap == 2
|
1709
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1710
|
+
tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
|
1711
|
+
$tot_logo_mat.heatmap(:col_header => $amino_acids,
|
1712
|
+
:row_header => row_header,
|
1713
|
+
:rvg_width => $rvg_width,
|
1714
|
+
:rvg_height => $rvg_height,
|
1715
|
+
:canvas_width => $canvas_width,
|
1716
|
+
:canvas_height => $canvas_height,
|
1717
|
+
:gradient_beg_color => '#0000FF',
|
1718
|
+
:gradient_mid_color => '#FFFFFF',
|
1719
|
+
:gradient_end_color => '#FF0000',
|
1720
|
+
:max_val => tot_abs_max_val.ceil,
|
1721
|
+
:mid_val => 0,
|
1722
|
+
:min_val => -1 * tot_abs_max_val.ceil,
|
1723
|
+
:print_value => $heatmapvalues,
|
1724
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1725
|
+
|
1726
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1727
|
+
end
|
1728
|
+
|
1729
|
+
$logger.info "Calculating log odds ratios done."
|
1730
|
+
end
|
1731
|
+
|
1732
|
+
#
|
1733
|
+
# Part 7. END
|
1734
|
+
#
|
1735
|
+
|
1736
|
+
$outfh.close
|
1737
|
+
exit 0
|
1738
|
+
end
|
1739
|
+
end
|
1740
|
+
|
1741
|
+
end # class CLI
|
1742
|
+
end # module Ulla
|