semin-ulla 0.9.6 → 0.9.7
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +11 -0
- data/Manifest.txt +0 -1
- data/README.rdoc +9 -8
- data/lib/narray_extensions.rb +9 -6
- data/lib/nmatrix_extensions.rb +10 -8
- data/lib/ulla.rb +1 -1
- data/lib/ulla/cli.rb +372 -274
- data/lib/ulla/environment.rb +8 -4
- data/lib/ulla/environment_class_hash.rb +6 -2
- data/lib/ulla/environment_feature.rb +5 -5
- data/lib/ulla/environment_feature_array.rb +8 -0
- data/script/console +3 -3
- data/ulla.gemspec +9 -10
- metadata +7 -10
- data/.autotest +0 -5
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
== 0.9.7 30/05/2009
|
2
|
+
|
3
|
+
* Added --environment option to consider not only substituted amino acids' environments but also substituting amino acids' environments
|
4
|
+
* Changed the default pseudocount, (1 / # of environment classes) to 0, a pseudocount needs to be explicitly provided with --add option
|
5
|
+
* Changed the fixed column size (7) of raw count matrices to vary depending on the number of digits of a maximum amino acid count
|
6
|
+
* Fixed a bug in reading an environment feature definition file (empty line)
|
7
|
+
|
8
|
+
== 0.9.6 18/03/2009
|
9
|
+
|
10
|
+
* Warns if Rmagick is not properly installed or missing.
|
11
|
+
|
1
12
|
== 0.9.5 18/03/2009
|
2
13
|
|
3
14
|
* Fixed a bug in the order of requiring libraries
|
data/Manifest.txt
CHANGED
data/README.rdoc
CHANGED
@@ -18,7 +18,7 @@ http://www-cryst.bioc.cam.ac.uk/ulla
|
|
18
18
|
|
19
19
|
== Requirements
|
20
20
|
|
21
|
-
* ruby 1.8.7 or above (http://www.ruby-lang.org)
|
21
|
+
* ruby 1.8.7 or above (1.9.0 or above recommended, http://www.ruby-lang.org)
|
22
22
|
* rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems)
|
23
23
|
|
24
24
|
Following RubyGems will be automatically installed if you have rubygems installed on your machine
|
@@ -49,7 +49,10 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
49
49
|
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
50
50
|
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
51
51
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
52
|
-
--noweight: calculate substitution
|
52
|
+
--noweight: calculate substitution counts with no weights
|
53
|
+
--environment (-e) INTEGER:
|
54
|
+
0 for considering only substituted amino acids' environments (default)
|
55
|
+
1 for considering both substituted and substituting amino acids' environments
|
53
56
|
--smooth (-s) INTEGER:
|
54
57
|
0 for partial smoothing (default)
|
55
58
|
1 for full smoothing
|
@@ -60,14 +63,14 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
60
63
|
1 for both structure and sequence
|
61
64
|
2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
|
62
65
|
--output INTEGER:
|
63
|
-
0 for raw
|
66
|
+
0 for raw counts (no smoothing performed)
|
64
67
|
1 for probabilities
|
65
|
-
2 for log
|
68
|
+
2 for log-odds (default)
|
66
69
|
--noroundoff: do not round off log odds ratio
|
67
|
-
--scale INTEGER: log
|
70
|
+
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
68
71
|
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
69
72
|
--autosigma: automatically adjust the sigma value for smoothing
|
70
|
-
--add DOUBLE: add this value to raw
|
73
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
|
71
74
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
72
75
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
73
76
|
--heatmap INTEGER:
|
@@ -234,8 +237,6 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
234
237
|
|
235
238
|
== TODO
|
236
239
|
|
237
|
-
* Substitution table generation considering both the source and the target environmental classes
|
238
|
-
|
239
240
|
== Repository
|
240
241
|
|
241
242
|
You can download a pre-built RubyGems package from
|
data/lib/narray_extensions.rb
CHANGED
@@ -4,16 +4,19 @@ require 'facets'
|
|
4
4
|
|
5
5
|
module NArrayExtensions
|
6
6
|
|
7
|
-
def pretty_string(
|
8
|
-
{
|
9
|
-
|
7
|
+
def pretty_string(options={})
|
8
|
+
opts = {:col_header => nil,
|
9
|
+
:row_header => nil,
|
10
|
+
:col_size => 7}.merge(options)
|
10
11
|
|
11
|
-
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
12
|
+
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
13
|
+
s + ("%#{opts[:col_size]}s" % a)
|
14
|
+
} + "\n" +
|
12
15
|
self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
|
13
16
|
if v.is_a? Float
|
14
|
-
s + ("
|
17
|
+
s + ("%#{opts[:col_size]}.2f" % v)
|
15
18
|
else
|
16
|
-
s + ("
|
19
|
+
s + ("%#{opts[:col_size]}d" % v)
|
17
20
|
end
|
18
21
|
}
|
19
22
|
end
|
data/lib/nmatrix_extensions.rb
CHANGED
@@ -13,17 +13,18 @@ end
|
|
13
13
|
module NMatrixExtensions
|
14
14
|
|
15
15
|
def pretty_string(options={})
|
16
|
-
opts = {:col_header
|
17
|
-
:row_header
|
16
|
+
opts = {:col_header => nil,
|
17
|
+
:row_header => nil,
|
18
|
+
:col_size => 7}.merge(options)
|
18
19
|
|
19
20
|
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
20
|
-
s + ("
|
21
|
+
s + ("%#{opts[:col_size]}s" % a)
|
21
22
|
} + "\n" + self.to_a.map_with_index { |a, i|
|
22
23
|
("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
|
23
24
|
if v.is_a? Float
|
24
|
-
s + ("
|
25
|
+
s + ("%#{opts[:col_size]}.2f" % v)
|
25
26
|
else
|
26
|
-
s + ("
|
27
|
+
s + ("%#{opts[:col_size]}d" % v)
|
27
28
|
end
|
28
29
|
}
|
29
30
|
}.join("\n")
|
@@ -67,6 +68,7 @@ module NMatrixExtensions
|
|
67
68
|
:title? => true,
|
68
69
|
:title => '',
|
69
70
|
:title_font_size => 35,
|
71
|
+
:title_font_scale => 1.0,
|
70
72
|
:print_value => false,
|
71
73
|
:key_font_size => 15,
|
72
74
|
:value_font_size => 8,
|
@@ -75,15 +77,15 @@ module NMatrixExtensions
|
|
75
77
|
RVG::dpi = opts[:dpi]
|
76
78
|
|
77
79
|
rvg = RVG.new(opts[:rvg_width], opts[:rvg_height]) do |canvas|
|
78
|
-
title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * 0
|
79
|
-
title_y = opts[:header_height] - opts[:title_font_size] *
|
80
|
+
title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * opts[:title_font_scale] / 2.0) / 2.0
|
81
|
+
title_y = opts[:header_height] - opts[:title_font_size] * opts[:title_font_scale]
|
80
82
|
|
81
83
|
canvas.viewbox(0, 0, opts[:canvas_width], opts[:canvas_height])
|
82
84
|
canvas.background_fill = opts[:background]
|
83
85
|
canvas.desc = opts[:title]
|
84
86
|
|
85
87
|
if opts[:title?]
|
86
|
-
canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size])
|
88
|
+
canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size] * opts[:title_font_scale])
|
87
89
|
end
|
88
90
|
|
89
91
|
# border for whole matrix
|
data/lib/ulla.rb
CHANGED
data/lib/ulla/cli.rb
CHANGED
@@ -39,6 +39,9 @@ Options:
|
|
39
39
|
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
40
40
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
41
41
|
--noweight: calculate substitution counts with no weights
|
42
|
+
--environment (-e) INTEGER:
|
43
|
+
0 for considering only substituted amino acids' environments (default)
|
44
|
+
1 for considering both substituted and substituting amino acids' environments
|
42
45
|
--smooth (-s) INTEGER:
|
43
46
|
0 for partial smoothing (default)
|
44
47
|
1 for full smoothing
|
@@ -56,7 +59,7 @@ Options:
|
|
56
59
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
57
60
|
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
58
61
|
--autosigma: automatically adjust the sigma value for smoothing
|
59
|
-
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default
|
62
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
|
60
63
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
61
64
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
62
65
|
--heatmap INTEGER:
|
@@ -88,22 +91,23 @@ Options:
|
|
88
91
|
# :call-seq:
|
89
92
|
# Ulla::CLI::calculate_pid(seq1, seq2) -> Float
|
90
93
|
#
|
91
|
-
def calculate_pid(seq1, seq2)
|
92
|
-
aas1 = seq1.
|
93
|
-
aas2 = seq2.
|
94
|
+
def calculate_pid(seq1, seq2, unit)
|
95
|
+
aas1 = seq1.scan(/\w{#{unit}}/)
|
96
|
+
aas2 = seq2.scan(/\w{#{unit}}/)
|
94
97
|
cols = aas1.zip(aas2)
|
98
|
+
gap = ($gap || '-') * unit
|
95
99
|
align = 0 # no. of aligned columns
|
96
100
|
ident = 0 # no. of identical columns
|
97
101
|
intgp = 0 # no. of internal gaps
|
98
102
|
|
99
103
|
cols.each do |col|
|
100
|
-
if (col[0] !=
|
104
|
+
if (col[0] != gap) && (col[1] != gap)
|
101
105
|
align += 1
|
102
106
|
if col[0] == col[1]
|
103
107
|
ident += 1
|
104
108
|
end
|
105
|
-
elsif (((col[0] ==
|
106
|
-
((col[0] !=
|
109
|
+
elsif (((col[0] == gap) && (col[1] != gap)) ||
|
110
|
+
((col[0] != gap) && (col[1] == gap)))
|
107
111
|
intgp += 1
|
108
112
|
end
|
109
113
|
end
|
@@ -148,8 +152,11 @@ Options:
|
|
148
152
|
|
149
153
|
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
150
154
|
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
155
|
+
$gap = '-'
|
151
156
|
$tem_list = nil
|
152
157
|
$tem_file = nil
|
158
|
+
$environment = 0
|
159
|
+
$col_size = nil
|
153
160
|
$classdef = 'classdef.dat'
|
154
161
|
$outfile = 'allmat.dat'
|
155
162
|
$outfh = nil # file hanfle for outfile
|
@@ -176,7 +183,7 @@ Options:
|
|
176
183
|
$heatmapcol = nil
|
177
184
|
$heatmapformat = 'png'
|
178
185
|
$heatmapstem = 'heatmaps'
|
179
|
-
$heatmapvalues
|
186
|
+
$heatmapvalues = false
|
180
187
|
$rvg_width = 550
|
181
188
|
$rvg_height = 650
|
182
189
|
$canvas_width = 550
|
@@ -189,7 +196,6 @@ Options:
|
|
189
196
|
$aa_mutb = {}
|
190
197
|
$aa_rel_mutb = {}
|
191
198
|
$aa_tot_freq = {}
|
192
|
-
$aa_env_cnt = Hash.new(0)
|
193
199
|
$smooth_prob = {}
|
194
200
|
$tot_cnt_mat = nil
|
195
201
|
$tot_prob_mat = nil
|
@@ -209,30 +215,32 @@ Options:
|
|
209
215
|
#
|
210
216
|
|
211
217
|
opts = GetoptLong.new(
|
212
|
-
[ '--help',
|
213
|
-
[ '--tem-list',
|
214
|
-
[ '--tem-file',
|
215
|
-
[ '--classdef',
|
216
|
-
[ '--
|
217
|
-
[ '--
|
218
|
-
[ '--
|
219
|
-
[ '--
|
220
|
-
[ '--
|
221
|
-
[ '--
|
222
|
-
[ '--
|
223
|
-
[ '--
|
224
|
-
[ '--
|
225
|
-
[ '--
|
226
|
-
[ '--heatmap
|
227
|
-
[ '--heatmap-
|
228
|
-
[ '--heatmap-
|
229
|
-
[ '--
|
230
|
-
[ '--
|
231
|
-
[ '--
|
232
|
-
[ '--
|
233
|
-
[ '--
|
234
|
-
[ '--
|
235
|
-
[ '--
|
218
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
219
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
220
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
221
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
222
|
+
[ '--environment', '-e', GetoptLong::REQUIRED_ARGUMENT ],
|
223
|
+
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
224
|
+
[ '--nosmooth', GetoptLong::NO_ARGUMENT ],
|
225
|
+
[ '--p1smooth', GetoptLong::NO_ARGUMENT ],
|
226
|
+
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
227
|
+
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
228
|
+
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
229
|
+
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
230
|
+
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
231
|
+
[ '--add', GetoptLong::REQUIRED_ARGUMENT ],
|
232
|
+
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
233
|
+
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
234
|
+
[ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
|
235
|
+
[ '--heatmap-columns', GetoptLong::REQUIRED_ARGUMENT ],
|
236
|
+
[ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
|
237
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
238
|
+
[ '--targetenv', '-t', GetoptLong::REQUIRED_ARGUMENT ],
|
239
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
240
|
+
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
241
|
+
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
242
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
243
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
236
244
|
)
|
237
245
|
|
238
246
|
begin
|
@@ -247,6 +255,8 @@ Options:
|
|
247
255
|
$tem_file = arg
|
248
256
|
when '--classdef'
|
249
257
|
$classdef = arg
|
258
|
+
when '--environment'
|
259
|
+
$environment = arg.to_i
|
250
260
|
when '--output'
|
251
261
|
$output = arg.to_i
|
252
262
|
when '--outfile'
|
@@ -335,7 +345,7 @@ Options:
|
|
335
345
|
exit 1
|
336
346
|
end
|
337
347
|
|
338
|
-
# warn if any input file is missing
|
348
|
+
# warn if any mandatory input file is missing
|
339
349
|
if $tem_list && !File.exist?($tem_list)
|
340
350
|
warn "Cannot find template list file, #{$tem_list}"
|
341
351
|
exit 1
|
@@ -372,12 +382,12 @@ Options:
|
|
372
382
|
# Reading Environment Class Definition File
|
373
383
|
#
|
374
384
|
|
375
|
-
#
|
385
|
+
# if --cys option 2, then we don't care about 'J' (for both Cystine and Cystine)
|
376
386
|
if $cys == 2
|
377
|
-
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.
|
387
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.delete('J')
|
378
388
|
end
|
379
389
|
|
380
|
-
# create an
|
390
|
+
# create an EnvironmentFeatureArray object for storing all environment
|
381
391
|
# features
|
382
392
|
$env_features = EnvironmentFeatureArray.new
|
383
393
|
|
@@ -398,9 +408,9 @@ Options:
|
|
398
408
|
|
399
409
|
IO.foreach($classdef) do |line|
|
400
410
|
line.chomp!
|
401
|
-
if line.start_with?('#')
|
411
|
+
if line.start_with?('#') || line.blank?
|
402
412
|
next
|
403
|
-
elsif (env_ftr = line.
|
413
|
+
elsif (env_ftr = line.split(/;/)).length == 5
|
404
414
|
$logger.info "An environment feature, #{line} detected."
|
405
415
|
if env_ftr[-1] == 'T'
|
406
416
|
# skip silenced environment feature
|
@@ -418,23 +428,39 @@ Options:
|
|
418
428
|
env_ftr[4])
|
419
429
|
env_index += 1
|
420
430
|
else
|
421
|
-
$logger.error "\"#{line}\" doesn't seem to be a proper format for" +
|
422
|
-
"
|
431
|
+
$logger.error "\"#{line}\" doesn't seem to be a proper format for " +
|
432
|
+
"an environment class definition."
|
423
433
|
exit 1
|
424
434
|
end
|
425
435
|
end
|
426
436
|
|
437
|
+
# set the size of amino acid column unit, extended gap
|
438
|
+
# and extended amino acid labels
|
439
|
+
$col_size = $environment == 1 ? $env_features.size : 1
|
440
|
+
$ext_gap = $gap * $col_size
|
441
|
+
$ext_amino_acids = []
|
442
|
+
|
427
443
|
# a hash for storing all environment classes
|
428
444
|
$env_classes = EnvironmentClassHash.new
|
429
445
|
|
430
446
|
# generate all possible combinations of environment labels, and store
|
431
447
|
# every environment class into the hash prepared above with the label
|
432
448
|
# as a key
|
433
|
-
$env_features.label_combinations.each_with_index
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
449
|
+
$env_features.label_combinations.each_with_index do |ef1, i|
|
450
|
+
key1 = ef1.flatten.join
|
451
|
+
$ext_amino_acids << key1
|
452
|
+
|
453
|
+
if $environment == 0
|
454
|
+
$env_classes[key1] = Environment.new(i, key1, $amino_acids)
|
455
|
+
else
|
456
|
+
# when considering both substituted and substituting amino acids' environtments,
|
457
|
+
# add target (substituting) aa's environment label
|
458
|
+
$env_features.label_combinations_without_aa_type.each_with_index do |ef2, j|
|
459
|
+
key2 = key1 + "-" + ef2.flatten.join
|
460
|
+
$env_classes[key2] = Environment.new(i + j, key2, $amino_acids)
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
438
464
|
|
439
465
|
#
|
440
466
|
# Part 3 END
|
@@ -512,9 +538,7 @@ Options:
|
|
512
538
|
if env_labels[key].empty?
|
513
539
|
env_labels[key] = labels
|
514
540
|
else
|
515
|
-
env_labels[key].each_with_index { |e, i|
|
516
|
-
env_labels[key][i] = e + labels[i]
|
517
|
-
}
|
541
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
518
542
|
end
|
519
543
|
end
|
520
544
|
end
|
@@ -523,97 +547,92 @@ Options:
|
|
523
547
|
|
524
548
|
if $noweight
|
525
549
|
ali.each_pair do |id1, seq1|
|
550
|
+
if $environment == 1
|
551
|
+
seq1 = seq1.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id1][pos] }.join
|
552
|
+
end
|
553
|
+
|
526
554
|
ali.each_pair do |id2, seq2|
|
527
555
|
if id1 != id2
|
528
|
-
|
529
|
-
|
530
|
-
|
556
|
+
if $environment == 1
|
557
|
+
seq2 = seq2.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id2][pos] }.join
|
558
|
+
end
|
559
|
+
|
560
|
+
pid = calculate_pid(seq1, seq2, $col_size)
|
561
|
+
s1 = seq1.scan(/\S{#{$col_size}}/)
|
562
|
+
s2 = seq2.scan(/\S{#{$col_size}}/)
|
531
563
|
|
532
564
|
# check PID_MIN
|
533
565
|
if $pidmin && (pid < $pidmin)
|
534
|
-
$logger.info "Skip alignment between #{id1} and #{id2} "
|
535
|
-
"having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
566
|
+
$logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
536
567
|
next
|
537
568
|
end
|
538
569
|
|
539
570
|
# check PID_MAX
|
540
571
|
if $pidmax && (pid > $pidmax)
|
541
|
-
$logger.info "Skip alignment between #{id1} and #{id2} "
|
542
|
-
"having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
572
|
+
$logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
543
573
|
next
|
544
574
|
end
|
545
575
|
|
546
576
|
s1.each_with_index do |aa1, pos|
|
547
|
-
|
548
|
-
aa2 = s2[pos].upcase
|
577
|
+
aa2 = s2[pos]
|
549
578
|
|
550
579
|
if env_labels[id1][pos].include?('X')
|
551
|
-
$logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
|
580
|
+
$logger.info "Substitutions from #{id1}-#{pos}-#{aa1[0].chr} were masked."
|
552
581
|
next
|
553
582
|
end
|
554
583
|
|
555
584
|
if env_labels[id2][pos].include?('X')
|
556
|
-
$logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
|
585
|
+
$logger.info "Substitutions to #{id2}-#{pos}-#{aa2[0].chr} were masked."
|
557
586
|
next
|
558
587
|
end
|
559
588
|
|
560
|
-
unless $amino_acids.include?(aa1)
|
561
|
-
$logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 ==
|
589
|
+
unless $amino_acids.include?(aa1[0].chr)
|
590
|
+
$logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not a standard amino acid." unless aa1 == $ext_gap
|
562
591
|
next
|
563
592
|
end
|
564
593
|
|
565
|
-
unless $amino_acids.include?(aa2)
|
566
|
-
$logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 ==
|
594
|
+
unless $amino_acids.include?(aa2[0].chr)
|
595
|
+
$logger.warn "#{id1}-#{pos}-#{aa2[0].chr} is not a standard amino acid." unless aa2 == $ext_gap
|
567
596
|
next
|
568
597
|
end
|
569
598
|
|
570
|
-
aa1
|
571
|
-
aa2
|
599
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
600
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
601
|
+
env_label = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
572
602
|
|
573
603
|
if $cst_features.empty?
|
574
|
-
$env_classes[
|
604
|
+
$env_classes[env_label].increase_residue_count(aa2[0].chr)
|
575
605
|
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
576
|
-
$env_classes[
|
606
|
+
$env_classes[env_label].increase_residue_count(aa2[0].chr)
|
577
607
|
else
|
578
|
-
$logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}
|
608
|
+
$logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
|
579
609
|
next
|
580
610
|
end
|
581
611
|
|
582
|
-
|
583
|
-
|
584
|
-
if $aa_env_cnt.has_key? grp_label
|
585
|
-
if $aa_env_cnt[grp_label].has_key? aa1
|
586
|
-
$aa_env_cnt[grp_label][aa1] += 1
|
587
|
-
else
|
588
|
-
$aa_env_cnt[grp_label][aa1] = 1
|
589
|
-
end
|
590
|
-
else
|
591
|
-
$aa_env_cnt[grp_label] = Hash.new(0)
|
592
|
-
$aa_env_cnt[grp_label][aa1] = 1
|
593
|
-
end
|
594
|
-
|
595
|
-
if $aa_tot_cnt.has_key? aa1
|
596
|
-
$aa_tot_cnt[aa1] += 1
|
597
|
-
else
|
598
|
-
$aa_tot_cnt[aa1] = 1
|
599
|
-
end
|
612
|
+
$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += 1 : $aa_tot_cnt[aa1] = 1
|
613
|
+
$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += 1 : $aa_mut_cnt[aa1] = 1 if aa1 != aa2
|
600
614
|
|
601
|
-
|
602
|
-
if $aa_mut_cnt.has_key? aa1
|
603
|
-
$aa_mut_cnt[aa1] += 1
|
604
|
-
else
|
605
|
-
$aa_mut_cnt[aa1] = 1
|
606
|
-
end
|
607
|
-
end
|
608
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
|
615
|
+
$logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (1) was added to the environments class, #{env_label}."
|
609
616
|
end
|
610
617
|
end
|
611
618
|
end
|
612
619
|
end
|
613
620
|
else
|
614
621
|
# BLOSUM-like weighting
|
615
|
-
clusters
|
616
|
-
|
622
|
+
clusters = []
|
623
|
+
ext_ali = Bio::Alignment::OriginalAlignment.new
|
624
|
+
|
625
|
+
ali.each_pair do |key, seq|
|
626
|
+
clusters << [key]
|
627
|
+
if $environment == 1
|
628
|
+
ext_seq = seq.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[key][pos] }.join
|
629
|
+
ext_ali.add_seq(ext_seq, key)
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
if $environment == 1
|
634
|
+
ali = ext_ali
|
635
|
+
end
|
617
636
|
|
618
637
|
# a loop for single linkage clustering
|
619
638
|
begin
|
@@ -624,7 +643,7 @@ Options:
|
|
624
643
|
found = false
|
625
644
|
clusters[i].each do |c1|
|
626
645
|
clusters[j].each do |c2|
|
627
|
-
if calculate_pid(ali[c1], ali[c2]) >= $weight
|
646
|
+
if calculate_pid(ali[c1], ali[c2], $col_size) >= $weight
|
628
647
|
indexes << j
|
629
648
|
found = true
|
630
649
|
break
|
@@ -655,102 +674,58 @@ Options:
|
|
655
674
|
clusters.combination(2).each do |cluster1, cluster2|
|
656
675
|
cluster1.each do |id1|
|
657
676
|
cluster2.each do |id2|
|
658
|
-
seq1 = ali[id1].
|
659
|
-
seq2 = ali[id2].
|
677
|
+
seq1 = ali[id1].scan(/\S{#{$col_size}}/)
|
678
|
+
seq2 = ali[id2].scan(/\S{#{$col_size}}/)
|
660
679
|
|
661
680
|
seq1.each_with_index do |aa1, pos|
|
662
|
-
|
663
|
-
aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
|
681
|
+
aa2 = seq2[pos]
|
664
682
|
|
665
683
|
if env_labels[id1][pos].include?('X')
|
666
|
-
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
|
684
|
+
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1[0].chr} are masked."
|
667
685
|
next
|
668
686
|
end
|
669
687
|
|
670
688
|
if env_labels[id2][pos].include?('X')
|
671
|
-
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
|
689
|
+
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2[0].chr} are masked."
|
672
690
|
next
|
673
691
|
end
|
674
692
|
|
675
|
-
unless $amino_acids.include?(aa1)
|
676
|
-
$logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 ==
|
693
|
+
unless $amino_acids.include?(aa1[0].chr)
|
694
|
+
$logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not standard amino acid." unless aa1 == $ext_gap
|
677
695
|
next
|
678
696
|
end
|
679
697
|
|
680
|
-
unless $amino_acids.include?(aa2)
|
681
|
-
$logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 ==
|
698
|
+
unless $amino_acids.include?(aa2[0].chr)
|
699
|
+
$logger.warn "#{id2}-#{pos}-#{aa2[0].chr} is not standard amino acid." unless aa2 == $ext_gap
|
682
700
|
next
|
683
701
|
end
|
684
702
|
|
685
|
-
aa1
|
686
|
-
aa2
|
687
|
-
cnt1
|
688
|
-
cnt2
|
689
|
-
jnt_cnt
|
703
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
704
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
705
|
+
cnt1 = 1.0 / cluster1.size.to_f
|
706
|
+
cnt2 = 1.0 / cluster2.size.to_f
|
707
|
+
jnt_cnt = cnt1 * cnt2
|
708
|
+
env_label1 = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
709
|
+
env_label2 = $environment == 1 ? aa2 + '-' + aa1[1..-1] : env_labels[id2][pos]
|
690
710
|
|
691
711
|
if $cst_features.empty?
|
692
|
-
$env_classes[
|
693
|
-
$env_classes[
|
712
|
+
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
713
|
+
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
694
714
|
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
695
|
-
$env_classes[
|
696
|
-
$env_classes[
|
715
|
+
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
716
|
+
$env_classes[env_label2].increase_residue_count(aa1[1].chr, jnt_cnt)
|
697
717
|
else
|
698
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}
|
718
|
+
$logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
|
699
719
|
next
|
700
720
|
end
|
701
721
|
|
702
|
-
|
703
|
-
|
722
|
+
$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
|
723
|
+
$aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
|
724
|
+
$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1 if aa1 == aa2
|
725
|
+
$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2 if aa1 == aa2
|
704
726
|
|
705
|
-
|
706
|
-
|
707
|
-
$aa_env_cnt[grp_label1][aa1] += cnt1
|
708
|
-
else
|
709
|
-
$aa_env_cnt[grp_label1][aa1] = cnt1
|
710
|
-
end
|
711
|
-
else
|
712
|
-
$aa_env_cnt[grp_label1] = Hash.new(0.0)
|
713
|
-
$aa_env_cnt[grp_label1][aa1] = cnt1
|
714
|
-
end
|
715
|
-
|
716
|
-
if $aa_env_cnt.has_key? grp_label2
|
717
|
-
if $aa_env_cnt[grp_label2].has_key? aa2
|
718
|
-
$aa_env_cnt[grp_label2][aa2] += cnt2
|
719
|
-
else
|
720
|
-
$aa_env_cnt[grp_label2][aa2] = cnt2
|
721
|
-
end
|
722
|
-
else
|
723
|
-
$aa_env_cnt[grp_label2] = Hash.new(0.0)
|
724
|
-
$aa_env_cnt[grp_label2][aa2] = cnt2
|
725
|
-
end
|
726
|
-
|
727
|
-
if $aa_tot_cnt.has_key? aa1
|
728
|
-
$aa_tot_cnt[aa1] += cnt1
|
729
|
-
else
|
730
|
-
$aa_tot_cnt[aa1] = cnt1
|
731
|
-
end
|
732
|
-
|
733
|
-
if $aa_tot_cnt.has_key? aa2
|
734
|
-
$aa_tot_cnt[aa2] += cnt2
|
735
|
-
else
|
736
|
-
$aa_tot_cnt[aa2] = cnt2
|
737
|
-
end
|
738
|
-
|
739
|
-
if aa1 != aa2
|
740
|
-
if $aa_mut_cnt.has_key? aa1
|
741
|
-
$aa_mut_cnt[aa1] += cnt1
|
742
|
-
else
|
743
|
-
$aa_mut_cnt[aa1] = cnt1
|
744
|
-
end
|
745
|
-
if $aa_mut_cnt.has_key? aa2
|
746
|
-
$aa_mut_cnt[aa2] += cnt2
|
747
|
-
else
|
748
|
-
$aa_mut_cnt[aa2] = cnt2
|
749
|
-
end
|
750
|
-
end
|
751
|
-
|
752
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
|
753
|
-
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
|
727
|
+
$logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label1}."
|
728
|
+
$logger.debug "#{id2}-#{pos}-#{aa2[0].chr} -> #{id1}-#{pos}-#{aa1[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label2}."
|
754
729
|
end
|
755
730
|
end
|
756
731
|
end
|
@@ -799,66 +774,108 @@ HEADER
|
|
799
774
|
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
800
775
|
end
|
801
776
|
|
777
|
+
if $environment == 0
|
778
|
+
$outfh.puts '#'
|
779
|
+
$outfh.puts '# Considered environments: substituted a.a.'
|
780
|
+
else
|
781
|
+
$outfh.puts '#'
|
782
|
+
$outfh.puts '# Considered environments: substituted a.a. and substituting a.a.'
|
783
|
+
end
|
784
|
+
|
802
785
|
# calculate amino acid frequencies and mutabilities, and
|
803
786
|
# print them as default statistics in the header part
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
0
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
787
|
+
if $environment == 0
|
788
|
+
ala_factor = if $aa_tot_cnt['A'] == 0
|
789
|
+
0.0
|
790
|
+
elsif $aa_mut_cnt['A'] == 0
|
791
|
+
0.0
|
792
|
+
else
|
793
|
+
100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
$tot_aa = $aa_tot_cnt.values.sum
|
812
798
|
|
813
799
|
$outfh.puts '#'
|
814
800
|
$outfh.puts "# Total amino acid frequencies:\n"
|
815
|
-
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
816
801
|
|
817
|
-
|
818
|
-
|
802
|
+
if $environment == 0
|
803
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
804
|
+
else
|
805
|
+
$outfh.puts "# %-3s %-#{$env_features.size}s %9s %9s %8s" % %w[RES ENV TOT_OBS MUT_OBS REL_FREQ]
|
806
|
+
end
|
819
807
|
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
808
|
+
min_cnt = 0
|
809
|
+
min_sigma = nil
|
810
|
+
aas = $environment == 0 ? $amino_acids : $ext_amino_acids
|
811
|
+
|
812
|
+
aas.each do |aa|
|
813
|
+
if ($aa_tot_cnt[aa] / $sigma) < $min_cnt_sigma_ratio
|
814
|
+
if $aa_tot_cnt[aa] > 0 and min_cnt > $aa_tot_cnt[aa]
|
815
|
+
min_cnt = $aa_tot_cnt[aa]
|
816
|
+
elsif min_cnt == 0
|
817
|
+
min_cnt = 1
|
828
818
|
end
|
829
819
|
|
830
|
-
|
831
|
-
end
|
820
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
832
821
|
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
822
|
+
if $environment == 0
|
823
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
|
824
|
+
"the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa}."
|
825
|
+
else
|
826
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
|
827
|
+
"the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa[0].chr} under the environment class #{aa[1..-1]}."
|
828
|
+
end
|
829
|
+
end
|
837
830
|
|
838
|
-
|
839
|
-
|
840
|
-
$
|
841
|
-
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
842
|
-
else
|
843
|
-
$outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
|
844
|
-
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
831
|
+
if $environment == 0
|
832
|
+
$aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ? 1.0 : ($aa_mut_cnt[aa] / $aa_tot_cnt[aa].to_f)
|
833
|
+
$aa_rel_mutb[aa] = $aa_mutb[aa] * ala_factor
|
845
834
|
end
|
835
|
+
|
836
|
+
$aa_tot_freq[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_tot_cnt[aa] / $tot_aa.to_f)
|
846
837
|
end
|
847
838
|
|
848
|
-
if min_cnt >
|
839
|
+
if min_cnt > 0
|
849
840
|
$logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
|
841
|
+
|
850
842
|
if $autosigma
|
851
843
|
$logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
|
852
844
|
$sigma = min_sigma
|
853
845
|
end
|
854
846
|
end
|
855
847
|
|
848
|
+
aas.each do |aa|
|
849
|
+
columns = $environment == 0 ?
|
850
|
+
[aa, $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_mutb[aa], $aa_rel_mutb[aa], $aa_tot_freq[aa]] :
|
851
|
+
[aa[0].chr, aa[1..-1], $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_tot_freq[aa]]
|
852
|
+
|
853
|
+
if $noweight
|
854
|
+
if $environment == 0
|
855
|
+
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' % columns
|
856
|
+
else
|
857
|
+
$outfh.puts "# %-3s %-#{$env_features.size}s %9d %9d %8.4f" % columns
|
858
|
+
end
|
859
|
+
else
|
860
|
+
if $environment == 0
|
861
|
+
$outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' % columns
|
862
|
+
else
|
863
|
+
$outfh.puts "# %-3s %-#{$env_features.size}s %9.2f %9.2f %8.4f" % columns
|
864
|
+
end
|
865
|
+
end
|
866
|
+
end
|
867
|
+
|
856
868
|
$outfh.puts '#'
|
857
869
|
$outfh.puts '# RES: Amino acid one letter code'
|
870
|
+
$outfh.puts '# ENV: Environment label of amino acid'
|
858
871
|
$outfh.puts '# TOT_OBS: Total count of incidence'
|
859
872
|
$outfh.puts '# MUT_OBS: Total count of mutation'
|
860
|
-
|
861
|
-
$
|
873
|
+
|
874
|
+
if $environment == 0
|
875
|
+
$outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
|
876
|
+
$outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
|
877
|
+
end
|
878
|
+
|
862
879
|
$outfh.puts '# REL_FREQ: Relative frequency'
|
863
880
|
$outfh.puts '#'
|
864
881
|
|
@@ -872,7 +889,7 @@ HEADER
|
|
872
889
|
# Generating substitution frequency matrices
|
873
890
|
#
|
874
891
|
|
875
|
-
# calculating probabilities for each environment
|
892
|
+
# calculating probabilities for each environment class
|
876
893
|
$env_classes.values.each do |e|
|
877
894
|
if e.freq_array.sum != 0
|
878
895
|
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
@@ -880,12 +897,12 @@ HEADER
|
|
880
897
|
end
|
881
898
|
|
882
899
|
# count raw frequencies
|
883
|
-
$tot_cnt_mat = NMatrix.
|
900
|
+
$tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
884
901
|
group_matrices = []
|
885
902
|
|
886
903
|
# for each combination of environment features
|
887
904
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
888
|
-
grp_cnt_mat = NMatrix.
|
905
|
+
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
889
906
|
|
890
907
|
$amino_acids.each_with_index do |aa, aj|
|
891
908
|
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
@@ -901,6 +918,8 @@ HEADER
|
|
901
918
|
if $output == 0
|
902
919
|
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
903
920
|
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
|
921
|
+
aa_max_cnt = $aa_tot_cnt.to_a.map { |k, v| v }.max
|
922
|
+
mat_col_size = aa_max_cnt.floor.to_s.size + 4
|
904
923
|
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
905
924
|
|
906
925
|
group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
|
@@ -908,7 +927,8 @@ HEADER
|
|
908
927
|
stem = "#{grp_no}. #{grp_label}"
|
909
928
|
$outfh.puts ">#{grp_label} #{grp_no}"
|
910
929
|
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
|
911
|
-
:row_header => $amino_acids
|
930
|
+
:row_header => $amino_acids,
|
931
|
+
:col_size => mat_col_size > 7 ? mat_col_size : 7)
|
912
932
|
|
913
933
|
# for a heat map
|
914
934
|
if $heatmap == 0 or $heatmap == 2
|
@@ -956,7 +976,8 @@ HEADER
|
|
956
976
|
# total
|
957
977
|
$outfh.puts '>Total'
|
958
978
|
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
959
|
-
:row_header => $amino_acids
|
979
|
+
:row_header => $amino_acids,
|
980
|
+
:col_size => mat_col_size > 7 ? mat_col_size : 7)
|
960
981
|
|
961
982
|
if $heatmap == 0 or $heatmap == 2
|
962
983
|
stem = "#{group_matrices.size}. TOTAL"
|
@@ -999,23 +1020,28 @@ HEADER
|
|
999
1020
|
|
1000
1021
|
# when nosmoothing !!!
|
1001
1022
|
if ($output > 0) && $nosmooth
|
1002
|
-
# reinitialize $tot_cnt_mat for pseudocounts
|
1003
1023
|
$tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1004
1024
|
|
1005
|
-
#
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
$env_classes.values.each { |e| e.freq_array += pseudo_cnt }
|
1025
|
+
# if pseudo count provided, reinitialize $tot_cnt_mat by adding pseudocounts
|
1026
|
+
if $add
|
1027
|
+
$env_classes.values.each { |e| e.freq_array += $add }
|
1028
|
+
end
|
1010
1029
|
|
1011
1030
|
# re-calculate probability vector for each environment class
|
1012
|
-
$env_classes.values.each
|
1031
|
+
$env_classes.values.each do |e|
|
1032
|
+
if e.freq_array.sum == 0
|
1033
|
+
# if no observation, then probabilities are zeros, too
|
1034
|
+
e.prob_array = e.freq_array
|
1035
|
+
else
|
1036
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum.to_f
|
1037
|
+
end
|
1038
|
+
end
|
1013
1039
|
|
1014
1040
|
group_matrices = []
|
1015
1041
|
|
1016
1042
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1017
|
-
grp_cnt_mat
|
1018
|
-
grp_prob_mat
|
1043
|
+
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1044
|
+
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1019
1045
|
|
1020
1046
|
$amino_acids.each_with_index do |aa, aj|
|
1021
1047
|
env_class = group[1].find { |e| e.label.start_with?(aa) }
|
@@ -1039,7 +1065,6 @@ HEADER
|
|
1039
1065
|
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1040
1066
|
:row_header => $amino_acids)
|
1041
1067
|
|
1042
|
-
|
1043
1068
|
# for a heat map
|
1044
1069
|
if $heatmap == 0 or $heatmap == 2
|
1045
1070
|
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
@@ -1134,12 +1159,24 @@ HEADER
|
|
1134
1159
|
|
1135
1160
|
if ($smooth == :full) || $p1smooth
|
1136
1161
|
# smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
|
1137
|
-
0.upto($amino_acids.size - 1)
|
1162
|
+
0.upto($amino_acids.size - 1) do |i|
|
1163
|
+
if $environment == 0
|
1164
|
+
p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]])
|
1165
|
+
else
|
1166
|
+
p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum)
|
1167
|
+
end
|
1168
|
+
end
|
1138
1169
|
$smooth_prob[1] = p1
|
1139
1170
|
elsif ($smooth == :partial)
|
1140
1171
|
# no smoothing for p1 probabilities just as Kenji's subst
|
1141
1172
|
# in this case, p1 probabilities were taken from the amino acid frequencies of your data set
|
1142
|
-
0.upto($amino_acids.size - 1)
|
1173
|
+
0.upto($amino_acids.size - 1) do |i|
|
1174
|
+
if $environment == 0
|
1175
|
+
p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]]
|
1176
|
+
else
|
1177
|
+
p1[i] = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
|
1178
|
+
end
|
1179
|
+
end
|
1143
1180
|
$smooth_prob[1] = p1
|
1144
1181
|
end
|
1145
1182
|
|
@@ -1148,6 +1185,10 @@ HEADER
|
|
1148
1185
|
#
|
1149
1186
|
env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
1150
1187
|
|
1188
|
+
if $environment == 1
|
1189
|
+
env_labels += $env_features[1..-1].map_with_index { |ef, ei| ef.labels.map { |l| "#{ei + $env_features.size}#{l}" } }
|
1190
|
+
end
|
1191
|
+
|
1151
1192
|
if $smooth == :partial
|
1152
1193
|
$outfh.puts <<HEADER
|
1153
1194
|
#
|
@@ -1189,9 +1230,9 @@ HEADER
|
|
1189
1230
|
# sigma value used is: #{$sigma}
|
1190
1231
|
#
|
1191
1232
|
HEADER
|
1192
|
-
1.upto(
|
1233
|
+
1.upto(env_labels.size) do |ci|
|
1193
1234
|
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
1194
|
-
if (ci > 2) && (ci <
|
1235
|
+
if (ci > 2) && (ci < env_labels.size)
|
1195
1236
|
$logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
|
1196
1237
|
next
|
1197
1238
|
end
|
@@ -1200,6 +1241,10 @@ HEADER
|
|
1200
1241
|
c1[0].product(*c1[1..-1]).each do |labels|
|
1201
1242
|
pattern = '.' * $env_features.size
|
1202
1243
|
|
1244
|
+
if $environment == 1
|
1245
|
+
pattern += '.' * ($env_features.size - 1)
|
1246
|
+
end
|
1247
|
+
|
1203
1248
|
labels.each do |label|
|
1204
1249
|
i = label[0].chr.to_i
|
1205
1250
|
l = label[1].chr
|
@@ -1211,12 +1256,22 @@ HEADER
|
|
1211
1256
|
next
|
1212
1257
|
end
|
1213
1258
|
|
1259
|
+
if $environment == 1
|
1260
|
+
pattern[$env_features.size, 0] = "-"
|
1261
|
+
end
|
1262
|
+
|
1214
1263
|
# get environments matching the pattern created above
|
1215
1264
|
# and calculate amino acid frequencies and their probabilities for all the environments
|
1216
|
-
envs = $env_classes.values.select { |env| env.label.match(pattern
|
1265
|
+
envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
|
1217
1266
|
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1218
1267
|
prob_arr = NArray.float($amino_acids.size)
|
1219
|
-
0.upto($amino_acids.size - 1)
|
1268
|
+
0.upto($amino_acids.size - 1) do |i|
|
1269
|
+
if freq_arr.sum == 0
|
1270
|
+
prob_arr[i] = 0
|
1271
|
+
else
|
1272
|
+
prob_arr[i] = freq_arr[i] / freq_arr.sum.to_f
|
1273
|
+
end
|
1274
|
+
end
|
1220
1275
|
|
1221
1276
|
# # assess whether a residue type j is compatible with a particular combination of structural features
|
1222
1277
|
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
@@ -1254,29 +1309,23 @@ HEADER
|
|
1254
1309
|
if ci == 1
|
1255
1310
|
priors << $smooth_prob[1]
|
1256
1311
|
elsif ci == 2
|
1257
|
-
labels.combination(1).select { |c2| c2[0].start_with?('0') }.each
|
1312
|
+
labels.combination(1).select { |c2| c2[0].start_with?('0') }.each do |c3|
|
1258
1313
|
priors << $smooth_prob[2][c3.to_set]
|
1259
|
-
|
1260
|
-
elsif ci ==
|
1261
|
-
labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each
|
1314
|
+
end
|
1315
|
+
elsif ci == env_labels.size
|
1316
|
+
labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each do |c3|
|
1262
1317
|
priors << $smooth_prob[3][c3.to_set]
|
1263
|
-
|
1318
|
+
end
|
1264
1319
|
end
|
1265
1320
|
|
1266
|
-
# entropy based prior
|
1267
|
-
entropy_max =
|
1268
|
-
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
1269
|
-
|
1270
|
-
p == 0.0 ? s - 1 : s + p * Math::log(p)
|
1271
|
-
rescue
|
1272
|
-
#puts "P: #{p}"
|
1273
|
-
end
|
1274
|
-
} }
|
1275
|
-
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
1321
|
+
# entropy based weighting prior step
|
1322
|
+
entropy_max = NMath::log($amino_acids.size)
|
1323
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
|
1324
|
+
mod_entropies = entropies.map { |entropy| (entropy_max - entropy) / entropy_max }
|
1276
1325
|
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
1277
1326
|
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
1278
1327
|
|
1279
|
-
# smoothing step
|
1328
|
+
# actual smoothing step
|
1280
1329
|
smooth_prob_arr = NArray.float($amino_acids.size)
|
1281
1330
|
big_N = freq_arr.sum.to_f
|
1282
1331
|
small_n = $amino_acids.size.to_f
|
@@ -1285,8 +1334,8 @@ HEADER
|
|
1285
1334
|
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1286
1335
|
|
1287
1336
|
# normalization step
|
1288
|
-
|
1289
|
-
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] /
|
1337
|
+
total = smooth_prob_arr.sum
|
1338
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
|
1290
1339
|
|
1291
1340
|
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1292
1341
|
if $smooth_prob.has_key?(ci + 1)
|
@@ -1331,36 +1380,47 @@ HEADER
|
|
1331
1380
|
#
|
1332
1381
|
HEADER
|
1333
1382
|
# full smooting
|
1334
|
-
1.upto(
|
1383
|
+
1.upto(env_labels.size) do |ci|
|
1335
1384
|
env_labels.combination(ci) do |c1|
|
1336
1385
|
c1[0].product(*c1[1..-1]).each do |labels|
|
1386
|
+
|
1337
1387
|
pattern = '.' * $env_features.size
|
1388
|
+
|
1389
|
+
if $environment == 1
|
1390
|
+
pattern += '.' * ($env_features.size - 1)
|
1391
|
+
end
|
1392
|
+
|
1338
1393
|
labels.each do |label|
|
1339
1394
|
j = label[0].chr.to_i
|
1340
1395
|
l = label[1].chr
|
1341
1396
|
pattern[j] = l
|
1342
1397
|
end
|
1343
1398
|
|
1399
|
+
if $environment == 1
|
1400
|
+
pattern[$env_features.size, 0] = "-"
|
1401
|
+
end
|
1402
|
+
|
1344
1403
|
# get environmetns, frequencies, and probabilities
|
1345
|
-
envs = $env_classes.values.select { |env| env.label.match(pattern
|
1404
|
+
envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
|
1346
1405
|
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1347
1406
|
prob_arr = NArray.float($amino_acids.size)
|
1348
1407
|
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
1349
1408
|
|
1350
1409
|
# collect priors
|
1351
|
-
priors
|
1410
|
+
priors = []
|
1411
|
+
|
1352
1412
|
if ci > 1
|
1353
1413
|
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
1354
1414
|
else
|
1355
1415
|
priors << $smooth_prob[1]
|
1356
1416
|
end
|
1357
1417
|
|
1358
|
-
# entropy based weighting priors
|
1359
|
-
entropy_max
|
1360
|
-
entropies
|
1361
|
-
|
1362
|
-
|
1363
|
-
weighted_priors = priors.map_with_index { |
|
1418
|
+
# entropy based weighting priors step
|
1419
|
+
entropy_max = NMath::log($amino_acids.size)
|
1420
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
|
1421
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
1422
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
1423
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
1364
1424
|
|
1365
1425
|
# smoothing step
|
1366
1426
|
smooth_prob_arr = NArray.float($amino_acids.size)
|
@@ -1371,8 +1431,8 @@ HEADER
|
|
1371
1431
|
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1372
1432
|
|
1373
1433
|
# normalization step
|
1374
|
-
|
1375
|
-
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] /
|
1434
|
+
total = smooth_prob_arr.sum
|
1435
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
|
1376
1436
|
|
1377
1437
|
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1378
1438
|
if $smooth_prob.has_key?(ci + 1)
|
@@ -1389,7 +1449,7 @@ HEADER
|
|
1389
1449
|
|
1390
1450
|
# updating smoothed probability array for each envrionment
|
1391
1451
|
$env_classes.values.each do |env|
|
1392
|
-
env.smooth_prob_array = $smooth_prob[
|
1452
|
+
env.smooth_prob_array = $smooth_prob[env_labels.size + 1][env.label_set]
|
1393
1453
|
end
|
1394
1454
|
|
1395
1455
|
# sorting environments and build 21X21 substitution matrices
|
@@ -1526,7 +1586,7 @@ HEADER
|
|
1526
1586
|
end
|
1527
1587
|
|
1528
1588
|
grp_logo_mats = []
|
1529
|
-
factor = $scale /
|
1589
|
+
factor = $scale / NMath::log(2)
|
1530
1590
|
|
1531
1591
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1532
1592
|
# calculating substitution probability matrix for each envrionment
|
@@ -1536,6 +1596,11 @@ HEADER
|
|
1536
1596
|
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1537
1597
|
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1538
1598
|
|
1599
|
+
if $environment == 1
|
1600
|
+
# parse substituting aa's environment label
|
1601
|
+
tgt_label = grp_label.split('-').last
|
1602
|
+
end
|
1603
|
+
|
1539
1604
|
$amino_acids.each_with_index do |aa, aj|
|
1540
1605
|
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1541
1606
|
env.logo_array = $cys == 0 ?
|
@@ -1543,19 +1608,29 @@ HEADER
|
|
1543
1608
|
NArray.float($amino_acids.size)
|
1544
1609
|
|
1545
1610
|
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
|
1546
|
-
|
1611
|
+
if $environment == 0
|
1612
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1613
|
+
else
|
1614
|
+
pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
|
1615
|
+
end
|
1616
|
+
|
1547
1617
|
odds = prob / pai
|
1548
|
-
env.logo_array[ai] = factor *
|
1618
|
+
env.logo_array[ai] = factor * NMath::log(odds)
|
1549
1619
|
grp_logo_mat[aj, ai] = env.logo_array[ai]
|
1550
1620
|
end
|
1551
1621
|
|
1552
1622
|
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1553
1623
|
if $cys == 0
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1624
|
+
if $environment == 0
|
1625
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1626
|
+
else
|
1627
|
+
pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
|
1628
|
+
$aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
|
1629
|
+
end
|
1630
|
+
prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
|
1631
|
+
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
|
1632
|
+
odds = prob / pai
|
1633
|
+
env.logo_array[$amino_acids.size] = factor * NMath::log(odds)
|
1559
1634
|
grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
|
1560
1635
|
end
|
1561
1636
|
end
|
@@ -1569,22 +1644,32 @@ HEADER
|
|
1569
1644
|
|
1570
1645
|
$amino_acids.each_with_index do |aa1, aj|
|
1571
1646
|
$amino_acids.each_with_index do |aa2, ai|
|
1572
|
-
prob
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1647
|
+
prob = $tot_prob_mat[aj, ai]
|
1648
|
+
|
1649
|
+
if $environment == 0
|
1650
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1651
|
+
else
|
1652
|
+
pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
|
1653
|
+
end
|
1654
|
+
|
1655
|
+
odds = prob / pai
|
1656
|
+
$tot_logo_mat[aj, ai] = factor * NMath::log(odds)
|
1576
1657
|
end
|
1577
1658
|
|
1578
1659
|
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1579
1660
|
if $cys == 0
|
1580
|
-
|
1661
|
+
if $environment == 0
|
1662
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1663
|
+
else
|
1664
|
+
pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
|
1665
|
+
$aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
|
1666
|
+
end
|
1581
1667
|
prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
|
1582
1668
|
odds = prob / pai
|
1583
|
-
$tot_logo_mat[aj, $amino_acids.size] = factor *
|
1669
|
+
$tot_logo_mat[aj, $amino_acids.size] = factor * NMath::log(odds)
|
1584
1670
|
end
|
1585
1671
|
end
|
1586
1672
|
|
1587
|
-
|
1588
1673
|
# calculating relative entropy for each amino acid pair H and
|
1589
1674
|
# the expected score E in bit units
|
1590
1675
|
tot_E = 0.0
|
@@ -1593,10 +1678,22 @@ HEADER
|
|
1593
1678
|
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1594
1679
|
0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
|
1595
1680
|
if j != i
|
1596
|
-
|
1681
|
+
if $environment == 0
|
1682
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
|
1683
|
+
else
|
1684
|
+
tot_E += $tot_logo_mat[j, i] *
|
1685
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
|
1686
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum / 2.0
|
1687
|
+
end
|
1597
1688
|
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
|
1598
1689
|
else
|
1599
|
-
|
1690
|
+
if $environment == 0
|
1691
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
|
1692
|
+
else
|
1693
|
+
tot_E += $tot_logo_mat[j, i] *
|
1694
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
|
1695
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
|
1696
|
+
end
|
1600
1697
|
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
|
1601
1698
|
end
|
1602
1699
|
end
|
@@ -1662,9 +1759,9 @@ HEADER
|
|
1662
1759
|
heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1663
1760
|
:row_header => row_header,
|
1664
1761
|
:rvg_width => $rvg_width,
|
1665
|
-
:rvg_height => $rvg_height
|
1762
|
+
:rvg_height => $rvg_height,
|
1666
1763
|
:canvas_width => $canvas_width,
|
1667
|
-
:canvas_height => $canvas_height
|
1764
|
+
:canvas_height => $canvas_height,
|
1668
1765
|
:gradient_beg_color => '#0000FF',
|
1669
1766
|
:gradient_mid_color => '#FFFFFF',
|
1670
1767
|
:gradient_end_color => '#FF0000',
|
@@ -1674,6 +1771,7 @@ HEADER
|
|
1674
1771
|
:print_value => $heatmapvalues,
|
1675
1772
|
:print_gradient => false,
|
1676
1773
|
:title => stem,
|
1774
|
+
:title_font_scale => 1.0,
|
1677
1775
|
:title_font_size => title_font_size)
|
1678
1776
|
end
|
1679
1777
|
end
|