ulla 0.9.6 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/README.rdoc +1 -1
- data/lib/narray_extensions.rb +9 -6
- data/lib/nmatrix_extensions.rb +10 -8
- data/lib/ulla.rb +1 -1
- data/lib/ulla/cli.rb +372 -274
- data/lib/ulla/environment.rb +8 -4
- data/lib/ulla/environment_class_hash.rb +6 -2
- data/lib/ulla/environment_feature.rb +5 -5
- data/lib/ulla/environment_feature_array.rb +8 -0
- data/script/console +3 -3
- data/ulla.gemspec +2 -2
- metadata +7 -5
data/History.txt
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
== 0.9.7 30/05/2009
|
2
|
+
|
3
|
+
* Added --environment option to consider not only substituted amino acids' environments but also substituting amino acids' environments
|
4
|
+
* Changed the default pseudocount, (1 / # of environment classes) to 0, a pseudocount needs to be explicitly provided with --add option
|
5
|
+
* Changed the fixed column size (7) of raw count matrices to vary depending on the number of digits of a maximum amino acid count
|
6
|
+
* Fixed a bug in reading an environment feature definition file (empty line)
|
7
|
+
|
1
8
|
== 0.9.6 18/03/2009
|
2
9
|
|
3
10
|
* Warns if Rmagick is not properly installed or missing.
|
data/README.rdoc
CHANGED
@@ -18,7 +18,7 @@ http://www-cryst.bioc.cam.ac.uk/ulla
|
|
18
18
|
|
19
19
|
== Requirements
|
20
20
|
|
21
|
-
* ruby 1.8.7 or above (http://www.ruby-lang.org)
|
21
|
+
* ruby 1.8.7 or above (1.9.0 or above recommended, http://www.ruby-lang.org)
|
22
22
|
* rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems)
|
23
23
|
|
24
24
|
Following RubyGems will be automatically installed if you have rubygems installed on your machine
|
data/lib/narray_extensions.rb
CHANGED
@@ -4,16 +4,19 @@ require 'facets'
|
|
4
4
|
|
5
5
|
module NArrayExtensions
|
6
6
|
|
7
|
-
def pretty_string(
|
8
|
-
{
|
9
|
-
|
7
|
+
def pretty_string(options={})
|
8
|
+
opts = {:col_header => nil,
|
9
|
+
:row_header => nil,
|
10
|
+
:col_size => 7}.merge(options)
|
10
11
|
|
11
|
-
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
12
|
+
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
13
|
+
s + ("%#{opts[:col_size]}s" % a)
|
14
|
+
} + "\n" +
|
12
15
|
self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
|
13
16
|
if v.is_a? Float
|
14
|
-
s + ("
|
17
|
+
s + ("%#{opts[:col_size]}.2f" % v)
|
15
18
|
else
|
16
|
-
s + ("
|
19
|
+
s + ("%#{opts[:col_size]}d" % v)
|
17
20
|
end
|
18
21
|
}
|
19
22
|
end
|
data/lib/nmatrix_extensions.rb
CHANGED
@@ -13,17 +13,18 @@ end
|
|
13
13
|
module NMatrixExtensions
|
14
14
|
|
15
15
|
def pretty_string(options={})
|
16
|
-
opts = {:col_header
|
17
|
-
:row_header
|
16
|
+
opts = {:col_header => nil,
|
17
|
+
:row_header => nil,
|
18
|
+
:col_size => 7}.merge(options)
|
18
19
|
|
19
20
|
("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
|
20
|
-
s + ("
|
21
|
+
s + ("%#{opts[:col_size]}s" % a)
|
21
22
|
} + "\n" + self.to_a.map_with_index { |a, i|
|
22
23
|
("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
|
23
24
|
if v.is_a? Float
|
24
|
-
s + ("
|
25
|
+
s + ("%#{opts[:col_size]}.2f" % v)
|
25
26
|
else
|
26
|
-
s + ("
|
27
|
+
s + ("%#{opts[:col_size]}d" % v)
|
27
28
|
end
|
28
29
|
}
|
29
30
|
}.join("\n")
|
@@ -67,6 +68,7 @@ module NMatrixExtensions
|
|
67
68
|
:title? => true,
|
68
69
|
:title => '',
|
69
70
|
:title_font_size => 35,
|
71
|
+
:title_font_scale => 1.0,
|
70
72
|
:print_value => false,
|
71
73
|
:key_font_size => 15,
|
72
74
|
:value_font_size => 8,
|
@@ -75,15 +77,15 @@ module NMatrixExtensions
|
|
75
77
|
RVG::dpi = opts[:dpi]
|
76
78
|
|
77
79
|
rvg = RVG.new(opts[:rvg_width], opts[:rvg_height]) do |canvas|
|
78
|
-
title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * 0
|
79
|
-
title_y = opts[:header_height] - opts[:title_font_size] *
|
80
|
+
title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * opts[:title_font_scale] / 2.0) / 2.0
|
81
|
+
title_y = opts[:header_height] - opts[:title_font_size] * opts[:title_font_scale]
|
80
82
|
|
81
83
|
canvas.viewbox(0, 0, opts[:canvas_width], opts[:canvas_height])
|
82
84
|
canvas.background_fill = opts[:background]
|
83
85
|
canvas.desc = opts[:title]
|
84
86
|
|
85
87
|
if opts[:title?]
|
86
|
-
canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size])
|
88
|
+
canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size] * opts[:title_font_scale])
|
87
89
|
end
|
88
90
|
|
89
91
|
# border for whole matrix
|
data/lib/ulla.rb
CHANGED
data/lib/ulla/cli.rb
CHANGED
@@ -39,6 +39,9 @@ Options:
|
|
39
39
|
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
40
40
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
41
41
|
--noweight: calculate substitution counts with no weights
|
42
|
+
--environment (-e) INTEGER:
|
43
|
+
0 for considering only substituted amino acids' environments (default)
|
44
|
+
1 for considering both substituted and substituting amino acids' environments
|
42
45
|
--smooth (-s) INTEGER:
|
43
46
|
0 for partial smoothing (default)
|
44
47
|
1 for full smoothing
|
@@ -56,7 +59,7 @@ Options:
|
|
56
59
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
57
60
|
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
58
61
|
--autosigma: automatically adjust the sigma value for smoothing
|
59
|
-
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default
|
62
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
|
60
63
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
61
64
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
62
65
|
--heatmap INTEGER:
|
@@ -88,22 +91,23 @@ Options:
|
|
88
91
|
# :call-seq:
|
89
92
|
# Ulla::CLI::calculate_pid(seq1, seq2) -> Float
|
90
93
|
#
|
91
|
-
def calculate_pid(seq1, seq2)
|
92
|
-
aas1 = seq1.
|
93
|
-
aas2 = seq2.
|
94
|
+
def calculate_pid(seq1, seq2, unit)
|
95
|
+
aas1 = seq1.scan(/\w{#{unit}}/)
|
96
|
+
aas2 = seq2.scan(/\w{#{unit}}/)
|
94
97
|
cols = aas1.zip(aas2)
|
98
|
+
gap = ($gap || '-') * unit
|
95
99
|
align = 0 # no. of aligned columns
|
96
100
|
ident = 0 # no. of identical columns
|
97
101
|
intgp = 0 # no. of internal gaps
|
98
102
|
|
99
103
|
cols.each do |col|
|
100
|
-
if (col[0] !=
|
104
|
+
if (col[0] != gap) && (col[1] != gap)
|
101
105
|
align += 1
|
102
106
|
if col[0] == col[1]
|
103
107
|
ident += 1
|
104
108
|
end
|
105
|
-
elsif (((col[0] ==
|
106
|
-
((col[0] !=
|
109
|
+
elsif (((col[0] == gap) && (col[1] != gap)) ||
|
110
|
+
((col[0] != gap) && (col[1] == gap)))
|
107
111
|
intgp += 1
|
108
112
|
end
|
109
113
|
end
|
@@ -148,8 +152,11 @@ Options:
|
|
148
152
|
|
149
153
|
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
150
154
|
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
155
|
+
$gap = '-'
|
151
156
|
$tem_list = nil
|
152
157
|
$tem_file = nil
|
158
|
+
$environment = 0
|
159
|
+
$col_size = nil
|
153
160
|
$classdef = 'classdef.dat'
|
154
161
|
$outfile = 'allmat.dat'
|
155
162
|
$outfh = nil # file hanfle for outfile
|
@@ -176,7 +183,7 @@ Options:
|
|
176
183
|
$heatmapcol = nil
|
177
184
|
$heatmapformat = 'png'
|
178
185
|
$heatmapstem = 'heatmaps'
|
179
|
-
$heatmapvalues
|
186
|
+
$heatmapvalues = false
|
180
187
|
$rvg_width = 550
|
181
188
|
$rvg_height = 650
|
182
189
|
$canvas_width = 550
|
@@ -189,7 +196,6 @@ Options:
|
|
189
196
|
$aa_mutb = {}
|
190
197
|
$aa_rel_mutb = {}
|
191
198
|
$aa_tot_freq = {}
|
192
|
-
$aa_env_cnt = Hash.new(0)
|
193
199
|
$smooth_prob = {}
|
194
200
|
$tot_cnt_mat = nil
|
195
201
|
$tot_prob_mat = nil
|
@@ -209,30 +215,32 @@ Options:
|
|
209
215
|
#
|
210
216
|
|
211
217
|
opts = GetoptLong.new(
|
212
|
-
[ '--help',
|
213
|
-
[ '--tem-list',
|
214
|
-
[ '--tem-file',
|
215
|
-
[ '--classdef',
|
216
|
-
[ '--
|
217
|
-
[ '--
|
218
|
-
[ '--
|
219
|
-
[ '--
|
220
|
-
[ '--
|
221
|
-
[ '--
|
222
|
-
[ '--
|
223
|
-
[ '--
|
224
|
-
[ '--
|
225
|
-
[ '--
|
226
|
-
[ '--heatmap
|
227
|
-
[ '--heatmap-
|
228
|
-
[ '--heatmap-
|
229
|
-
[ '--
|
230
|
-
[ '--
|
231
|
-
[ '--
|
232
|
-
[ '--
|
233
|
-
[ '--
|
234
|
-
[ '--
|
235
|
-
[ '--
|
218
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
219
|
+
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
220
|
+
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
221
|
+
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
222
|
+
[ '--environment', '-e', GetoptLong::REQUIRED_ARGUMENT ],
|
223
|
+
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
224
|
+
[ '--nosmooth', GetoptLong::NO_ARGUMENT ],
|
225
|
+
[ '--p1smooth', GetoptLong::NO_ARGUMENT ],
|
226
|
+
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
227
|
+
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
228
|
+
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
229
|
+
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
230
|
+
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
231
|
+
[ '--add', GetoptLong::REQUIRED_ARGUMENT ],
|
232
|
+
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
233
|
+
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
234
|
+
[ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
|
235
|
+
[ '--heatmap-columns', GetoptLong::REQUIRED_ARGUMENT ],
|
236
|
+
[ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
|
237
|
+
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
238
|
+
[ '--targetenv', '-t', GetoptLong::REQUIRED_ARGUMENT ],
|
239
|
+
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
240
|
+
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
241
|
+
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
242
|
+
[ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
|
243
|
+
[ '--version', GetoptLong::NO_ARGUMENT ]
|
236
244
|
)
|
237
245
|
|
238
246
|
begin
|
@@ -247,6 +255,8 @@ Options:
|
|
247
255
|
$tem_file = arg
|
248
256
|
when '--classdef'
|
249
257
|
$classdef = arg
|
258
|
+
when '--environment'
|
259
|
+
$environment = arg.to_i
|
250
260
|
when '--output'
|
251
261
|
$output = arg.to_i
|
252
262
|
when '--outfile'
|
@@ -335,7 +345,7 @@ Options:
|
|
335
345
|
exit 1
|
336
346
|
end
|
337
347
|
|
338
|
-
# warn if any input file is missing
|
348
|
+
# warn if any mandatory input file is missing
|
339
349
|
if $tem_list && !File.exist?($tem_list)
|
340
350
|
warn "Cannot find template list file, #{$tem_list}"
|
341
351
|
exit 1
|
@@ -372,12 +382,12 @@ Options:
|
|
372
382
|
# Reading Environment Class Definition File
|
373
383
|
#
|
374
384
|
|
375
|
-
#
|
385
|
+
# if --cys option 2, then we don't care about 'J' (for both Cystine and Cystine)
|
376
386
|
if $cys == 2
|
377
|
-
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.
|
387
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.delete('J')
|
378
388
|
end
|
379
389
|
|
380
|
-
# create an
|
390
|
+
# create an EnvironmentFeatureArray object for storing all environment
|
381
391
|
# features
|
382
392
|
$env_features = EnvironmentFeatureArray.new
|
383
393
|
|
@@ -398,9 +408,9 @@ Options:
|
|
398
408
|
|
399
409
|
IO.foreach($classdef) do |line|
|
400
410
|
line.chomp!
|
401
|
-
if line.start_with?('#')
|
411
|
+
if line.start_with?('#') || line.blank?
|
402
412
|
next
|
403
|
-
elsif (env_ftr = line.
|
413
|
+
elsif (env_ftr = line.split(/;/)).length == 5
|
404
414
|
$logger.info "An environment feature, #{line} detected."
|
405
415
|
if env_ftr[-1] == 'T'
|
406
416
|
# skip silenced environment feature
|
@@ -418,23 +428,39 @@ Options:
|
|
418
428
|
env_ftr[4])
|
419
429
|
env_index += 1
|
420
430
|
else
|
421
|
-
$logger.error "\"#{line}\" doesn't seem to be a proper format for" +
|
422
|
-
"
|
431
|
+
$logger.error "\"#{line}\" doesn't seem to be a proper format for " +
|
432
|
+
"an environment class definition."
|
423
433
|
exit 1
|
424
434
|
end
|
425
435
|
end
|
426
436
|
|
437
|
+
# set the size of amino acid column unit, extended gap
|
438
|
+
# and extended amino acid labels
|
439
|
+
$col_size = $environment == 1 ? $env_features.size : 1
|
440
|
+
$ext_gap = $gap * $col_size
|
441
|
+
$ext_amino_acids = []
|
442
|
+
|
427
443
|
# a hash for storing all environment classes
|
428
444
|
$env_classes = EnvironmentClassHash.new
|
429
445
|
|
430
446
|
# generate all possible combinations of environment labels, and store
|
431
447
|
# every environment class into the hash prepared above with the label
|
432
448
|
# as a key
|
433
|
-
$env_features.label_combinations.each_with_index
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
449
|
+
$env_features.label_combinations.each_with_index do |ef1, i|
|
450
|
+
key1 = ef1.flatten.join
|
451
|
+
$ext_amino_acids << key1
|
452
|
+
|
453
|
+
if $environment == 0
|
454
|
+
$env_classes[key1] = Environment.new(i, key1, $amino_acids)
|
455
|
+
else
|
456
|
+
# when considering both substituted and substituting amino acids' environtments,
|
457
|
+
# add target (substituting) aa's environment label
|
458
|
+
$env_features.label_combinations_without_aa_type.each_with_index do |ef2, j|
|
459
|
+
key2 = key1 + "-" + ef2.flatten.join
|
460
|
+
$env_classes[key2] = Environment.new(i + j, key2, $amino_acids)
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
438
464
|
|
439
465
|
#
|
440
466
|
# Part 3 END
|
@@ -512,9 +538,7 @@ Options:
|
|
512
538
|
if env_labels[key].empty?
|
513
539
|
env_labels[key] = labels
|
514
540
|
else
|
515
|
-
env_labels[key].each_with_index { |e, i|
|
516
|
-
env_labels[key][i] = e + labels[i]
|
517
|
-
}
|
541
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
518
542
|
end
|
519
543
|
end
|
520
544
|
end
|
@@ -523,97 +547,92 @@ Options:
|
|
523
547
|
|
524
548
|
if $noweight
|
525
549
|
ali.each_pair do |id1, seq1|
|
550
|
+
if $environment == 1
|
551
|
+
seq1 = seq1.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id1][pos] }.join
|
552
|
+
end
|
553
|
+
|
526
554
|
ali.each_pair do |id2, seq2|
|
527
555
|
if id1 != id2
|
528
|
-
|
529
|
-
|
530
|
-
|
556
|
+
if $environment == 1
|
557
|
+
seq2 = seq2.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id2][pos] }.join
|
558
|
+
end
|
559
|
+
|
560
|
+
pid = calculate_pid(seq1, seq2, $col_size)
|
561
|
+
s1 = seq1.scan(/\S{#{$col_size}}/)
|
562
|
+
s2 = seq2.scan(/\S{#{$col_size}}/)
|
531
563
|
|
532
564
|
# check PID_MIN
|
533
565
|
if $pidmin && (pid < $pidmin)
|
534
|
-
$logger.info "Skip alignment between #{id1} and #{id2} "
|
535
|
-
"having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
566
|
+
$logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
536
567
|
next
|
537
568
|
end
|
538
569
|
|
539
570
|
# check PID_MAX
|
540
571
|
if $pidmax && (pid > $pidmax)
|
541
|
-
$logger.info "Skip alignment between #{id1} and #{id2} "
|
542
|
-
"having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
572
|
+
$logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
543
573
|
next
|
544
574
|
end
|
545
575
|
|
546
576
|
s1.each_with_index do |aa1, pos|
|
547
|
-
|
548
|
-
aa2 = s2[pos].upcase
|
577
|
+
aa2 = s2[pos]
|
549
578
|
|
550
579
|
if env_labels[id1][pos].include?('X')
|
551
|
-
$logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
|
580
|
+
$logger.info "Substitutions from #{id1}-#{pos}-#{aa1[0].chr} were masked."
|
552
581
|
next
|
553
582
|
end
|
554
583
|
|
555
584
|
if env_labels[id2][pos].include?('X')
|
556
|
-
$logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
|
585
|
+
$logger.info "Substitutions to #{id2}-#{pos}-#{aa2[0].chr} were masked."
|
557
586
|
next
|
558
587
|
end
|
559
588
|
|
560
|
-
unless $amino_acids.include?(aa1)
|
561
|
-
$logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 ==
|
589
|
+
unless $amino_acids.include?(aa1[0].chr)
|
590
|
+
$logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not a standard amino acid." unless aa1 == $ext_gap
|
562
591
|
next
|
563
592
|
end
|
564
593
|
|
565
|
-
unless $amino_acids.include?(aa2)
|
566
|
-
$logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 ==
|
594
|
+
unless $amino_acids.include?(aa2[0].chr)
|
595
|
+
$logger.warn "#{id1}-#{pos}-#{aa2[0].chr} is not a standard amino acid." unless aa2 == $ext_gap
|
567
596
|
next
|
568
597
|
end
|
569
598
|
|
570
|
-
aa1
|
571
|
-
aa2
|
599
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
600
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
601
|
+
env_label = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
572
602
|
|
573
603
|
if $cst_features.empty?
|
574
|
-
$env_classes[
|
604
|
+
$env_classes[env_label].increase_residue_count(aa2[0].chr)
|
575
605
|
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
576
|
-
$env_classes[
|
606
|
+
$env_classes[env_label].increase_residue_count(aa2[0].chr)
|
577
607
|
else
|
578
|
-
$logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}
|
608
|
+
$logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
|
579
609
|
next
|
580
610
|
end
|
581
611
|
|
582
|
-
|
583
|
-
|
584
|
-
if $aa_env_cnt.has_key? grp_label
|
585
|
-
if $aa_env_cnt[grp_label].has_key? aa1
|
586
|
-
$aa_env_cnt[grp_label][aa1] += 1
|
587
|
-
else
|
588
|
-
$aa_env_cnt[grp_label][aa1] = 1
|
589
|
-
end
|
590
|
-
else
|
591
|
-
$aa_env_cnt[grp_label] = Hash.new(0)
|
592
|
-
$aa_env_cnt[grp_label][aa1] = 1
|
593
|
-
end
|
594
|
-
|
595
|
-
if $aa_tot_cnt.has_key? aa1
|
596
|
-
$aa_tot_cnt[aa1] += 1
|
597
|
-
else
|
598
|
-
$aa_tot_cnt[aa1] = 1
|
599
|
-
end
|
612
|
+
$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += 1 : $aa_tot_cnt[aa1] = 1
|
613
|
+
$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += 1 : $aa_mut_cnt[aa1] = 1 if aa1 != aa2
|
600
614
|
|
601
|
-
|
602
|
-
if $aa_mut_cnt.has_key? aa1
|
603
|
-
$aa_mut_cnt[aa1] += 1
|
604
|
-
else
|
605
|
-
$aa_mut_cnt[aa1] = 1
|
606
|
-
end
|
607
|
-
end
|
608
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
|
615
|
+
$logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (1) was added to the environments class, #{env_label}."
|
609
616
|
end
|
610
617
|
end
|
611
618
|
end
|
612
619
|
end
|
613
620
|
else
|
614
621
|
# BLOSUM-like weighting
|
615
|
-
clusters
|
616
|
-
|
622
|
+
clusters = []
|
623
|
+
ext_ali = Bio::Alignment::OriginalAlignment.new
|
624
|
+
|
625
|
+
ali.each_pair do |key, seq|
|
626
|
+
clusters << [key]
|
627
|
+
if $environment == 1
|
628
|
+
ext_seq = seq.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[key][pos] }.join
|
629
|
+
ext_ali.add_seq(ext_seq, key)
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
if $environment == 1
|
634
|
+
ali = ext_ali
|
635
|
+
end
|
617
636
|
|
618
637
|
# a loop for single linkage clustering
|
619
638
|
begin
|
@@ -624,7 +643,7 @@ Options:
|
|
624
643
|
found = false
|
625
644
|
clusters[i].each do |c1|
|
626
645
|
clusters[j].each do |c2|
|
627
|
-
if calculate_pid(ali[c1], ali[c2]) >= $weight
|
646
|
+
if calculate_pid(ali[c1], ali[c2], $col_size) >= $weight
|
628
647
|
indexes << j
|
629
648
|
found = true
|
630
649
|
break
|
@@ -655,102 +674,58 @@ Options:
|
|
655
674
|
clusters.combination(2).each do |cluster1, cluster2|
|
656
675
|
cluster1.each do |id1|
|
657
676
|
cluster2.each do |id2|
|
658
|
-
seq1 = ali[id1].
|
659
|
-
seq2 = ali[id2].
|
677
|
+
seq1 = ali[id1].scan(/\S{#{$col_size}}/)
|
678
|
+
seq2 = ali[id2].scan(/\S{#{$col_size}}/)
|
660
679
|
|
661
680
|
seq1.each_with_index do |aa1, pos|
|
662
|
-
|
663
|
-
aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
|
681
|
+
aa2 = seq2[pos]
|
664
682
|
|
665
683
|
if env_labels[id1][pos].include?('X')
|
666
|
-
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
|
684
|
+
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1[0].chr} are masked."
|
667
685
|
next
|
668
686
|
end
|
669
687
|
|
670
688
|
if env_labels[id2][pos].include?('X')
|
671
|
-
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
|
689
|
+
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2[0].chr} are masked."
|
672
690
|
next
|
673
691
|
end
|
674
692
|
|
675
|
-
unless $amino_acids.include?(aa1)
|
676
|
-
$logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 ==
|
693
|
+
unless $amino_acids.include?(aa1[0].chr)
|
694
|
+
$logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not standard amino acid." unless aa1 == $ext_gap
|
677
695
|
next
|
678
696
|
end
|
679
697
|
|
680
|
-
unless $amino_acids.include?(aa2)
|
681
|
-
$logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 ==
|
698
|
+
unless $amino_acids.include?(aa2[0].chr)
|
699
|
+
$logger.warn "#{id2}-#{pos}-#{aa2[0].chr} is not standard amino acid." unless aa2 == $ext_gap
|
682
700
|
next
|
683
701
|
end
|
684
702
|
|
685
|
-
aa1
|
686
|
-
aa2
|
687
|
-
cnt1
|
688
|
-
cnt2
|
689
|
-
jnt_cnt
|
703
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
704
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
705
|
+
cnt1 = 1.0 / cluster1.size.to_f
|
706
|
+
cnt2 = 1.0 / cluster2.size.to_f
|
707
|
+
jnt_cnt = cnt1 * cnt2
|
708
|
+
env_label1 = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
709
|
+
env_label2 = $environment == 1 ? aa2 + '-' + aa1[1..-1] : env_labels[id2][pos]
|
690
710
|
|
691
711
|
if $cst_features.empty?
|
692
|
-
$env_classes[
|
693
|
-
$env_classes[
|
712
|
+
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
713
|
+
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
694
714
|
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
695
|
-
$env_classes[
|
696
|
-
$env_classes[
|
715
|
+
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
716
|
+
$env_classes[env_label2].increase_residue_count(aa1[1].chr, jnt_cnt)
|
697
717
|
else
|
698
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}
|
718
|
+
$logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
|
699
719
|
next
|
700
720
|
end
|
701
721
|
|
702
|
-
|
703
|
-
|
722
|
+
$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
|
723
|
+
$aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
|
724
|
+
$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1 if aa1 == aa2
|
725
|
+
$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2 if aa1 == aa2
|
704
726
|
|
705
|
-
|
706
|
-
|
707
|
-
$aa_env_cnt[grp_label1][aa1] += cnt1
|
708
|
-
else
|
709
|
-
$aa_env_cnt[grp_label1][aa1] = cnt1
|
710
|
-
end
|
711
|
-
else
|
712
|
-
$aa_env_cnt[grp_label1] = Hash.new(0.0)
|
713
|
-
$aa_env_cnt[grp_label1][aa1] = cnt1
|
714
|
-
end
|
715
|
-
|
716
|
-
if $aa_env_cnt.has_key? grp_label2
|
717
|
-
if $aa_env_cnt[grp_label2].has_key? aa2
|
718
|
-
$aa_env_cnt[grp_label2][aa2] += cnt2
|
719
|
-
else
|
720
|
-
$aa_env_cnt[grp_label2][aa2] = cnt2
|
721
|
-
end
|
722
|
-
else
|
723
|
-
$aa_env_cnt[grp_label2] = Hash.new(0.0)
|
724
|
-
$aa_env_cnt[grp_label2][aa2] = cnt2
|
725
|
-
end
|
726
|
-
|
727
|
-
if $aa_tot_cnt.has_key? aa1
|
728
|
-
$aa_tot_cnt[aa1] += cnt1
|
729
|
-
else
|
730
|
-
$aa_tot_cnt[aa1] = cnt1
|
731
|
-
end
|
732
|
-
|
733
|
-
if $aa_tot_cnt.has_key? aa2
|
734
|
-
$aa_tot_cnt[aa2] += cnt2
|
735
|
-
else
|
736
|
-
$aa_tot_cnt[aa2] = cnt2
|
737
|
-
end
|
738
|
-
|
739
|
-
if aa1 != aa2
|
740
|
-
if $aa_mut_cnt.has_key? aa1
|
741
|
-
$aa_mut_cnt[aa1] += cnt1
|
742
|
-
else
|
743
|
-
$aa_mut_cnt[aa1] = cnt1
|
744
|
-
end
|
745
|
-
if $aa_mut_cnt.has_key? aa2
|
746
|
-
$aa_mut_cnt[aa2] += cnt2
|
747
|
-
else
|
748
|
-
$aa_mut_cnt[aa2] = cnt2
|
749
|
-
end
|
750
|
-
end
|
751
|
-
|
752
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
|
753
|
-
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
|
727
|
+
$logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label1}."
|
728
|
+
$logger.debug "#{id2}-#{pos}-#{aa2[0].chr} -> #{id1}-#{pos}-#{aa1[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label2}."
|
754
729
|
end
|
755
730
|
end
|
756
731
|
end
|
@@ -799,66 +774,108 @@ HEADER
|
|
799
774
|
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
800
775
|
end
|
801
776
|
|
777
|
+
if $environment == 0
|
778
|
+
$outfh.puts '#'
|
779
|
+
$outfh.puts '# Considered environments: substituted a.a.'
|
780
|
+
else
|
781
|
+
$outfh.puts '#'
|
782
|
+
$outfh.puts '# Considered environments: substituted a.a. and substituting a.a.'
|
783
|
+
end
|
784
|
+
|
802
785
|
# calculate amino acid frequencies and mutabilities, and
|
803
786
|
# print them as default statistics in the header part
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
0
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
787
|
+
if $environment == 0
|
788
|
+
ala_factor = if $aa_tot_cnt['A'] == 0
|
789
|
+
0.0
|
790
|
+
elsif $aa_mut_cnt['A'] == 0
|
791
|
+
0.0
|
792
|
+
else
|
793
|
+
100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
$tot_aa = $aa_tot_cnt.values.sum
|
812
798
|
|
813
799
|
$outfh.puts '#'
|
814
800
|
$outfh.puts "# Total amino acid frequencies:\n"
|
815
|
-
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
816
801
|
|
817
|
-
|
818
|
-
|
802
|
+
if $environment == 0
|
803
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
804
|
+
else
|
805
|
+
$outfh.puts "# %-3s %-#{$env_features.size}s %9s %9s %8s" % %w[RES ENV TOT_OBS MUT_OBS REL_FREQ]
|
806
|
+
end
|
819
807
|
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
808
|
+
min_cnt = 0
|
809
|
+
min_sigma = nil
|
810
|
+
aas = $environment == 0 ? $amino_acids : $ext_amino_acids
|
811
|
+
|
812
|
+
aas.each do |aa|
|
813
|
+
if ($aa_tot_cnt[aa] / $sigma) < $min_cnt_sigma_ratio
|
814
|
+
if $aa_tot_cnt[aa] > 0 and min_cnt > $aa_tot_cnt[aa]
|
815
|
+
min_cnt = $aa_tot_cnt[aa]
|
816
|
+
elsif min_cnt == 0
|
817
|
+
min_cnt = 1
|
828
818
|
end
|
829
819
|
|
830
|
-
|
831
|
-
end
|
820
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
832
821
|
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
822
|
+
if $environment == 0
|
823
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
|
824
|
+
"the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa}."
|
825
|
+
else
|
826
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
|
827
|
+
"the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa[0].chr} under the environment class #{aa[1..-1]}."
|
828
|
+
end
|
829
|
+
end
|
837
830
|
|
838
|
-
|
839
|
-
|
840
|
-
$
|
841
|
-
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
842
|
-
else
|
843
|
-
$outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
|
844
|
-
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
831
|
+
if $environment == 0
|
832
|
+
$aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ? 1.0 : ($aa_mut_cnt[aa] / $aa_tot_cnt[aa].to_f)
|
833
|
+
$aa_rel_mutb[aa] = $aa_mutb[aa] * ala_factor
|
845
834
|
end
|
835
|
+
|
836
|
+
$aa_tot_freq[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_tot_cnt[aa] / $tot_aa.to_f)
|
846
837
|
end
|
847
838
|
|
848
|
-
if min_cnt >
|
839
|
+
if min_cnt > 0
|
849
840
|
$logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
|
841
|
+
|
850
842
|
if $autosigma
|
851
843
|
$logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
|
852
844
|
$sigma = min_sigma
|
853
845
|
end
|
854
846
|
end
|
855
847
|
|
848
|
+
aas.each do |aa|
|
849
|
+
columns = $environment == 0 ?
|
850
|
+
[aa, $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_mutb[aa], $aa_rel_mutb[aa], $aa_tot_freq[aa]] :
|
851
|
+
[aa[0].chr, aa[1..-1], $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_tot_freq[aa]]
|
852
|
+
|
853
|
+
if $noweight
|
854
|
+
if $environment == 0
|
855
|
+
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' % columns
|
856
|
+
else
|
857
|
+
$outfh.puts "# %-3s %-#{$env_features.size}s %9d %9d %8.4f" % columns
|
858
|
+
end
|
859
|
+
else
|
860
|
+
if $environment == 0
|
861
|
+
$outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' % columns
|
862
|
+
else
|
863
|
+
$outfh.puts "# %-3s %-#{$env_features.size}s %9.2f %9.2f %8.4f" % columns
|
864
|
+
end
|
865
|
+
end
|
866
|
+
end
|
867
|
+
|
856
868
|
$outfh.puts '#'
|
857
869
|
$outfh.puts '# RES: Amino acid one letter code'
|
870
|
+
$outfh.puts '# ENV: Environment label of amino acid'
|
858
871
|
$outfh.puts '# TOT_OBS: Total count of incidence'
|
859
872
|
$outfh.puts '# MUT_OBS: Total count of mutation'
|
860
|
-
|
861
|
-
$
|
873
|
+
|
874
|
+
if $environment == 0
|
875
|
+
$outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
|
876
|
+
$outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
|
877
|
+
end
|
878
|
+
|
862
879
|
$outfh.puts '# REL_FREQ: Relative frequency'
|
863
880
|
$outfh.puts '#'
|
864
881
|
|
@@ -872,7 +889,7 @@ HEADER
|
|
872
889
|
# Generating substitution frequency matrices
|
873
890
|
#
|
874
891
|
|
875
|
-
# calculating probabilities for each environment
|
892
|
+
# calculating probabilities for each environment class
|
876
893
|
$env_classes.values.each do |e|
|
877
894
|
if e.freq_array.sum != 0
|
878
895
|
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
@@ -880,12 +897,12 @@ HEADER
|
|
880
897
|
end
|
881
898
|
|
882
899
|
# count raw frequencies
|
883
|
-
$tot_cnt_mat = NMatrix.
|
900
|
+
$tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
884
901
|
group_matrices = []
|
885
902
|
|
886
903
|
# for each combination of environment features
|
887
904
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
888
|
-
grp_cnt_mat = NMatrix.
|
905
|
+
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
889
906
|
|
890
907
|
$amino_acids.each_with_index do |aa, aj|
|
891
908
|
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
@@ -901,6 +918,8 @@ HEADER
|
|
901
918
|
if $output == 0
|
902
919
|
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
903
920
|
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
|
921
|
+
aa_max_cnt = $aa_tot_cnt.to_a.map { |k, v| v }.max
|
922
|
+
mat_col_size = aa_max_cnt.floor.to_s.size + 4
|
904
923
|
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
905
924
|
|
906
925
|
group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
|
@@ -908,7 +927,8 @@ HEADER
|
|
908
927
|
stem = "#{grp_no}. #{grp_label}"
|
909
928
|
$outfh.puts ">#{grp_label} #{grp_no}"
|
910
929
|
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
|
911
|
-
:row_header => $amino_acids
|
930
|
+
:row_header => $amino_acids,
|
931
|
+
:col_size => mat_col_size > 7 ? mat_col_size : 7)
|
912
932
|
|
913
933
|
# for a heat map
|
914
934
|
if $heatmap == 0 or $heatmap == 2
|
@@ -956,7 +976,8 @@ HEADER
|
|
956
976
|
# total
|
957
977
|
$outfh.puts '>Total'
|
958
978
|
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
959
|
-
:row_header => $amino_acids
|
979
|
+
:row_header => $amino_acids,
|
980
|
+
:col_size => mat_col_size > 7 ? mat_col_size : 7)
|
960
981
|
|
961
982
|
if $heatmap == 0 or $heatmap == 2
|
962
983
|
stem = "#{group_matrices.size}. TOTAL"
|
@@ -999,23 +1020,28 @@ HEADER
|
|
999
1020
|
|
1000
1021
|
# when nosmoothing !!!
|
1001
1022
|
if ($output > 0) && $nosmooth
|
1002
|
-
# reinitialize $tot_cnt_mat for pseudocounts
|
1003
1023
|
$tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1004
1024
|
|
1005
|
-
#
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
$env_classes.values.each { |e| e.freq_array += pseudo_cnt }
|
1025
|
+
# if pseudo count provided, reinitialize $tot_cnt_mat by adding pseudocounts
|
1026
|
+
if $add
|
1027
|
+
$env_classes.values.each { |e| e.freq_array += $add }
|
1028
|
+
end
|
1010
1029
|
|
1011
1030
|
# re-calculate probability vector for each environment class
|
1012
|
-
$env_classes.values.each
|
1031
|
+
$env_classes.values.each do |e|
|
1032
|
+
if e.freq_array.sum == 0
|
1033
|
+
# if no observation, then probabilities are zeros, too
|
1034
|
+
e.prob_array = e.freq_array
|
1035
|
+
else
|
1036
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum.to_f
|
1037
|
+
end
|
1038
|
+
end
|
1013
1039
|
|
1014
1040
|
group_matrices = []
|
1015
1041
|
|
1016
1042
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1017
|
-
grp_cnt_mat
|
1018
|
-
grp_prob_mat
|
1043
|
+
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1044
|
+
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1019
1045
|
|
1020
1046
|
$amino_acids.each_with_index do |aa, aj|
|
1021
1047
|
env_class = group[1].find { |e| e.label.start_with?(aa) }
|
@@ -1039,7 +1065,6 @@ HEADER
|
|
1039
1065
|
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1040
1066
|
:row_header => $amino_acids)
|
1041
1067
|
|
1042
|
-
|
1043
1068
|
# for a heat map
|
1044
1069
|
if $heatmap == 0 or $heatmap == 2
|
1045
1070
|
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
@@ -1134,12 +1159,24 @@ HEADER
|
|
1134
1159
|
|
1135
1160
|
if ($smooth == :full) || $p1smooth
|
1136
1161
|
# smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
|
1137
|
-
0.upto($amino_acids.size - 1)
|
1162
|
+
0.upto($amino_acids.size - 1) do |i|
|
1163
|
+
if $environment == 0
|
1164
|
+
p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]])
|
1165
|
+
else
|
1166
|
+
p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum)
|
1167
|
+
end
|
1168
|
+
end
|
1138
1169
|
$smooth_prob[1] = p1
|
1139
1170
|
elsif ($smooth == :partial)
|
1140
1171
|
# no smoothing for p1 probabilities just as Kenji's subst
|
1141
1172
|
# in this case, p1 probabilities were taken from the amino acid frequencies of your data set
|
1142
|
-
0.upto($amino_acids.size - 1)
|
1173
|
+
0.upto($amino_acids.size - 1) do |i|
|
1174
|
+
if $environment == 0
|
1175
|
+
p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]]
|
1176
|
+
else
|
1177
|
+
p1[i] = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
|
1178
|
+
end
|
1179
|
+
end
|
1143
1180
|
$smooth_prob[1] = p1
|
1144
1181
|
end
|
1145
1182
|
|
@@ -1148,6 +1185,10 @@ HEADER
|
|
1148
1185
|
#
|
1149
1186
|
env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
1150
1187
|
|
1188
|
+
if $environment == 1
|
1189
|
+
env_labels += $env_features[1..-1].map_with_index { |ef, ei| ef.labels.map { |l| "#{ei + $env_features.size}#{l}" } }
|
1190
|
+
end
|
1191
|
+
|
1151
1192
|
if $smooth == :partial
|
1152
1193
|
$outfh.puts <<HEADER
|
1153
1194
|
#
|
@@ -1189,9 +1230,9 @@ HEADER
|
|
1189
1230
|
# sigma value used is: #{$sigma}
|
1190
1231
|
#
|
1191
1232
|
HEADER
|
1192
|
-
1.upto(
|
1233
|
+
1.upto(env_labels.size) do |ci|
|
1193
1234
|
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
1194
|
-
if (ci > 2) && (ci <
|
1235
|
+
if (ci > 2) && (ci < env_labels.size)
|
1195
1236
|
$logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
|
1196
1237
|
next
|
1197
1238
|
end
|
@@ -1200,6 +1241,10 @@ HEADER
|
|
1200
1241
|
c1[0].product(*c1[1..-1]).each do |labels|
|
1201
1242
|
pattern = '.' * $env_features.size
|
1202
1243
|
|
1244
|
+
if $environment == 1
|
1245
|
+
pattern += '.' * ($env_features.size - 1)
|
1246
|
+
end
|
1247
|
+
|
1203
1248
|
labels.each do |label|
|
1204
1249
|
i = label[0].chr.to_i
|
1205
1250
|
l = label[1].chr
|
@@ -1211,12 +1256,22 @@ HEADER
|
|
1211
1256
|
next
|
1212
1257
|
end
|
1213
1258
|
|
1259
|
+
if $environment == 1
|
1260
|
+
pattern[$env_features.size, 0] = "-"
|
1261
|
+
end
|
1262
|
+
|
1214
1263
|
# get environments matching the pattern created above
|
1215
1264
|
# and calculate amino acid frequencies and their probabilities for all the environments
|
1216
|
-
envs = $env_classes.values.select { |env| env.label.match(pattern
|
1265
|
+
envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
|
1217
1266
|
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1218
1267
|
prob_arr = NArray.float($amino_acids.size)
|
1219
|
-
0.upto($amino_acids.size - 1)
|
1268
|
+
0.upto($amino_acids.size - 1) do |i|
|
1269
|
+
if freq_arr.sum == 0
|
1270
|
+
prob_arr[i] = 0
|
1271
|
+
else
|
1272
|
+
prob_arr[i] = freq_arr[i] / freq_arr.sum.to_f
|
1273
|
+
end
|
1274
|
+
end
|
1220
1275
|
|
1221
1276
|
# # assess whether a residue type j is compatible with a particular combination of structural features
|
1222
1277
|
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
@@ -1254,29 +1309,23 @@ HEADER
|
|
1254
1309
|
if ci == 1
|
1255
1310
|
priors << $smooth_prob[1]
|
1256
1311
|
elsif ci == 2
|
1257
|
-
labels.combination(1).select { |c2| c2[0].start_with?('0') }.each
|
1312
|
+
labels.combination(1).select { |c2| c2[0].start_with?('0') }.each do |c3|
|
1258
1313
|
priors << $smooth_prob[2][c3.to_set]
|
1259
|
-
|
1260
|
-
elsif ci ==
|
1261
|
-
labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each
|
1314
|
+
end
|
1315
|
+
elsif ci == env_labels.size
|
1316
|
+
labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each do |c3|
|
1262
1317
|
priors << $smooth_prob[3][c3.to_set]
|
1263
|
-
|
1318
|
+
end
|
1264
1319
|
end
|
1265
1320
|
|
1266
|
-
# entropy based prior
|
1267
|
-
entropy_max =
|
1268
|
-
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
1269
|
-
|
1270
|
-
p == 0.0 ? s - 1 : s + p * Math::log(p)
|
1271
|
-
rescue
|
1272
|
-
#puts "P: #{p}"
|
1273
|
-
end
|
1274
|
-
} }
|
1275
|
-
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
1321
|
+
# entropy based weighting prior step
|
1322
|
+
entropy_max = NMath::log($amino_acids.size)
|
1323
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
|
1324
|
+
mod_entropies = entropies.map { |entropy| (entropy_max - entropy) / entropy_max }
|
1276
1325
|
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
1277
1326
|
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
1278
1327
|
|
1279
|
-
# smoothing step
|
1328
|
+
# actual smoothing step
|
1280
1329
|
smooth_prob_arr = NArray.float($amino_acids.size)
|
1281
1330
|
big_N = freq_arr.sum.to_f
|
1282
1331
|
small_n = $amino_acids.size.to_f
|
@@ -1285,8 +1334,8 @@ HEADER
|
|
1285
1334
|
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1286
1335
|
|
1287
1336
|
# normalization step
|
1288
|
-
|
1289
|
-
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] /
|
1337
|
+
total = smooth_prob_arr.sum
|
1338
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
|
1290
1339
|
|
1291
1340
|
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1292
1341
|
if $smooth_prob.has_key?(ci + 1)
|
@@ -1331,36 +1380,47 @@ HEADER
|
|
1331
1380
|
#
|
1332
1381
|
HEADER
|
1333
1382
|
# full smooting
|
1334
|
-
1.upto(
|
1383
|
+
1.upto(env_labels.size) do |ci|
|
1335
1384
|
env_labels.combination(ci) do |c1|
|
1336
1385
|
c1[0].product(*c1[1..-1]).each do |labels|
|
1386
|
+
|
1337
1387
|
pattern = '.' * $env_features.size
|
1388
|
+
|
1389
|
+
if $environment == 1
|
1390
|
+
pattern += '.' * ($env_features.size - 1)
|
1391
|
+
end
|
1392
|
+
|
1338
1393
|
labels.each do |label|
|
1339
1394
|
j = label[0].chr.to_i
|
1340
1395
|
l = label[1].chr
|
1341
1396
|
pattern[j] = l
|
1342
1397
|
end
|
1343
1398
|
|
1399
|
+
if $environment == 1
|
1400
|
+
pattern[$env_features.size, 0] = "-"
|
1401
|
+
end
|
1402
|
+
|
1344
1403
|
# get environmetns, frequencies, and probabilities
|
1345
|
-
envs = $env_classes.values.select { |env| env.label.match(pattern
|
1404
|
+
envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
|
1346
1405
|
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1347
1406
|
prob_arr = NArray.float($amino_acids.size)
|
1348
1407
|
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
1349
1408
|
|
1350
1409
|
# collect priors
|
1351
|
-
priors
|
1410
|
+
priors = []
|
1411
|
+
|
1352
1412
|
if ci > 1
|
1353
1413
|
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
1354
1414
|
else
|
1355
1415
|
priors << $smooth_prob[1]
|
1356
1416
|
end
|
1357
1417
|
|
1358
|
-
# entropy based weighting priors
|
1359
|
-
entropy_max
|
1360
|
-
entropies
|
1361
|
-
|
1362
|
-
|
1363
|
-
weighted_priors = priors.map_with_index { |
|
1418
|
+
# entropy based weighting priors step
|
1419
|
+
entropy_max = NMath::log($amino_acids.size)
|
1420
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
|
1421
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
1422
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
1423
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
1364
1424
|
|
1365
1425
|
# smoothing step
|
1366
1426
|
smooth_prob_arr = NArray.float($amino_acids.size)
|
@@ -1371,8 +1431,8 @@ HEADER
|
|
1371
1431
|
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1372
1432
|
|
1373
1433
|
# normalization step
|
1374
|
-
|
1375
|
-
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] /
|
1434
|
+
total = smooth_prob_arr.sum
|
1435
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
|
1376
1436
|
|
1377
1437
|
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1378
1438
|
if $smooth_prob.has_key?(ci + 1)
|
@@ -1389,7 +1449,7 @@ HEADER
|
|
1389
1449
|
|
1390
1450
|
# updating smoothed probability array for each envrionment
|
1391
1451
|
$env_classes.values.each do |env|
|
1392
|
-
env.smooth_prob_array = $smooth_prob[
|
1452
|
+
env.smooth_prob_array = $smooth_prob[env_labels.size + 1][env.label_set]
|
1393
1453
|
end
|
1394
1454
|
|
1395
1455
|
# sorting environments and build 21X21 substitution matrices
|
@@ -1526,7 +1586,7 @@ HEADER
|
|
1526
1586
|
end
|
1527
1587
|
|
1528
1588
|
grp_logo_mats = []
|
1529
|
-
factor = $scale /
|
1589
|
+
factor = $scale / NMath::log(2)
|
1530
1590
|
|
1531
1591
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1532
1592
|
# calculating substitution probability matrix for each envrionment
|
@@ -1536,6 +1596,11 @@ HEADER
|
|
1536
1596
|
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1537
1597
|
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1538
1598
|
|
1599
|
+
if $environment == 1
|
1600
|
+
# parse substituting aa's environment label
|
1601
|
+
tgt_label = grp_label.split('-').last
|
1602
|
+
end
|
1603
|
+
|
1539
1604
|
$amino_acids.each_with_index do |aa, aj|
|
1540
1605
|
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1541
1606
|
env.logo_array = $cys == 0 ?
|
@@ -1543,19 +1608,29 @@ HEADER
|
|
1543
1608
|
NArray.float($amino_acids.size)
|
1544
1609
|
|
1545
1610
|
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
|
1546
|
-
|
1611
|
+
if $environment == 0
|
1612
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1613
|
+
else
|
1614
|
+
pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
|
1615
|
+
end
|
1616
|
+
|
1547
1617
|
odds = prob / pai
|
1548
|
-
env.logo_array[ai] = factor *
|
1618
|
+
env.logo_array[ai] = factor * NMath::log(odds)
|
1549
1619
|
grp_logo_mat[aj, ai] = env.logo_array[ai]
|
1550
1620
|
end
|
1551
1621
|
|
1552
1622
|
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1553
1623
|
if $cys == 0
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1624
|
+
if $environment == 0
|
1625
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1626
|
+
else
|
1627
|
+
pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
|
1628
|
+
$aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
|
1629
|
+
end
|
1630
|
+
prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
|
1631
|
+
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
|
1632
|
+
odds = prob / pai
|
1633
|
+
env.logo_array[$amino_acids.size] = factor * NMath::log(odds)
|
1559
1634
|
grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
|
1560
1635
|
end
|
1561
1636
|
end
|
@@ -1569,22 +1644,32 @@ HEADER
|
|
1569
1644
|
|
1570
1645
|
$amino_acids.each_with_index do |aa1, aj|
|
1571
1646
|
$amino_acids.each_with_index do |aa2, ai|
|
1572
|
-
prob
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1647
|
+
prob = $tot_prob_mat[aj, ai]
|
1648
|
+
|
1649
|
+
if $environment == 0
|
1650
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1651
|
+
else
|
1652
|
+
pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
|
1653
|
+
end
|
1654
|
+
|
1655
|
+
odds = prob / pai
|
1656
|
+
$tot_logo_mat[aj, ai] = factor * NMath::log(odds)
|
1576
1657
|
end
|
1577
1658
|
|
1578
1659
|
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1579
1660
|
if $cys == 0
|
1580
|
-
|
1661
|
+
if $environment == 0
|
1662
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1663
|
+
else
|
1664
|
+
pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
|
1665
|
+
$aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
|
1666
|
+
end
|
1581
1667
|
prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
|
1582
1668
|
odds = prob / pai
|
1583
|
-
$tot_logo_mat[aj, $amino_acids.size] = factor *
|
1669
|
+
$tot_logo_mat[aj, $amino_acids.size] = factor * NMath::log(odds)
|
1584
1670
|
end
|
1585
1671
|
end
|
1586
1672
|
|
1587
|
-
|
1588
1673
|
# calculating relative entropy for each amino acid pair H and
|
1589
1674
|
# the expected score E in bit units
|
1590
1675
|
tot_E = 0.0
|
@@ -1593,10 +1678,22 @@ HEADER
|
|
1593
1678
|
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1594
1679
|
0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
|
1595
1680
|
if j != i
|
1596
|
-
|
1681
|
+
if $environment == 0
|
1682
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
|
1683
|
+
else
|
1684
|
+
tot_E += $tot_logo_mat[j, i] *
|
1685
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
|
1686
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum / 2.0
|
1687
|
+
end
|
1597
1688
|
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
|
1598
1689
|
else
|
1599
|
-
|
1690
|
+
if $environment == 0
|
1691
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
|
1692
|
+
else
|
1693
|
+
tot_E += $tot_logo_mat[j, i] *
|
1694
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
|
1695
|
+
$aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
|
1696
|
+
end
|
1600
1697
|
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
|
1601
1698
|
end
|
1602
1699
|
end
|
@@ -1662,9 +1759,9 @@ HEADER
|
|
1662
1759
|
heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1663
1760
|
:row_header => row_header,
|
1664
1761
|
:rvg_width => $rvg_width,
|
1665
|
-
:rvg_height => $rvg_height
|
1762
|
+
:rvg_height => $rvg_height,
|
1666
1763
|
:canvas_width => $canvas_width,
|
1667
|
-
:canvas_height => $canvas_height
|
1764
|
+
:canvas_height => $canvas_height,
|
1668
1765
|
:gradient_beg_color => '#0000FF',
|
1669
1766
|
:gradient_mid_color => '#FFFFFF',
|
1670
1767
|
:gradient_end_color => '#FF0000',
|
@@ -1674,6 +1771,7 @@ HEADER
|
|
1674
1771
|
:print_value => $heatmapvalues,
|
1675
1772
|
:print_gradient => false,
|
1676
1773
|
:title => stem,
|
1774
|
+
:title_font_scale => 1.0,
|
1677
1775
|
:title_font_size => title_font_size)
|
1678
1776
|
end
|
1679
1777
|
end
|