egor 0.0.5 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +28 -26
- data/Manifest.txt +12 -8
- data/README.rdoc +206 -54
- data/Rakefile +9 -9
- data/egor.gemspec +13 -15
- data/lib/egor.rb +1 -1
- data/lib/egor/cli.rb +535 -168
- data/lib/egor/environment.rb +34 -0
- data/lib/egor/environment_class_hash.rb +20 -0
- data/lib/egor/environment_feature.rb +26 -0
- data/lib/egor/environment_feature_array.rb +12 -0
- data/lib/egor/heatmap_array.rb +111 -0
- data/lib/narray_extensions.rb +3 -2
- data/lib/nmatrix_extensions.rb +227 -6
- data/lib/string_extensions.rb +17 -0
- data/test/egor/test_cli.rb +9 -0
- data/test/egor/test_environment_class_hash.rb +25 -0
- data/test/egor/test_environment_feature.rb +29 -0
- data/test/test_math_extensions.rb +11 -0
- data/test/test_narray_extensions.rb +14 -0
- data/test/test_string_extensions.rb +11 -0
- data/website/index.html +5 -76
- data/website/index.txt +183 -18
- data/website/stylesheets/screen.css +0 -1
- metadata +27 -20
- data/lib/enumerable_extensions.rb +0 -11
- data/lib/environment.rb +0 -58
- data/lib/environment_class_hash.rb +0 -18
- data/lib/environment_feature.rb +0 -14
- data/lib/environment_feature_array.rb +0 -10
- data/test/test_egor_cli.rb +0 -8
- data/test/test_enumerable_extensions.rb +0 -16
- data/test/test_environment_feature.rb +0 -11
data/lib/egor.rb
CHANGED
data/lib/egor/cli.rb
CHANGED
@@ -5,16 +5,17 @@ require 'narray'
|
|
5
5
|
require 'bio'
|
6
6
|
require 'set'
|
7
7
|
require 'facets'
|
8
|
-
require 'simple_memoize'
|
9
8
|
|
9
|
+
require 'math_extensions'
|
10
|
+
require 'string_extensions'
|
10
11
|
require 'narray_extensions'
|
11
12
|
require 'nmatrix_extensions'
|
12
|
-
|
13
|
-
require '
|
14
|
-
require '
|
15
|
-
require '
|
16
|
-
require '
|
17
|
-
require '
|
13
|
+
|
14
|
+
require 'egor/environment'
|
15
|
+
require 'egor/environment_class_hash'
|
16
|
+
require 'egor/environment_feature'
|
17
|
+
require 'egor/environment_feature_array'
|
18
|
+
require 'egor/heatmap_array'
|
18
19
|
|
19
20
|
# This is a module for an actual command line interpreter for Egor
|
20
21
|
# ---
|
@@ -25,7 +26,7 @@ module Egor
|
|
25
26
|
|
26
27
|
# :nodoc:
|
27
28
|
def print_version
|
28
|
-
puts
|
29
|
+
puts VERSION
|
29
30
|
end
|
30
31
|
|
31
32
|
# Print Egor's Usage on the screen
|
@@ -62,14 +63,26 @@ Options:
|
|
62
63
|
0 for raw counts (no smoothing performed)
|
63
64
|
1 for probabilities
|
64
65
|
2 for log-odds (default)
|
65
|
-
--
|
66
|
+
--noroundoff: do not round off log odds ratio
|
66
67
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
67
68
|
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
68
69
|
--autosigma: automatically adjust the sigma value for smoothing
|
69
70
|
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
70
|
-
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
71
71
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
72
72
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
73
|
+
--heatmap INTEGER:
|
74
|
+
0 create a heat map file for each substitution table
|
75
|
+
1 create one big file containing all heat maps from substitution tables
|
76
|
+
2 do both 0 and 1
|
77
|
+
--heatmap-format INTEGER:
|
78
|
+
0 for Portable Network Graphics (PNG) Format (default)
|
79
|
+
1 for Graphics Interchange Format (GIF)
|
80
|
+
2 for Joint Photographic Experts Group (JPEG) Format
|
81
|
+
3 for Microsoft Windows bitmap (BMP) Format
|
82
|
+
4 for Portable Document Format (PDF)
|
83
|
+
--heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
|
84
|
+
--heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
|
85
|
+
--heatmap-values: print values in the cells when generating heat maps
|
73
86
|
--verbose (-v) INTEGER
|
74
87
|
0 for ERROR level
|
75
88
|
1 for WARN or above level (default)
|
@@ -87,12 +100,12 @@ Options:
|
|
87
100
|
# Egor::CLI::calculate_pid(seq1, seq2) -> Float
|
88
101
|
#
|
89
102
|
def calculate_pid(seq1, seq2)
|
90
|
-
|
91
|
-
|
92
|
-
cols =
|
93
|
-
align = 0
|
94
|
-
ident = 0
|
95
|
-
intgp = 0
|
103
|
+
aas1 = seq1.split('')
|
104
|
+
aas2 = seq2.split('')
|
105
|
+
cols = aas1.zip(aas2)
|
106
|
+
align = 0 # no. of aligned columns
|
107
|
+
ident = 0 # no. of identical columns
|
108
|
+
intgp = 0 # no. of internal gaps
|
96
109
|
|
97
110
|
cols.each do |col|
|
98
111
|
if (col[0] != '-') && (col[1] != '-')
|
@@ -100,14 +113,14 @@ Options:
|
|
100
113
|
if col[0] == col[1]
|
101
114
|
ident += 1
|
102
115
|
end
|
103
|
-
elsif (((col[0] == '-') && (col[1] != '-')) ||
|
116
|
+
elsif (((col[0] == '-') && (col[1] != '-')) ||
|
117
|
+
((col[0] != '-') && (col[1] == '-')))
|
104
118
|
intgp += 1
|
105
119
|
end
|
106
120
|
end
|
107
121
|
|
108
122
|
pid = 100.0 * ident.to_f / (align + intgp)
|
109
123
|
end
|
110
|
-
memoize :calculate_pid
|
111
124
|
|
112
125
|
# :nodoc:
|
113
126
|
def execute(arguments=[])
|
@@ -121,7 +134,7 @@ Options:
|
|
121
134
|
# aa: weighted amino acid
|
122
135
|
# tot: total
|
123
136
|
# rel: relative
|
124
|
-
#
|
137
|
+
# jnt: joint
|
125
138
|
# cnt: count
|
126
139
|
# mut: mutation
|
127
140
|
# mutb: mutability
|
@@ -145,31 +158,42 @@ Options:
|
|
145
158
|
$logger.level = Logger::WARN
|
146
159
|
|
147
160
|
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
148
|
-
$amino_acids
|
149
|
-
|
150
|
-
$
|
151
|
-
$
|
152
|
-
$
|
153
|
-
$
|
154
|
-
$
|
155
|
-
$
|
156
|
-
$
|
157
|
-
$
|
158
|
-
$
|
159
|
-
$
|
160
|
-
$
|
161
|
-
$
|
162
|
-
$
|
163
|
-
$
|
164
|
-
$
|
165
|
-
$
|
166
|
-
$
|
167
|
-
$
|
168
|
-
$
|
169
|
-
$
|
170
|
-
$
|
171
|
-
$
|
172
|
-
$penv
|
161
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
162
|
+
$tem_list = nil
|
163
|
+
$tem_file = nil
|
164
|
+
$classdef = 'classdef.dat'
|
165
|
+
$outfile = 'allmat.dat'
|
166
|
+
$outfh = nil # file hanfle for outfile
|
167
|
+
$output = 2 # default: log odds matrix
|
168
|
+
$ali_size = 0
|
169
|
+
$tot_aa = 0
|
170
|
+
$sigma = 5.0
|
171
|
+
$autosigma = false
|
172
|
+
$weight = 60
|
173
|
+
$noweight = false
|
174
|
+
$smooth = :partial
|
175
|
+
$nosmooth = false
|
176
|
+
$noroundoff = false
|
177
|
+
$p1smooth = false
|
178
|
+
$scale = 3
|
179
|
+
$pidmin = nil
|
180
|
+
$pidmax = nil
|
181
|
+
$scale = 3
|
182
|
+
$add = nil
|
183
|
+
$cys = 0
|
184
|
+
$targetenv = false
|
185
|
+
$penv = false
|
186
|
+
$heatmap = nil
|
187
|
+
$heatmapcol = nil
|
188
|
+
$heatmapformat = 'png'
|
189
|
+
$heatmapstem = 'heatmaps'
|
190
|
+
$heatmapvalues = false
|
191
|
+
$rvg_width = 550
|
192
|
+
$rvg_height = 650
|
193
|
+
$canvas_width = 550
|
194
|
+
$canvas_height = 650
|
195
|
+
$cell_width = 20
|
196
|
+
$cell_height = 20
|
173
197
|
|
174
198
|
$aa_tot_cnt = Hash.new(0)
|
175
199
|
$aa_mut_cnt = Hash.new(0)
|
@@ -184,7 +208,7 @@ Options:
|
|
184
208
|
$tot_smooth_prob = {}
|
185
209
|
|
186
210
|
# minimum ratio of amino acid count to sigma value
|
187
|
-
$
|
211
|
+
$min_cnt_sigma_ratio = 500.0
|
188
212
|
|
189
213
|
#
|
190
214
|
# Part 1 END
|
@@ -205,11 +229,16 @@ Options:
|
|
205
229
|
[ '--p1smooth', GetoptLong::NO_ARGUMENT ],
|
206
230
|
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
207
231
|
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
208
|
-
[ '--
|
232
|
+
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
209
233
|
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
210
234
|
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
211
|
-
|
235
|
+
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
236
|
+
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
237
|
+
[ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
|
238
|
+
[ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
|
239
|
+
[ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
|
212
240
|
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
241
|
+
[ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
|
213
242
|
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
214
243
|
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
215
244
|
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
@@ -235,6 +264,8 @@ Options:
|
|
235
264
|
$outfile = arg
|
236
265
|
when '--cys'
|
237
266
|
$cys = arg.to_i
|
267
|
+
when '--targetenv'
|
268
|
+
$targetenv = (arg.to_i == 1) ? true : false
|
238
269
|
when '--weight'
|
239
270
|
$weight = arg.to_i
|
240
271
|
when '--sigma'
|
@@ -247,8 +278,8 @@ Options:
|
|
247
278
|
$pidmax = arg.to_f
|
248
279
|
when '--noweight'
|
249
280
|
$noweight = true
|
250
|
-
when '--
|
251
|
-
$
|
281
|
+
when '--noroundoff'
|
282
|
+
$noroundoff = true
|
252
283
|
when '--smooth'
|
253
284
|
$smooth = (arg.to_i == 1) ? :full : :partial
|
254
285
|
when '--nosmooth'
|
@@ -260,18 +291,42 @@ Options:
|
|
260
291
|
when '--add'
|
261
292
|
$add = arg.to_f
|
262
293
|
when '--penv'
|
263
|
-
warn "--penv option is not supported
|
294
|
+
warn "--penv option is not supported."
|
264
295
|
exit 1
|
265
296
|
$penv = true
|
266
|
-
|
267
|
-
|
297
|
+
when '--heatmap'
|
298
|
+
$heatmap = case arg.to_i
|
299
|
+
when (0..2) then arg.to_i
|
300
|
+
else
|
301
|
+
warn "--heatmap #{arg.to_i} is not allowed."
|
302
|
+
exit1
|
303
|
+
end
|
304
|
+
when '--heatmap-columns'
|
305
|
+
$heatmapcol = arg.to_i
|
306
|
+
when '--heatmap-stem'
|
307
|
+
$heatmapstem = arg.to_s
|
308
|
+
when '--heatmap-format'
|
309
|
+
$heatmapformat = case arg.to_i
|
310
|
+
when 0 then 'png'
|
311
|
+
when 1 then 'gif'
|
312
|
+
when 2 then 'jpg'
|
313
|
+
when 3 then 'bmp'
|
314
|
+
when 4 then 'pdf'
|
315
|
+
else
|
316
|
+
warn "--heatmap-format #{arg.to_i} is not supported."
|
317
|
+
exit 1
|
318
|
+
end
|
319
|
+
when '--heatmap-values'
|
320
|
+
$heatmapvalues = true
|
268
321
|
when '--verbose'
|
269
322
|
$logger.level = case arg.to_i
|
270
323
|
when 0 then Logger::ERROR
|
271
324
|
when 1 then Logger::WARN
|
272
325
|
when 2 then Logger::INFO
|
273
326
|
when 3 then Logger::DEBUG
|
274
|
-
else
|
327
|
+
else
|
328
|
+
warn "--verbose (-v) #{arg.to_i} is not supported."
|
329
|
+
exit 1
|
275
330
|
end
|
276
331
|
when '--version'
|
277
332
|
print_version
|
@@ -284,7 +339,9 @@ Options:
|
|
284
339
|
end
|
285
340
|
|
286
341
|
# when arguments are nonsense, print usage
|
287
|
-
if ((ARGV.length != 0) ||
|
342
|
+
if ((ARGV.length != 0) ||
|
343
|
+
(!$tem_list && !$tem_file) ||
|
344
|
+
($tem_list && $tem_file))
|
288
345
|
print_usage
|
289
346
|
exit 1
|
290
347
|
end
|
@@ -305,7 +362,6 @@ Options:
|
|
305
362
|
exit 1
|
306
363
|
end
|
307
364
|
|
308
|
-
|
309
365
|
#
|
310
366
|
# Part 2 END
|
311
367
|
#
|
@@ -316,23 +372,28 @@ Options:
|
|
316
372
|
# Reading Environment Class Definition File
|
317
373
|
#
|
318
374
|
|
319
|
-
$logger.info "Egor START."
|
320
|
-
|
321
375
|
# check --cys option and modify amino_acids set if necessary
|
322
376
|
if $cys == 2
|
323
377
|
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
|
324
378
|
end
|
325
379
|
|
326
|
-
# create an EnvironmentFeatureList object for storing all environment
|
380
|
+
# create an EnvironmentFeatureList object for storing all environment
|
381
|
+
# features
|
327
382
|
$env_features = EnvironmentFeatureArray.new
|
328
383
|
|
329
384
|
# an array for storing indexes of constrained environment features
|
330
385
|
$cst_features = []
|
331
386
|
|
332
|
-
# add substituted amino acid (aa1) in a substitution to the environment
|
333
|
-
|
387
|
+
# add substituted amino acid (aa1) in a substitution to the environment
|
388
|
+
# feature list
|
389
|
+
$env_features << EnvironmentFeature.new('sequence',
|
390
|
+
$amino_acids,
|
391
|
+
$amino_acids,
|
392
|
+
'F',
|
393
|
+
'F')
|
334
394
|
|
335
|
-
# read environment class definiton file and store them into
|
395
|
+
# read environment class definiton file and store them into
|
396
|
+
# the hash prepared above
|
336
397
|
env_index = 1
|
337
398
|
|
338
399
|
IO.foreach($classdef) do |line|
|
@@ -350,10 +411,15 @@ Options:
|
|
350
411
|
$cst_features << env_index
|
351
412
|
$logger.warn "The environment feature, #{line} constrained."
|
352
413
|
end
|
353
|
-
$env_features << EnvironmentFeature.new(env_ftr[0],
|
414
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
415
|
+
env_ftr[1].split(''),
|
416
|
+
env_ftr[2].split(''),
|
417
|
+
env_ftr[3],
|
418
|
+
env_ftr[4])
|
354
419
|
env_index += 1
|
355
420
|
else
|
356
|
-
$logger.error "\"#{line}\" doesn't seem to be a proper format for
|
421
|
+
$logger.error "\"#{line}\" doesn't seem to be a proper format for" +
|
422
|
+
"a environment class definition."
|
357
423
|
exit 1
|
358
424
|
end
|
359
425
|
end
|
@@ -361,9 +427,13 @@ Options:
|
|
361
427
|
# a hash for storing all environment classes
|
362
428
|
$env_classes = EnvironmentClassHash.new
|
363
429
|
|
364
|
-
# generate all possible combinations of environment labels, and store
|
430
|
+
# generate all possible combinations of environment labels, and store
|
431
|
+
# every environment class into the hash prepared above with the label
|
432
|
+
# as a key
|
365
433
|
$env_features.label_combinations.each_with_index { |e, i|
|
366
|
-
$env_classes[e.flatten.join] = Environment.new(i,
|
434
|
+
$env_classes[e.flatten.join] = Environment.new(i,
|
435
|
+
e.flatten.join,
|
436
|
+
$amino_acids)
|
367
437
|
}
|
368
438
|
|
369
439
|
#
|
@@ -390,19 +460,17 @@ Options:
|
|
390
460
|
$tem_list_io.each_line do |tem_file|
|
391
461
|
tem_file.chomp!
|
392
462
|
|
393
|
-
$logger.info "Analysing #{tem_file} ..."
|
394
|
-
|
395
463
|
ali = Bio::Alignment::OriginalAlignment.new
|
396
464
|
ff = Bio::FlatFile.auto(tem_file)
|
397
465
|
|
398
466
|
ff.each_entry do |pir|
|
399
467
|
if (pir.definition == 'sequence') || (pir.definition == 'structure')
|
400
|
-
ali.add_seq(pir.data.
|
468
|
+
ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
|
401
469
|
end
|
402
470
|
end
|
403
471
|
|
404
472
|
if ali.size < 2
|
405
|
-
$logger.warn "Skipped #{tem_file}
|
473
|
+
$logger.warn "Skipped #{tem_file} which has only one unique entry."
|
406
474
|
next
|
407
475
|
end
|
408
476
|
|
@@ -414,8 +482,10 @@ Options:
|
|
414
482
|
# check disulphide bond environment first!
|
415
483
|
ff.rewind
|
416
484
|
ff.each_entry do |pir|
|
417
|
-
if (pir.entry_id == key) &&
|
418
|
-
|
485
|
+
if ((pir.entry_id == key) &&
|
486
|
+
((pir.definition == "disulphide") ||
|
487
|
+
(pir.definition == "disulfide")))
|
488
|
+
disulphide[key] = pir.data.remove_internal_spaces.split('')
|
419
489
|
end
|
420
490
|
end
|
421
491
|
|
@@ -425,14 +495,16 @@ Options:
|
|
425
495
|
ff.rewind
|
426
496
|
ff.each_entry do |pir|
|
427
497
|
if (pir.entry_id == key) && (pir.definition == ec.name)
|
428
|
-
labels = pir.data.
|
498
|
+
labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
|
429
499
|
if sym == '-'
|
430
500
|
'-'
|
431
501
|
elsif sym == 'X' || sym == 'x'
|
432
502
|
'X'
|
433
503
|
else
|
434
504
|
if ei == 0 # Amino Acid Environment Feature
|
435
|
-
(disulphide.has_key?(key) &&
|
505
|
+
(disulphide.has_key?(key) &&
|
506
|
+
(disulphide[key][pos] == 'F') &&
|
507
|
+
(sym == 'C')) ? 'J' : sym
|
436
508
|
else
|
437
509
|
ec.labels[ec.symbols.index(sym)]
|
438
510
|
end
|
@@ -442,7 +514,9 @@ Options:
|
|
442
514
|
if env_labels[key].empty?
|
443
515
|
env_labels[key] = labels
|
444
516
|
else
|
445
|
-
env_labels[key].each_with_index { |e, i|
|
517
|
+
env_labels[key].each_with_index { |e, i|
|
518
|
+
env_labels[key][i] = e + labels[i]
|
519
|
+
}
|
446
520
|
end
|
447
521
|
end
|
448
522
|
end
|
@@ -459,13 +533,15 @@ Options:
|
|
459
533
|
|
460
534
|
# check PID_MIN
|
461
535
|
if $pidmin && (pid < $pidmin)
|
462
|
-
$logger.info
|
536
|
+
$logger.info "Skip alignment between #{id1} and #{id2} " +
|
537
|
+
"having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
463
538
|
next
|
464
539
|
end
|
465
540
|
|
466
541
|
# check PID_MAX
|
467
542
|
if $pidmax && (pid > $pidmax)
|
468
|
-
$logger.info
|
543
|
+
$logger.info "Skip alignment between #{id1} and #{id2} " +
|
544
|
+
"having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
469
545
|
next
|
470
546
|
end
|
471
547
|
|
@@ -574,7 +650,7 @@ Options:
|
|
574
650
|
end while(continue)
|
575
651
|
|
576
652
|
if clusters.size < 2
|
577
|
-
$logger.debug "Skipped #{tem_file}
|
653
|
+
$logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
|
578
654
|
next
|
579
655
|
end
|
580
656
|
|
@@ -610,16 +686,16 @@ Options:
|
|
610
686
|
|
611
687
|
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
|
612
688
|
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
|
613
|
-
|
614
|
-
|
615
|
-
|
689
|
+
cnt1 = 1.0 / cluster1.size
|
690
|
+
cnt2 = 1.0 / cluster2.size
|
691
|
+
jnt_cnt = cnt1 * cnt2
|
616
692
|
|
617
693
|
if $cst_features.empty?
|
618
|
-
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2,
|
619
|
-
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1,
|
694
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
|
695
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
|
620
696
|
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
621
|
-
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2,
|
622
|
-
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1,
|
697
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
|
698
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
|
623
699
|
else
|
624
700
|
$logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
|
625
701
|
next
|
@@ -630,64 +706,65 @@ Options:
|
|
630
706
|
|
631
707
|
if $aa_env_cnt.has_key? grp_label1
|
632
708
|
if $aa_env_cnt[grp_label1].has_key? aa1
|
633
|
-
$aa_env_cnt[grp_label1][aa1] +=
|
709
|
+
$aa_env_cnt[grp_label1][aa1] += cnt1
|
634
710
|
else
|
635
|
-
$aa_env_cnt[grp_label1][aa1] =
|
711
|
+
$aa_env_cnt[grp_label1][aa1] = cnt1
|
636
712
|
end
|
637
713
|
else
|
638
714
|
$aa_env_cnt[grp_label1] = Hash.new(0.0)
|
639
|
-
$aa_env_cnt[grp_label1][aa1] =
|
715
|
+
$aa_env_cnt[grp_label1][aa1] = cnt1
|
640
716
|
end
|
641
717
|
|
642
718
|
if $aa_env_cnt.has_key? grp_label2
|
643
719
|
if $aa_env_cnt[grp_label2].has_key? aa2
|
644
|
-
$aa_env_cnt[grp_label2][aa2] +=
|
720
|
+
$aa_env_cnt[grp_label2][aa2] += cnt2
|
645
721
|
else
|
646
|
-
$aa_env_cnt[grp_label2][aa2] =
|
722
|
+
$aa_env_cnt[grp_label2][aa2] = cnt2
|
647
723
|
end
|
648
724
|
else
|
649
725
|
$aa_env_cnt[grp_label2] = Hash.new(0.0)
|
650
|
-
$aa_env_cnt[grp_label2][aa2] =
|
726
|
+
$aa_env_cnt[grp_label2][aa2] = cnt2
|
651
727
|
end
|
652
728
|
|
653
729
|
if $aa_tot_cnt.has_key? aa1
|
654
|
-
$aa_tot_cnt[aa1] +=
|
730
|
+
$aa_tot_cnt[aa1] += cnt1
|
655
731
|
else
|
656
|
-
$aa_tot_cnt[aa1] =
|
732
|
+
$aa_tot_cnt[aa1] = cnt1
|
657
733
|
end
|
658
734
|
|
659
735
|
if $aa_tot_cnt.has_key? aa2
|
660
|
-
$aa_tot_cnt[aa2] +=
|
736
|
+
$aa_tot_cnt[aa2] += cnt2
|
661
737
|
else
|
662
|
-
$aa_tot_cnt[aa2] =
|
738
|
+
$aa_tot_cnt[aa2] = cnt2
|
663
739
|
end
|
664
740
|
|
665
741
|
if aa1 != aa2
|
666
742
|
if $aa_mut_cnt.has_key? aa1
|
667
|
-
$aa_mut_cnt[aa1] +=
|
743
|
+
$aa_mut_cnt[aa1] += cnt1
|
668
744
|
else
|
669
|
-
$aa_mut_cnt[aa1] =
|
745
|
+
$aa_mut_cnt[aa1] = cnt1
|
670
746
|
end
|
671
747
|
if $aa_mut_cnt.has_key? aa2
|
672
|
-
$aa_mut_cnt[aa2] +=
|
748
|
+
$aa_mut_cnt[aa2] += cnt2
|
673
749
|
else
|
674
|
-
$aa_mut_cnt[aa2] =
|
750
|
+
$aa_mut_cnt[aa2] = cnt2
|
675
751
|
end
|
676
752
|
end
|
677
753
|
|
678
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" %
|
679
|
-
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" %
|
754
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
|
755
|
+
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
|
680
756
|
end
|
681
757
|
end
|
682
758
|
end
|
683
759
|
end
|
684
760
|
end
|
761
|
+
$logger.info "Analysing #{tem_file} done."
|
685
762
|
end
|
686
763
|
|
687
764
|
# print out default header
|
688
765
|
$outfh.puts <<HEADER
|
689
766
|
# Environment-specific amino acid substitution matrices
|
690
|
-
# Creator: egor version #{
|
767
|
+
# Creator: egor version #{VERSION}
|
691
768
|
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
692
769
|
#
|
693
770
|
# Definitions for structural environments:
|
@@ -739,20 +816,20 @@ HEADER
|
|
739
816
|
$outfh.puts "# Total amino acid frequencies:\n"
|
740
817
|
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
741
818
|
|
742
|
-
|
819
|
+
min_cnt = -1
|
743
820
|
min_sigma = nil
|
744
821
|
|
745
822
|
$amino_acids.each do |res|
|
746
|
-
if ($aa_tot_cnt[res] / $sigma) < $
|
747
|
-
if
|
748
|
-
|
749
|
-
min_sigma =
|
750
|
-
elsif (
|
751
|
-
|
752
|
-
min_sigma =
|
823
|
+
if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
|
824
|
+
if min_cnt < 0
|
825
|
+
min_cnt = $aa_tot_cnt[res]
|
826
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
827
|
+
elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
|
828
|
+
min_cnt = $aa_tot_cnt[res]
|
829
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
753
830
|
end
|
754
831
|
|
755
|
-
$logger.warn "The current sigma value, #{$sigma} seems to be too big for the total
|
832
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
|
756
833
|
end
|
757
834
|
|
758
835
|
$aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
|
@@ -770,7 +847,7 @@ HEADER
|
|
770
847
|
end
|
771
848
|
end
|
772
849
|
|
773
|
-
if
|
850
|
+
if min_cnt > -1
|
774
851
|
$logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
|
775
852
|
if $autosigma
|
776
853
|
$logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
|
@@ -780,12 +857,13 @@ HEADER
|
|
780
857
|
|
781
858
|
$outfh.puts '#'
|
782
859
|
$outfh.puts '# RES: Amino acid one letter code'
|
783
|
-
$outfh.puts '# TOT_OBS: Total
|
784
|
-
$outfh.puts '# MUT_OBS: Total
|
860
|
+
$outfh.puts '# TOT_OBS: Total count of incidence'
|
861
|
+
$outfh.puts '# MUT_OBS: Total count of mutation'
|
785
862
|
$outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
|
786
|
-
$outfh.puts '# REL_MUTB: Relative mutability (ALA=100)'
|
863
|
+
$outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
|
787
864
|
$outfh.puts '# REL_FREQ: Relative frequency'
|
788
865
|
$outfh.puts '#'
|
866
|
+
|
789
867
|
#
|
790
868
|
# Part 4. END
|
791
869
|
#
|
@@ -804,7 +882,8 @@ HEADER
|
|
804
882
|
end
|
805
883
|
|
806
884
|
# count raw frequencies
|
807
|
-
$tot_cnt_mat
|
885
|
+
$tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
|
886
|
+
group_matrices = []
|
808
887
|
|
809
888
|
# for each combination of environment features
|
810
889
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
@@ -816,22 +895,88 @@ HEADER
|
|
816
895
|
end
|
817
896
|
|
818
897
|
$tot_cnt_mat += grp_cnt_mat
|
819
|
-
|
820
|
-
if $output == 0
|
821
|
-
$outfh.puts ">#{group[0]} #{group_no}"
|
822
|
-
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
823
|
-
end
|
898
|
+
group_matrices << [group[0], grp_cnt_mat]
|
824
899
|
end
|
825
900
|
|
901
|
+
$logger.info "Counting substitutions done."
|
902
|
+
|
826
903
|
if $output == 0
|
904
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
905
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
|
906
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
907
|
+
|
908
|
+
group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
|
909
|
+
# for a matrix file
|
910
|
+
stem = "#{grp_no}. #{grp_label}"
|
911
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
912
|
+
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
|
913
|
+
:row_header => $amino_acids)
|
914
|
+
|
915
|
+
# for a heat map
|
916
|
+
if $heatmap == 0 or $heatmap == 2
|
917
|
+
grp_cnt_mat.heatmap(:col_header => $amino_acids,
|
918
|
+
:row_header => $amino_acids,
|
919
|
+
:rvg_width => $rvg_width,
|
920
|
+
:rvg_height => $rvg_height,
|
921
|
+
:canvas_width => $canvas_width,
|
922
|
+
:canvas_height => $canvas_height,
|
923
|
+
:max_val => grp_max_val.ceil,
|
924
|
+
:min_val => 0,
|
925
|
+
:print_value => $heatmapvalues,
|
926
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
927
|
+
|
928
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
929
|
+
end
|
930
|
+
|
931
|
+
if $heatmap == 1 or $heatmap == 2
|
932
|
+
heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
|
933
|
+
:row_header => $amino_acids,
|
934
|
+
:rvg_width => $rvg_width,
|
935
|
+
:rvg_height => $rvg_height - 50,
|
936
|
+
:canvas_width => $canvas_width,
|
937
|
+
:canvas_height => $canvas_height - 50,
|
938
|
+
:max_val => grp_max_val.ceil,
|
939
|
+
:min_val => 0,
|
940
|
+
:print_value => $heatmapvalues,
|
941
|
+
:print_gradient => false,
|
942
|
+
:title => stem,
|
943
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
944
|
+
end
|
945
|
+
end
|
946
|
+
|
947
|
+
if $heatmap == 1 or $heatmap == 2
|
948
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
949
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
950
|
+
:rvg_width => $rvg_width,
|
951
|
+
:max_val => grp_max_val.ceil,
|
952
|
+
:min_val => 0).write(file)
|
953
|
+
|
954
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
955
|
+
end
|
956
|
+
|
957
|
+
# total
|
827
958
|
$outfh.puts '>Total'
|
828
|
-
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
829
|
-
|
959
|
+
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
960
|
+
:row_header => $amino_acids)
|
961
|
+
|
962
|
+
if $heatmap == 0 or $heatmap == 2
|
963
|
+
stem = "#{group_matrices.size}. TOTAL"
|
964
|
+
heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
|
965
|
+
:row_header => $amino_acids,
|
966
|
+
:rvg_width => $rvg_width,
|
967
|
+
:rvg_height => $rvg_height,
|
968
|
+
:canvas_width => $canvas_width,
|
969
|
+
:canvas_height => $canvas_height,
|
970
|
+
:max_val => $tot_cnt_mat.max.ceil,
|
971
|
+
:min_val => 0,
|
972
|
+
:print_value => $heatmapvalues,
|
973
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
974
|
+
|
975
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
976
|
+
end
|
830
977
|
exit 0
|
831
978
|
end
|
832
979
|
|
833
|
-
$logger.info "Counting substitutions is done."
|
834
|
-
|
835
980
|
#
|
836
981
|
# Part 5. END
|
837
982
|
#
|
@@ -867,6 +1012,8 @@ HEADER
|
|
867
1012
|
# re-calculate probability vector for each environment class
|
868
1013
|
$env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
|
869
1014
|
|
1015
|
+
group_matrices = []
|
1016
|
+
|
870
1017
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
871
1018
|
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
872
1019
|
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
@@ -878,10 +1025,63 @@ HEADER
|
|
878
1025
|
end
|
879
1026
|
|
880
1027
|
$tot_cnt_mat += grp_cnt_mat
|
1028
|
+
group_matrices << [group[0], grp_prob_mat]
|
1029
|
+
end
|
1030
|
+
|
1031
|
+
if $output == 1
|
1032
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1033
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
|
1034
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
1035
|
+
|
1036
|
+
group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
|
1037
|
+
# for a matrix file
|
1038
|
+
stem = "#{grp_no}. #{grp_label}"
|
1039
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1040
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1041
|
+
:row_header => $amino_acids)
|
1042
|
+
|
1043
|
+
|
1044
|
+
# for a heat map
|
1045
|
+
if $heatmap == 0 or $heatmap == 2
|
1046
|
+
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1047
|
+
:row_header => $amino_acids,
|
1048
|
+
:rvg_width => $rvg_width,
|
1049
|
+
:rvg_height => $rvg_height,
|
1050
|
+
:canvas_width => $canvas_width,
|
1051
|
+
:canvas_height => $canvas_height,
|
1052
|
+
:max_val => grp_max_val.ceil,
|
1053
|
+
:min_val => 0,
|
1054
|
+
:print_value => $heatmapvalues,
|
1055
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1056
|
+
|
1057
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1058
|
+
end
|
1059
|
+
|
1060
|
+
if $heatmap == 1 or $heatmap == 2
|
1061
|
+
heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1062
|
+
:row_header => $amino_acids,
|
1063
|
+
:rvg_width => $rvg_width,
|
1064
|
+
:rvg_height => $rvg_height - 50,
|
1065
|
+
:canvas_width => $canvas_width,
|
1066
|
+
:canvas_height => $canvas_height - 50,
|
1067
|
+
:max_val => grp_max_val.ceil,
|
1068
|
+
:min_val => 0,
|
1069
|
+
:print_value => $heatmapvalues,
|
1070
|
+
:print_gradient => false,
|
1071
|
+
:title => stem,
|
1072
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
1073
|
+
end
|
1074
|
+
end
|
881
1075
|
|
882
|
-
|
883
|
-
|
884
|
-
|
1076
|
+
# for heat maps in a single file
|
1077
|
+
if $heatmap == 1 or $heatmap == 2
|
1078
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1079
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1080
|
+
:rvg_width => $rvg_width,
|
1081
|
+
:max_val => grp_max_val.ceil,
|
1082
|
+
:min_val => 0).write(file)
|
1083
|
+
|
1084
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
885
1085
|
end
|
886
1086
|
end
|
887
1087
|
|
@@ -892,15 +1092,32 @@ HEADER
|
|
892
1092
|
0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
|
893
1093
|
end
|
894
1094
|
|
895
|
-
$
|
896
|
-
|
897
|
-
if ($output == 1)
|
1095
|
+
if $output == 1
|
898
1096
|
$outfh.puts '>Total'
|
899
|
-
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1097
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1098
|
+
:row_header => $amino_acids)
|
900
1099
|
$outfh.close
|
901
|
-
|
1100
|
+
|
1101
|
+
# for a heat map
|
1102
|
+
if $heatmap == 0 or $heatmap == 2
|
1103
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1104
|
+
$tot_prob_mat.heatmap(:col_header => $amino_acids,
|
1105
|
+
:row_header => $amino_acids,
|
1106
|
+
:rvg_width => $rvg_width,
|
1107
|
+
:rvg_height => $rvg_height,
|
1108
|
+
:canvas_width => $canvas_width,
|
1109
|
+
:canvas_height => $canvas_height,
|
1110
|
+
:max_val => $tot_prob_mat.max.ceil,
|
1111
|
+
:min_val => 0,
|
1112
|
+
:print_value => $heatmapvalues,
|
1113
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1114
|
+
|
1115
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1116
|
+
end
|
902
1117
|
exit 0
|
903
1118
|
end
|
1119
|
+
|
1120
|
+
$logger.info 'Calculating substitution probabilities (no smoothing) done.'
|
904
1121
|
end
|
905
1122
|
|
906
1123
|
# when smoothing!!!
|
@@ -980,7 +1197,7 @@ HEADER
|
|
980
1197
|
end
|
981
1198
|
|
982
1199
|
env_labels.combination(ci) do |c1|
|
983
|
-
|
1200
|
+
c1[0].product(*c1[1..-1]).each do |labels|
|
984
1201
|
pattern = '.' * $env_features.size
|
985
1202
|
|
986
1203
|
labels.each do |label|
|
@@ -1081,7 +1298,7 @@ HEADER
|
|
1081
1298
|
end
|
1082
1299
|
end
|
1083
1300
|
end
|
1084
|
-
$logger.info 'Calculating substitution probabilities
|
1301
|
+
$logger.info 'Calculating substitution probabilities (partial smoothing) done.'
|
1085
1302
|
else
|
1086
1303
|
$outfh.puts <<HEADER
|
1087
1304
|
#
|
@@ -1116,7 +1333,7 @@ HEADER
|
|
1116
1333
|
# full smooting
|
1117
1334
|
1.upto($env_features.size) do |ci|
|
1118
1335
|
env_labels.combination(ci) do |c1|
|
1119
|
-
|
1336
|
+
c1[0].product(*c1[1..-1]).each do |labels|
|
1120
1337
|
pattern = '.' * $env_features.size
|
1121
1338
|
labels.each do |label|
|
1122
1339
|
j = label[0].chr.to_i
|
@@ -1167,7 +1384,7 @@ HEADER
|
|
1167
1384
|
end
|
1168
1385
|
end
|
1169
1386
|
end
|
1170
|
-
$logger.info 'Calculating substitution probabilities
|
1387
|
+
$logger.info 'Calculating substitution probabilities (full smoothing) done.'
|
1171
1388
|
end
|
1172
1389
|
|
1173
1390
|
# updating smoothed probability array for each envrionment
|
@@ -1176,7 +1393,9 @@ HEADER
|
|
1176
1393
|
end
|
1177
1394
|
|
1178
1395
|
# sorting environments and build 21X21 substitution matrices
|
1179
|
-
|
1396
|
+
group_matrices = []
|
1397
|
+
|
1398
|
+
$env_classes.groups_sorted_by_residue_labels.each do |group|
|
1180
1399
|
# calculating 21X21 substitution probability matrix for each envrionment
|
1181
1400
|
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1182
1401
|
|
@@ -1185,9 +1404,62 @@ HEADER
|
|
1185
1404
|
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
|
1186
1405
|
end
|
1187
1406
|
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1407
|
+
group_matrices << [group[0], grp_prob_mat]
|
1408
|
+
end
|
1409
|
+
|
1410
|
+
if $output == 1
|
1411
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1412
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
|
1413
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
1414
|
+
|
1415
|
+
group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
|
1416
|
+
# for a matrix file
|
1417
|
+
stem = "#{grp_no}. #{grp_label}"
|
1418
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1419
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1420
|
+
:row_header => $amino_acids)
|
1421
|
+
|
1422
|
+
# for heat map generation
|
1423
|
+
if $heatmap == 0 or $heatmap == 2
|
1424
|
+
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1425
|
+
:row_header => $amino_acids,
|
1426
|
+
:rvg_width => $rvg_width,
|
1427
|
+
:rvg_height => $rvg_height,
|
1428
|
+
:canvas_width => $canvas_width,
|
1429
|
+
:canvas_height => $canvas_height,
|
1430
|
+
:max_val => grp_max_val.ceil,
|
1431
|
+
:min_val => 0,
|
1432
|
+
:print_value => $heatmapvalues,
|
1433
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1434
|
+
|
1435
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1436
|
+
end
|
1437
|
+
|
1438
|
+
if $heatmap == 1 or $heatmap == 2
|
1439
|
+
heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1440
|
+
:row_header => $amino_acids,
|
1441
|
+
:rvg_width => $rvg_width,
|
1442
|
+
:rvg_height => $rvg_height - 50,
|
1443
|
+
:canvas_width => $canvas_width,
|
1444
|
+
:canvas_height => $canvas_height - 50,
|
1445
|
+
:max_val => grp_max_val.ceil,
|
1446
|
+
:min_val => 0,
|
1447
|
+
:print_value => $heatmapvalues,
|
1448
|
+
:print_gradient => false,
|
1449
|
+
:title => stem,
|
1450
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
1451
|
+
end
|
1452
|
+
end
|
1453
|
+
|
1454
|
+
# for heat maps in a single file
|
1455
|
+
if $heatmap == 1 or $heatmap == 2
|
1456
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1457
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1458
|
+
:rvg_width => $rvg_width,
|
1459
|
+
:max_val => grp_max_val.ceil,
|
1460
|
+
:min_val => 0).write(file)
|
1461
|
+
|
1462
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1191
1463
|
end
|
1192
1464
|
end
|
1193
1465
|
|
@@ -1202,9 +1474,26 @@ HEADER
|
|
1202
1474
|
|
1203
1475
|
if $output == 1
|
1204
1476
|
$outfh.puts '>Total'
|
1205
|
-
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1477
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1478
|
+
:row_header => $amino_acids)
|
1206
1479
|
$outfh.close
|
1207
|
-
|
1480
|
+
|
1481
|
+
# for a heat map
|
1482
|
+
if $heatmap == 0 or $heatmap == 2
|
1483
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1484
|
+
$tot_prob_mat.heatmap(:col_header => $amino_acids,
|
1485
|
+
:row_header => $amino_acids,
|
1486
|
+
:rvg_width => $rvg_width,
|
1487
|
+
:rvg_height => $rvg_height,
|
1488
|
+
:canvas_width => $canvas_width,
|
1489
|
+
:canvas_height => $canvas_height,
|
1490
|
+
:max_val => $tot_prob_mat.max.ceil,
|
1491
|
+
:min_val => 0,
|
1492
|
+
:print_value => $heatmapvalues,
|
1493
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1494
|
+
|
1495
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1496
|
+
end
|
1208
1497
|
exit 0
|
1209
1498
|
end
|
1210
1499
|
end
|
@@ -1242,16 +1531,18 @@ HEADER
|
|
1242
1531
|
# calculating substitution probability matrix for each envrionment
|
1243
1532
|
grp_label = group[0]
|
1244
1533
|
grp_envs = group[1]
|
1245
|
-
grp_logo_mat = $cys == 0 ?
|
1534
|
+
grp_logo_mat = $cys == 0 ?
|
1535
|
+
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1536
|
+
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1246
1537
|
|
1247
1538
|
$amino_acids.each_with_index do |aa, aj|
|
1248
1539
|
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1249
|
-
|
1250
|
-
|
1540
|
+
env.logo_array = $cys == 0 ?
|
1541
|
+
NArray.float($amino_acids.size + 1) :
|
1542
|
+
NArray.float($amino_acids.size)
|
1251
1543
|
|
1252
1544
|
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
|
1253
1545
|
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1254
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1255
1546
|
odds = prob / pai
|
1256
1547
|
env.logo_array[ai] = factor * Math::log(odds)
|
1257
1548
|
grp_logo_mat[aj, ai] = env.logo_array[ai]
|
@@ -1262,7 +1553,6 @@ HEADER
|
|
1262
1553
|
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1263
1554
|
prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
|
1264
1555
|
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
|
1265
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1266
1556
|
odds = prob / pai
|
1267
1557
|
env.logo_array[$amino_acids.size] = factor * Math::log(odds)
|
1268
1558
|
grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
|
@@ -1272,13 +1562,14 @@ HEADER
|
|
1272
1562
|
grp_logo_mats << [grp_label, grp_logo_mat]
|
1273
1563
|
end
|
1274
1564
|
|
1275
|
-
$tot_logo_mat = $cys == 0 ?
|
1565
|
+
$tot_logo_mat = $cys == 0 ?
|
1566
|
+
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1567
|
+
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1276
1568
|
|
1277
1569
|
$amino_acids.each_with_index do |aa1, aj|
|
1278
1570
|
$amino_acids.each_with_index do |aa2, ai|
|
1279
1571
|
prob = $tot_prob_mat[aj, ai]
|
1280
1572
|
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1281
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1282
1573
|
odds = prob / pai
|
1283
1574
|
$tot_logo_mat[aj, ai] = factor * Math::log(odds)
|
1284
1575
|
end
|
@@ -1287,7 +1578,6 @@ HEADER
|
|
1287
1578
|
if $cys == 0
|
1288
1579
|
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1289
1580
|
prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
|
1290
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1291
1581
|
odds = prob / pai
|
1292
1582
|
$tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
|
1293
1583
|
end
|
@@ -1315,7 +1605,7 @@ HEADER
|
|
1315
1605
|
#
|
1316
1606
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1317
1607
|
HEADER
|
1318
|
-
unless $
|
1608
|
+
unless $noroundoff
|
1319
1609
|
$outfh.puts <<HEADER
|
1320
1610
|
# rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
|
1321
1611
|
HEADER
|
@@ -1326,43 +1616,120 @@ HEADER
|
|
1326
1616
|
#
|
1327
1617
|
HEADER
|
1328
1618
|
|
1619
|
+
grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
|
1620
|
+
grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
|
1621
|
+
abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
|
1622
|
+
row_header = $cys ? $amino_acids + %w[U] : $amino_acids
|
1623
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1624
|
+
$heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
|
1625
|
+
|
1329
1626
|
grp_logo_mats.each_with_index do |arr, grp_no|
|
1330
1627
|
grp_label = arr[0]
|
1331
1628
|
grp_logo_mat = arr[1]
|
1629
|
+
stem = "#{grp_no}. #{grp_label}"
|
1332
1630
|
|
1333
|
-
unless $
|
1631
|
+
unless $noroundoff
|
1334
1632
|
grp_logo_mat = grp_logo_mat.round
|
1335
1633
|
end
|
1336
1634
|
|
1635
|
+
# for a matrix file
|
1337
1636
|
$outfh.puts ">#{grp_label} #{grp_no}"
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1637
|
+
$outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
|
1638
|
+
:row_header => row_header)
|
1639
|
+
# for a heat map
|
1640
|
+
if $heatmap == 0 or $heatmap == 2
|
1641
|
+
grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1642
|
+
:row_header => row_header,
|
1643
|
+
:rvg_width => $rvg_width,
|
1644
|
+
:rvg_height => $rvg_height,
|
1645
|
+
:canvas_width => $canvas_width,
|
1646
|
+
:canvas_height => $canvas_height,
|
1647
|
+
:gradient_beg_color => '#0000FF',
|
1648
|
+
:gradient_mid_color => '#FFFFFF',
|
1649
|
+
:gradient_end_color => '#FF0000',
|
1650
|
+
:max_val => abs_max_val.ceil,
|
1651
|
+
:mid_val => 0,
|
1652
|
+
:min_val => -1 * abs_max_val.ceil,
|
1653
|
+
:print_value => $heatmapvalues,
|
1654
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1655
|
+
|
1656
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1657
|
+
end
|
1658
|
+
|
1659
|
+
if $heatmap == 1 or $heatmap == 2
|
1660
|
+
heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1661
|
+
:row_header => row_header,
|
1662
|
+
:rvg_width => $rvg_width,
|
1663
|
+
:rvg_height => $rvg_height - 50,
|
1664
|
+
:canvas_width => $canvas_width,
|
1665
|
+
:canvas_height => $canvas_height - 50,
|
1666
|
+
:gradient_beg_color => '#0000FF',
|
1667
|
+
:gradient_mid_color => '#FFFFFF',
|
1668
|
+
:gradient_end_color => '#FF0000',
|
1669
|
+
:max_val => abs_max_val.ceil,
|
1670
|
+
:mid_val => 0,
|
1671
|
+
:min_val => -1 * abs_max_val.ceil,
|
1672
|
+
:print_value => $heatmapvalues,
|
1673
|
+
:print_gradient => false,
|
1674
|
+
:title => stem,
|
1675
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
1342
1676
|
end
|
1343
1677
|
end
|
1344
1678
|
|
1345
|
-
|
1679
|
+
# for heat maps in a single file
|
1680
|
+
if $heatmap == 1 or $heatmap == 2
|
1681
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1682
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1683
|
+
:rvg_width => $rvg_width,
|
1684
|
+
:gradient_beg_color => '#0000FF',
|
1685
|
+
:gradient_mid_color => '#FFFFFF',
|
1686
|
+
:gradient_end_color => '#FF0000',
|
1687
|
+
:max_val => abs_max_val.ceil,
|
1688
|
+
:mid_val => 0,
|
1689
|
+
:min_val => -1 * abs_max_val.ceil).write(file)
|
1690
|
+
|
1691
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1692
|
+
end
|
1346
1693
|
|
1347
|
-
|
1694
|
+
# for a matrix file
|
1695
|
+
unless $noroundoff
|
1348
1696
|
$tot_logo_mat = $tot_logo_mat.round
|
1349
1697
|
end
|
1350
1698
|
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1699
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1700
|
+
$outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
|
1701
|
+
:row_header => row_header)
|
1702
|
+
|
1703
|
+
# for a heat map
|
1704
|
+
if $heatmap == 0 or $heatmap == 2
|
1705
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1706
|
+
tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
|
1707
|
+
$tot_logo_mat.heatmap(:col_header => $amino_acids,
|
1708
|
+
:row_header => row_header,
|
1709
|
+
:rvg_width => $rvg_width,
|
1710
|
+
:rvg_height => $rvg_height,
|
1711
|
+
:canvas_width => $canvas_width,
|
1712
|
+
:canvas_height => $canvas_height,
|
1713
|
+
:gradient_beg_color => '#0000FF',
|
1714
|
+
:gradient_mid_color => '#FFFFFF',
|
1715
|
+
:gradient_end_color => '#FF0000',
|
1716
|
+
:max_val => tot_abs_max_val.ceil,
|
1717
|
+
:mid_val => 0,
|
1718
|
+
:min_val => -1 * tot_abs_max_val.ceil,
|
1719
|
+
:print_value => $heatmapvalues,
|
1720
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1721
|
+
|
1722
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1355
1723
|
end
|
1356
1724
|
|
1357
|
-
$logger.info "Calculating log odds
|
1358
|
-
|
1359
|
-
#
|
1360
|
-
# Part 7. END
|
1361
|
-
#
|
1725
|
+
$logger.info "Calculating log odds ratios done."
|
1362
1726
|
end
|
1363
1727
|
|
1728
|
+
#
|
1729
|
+
# Part 7. END
|
1730
|
+
#
|
1731
|
+
|
1364
1732
|
$outfh.close
|
1365
|
-
$logger.info "Egor END."
|
1366
1733
|
exit 0
|
1367
1734
|
end
|
1368
1735
|
end
|