egor 0.0.5 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +28 -26
- data/Manifest.txt +12 -8
- data/README.rdoc +206 -54
- data/Rakefile +9 -9
- data/egor.gemspec +13 -15
- data/lib/egor.rb +1 -1
- data/lib/egor/cli.rb +535 -168
- data/lib/egor/environment.rb +34 -0
- data/lib/egor/environment_class_hash.rb +20 -0
- data/lib/egor/environment_feature.rb +26 -0
- data/lib/egor/environment_feature_array.rb +12 -0
- data/lib/egor/heatmap_array.rb +111 -0
- data/lib/narray_extensions.rb +3 -2
- data/lib/nmatrix_extensions.rb +227 -6
- data/lib/string_extensions.rb +17 -0
- data/test/egor/test_cli.rb +9 -0
- data/test/egor/test_environment_class_hash.rb +25 -0
- data/test/egor/test_environment_feature.rb +29 -0
- data/test/test_math_extensions.rb +11 -0
- data/test/test_narray_extensions.rb +14 -0
- data/test/test_string_extensions.rb +11 -0
- data/website/index.html +5 -76
- data/website/index.txt +183 -18
- data/website/stylesheets/screen.css +0 -1
- metadata +27 -20
- data/lib/enumerable_extensions.rb +0 -11
- data/lib/environment.rb +0 -58
- data/lib/environment_class_hash.rb +0 -18
- data/lib/environment_feature.rb +0 -14
- data/lib/environment_feature_array.rb +0 -10
- data/test/test_egor_cli.rb +0 -8
- data/test/test_enumerable_extensions.rb +0 -16
- data/test/test_environment_feature.rb +0 -11
data/lib/egor.rb
CHANGED
data/lib/egor/cli.rb
CHANGED
@@ -5,16 +5,17 @@ require 'narray'
|
|
5
5
|
require 'bio'
|
6
6
|
require 'set'
|
7
7
|
require 'facets'
|
8
|
-
require 'simple_memoize'
|
9
8
|
|
9
|
+
require 'math_extensions'
|
10
|
+
require 'string_extensions'
|
10
11
|
require 'narray_extensions'
|
11
12
|
require 'nmatrix_extensions'
|
12
|
-
|
13
|
-
require '
|
14
|
-
require '
|
15
|
-
require '
|
16
|
-
require '
|
17
|
-
require '
|
13
|
+
|
14
|
+
require 'egor/environment'
|
15
|
+
require 'egor/environment_class_hash'
|
16
|
+
require 'egor/environment_feature'
|
17
|
+
require 'egor/environment_feature_array'
|
18
|
+
require 'egor/heatmap_array'
|
18
19
|
|
19
20
|
# This is a module for an actual command line interpreter for Egor
|
20
21
|
# ---
|
@@ -25,7 +26,7 @@ module Egor
|
|
25
26
|
|
26
27
|
# :nodoc:
|
27
28
|
def print_version
|
28
|
-
puts
|
29
|
+
puts VERSION
|
29
30
|
end
|
30
31
|
|
31
32
|
# Print Egor's Usage on the screen
|
@@ -62,14 +63,26 @@ Options:
|
|
62
63
|
0 for raw counts (no smoothing performed)
|
63
64
|
1 for probabilities
|
64
65
|
2 for log-odds (default)
|
65
|
-
--
|
66
|
+
--noroundoff: do not round off log odds ratio
|
66
67
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
67
68
|
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
68
69
|
--autosigma: automatically adjust the sigma value for smoothing
|
69
70
|
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
70
|
-
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
71
71
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
72
72
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
73
|
+
--heatmap INTEGER:
|
74
|
+
0 create a heat map file for each substitution table
|
75
|
+
1 create one big file containing all heat maps from substitution tables
|
76
|
+
2 do both 0 and 1
|
77
|
+
--heatmap-format INTEGER:
|
78
|
+
0 for Portable Network Graphics (PNG) Format (default)
|
79
|
+
1 for Graphics Interchange Format (GIF)
|
80
|
+
2 for Joint Photographic Experts Group (JPEG) Format
|
81
|
+
3 for Microsoft Windows bitmap (BMP) Format
|
82
|
+
4 for Portable Document Format (PDF)
|
83
|
+
--heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
|
84
|
+
--heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
|
85
|
+
--heatmap-values: print values in the cells when generating heat maps
|
73
86
|
--verbose (-v) INTEGER
|
74
87
|
0 for ERROR level
|
75
88
|
1 for WARN or above level (default)
|
@@ -87,12 +100,12 @@ Options:
|
|
87
100
|
# Egor::CLI::calculate_pid(seq1, seq2) -> Float
|
88
101
|
#
|
89
102
|
def calculate_pid(seq1, seq2)
|
90
|
-
|
91
|
-
|
92
|
-
cols =
|
93
|
-
align = 0
|
94
|
-
ident = 0
|
95
|
-
intgp = 0
|
103
|
+
aas1 = seq1.split('')
|
104
|
+
aas2 = seq2.split('')
|
105
|
+
cols = aas1.zip(aas2)
|
106
|
+
align = 0 # no. of aligned columns
|
107
|
+
ident = 0 # no. of identical columns
|
108
|
+
intgp = 0 # no. of internal gaps
|
96
109
|
|
97
110
|
cols.each do |col|
|
98
111
|
if (col[0] != '-') && (col[1] != '-')
|
@@ -100,14 +113,14 @@ Options:
|
|
100
113
|
if col[0] == col[1]
|
101
114
|
ident += 1
|
102
115
|
end
|
103
|
-
elsif (((col[0] == '-') && (col[1] != '-')) ||
|
116
|
+
elsif (((col[0] == '-') && (col[1] != '-')) ||
|
117
|
+
((col[0] != '-') && (col[1] == '-')))
|
104
118
|
intgp += 1
|
105
119
|
end
|
106
120
|
end
|
107
121
|
|
108
122
|
pid = 100.0 * ident.to_f / (align + intgp)
|
109
123
|
end
|
110
|
-
memoize :calculate_pid
|
111
124
|
|
112
125
|
# :nodoc:
|
113
126
|
def execute(arguments=[])
|
@@ -121,7 +134,7 @@ Options:
|
|
121
134
|
# aa: weighted amino acid
|
122
135
|
# tot: total
|
123
136
|
# rel: relative
|
124
|
-
#
|
137
|
+
# jnt: joint
|
125
138
|
# cnt: count
|
126
139
|
# mut: mutation
|
127
140
|
# mutb: mutability
|
@@ -145,31 +158,42 @@ Options:
|
|
145
158
|
$logger.level = Logger::WARN
|
146
159
|
|
147
160
|
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
148
|
-
$amino_acids
|
149
|
-
|
150
|
-
$
|
151
|
-
$
|
152
|
-
$
|
153
|
-
$
|
154
|
-
$
|
155
|
-
$
|
156
|
-
$
|
157
|
-
$
|
158
|
-
$
|
159
|
-
$
|
160
|
-
$
|
161
|
-
$
|
162
|
-
$
|
163
|
-
$
|
164
|
-
$
|
165
|
-
$
|
166
|
-
$
|
167
|
-
$
|
168
|
-
$
|
169
|
-
$
|
170
|
-
$
|
171
|
-
$
|
172
|
-
$penv
|
161
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
162
|
+
$tem_list = nil
|
163
|
+
$tem_file = nil
|
164
|
+
$classdef = 'classdef.dat'
|
165
|
+
$outfile = 'allmat.dat'
|
166
|
+
$outfh = nil # file hanfle for outfile
|
167
|
+
$output = 2 # default: log odds matrix
|
168
|
+
$ali_size = 0
|
169
|
+
$tot_aa = 0
|
170
|
+
$sigma = 5.0
|
171
|
+
$autosigma = false
|
172
|
+
$weight = 60
|
173
|
+
$noweight = false
|
174
|
+
$smooth = :partial
|
175
|
+
$nosmooth = false
|
176
|
+
$noroundoff = false
|
177
|
+
$p1smooth = false
|
178
|
+
$scale = 3
|
179
|
+
$pidmin = nil
|
180
|
+
$pidmax = nil
|
181
|
+
$scale = 3
|
182
|
+
$add = nil
|
183
|
+
$cys = 0
|
184
|
+
$targetenv = false
|
185
|
+
$penv = false
|
186
|
+
$heatmap = nil
|
187
|
+
$heatmapcol = nil
|
188
|
+
$heatmapformat = 'png'
|
189
|
+
$heatmapstem = 'heatmaps'
|
190
|
+
$heatmapvalues = false
|
191
|
+
$rvg_width = 550
|
192
|
+
$rvg_height = 650
|
193
|
+
$canvas_width = 550
|
194
|
+
$canvas_height = 650
|
195
|
+
$cell_width = 20
|
196
|
+
$cell_height = 20
|
173
197
|
|
174
198
|
$aa_tot_cnt = Hash.new(0)
|
175
199
|
$aa_mut_cnt = Hash.new(0)
|
@@ -184,7 +208,7 @@ Options:
|
|
184
208
|
$tot_smooth_prob = {}
|
185
209
|
|
186
210
|
# minimum ratio of amino acid count to sigma value
|
187
|
-
$
|
211
|
+
$min_cnt_sigma_ratio = 500.0
|
188
212
|
|
189
213
|
#
|
190
214
|
# Part 1 END
|
@@ -205,11 +229,16 @@ Options:
|
|
205
229
|
[ '--p1smooth', GetoptLong::NO_ARGUMENT ],
|
206
230
|
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
207
231
|
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
208
|
-
[ '--
|
232
|
+
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
209
233
|
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
210
234
|
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
211
|
-
|
235
|
+
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
236
|
+
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
237
|
+
[ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
|
238
|
+
[ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
|
239
|
+
[ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
|
212
240
|
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
241
|
+
[ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
|
213
242
|
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
214
243
|
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
215
244
|
[ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
@@ -235,6 +264,8 @@ Options:
|
|
235
264
|
$outfile = arg
|
236
265
|
when '--cys'
|
237
266
|
$cys = arg.to_i
|
267
|
+
when '--targetenv'
|
268
|
+
$targetenv = (arg.to_i == 1) ? true : false
|
238
269
|
when '--weight'
|
239
270
|
$weight = arg.to_i
|
240
271
|
when '--sigma'
|
@@ -247,8 +278,8 @@ Options:
|
|
247
278
|
$pidmax = arg.to_f
|
248
279
|
when '--noweight'
|
249
280
|
$noweight = true
|
250
|
-
when '--
|
251
|
-
$
|
281
|
+
when '--noroundoff'
|
282
|
+
$noroundoff = true
|
252
283
|
when '--smooth'
|
253
284
|
$smooth = (arg.to_i == 1) ? :full : :partial
|
254
285
|
when '--nosmooth'
|
@@ -260,18 +291,42 @@ Options:
|
|
260
291
|
when '--add'
|
261
292
|
$add = arg.to_f
|
262
293
|
when '--penv'
|
263
|
-
warn "--penv option is not supported
|
294
|
+
warn "--penv option is not supported."
|
264
295
|
exit 1
|
265
296
|
$penv = true
|
266
|
-
|
267
|
-
|
297
|
+
when '--heatmap'
|
298
|
+
$heatmap = case arg.to_i
|
299
|
+
when (0..2) then arg.to_i
|
300
|
+
else
|
301
|
+
warn "--heatmap #{arg.to_i} is not allowed."
|
302
|
+
exit1
|
303
|
+
end
|
304
|
+
when '--heatmap-columns'
|
305
|
+
$heatmapcol = arg.to_i
|
306
|
+
when '--heatmap-stem'
|
307
|
+
$heatmapstem = arg.to_s
|
308
|
+
when '--heatmap-format'
|
309
|
+
$heatmapformat = case arg.to_i
|
310
|
+
when 0 then 'png'
|
311
|
+
when 1 then 'gif'
|
312
|
+
when 2 then 'jpg'
|
313
|
+
when 3 then 'bmp'
|
314
|
+
when 4 then 'pdf'
|
315
|
+
else
|
316
|
+
warn "--heatmap-format #{arg.to_i} is not supported."
|
317
|
+
exit 1
|
318
|
+
end
|
319
|
+
when '--heatmap-values'
|
320
|
+
$heatmapvalues = true
|
268
321
|
when '--verbose'
|
269
322
|
$logger.level = case arg.to_i
|
270
323
|
when 0 then Logger::ERROR
|
271
324
|
when 1 then Logger::WARN
|
272
325
|
when 2 then Logger::INFO
|
273
326
|
when 3 then Logger::DEBUG
|
274
|
-
else
|
327
|
+
else
|
328
|
+
warn "--verbose (-v) #{arg.to_i} is not supported."
|
329
|
+
exit 1
|
275
330
|
end
|
276
331
|
when '--version'
|
277
332
|
print_version
|
@@ -284,7 +339,9 @@ Options:
|
|
284
339
|
end
|
285
340
|
|
286
341
|
# when arguments are nonsense, print usage
|
287
|
-
if ((ARGV.length != 0) ||
|
342
|
+
if ((ARGV.length != 0) ||
|
343
|
+
(!$tem_list && !$tem_file) ||
|
344
|
+
($tem_list && $tem_file))
|
288
345
|
print_usage
|
289
346
|
exit 1
|
290
347
|
end
|
@@ -305,7 +362,6 @@ Options:
|
|
305
362
|
exit 1
|
306
363
|
end
|
307
364
|
|
308
|
-
|
309
365
|
#
|
310
366
|
# Part 2 END
|
311
367
|
#
|
@@ -316,23 +372,28 @@ Options:
|
|
316
372
|
# Reading Environment Class Definition File
|
317
373
|
#
|
318
374
|
|
319
|
-
$logger.info "Egor START."
|
320
|
-
|
321
375
|
# check --cys option and modify amino_acids set if necessary
|
322
376
|
if $cys == 2
|
323
377
|
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
|
324
378
|
end
|
325
379
|
|
326
|
-
# create an EnvironmentFeatureList object for storing all environment
|
380
|
+
# create an EnvironmentFeatureList object for storing all environment
|
381
|
+
# features
|
327
382
|
$env_features = EnvironmentFeatureArray.new
|
328
383
|
|
329
384
|
# an array for storing indexes of constrained environment features
|
330
385
|
$cst_features = []
|
331
386
|
|
332
|
-
# add substituted amino acid (aa1) in a substitution to the environment
|
333
|
-
|
387
|
+
# add substituted amino acid (aa1) in a substitution to the environment
|
388
|
+
# feature list
|
389
|
+
$env_features << EnvironmentFeature.new('sequence',
|
390
|
+
$amino_acids,
|
391
|
+
$amino_acids,
|
392
|
+
'F',
|
393
|
+
'F')
|
334
394
|
|
335
|
-
# read environment class definiton file and store them into
|
395
|
+
# read environment class definiton file and store them into
|
396
|
+
# the hash prepared above
|
336
397
|
env_index = 1
|
337
398
|
|
338
399
|
IO.foreach($classdef) do |line|
|
@@ -350,10 +411,15 @@ Options:
|
|
350
411
|
$cst_features << env_index
|
351
412
|
$logger.warn "The environment feature, #{line} constrained."
|
352
413
|
end
|
353
|
-
$env_features << EnvironmentFeature.new(env_ftr[0],
|
414
|
+
$env_features << EnvironmentFeature.new(env_ftr[0],
|
415
|
+
env_ftr[1].split(''),
|
416
|
+
env_ftr[2].split(''),
|
417
|
+
env_ftr[3],
|
418
|
+
env_ftr[4])
|
354
419
|
env_index += 1
|
355
420
|
else
|
356
|
-
$logger.error "\"#{line}\" doesn't seem to be a proper format for
|
421
|
+
$logger.error "\"#{line}\" doesn't seem to be a proper format for" +
|
422
|
+
"a environment class definition."
|
357
423
|
exit 1
|
358
424
|
end
|
359
425
|
end
|
@@ -361,9 +427,13 @@ Options:
|
|
361
427
|
# a hash for storing all environment classes
|
362
428
|
$env_classes = EnvironmentClassHash.new
|
363
429
|
|
364
|
-
# generate all possible combinations of environment labels, and store
|
430
|
+
# generate all possible combinations of environment labels, and store
|
431
|
+
# every environment class into the hash prepared above with the label
|
432
|
+
# as a key
|
365
433
|
$env_features.label_combinations.each_with_index { |e, i|
|
366
|
-
$env_classes[e.flatten.join] = Environment.new(i,
|
434
|
+
$env_classes[e.flatten.join] = Environment.new(i,
|
435
|
+
e.flatten.join,
|
436
|
+
$amino_acids)
|
367
437
|
}
|
368
438
|
|
369
439
|
#
|
@@ -390,19 +460,17 @@ Options:
|
|
390
460
|
$tem_list_io.each_line do |tem_file|
|
391
461
|
tem_file.chomp!
|
392
462
|
|
393
|
-
$logger.info "Analysing #{tem_file} ..."
|
394
|
-
|
395
463
|
ali = Bio::Alignment::OriginalAlignment.new
|
396
464
|
ff = Bio::FlatFile.auto(tem_file)
|
397
465
|
|
398
466
|
ff.each_entry do |pir|
|
399
467
|
if (pir.definition == 'sequence') || (pir.definition == 'structure')
|
400
|
-
ali.add_seq(pir.data.
|
468
|
+
ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
|
401
469
|
end
|
402
470
|
end
|
403
471
|
|
404
472
|
if ali.size < 2
|
405
|
-
$logger.warn "Skipped #{tem_file}
|
473
|
+
$logger.warn "Skipped #{tem_file} which has only one unique entry."
|
406
474
|
next
|
407
475
|
end
|
408
476
|
|
@@ -414,8 +482,10 @@ Options:
|
|
414
482
|
# check disulphide bond environment first!
|
415
483
|
ff.rewind
|
416
484
|
ff.each_entry do |pir|
|
417
|
-
if (pir.entry_id == key) &&
|
418
|
-
|
485
|
+
if ((pir.entry_id == key) &&
|
486
|
+
((pir.definition == "disulphide") ||
|
487
|
+
(pir.definition == "disulfide")))
|
488
|
+
disulphide[key] = pir.data.remove_internal_spaces.split('')
|
419
489
|
end
|
420
490
|
end
|
421
491
|
|
@@ -425,14 +495,16 @@ Options:
|
|
425
495
|
ff.rewind
|
426
496
|
ff.each_entry do |pir|
|
427
497
|
if (pir.entry_id == key) && (pir.definition == ec.name)
|
428
|
-
labels = pir.data.
|
498
|
+
labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
|
429
499
|
if sym == '-'
|
430
500
|
'-'
|
431
501
|
elsif sym == 'X' || sym == 'x'
|
432
502
|
'X'
|
433
503
|
else
|
434
504
|
if ei == 0 # Amino Acid Environment Feature
|
435
|
-
(disulphide.has_key?(key) &&
|
505
|
+
(disulphide.has_key?(key) &&
|
506
|
+
(disulphide[key][pos] == 'F') &&
|
507
|
+
(sym == 'C')) ? 'J' : sym
|
436
508
|
else
|
437
509
|
ec.labels[ec.symbols.index(sym)]
|
438
510
|
end
|
@@ -442,7 +514,9 @@ Options:
|
|
442
514
|
if env_labels[key].empty?
|
443
515
|
env_labels[key] = labels
|
444
516
|
else
|
445
|
-
env_labels[key].each_with_index { |e, i|
|
517
|
+
env_labels[key].each_with_index { |e, i|
|
518
|
+
env_labels[key][i] = e + labels[i]
|
519
|
+
}
|
446
520
|
end
|
447
521
|
end
|
448
522
|
end
|
@@ -459,13 +533,15 @@ Options:
|
|
459
533
|
|
460
534
|
# check PID_MIN
|
461
535
|
if $pidmin && (pid < $pidmin)
|
462
|
-
$logger.info
|
536
|
+
$logger.info "Skip alignment between #{id1} and #{id2} " +
|
537
|
+
"having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
463
538
|
next
|
464
539
|
end
|
465
540
|
|
466
541
|
# check PID_MAX
|
467
542
|
if $pidmax && (pid > $pidmax)
|
468
|
-
$logger.info
|
543
|
+
$logger.info "Skip alignment between #{id1} and #{id2} " +
|
544
|
+
"having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
469
545
|
next
|
470
546
|
end
|
471
547
|
|
@@ -574,7 +650,7 @@ Options:
|
|
574
650
|
end while(continue)
|
575
651
|
|
576
652
|
if clusters.size < 2
|
577
|
-
$logger.debug "Skipped #{tem_file}
|
653
|
+
$logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
|
578
654
|
next
|
579
655
|
end
|
580
656
|
|
@@ -610,16 +686,16 @@ Options:
|
|
610
686
|
|
611
687
|
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
|
612
688
|
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
|
613
|
-
|
614
|
-
|
615
|
-
|
689
|
+
cnt1 = 1.0 / cluster1.size
|
690
|
+
cnt2 = 1.0 / cluster2.size
|
691
|
+
jnt_cnt = cnt1 * cnt2
|
616
692
|
|
617
693
|
if $cst_features.empty?
|
618
|
-
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2,
|
619
|
-
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1,
|
694
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
|
695
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
|
620
696
|
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
621
|
-
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2,
|
622
|
-
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1,
|
697
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
|
698
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
|
623
699
|
else
|
624
700
|
$logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
|
625
701
|
next
|
@@ -630,64 +706,65 @@ Options:
|
|
630
706
|
|
631
707
|
if $aa_env_cnt.has_key? grp_label1
|
632
708
|
if $aa_env_cnt[grp_label1].has_key? aa1
|
633
|
-
$aa_env_cnt[grp_label1][aa1] +=
|
709
|
+
$aa_env_cnt[grp_label1][aa1] += cnt1
|
634
710
|
else
|
635
|
-
$aa_env_cnt[grp_label1][aa1] =
|
711
|
+
$aa_env_cnt[grp_label1][aa1] = cnt1
|
636
712
|
end
|
637
713
|
else
|
638
714
|
$aa_env_cnt[grp_label1] = Hash.new(0.0)
|
639
|
-
$aa_env_cnt[grp_label1][aa1] =
|
715
|
+
$aa_env_cnt[grp_label1][aa1] = cnt1
|
640
716
|
end
|
641
717
|
|
642
718
|
if $aa_env_cnt.has_key? grp_label2
|
643
719
|
if $aa_env_cnt[grp_label2].has_key? aa2
|
644
|
-
$aa_env_cnt[grp_label2][aa2] +=
|
720
|
+
$aa_env_cnt[grp_label2][aa2] += cnt2
|
645
721
|
else
|
646
|
-
$aa_env_cnt[grp_label2][aa2] =
|
722
|
+
$aa_env_cnt[grp_label2][aa2] = cnt2
|
647
723
|
end
|
648
724
|
else
|
649
725
|
$aa_env_cnt[grp_label2] = Hash.new(0.0)
|
650
|
-
$aa_env_cnt[grp_label2][aa2] =
|
726
|
+
$aa_env_cnt[grp_label2][aa2] = cnt2
|
651
727
|
end
|
652
728
|
|
653
729
|
if $aa_tot_cnt.has_key? aa1
|
654
|
-
$aa_tot_cnt[aa1] +=
|
730
|
+
$aa_tot_cnt[aa1] += cnt1
|
655
731
|
else
|
656
|
-
$aa_tot_cnt[aa1] =
|
732
|
+
$aa_tot_cnt[aa1] = cnt1
|
657
733
|
end
|
658
734
|
|
659
735
|
if $aa_tot_cnt.has_key? aa2
|
660
|
-
$aa_tot_cnt[aa2] +=
|
736
|
+
$aa_tot_cnt[aa2] += cnt2
|
661
737
|
else
|
662
|
-
$aa_tot_cnt[aa2] =
|
738
|
+
$aa_tot_cnt[aa2] = cnt2
|
663
739
|
end
|
664
740
|
|
665
741
|
if aa1 != aa2
|
666
742
|
if $aa_mut_cnt.has_key? aa1
|
667
|
-
$aa_mut_cnt[aa1] +=
|
743
|
+
$aa_mut_cnt[aa1] += cnt1
|
668
744
|
else
|
669
|
-
$aa_mut_cnt[aa1] =
|
745
|
+
$aa_mut_cnt[aa1] = cnt1
|
670
746
|
end
|
671
747
|
if $aa_mut_cnt.has_key? aa2
|
672
|
-
$aa_mut_cnt[aa2] +=
|
748
|
+
$aa_mut_cnt[aa2] += cnt2
|
673
749
|
else
|
674
|
-
$aa_mut_cnt[aa2] =
|
750
|
+
$aa_mut_cnt[aa2] = cnt2
|
675
751
|
end
|
676
752
|
end
|
677
753
|
|
678
|
-
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" %
|
679
|
-
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" %
|
754
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
|
755
|
+
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
|
680
756
|
end
|
681
757
|
end
|
682
758
|
end
|
683
759
|
end
|
684
760
|
end
|
761
|
+
$logger.info "Analysing #{tem_file} done."
|
685
762
|
end
|
686
763
|
|
687
764
|
# print out default header
|
688
765
|
$outfh.puts <<HEADER
|
689
766
|
# Environment-specific amino acid substitution matrices
|
690
|
-
# Creator: egor version #{
|
767
|
+
# Creator: egor version #{VERSION}
|
691
768
|
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
692
769
|
#
|
693
770
|
# Definitions for structural environments:
|
@@ -739,20 +816,20 @@ HEADER
|
|
739
816
|
$outfh.puts "# Total amino acid frequencies:\n"
|
740
817
|
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
741
818
|
|
742
|
-
|
819
|
+
min_cnt = -1
|
743
820
|
min_sigma = nil
|
744
821
|
|
745
822
|
$amino_acids.each do |res|
|
746
|
-
if ($aa_tot_cnt[res] / $sigma) < $
|
747
|
-
if
|
748
|
-
|
749
|
-
min_sigma =
|
750
|
-
elsif (
|
751
|
-
|
752
|
-
min_sigma =
|
823
|
+
if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
|
824
|
+
if min_cnt < 0
|
825
|
+
min_cnt = $aa_tot_cnt[res]
|
826
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
827
|
+
elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
|
828
|
+
min_cnt = $aa_tot_cnt[res]
|
829
|
+
min_sigma = min_cnt / $min_cnt_sigma_ratio
|
753
830
|
end
|
754
831
|
|
755
|
-
$logger.warn "The current sigma value, #{$sigma} seems to be too big for the total
|
832
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
|
756
833
|
end
|
757
834
|
|
758
835
|
$aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
|
@@ -770,7 +847,7 @@ HEADER
|
|
770
847
|
end
|
771
848
|
end
|
772
849
|
|
773
|
-
if
|
850
|
+
if min_cnt > -1
|
774
851
|
$logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
|
775
852
|
if $autosigma
|
776
853
|
$logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
|
@@ -780,12 +857,13 @@ HEADER
|
|
780
857
|
|
781
858
|
$outfh.puts '#'
|
782
859
|
$outfh.puts '# RES: Amino acid one letter code'
|
783
|
-
$outfh.puts '# TOT_OBS: Total
|
784
|
-
$outfh.puts '# MUT_OBS: Total
|
860
|
+
$outfh.puts '# TOT_OBS: Total count of incidence'
|
861
|
+
$outfh.puts '# MUT_OBS: Total count of mutation'
|
785
862
|
$outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
|
786
|
-
$outfh.puts '# REL_MUTB: Relative mutability (ALA=100)'
|
863
|
+
$outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
|
787
864
|
$outfh.puts '# REL_FREQ: Relative frequency'
|
788
865
|
$outfh.puts '#'
|
866
|
+
|
789
867
|
#
|
790
868
|
# Part 4. END
|
791
869
|
#
|
@@ -804,7 +882,8 @@ HEADER
|
|
804
882
|
end
|
805
883
|
|
806
884
|
# count raw frequencies
|
807
|
-
$tot_cnt_mat
|
885
|
+
$tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
|
886
|
+
group_matrices = []
|
808
887
|
|
809
888
|
# for each combination of environment features
|
810
889
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
@@ -816,22 +895,88 @@ HEADER
|
|
816
895
|
end
|
817
896
|
|
818
897
|
$tot_cnt_mat += grp_cnt_mat
|
819
|
-
|
820
|
-
if $output == 0
|
821
|
-
$outfh.puts ">#{group[0]} #{group_no}"
|
822
|
-
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
823
|
-
end
|
898
|
+
group_matrices << [group[0], grp_cnt_mat]
|
824
899
|
end
|
825
900
|
|
901
|
+
$logger.info "Counting substitutions done."
|
902
|
+
|
826
903
|
if $output == 0
|
904
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
905
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
|
906
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
907
|
+
|
908
|
+
group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
|
909
|
+
# for a matrix file
|
910
|
+
stem = "#{grp_no}. #{grp_label}"
|
911
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
912
|
+
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
|
913
|
+
:row_header => $amino_acids)
|
914
|
+
|
915
|
+
# for a heat map
|
916
|
+
if $heatmap == 0 or $heatmap == 2
|
917
|
+
grp_cnt_mat.heatmap(:col_header => $amino_acids,
|
918
|
+
:row_header => $amino_acids,
|
919
|
+
:rvg_width => $rvg_width,
|
920
|
+
:rvg_height => $rvg_height,
|
921
|
+
:canvas_width => $canvas_width,
|
922
|
+
:canvas_height => $canvas_height,
|
923
|
+
:max_val => grp_max_val.ceil,
|
924
|
+
:min_val => 0,
|
925
|
+
:print_value => $heatmapvalues,
|
926
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
927
|
+
|
928
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
929
|
+
end
|
930
|
+
|
931
|
+
if $heatmap == 1 or $heatmap == 2
|
932
|
+
heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
|
933
|
+
:row_header => $amino_acids,
|
934
|
+
:rvg_width => $rvg_width,
|
935
|
+
:rvg_height => $rvg_height - 50,
|
936
|
+
:canvas_width => $canvas_width,
|
937
|
+
:canvas_height => $canvas_height - 50,
|
938
|
+
:max_val => grp_max_val.ceil,
|
939
|
+
:min_val => 0,
|
940
|
+
:print_value => $heatmapvalues,
|
941
|
+
:print_gradient => false,
|
942
|
+
:title => stem,
|
943
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
944
|
+
end
|
945
|
+
end
|
946
|
+
|
947
|
+
if $heatmap == 1 or $heatmap == 2
|
948
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
949
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
950
|
+
:rvg_width => $rvg_width,
|
951
|
+
:max_val => grp_max_val.ceil,
|
952
|
+
:min_val => 0).write(file)
|
953
|
+
|
954
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
955
|
+
end
|
956
|
+
|
957
|
+
# total
|
827
958
|
$outfh.puts '>Total'
|
828
|
-
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
829
|
-
|
959
|
+
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
|
960
|
+
:row_header => $amino_acids)
|
961
|
+
|
962
|
+
if $heatmap == 0 or $heatmap == 2
|
963
|
+
stem = "#{group_matrices.size}. TOTAL"
|
964
|
+
heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
|
965
|
+
:row_header => $amino_acids,
|
966
|
+
:rvg_width => $rvg_width,
|
967
|
+
:rvg_height => $rvg_height,
|
968
|
+
:canvas_width => $canvas_width,
|
969
|
+
:canvas_height => $canvas_height,
|
970
|
+
:max_val => $tot_cnt_mat.max.ceil,
|
971
|
+
:min_val => 0,
|
972
|
+
:print_value => $heatmapvalues,
|
973
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
974
|
+
|
975
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
976
|
+
end
|
830
977
|
exit 0
|
831
978
|
end
|
832
979
|
|
833
|
-
$logger.info "Counting substitutions is done."
|
834
|
-
|
835
980
|
#
|
836
981
|
# Part 5. END
|
837
982
|
#
|
@@ -867,6 +1012,8 @@ HEADER
|
|
867
1012
|
# re-calculate probability vector for each environment class
|
868
1013
|
$env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
|
869
1014
|
|
1015
|
+
group_matrices = []
|
1016
|
+
|
870
1017
|
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
871
1018
|
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
872
1019
|
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
@@ -878,10 +1025,63 @@ HEADER
|
|
878
1025
|
end
|
879
1026
|
|
880
1027
|
$tot_cnt_mat += grp_cnt_mat
|
1028
|
+
group_matrices << [group[0], grp_prob_mat]
|
1029
|
+
end
|
1030
|
+
|
1031
|
+
if $output == 1
|
1032
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1033
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
|
1034
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
1035
|
+
|
1036
|
+
group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
|
1037
|
+
# for a matrix file
|
1038
|
+
stem = "#{grp_no}. #{grp_label}"
|
1039
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1040
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1041
|
+
:row_header => $amino_acids)
|
1042
|
+
|
1043
|
+
|
1044
|
+
# for a heat map
|
1045
|
+
if $heatmap == 0 or $heatmap == 2
|
1046
|
+
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1047
|
+
:row_header => $amino_acids,
|
1048
|
+
:rvg_width => $rvg_width,
|
1049
|
+
:rvg_height => $rvg_height,
|
1050
|
+
:canvas_width => $canvas_width,
|
1051
|
+
:canvas_height => $canvas_height,
|
1052
|
+
:max_val => grp_max_val.ceil,
|
1053
|
+
:min_val => 0,
|
1054
|
+
:print_value => $heatmapvalues,
|
1055
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1056
|
+
|
1057
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1058
|
+
end
|
1059
|
+
|
1060
|
+
if $heatmap == 1 or $heatmap == 2
|
1061
|
+
heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1062
|
+
:row_header => $amino_acids,
|
1063
|
+
:rvg_width => $rvg_width,
|
1064
|
+
:rvg_height => $rvg_height - 50,
|
1065
|
+
:canvas_width => $canvas_width,
|
1066
|
+
:canvas_height => $canvas_height - 50,
|
1067
|
+
:max_val => grp_max_val.ceil,
|
1068
|
+
:min_val => 0,
|
1069
|
+
:print_value => $heatmapvalues,
|
1070
|
+
:print_gradient => false,
|
1071
|
+
:title => stem,
|
1072
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
1073
|
+
end
|
1074
|
+
end
|
881
1075
|
|
882
|
-
|
883
|
-
|
884
|
-
|
1076
|
+
# for heat maps in a single file
|
1077
|
+
if $heatmap == 1 or $heatmap == 2
|
1078
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1079
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1080
|
+
:rvg_width => $rvg_width,
|
1081
|
+
:max_val => grp_max_val.ceil,
|
1082
|
+
:min_val => 0).write(file)
|
1083
|
+
|
1084
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
885
1085
|
end
|
886
1086
|
end
|
887
1087
|
|
@@ -892,15 +1092,32 @@ HEADER
|
|
892
1092
|
0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
|
893
1093
|
end
|
894
1094
|
|
895
|
-
$
|
896
|
-
|
897
|
-
if ($output == 1)
|
1095
|
+
if $output == 1
|
898
1096
|
$outfh.puts '>Total'
|
899
|
-
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1097
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1098
|
+
:row_header => $amino_acids)
|
900
1099
|
$outfh.close
|
901
|
-
|
1100
|
+
|
1101
|
+
# for a heat map
|
1102
|
+
if $heatmap == 0 or $heatmap == 2
|
1103
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1104
|
+
$tot_prob_mat.heatmap(:col_header => $amino_acids,
|
1105
|
+
:row_header => $amino_acids,
|
1106
|
+
:rvg_width => $rvg_width,
|
1107
|
+
:rvg_height => $rvg_height,
|
1108
|
+
:canvas_width => $canvas_width,
|
1109
|
+
:canvas_height => $canvas_height,
|
1110
|
+
:max_val => $tot_prob_mat.max.ceil,
|
1111
|
+
:min_val => 0,
|
1112
|
+
:print_value => $heatmapvalues,
|
1113
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1114
|
+
|
1115
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1116
|
+
end
|
902
1117
|
exit 0
|
903
1118
|
end
|
1119
|
+
|
1120
|
+
$logger.info 'Calculating substitution probabilities (no smoothing) done.'
|
904
1121
|
end
|
905
1122
|
|
906
1123
|
# when smoothing!!!
|
@@ -980,7 +1197,7 @@ HEADER
|
|
980
1197
|
end
|
981
1198
|
|
982
1199
|
env_labels.combination(ci) do |c1|
|
983
|
-
|
1200
|
+
c1[0].product(*c1[1..-1]).each do |labels|
|
984
1201
|
pattern = '.' * $env_features.size
|
985
1202
|
|
986
1203
|
labels.each do |label|
|
@@ -1081,7 +1298,7 @@ HEADER
|
|
1081
1298
|
end
|
1082
1299
|
end
|
1083
1300
|
end
|
1084
|
-
$logger.info 'Calculating substitution probabilities
|
1301
|
+
$logger.info 'Calculating substitution probabilities (partial smoothing) done.'
|
1085
1302
|
else
|
1086
1303
|
$outfh.puts <<HEADER
|
1087
1304
|
#
|
@@ -1116,7 +1333,7 @@ HEADER
|
|
1116
1333
|
# full smooting
|
1117
1334
|
1.upto($env_features.size) do |ci|
|
1118
1335
|
env_labels.combination(ci) do |c1|
|
1119
|
-
|
1336
|
+
c1[0].product(*c1[1..-1]).each do |labels|
|
1120
1337
|
pattern = '.' * $env_features.size
|
1121
1338
|
labels.each do |label|
|
1122
1339
|
j = label[0].chr.to_i
|
@@ -1167,7 +1384,7 @@ HEADER
|
|
1167
1384
|
end
|
1168
1385
|
end
|
1169
1386
|
end
|
1170
|
-
$logger.info 'Calculating substitution probabilities
|
1387
|
+
$logger.info 'Calculating substitution probabilities (full smoothing) done.'
|
1171
1388
|
end
|
1172
1389
|
|
1173
1390
|
# updating smoothed probability array for each envrionment
|
@@ -1176,7 +1393,9 @@ HEADER
|
|
1176
1393
|
end
|
1177
1394
|
|
1178
1395
|
# sorting environments and build 21X21 substitution matrices
|
1179
|
-
|
1396
|
+
group_matrices = []
|
1397
|
+
|
1398
|
+
$env_classes.groups_sorted_by_residue_labels.each do |group|
|
1180
1399
|
# calculating 21X21 substitution probability matrix for each envrionment
|
1181
1400
|
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1182
1401
|
|
@@ -1185,9 +1404,62 @@ HEADER
|
|
1185
1404
|
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
|
1186
1405
|
end
|
1187
1406
|
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1407
|
+
group_matrices << [group[0], grp_prob_mat]
|
1408
|
+
end
|
1409
|
+
|
1410
|
+
if $output == 1
|
1411
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1412
|
+
grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
|
1413
|
+
$heatmapcol ||= Math::sqrt(group_matrices.size).round
|
1414
|
+
|
1415
|
+
group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
|
1416
|
+
# for a matrix file
|
1417
|
+
stem = "#{grp_no}. #{grp_label}"
|
1418
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1419
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
|
1420
|
+
:row_header => $amino_acids)
|
1421
|
+
|
1422
|
+
# for heat map generation
|
1423
|
+
if $heatmap == 0 or $heatmap == 2
|
1424
|
+
grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1425
|
+
:row_header => $amino_acids,
|
1426
|
+
:rvg_width => $rvg_width,
|
1427
|
+
:rvg_height => $rvg_height,
|
1428
|
+
:canvas_width => $canvas_width,
|
1429
|
+
:canvas_height => $canvas_height,
|
1430
|
+
:max_val => grp_max_val.ceil,
|
1431
|
+
:min_val => 0,
|
1432
|
+
:print_value => $heatmapvalues,
|
1433
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1434
|
+
|
1435
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1436
|
+
end
|
1437
|
+
|
1438
|
+
if $heatmap == 1 or $heatmap == 2
|
1439
|
+
heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
|
1440
|
+
:row_header => $amino_acids,
|
1441
|
+
:rvg_width => $rvg_width,
|
1442
|
+
:rvg_height => $rvg_height - 50,
|
1443
|
+
:canvas_width => $canvas_width,
|
1444
|
+
:canvas_height => $canvas_height - 50,
|
1445
|
+
:max_val => grp_max_val.ceil,
|
1446
|
+
:min_val => 0,
|
1447
|
+
:print_value => $heatmapvalues,
|
1448
|
+
:print_gradient => false,
|
1449
|
+
:title => stem,
|
1450
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
1451
|
+
end
|
1452
|
+
end
|
1453
|
+
|
1454
|
+
# for heat maps in a single file
|
1455
|
+
if $heatmap == 1 or $heatmap == 2
|
1456
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1457
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1458
|
+
:rvg_width => $rvg_width,
|
1459
|
+
:max_val => grp_max_val.ceil,
|
1460
|
+
:min_val => 0).write(file)
|
1461
|
+
|
1462
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1191
1463
|
end
|
1192
1464
|
end
|
1193
1465
|
|
@@ -1202,9 +1474,26 @@ HEADER
|
|
1202
1474
|
|
1203
1475
|
if $output == 1
|
1204
1476
|
$outfh.puts '>Total'
|
1205
|
-
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1477
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
|
1478
|
+
:row_header => $amino_acids)
|
1206
1479
|
$outfh.close
|
1207
|
-
|
1480
|
+
|
1481
|
+
# for a heat map
|
1482
|
+
if $heatmap == 0 or $heatmap == 2
|
1483
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1484
|
+
$tot_prob_mat.heatmap(:col_header => $amino_acids,
|
1485
|
+
:row_header => $amino_acids,
|
1486
|
+
:rvg_width => $rvg_width,
|
1487
|
+
:rvg_height => $rvg_height,
|
1488
|
+
:canvas_width => $canvas_width,
|
1489
|
+
:canvas_height => $canvas_height,
|
1490
|
+
:max_val => $tot_prob_mat.max.ceil,
|
1491
|
+
:min_val => 0,
|
1492
|
+
:print_value => $heatmapvalues,
|
1493
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1494
|
+
|
1495
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1496
|
+
end
|
1208
1497
|
exit 0
|
1209
1498
|
end
|
1210
1499
|
end
|
@@ -1242,16 +1531,18 @@ HEADER
|
|
1242
1531
|
# calculating substitution probability matrix for each envrionment
|
1243
1532
|
grp_label = group[0]
|
1244
1533
|
grp_envs = group[1]
|
1245
|
-
grp_logo_mat = $cys == 0 ?
|
1534
|
+
grp_logo_mat = $cys == 0 ?
|
1535
|
+
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1536
|
+
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1246
1537
|
|
1247
1538
|
$amino_acids.each_with_index do |aa, aj|
|
1248
1539
|
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1249
|
-
|
1250
|
-
|
1540
|
+
env.logo_array = $cys == 0 ?
|
1541
|
+
NArray.float($amino_acids.size + 1) :
|
1542
|
+
NArray.float($amino_acids.size)
|
1251
1543
|
|
1252
1544
|
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
|
1253
1545
|
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1254
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1255
1546
|
odds = prob / pai
|
1256
1547
|
env.logo_array[ai] = factor * Math::log(odds)
|
1257
1548
|
grp_logo_mat[aj, ai] = env.logo_array[ai]
|
@@ -1262,7 +1553,6 @@ HEADER
|
|
1262
1553
|
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1263
1554
|
prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
|
1264
1555
|
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
|
1265
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1266
1556
|
odds = prob / pai
|
1267
1557
|
env.logo_array[$amino_acids.size] = factor * Math::log(odds)
|
1268
1558
|
grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
|
@@ -1272,13 +1562,14 @@ HEADER
|
|
1272
1562
|
grp_logo_mats << [grp_label, grp_logo_mat]
|
1273
1563
|
end
|
1274
1564
|
|
1275
|
-
$tot_logo_mat = $cys == 0 ?
|
1565
|
+
$tot_logo_mat = $cys == 0 ?
|
1566
|
+
NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
|
1567
|
+
NMatrix.float($amino_acids.size, $amino_acids.size)
|
1276
1568
|
|
1277
1569
|
$amino_acids.each_with_index do |aa1, aj|
|
1278
1570
|
$amino_acids.each_with_index do |aa2, ai|
|
1279
1571
|
prob = $tot_prob_mat[aj, ai]
|
1280
1572
|
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1281
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1282
1573
|
odds = prob / pai
|
1283
1574
|
$tot_logo_mat[aj, ai] = factor * Math::log(odds)
|
1284
1575
|
end
|
@@ -1287,7 +1578,6 @@ HEADER
|
|
1287
1578
|
if $cys == 0
|
1288
1579
|
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1289
1580
|
prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
|
1290
|
-
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1291
1581
|
odds = prob / pai
|
1292
1582
|
$tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
|
1293
1583
|
end
|
@@ -1315,7 +1605,7 @@ HEADER
|
|
1315
1605
|
#
|
1316
1606
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1317
1607
|
HEADER
|
1318
|
-
unless $
|
1608
|
+
unless $noroundoff
|
1319
1609
|
$outfh.puts <<HEADER
|
1320
1610
|
# rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
|
1321
1611
|
HEADER
|
@@ -1326,43 +1616,120 @@ HEADER
|
|
1326
1616
|
#
|
1327
1617
|
HEADER
|
1328
1618
|
|
1619
|
+
grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
|
1620
|
+
grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
|
1621
|
+
abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
|
1622
|
+
row_header = $cys ? $amino_acids + %w[U] : $amino_acids
|
1623
|
+
heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
|
1624
|
+
$heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
|
1625
|
+
|
1329
1626
|
grp_logo_mats.each_with_index do |arr, grp_no|
|
1330
1627
|
grp_label = arr[0]
|
1331
1628
|
grp_logo_mat = arr[1]
|
1629
|
+
stem = "#{grp_no}. #{grp_label}"
|
1332
1630
|
|
1333
|
-
unless $
|
1631
|
+
unless $noroundoff
|
1334
1632
|
grp_logo_mat = grp_logo_mat.round
|
1335
1633
|
end
|
1336
1634
|
|
1635
|
+
# for a matrix file
|
1337
1636
|
$outfh.puts ">#{grp_label} #{grp_no}"
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1637
|
+
$outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
|
1638
|
+
:row_header => row_header)
|
1639
|
+
# for a heat map
|
1640
|
+
if $heatmap == 0 or $heatmap == 2
|
1641
|
+
grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1642
|
+
:row_header => row_header,
|
1643
|
+
:rvg_width => $rvg_width,
|
1644
|
+
:rvg_height => $rvg_height,
|
1645
|
+
:canvas_width => $canvas_width,
|
1646
|
+
:canvas_height => $canvas_height,
|
1647
|
+
:gradient_beg_color => '#0000FF',
|
1648
|
+
:gradient_mid_color => '#FFFFFF',
|
1649
|
+
:gradient_end_color => '#FF0000',
|
1650
|
+
:max_val => abs_max_val.ceil,
|
1651
|
+
:mid_val => 0,
|
1652
|
+
:min_val => -1 * abs_max_val.ceil,
|
1653
|
+
:print_value => $heatmapvalues,
|
1654
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1655
|
+
|
1656
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1657
|
+
end
|
1658
|
+
|
1659
|
+
if $heatmap == 1 or $heatmap == 2
|
1660
|
+
heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
|
1661
|
+
:row_header => row_header,
|
1662
|
+
:rvg_width => $rvg_width,
|
1663
|
+
:rvg_height => $rvg_height - 50,
|
1664
|
+
:canvas_width => $canvas_width,
|
1665
|
+
:canvas_height => $canvas_height - 50,
|
1666
|
+
:gradient_beg_color => '#0000FF',
|
1667
|
+
:gradient_mid_color => '#FFFFFF',
|
1668
|
+
:gradient_end_color => '#FF0000',
|
1669
|
+
:max_val => abs_max_val.ceil,
|
1670
|
+
:mid_val => 0,
|
1671
|
+
:min_val => -1 * abs_max_val.ceil,
|
1672
|
+
:print_value => $heatmapvalues,
|
1673
|
+
:print_gradient => false,
|
1674
|
+
:title => stem,
|
1675
|
+
:title_font_size => $rvg_width * $heatmapcol / 100.0)
|
1342
1676
|
end
|
1343
1677
|
end
|
1344
1678
|
|
1345
|
-
|
1679
|
+
# for heat maps in a single file
|
1680
|
+
if $heatmap == 1 or $heatmap == 2
|
1681
|
+
file = "#{$heatmapstem}.#{$heatmapformat}"
|
1682
|
+
heatmaps.heatmap(:columns => $heatmapcol,
|
1683
|
+
:rvg_width => $rvg_width,
|
1684
|
+
:gradient_beg_color => '#0000FF',
|
1685
|
+
:gradient_mid_color => '#FFFFFF',
|
1686
|
+
:gradient_end_color => '#FF0000',
|
1687
|
+
:max_val => abs_max_val.ceil,
|
1688
|
+
:mid_val => 0,
|
1689
|
+
:min_val => -1 * abs_max_val.ceil).write(file)
|
1690
|
+
|
1691
|
+
$logger.info "Generating heat maps in a file, #{file} done."
|
1692
|
+
end
|
1346
1693
|
|
1347
|
-
|
1694
|
+
# for a matrix file
|
1695
|
+
unless $noroundoff
|
1348
1696
|
$tot_logo_mat = $tot_logo_mat.round
|
1349
1697
|
end
|
1350
1698
|
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1699
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1700
|
+
$outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
|
1701
|
+
:row_header => row_header)
|
1702
|
+
|
1703
|
+
# for a heat map
|
1704
|
+
if $heatmap == 0 or $heatmap == 2
|
1705
|
+
stem = "#{group_matrices.size}. TOTAL"
|
1706
|
+
tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
|
1707
|
+
$tot_logo_mat.heatmap(:col_header => $amino_acids,
|
1708
|
+
:row_header => row_header,
|
1709
|
+
:rvg_width => $rvg_width,
|
1710
|
+
:rvg_height => $rvg_height,
|
1711
|
+
:canvas_width => $canvas_width,
|
1712
|
+
:canvas_height => $canvas_height,
|
1713
|
+
:gradient_beg_color => '#0000FF',
|
1714
|
+
:gradient_mid_color => '#FFFFFF',
|
1715
|
+
:gradient_end_color => '#FF0000',
|
1716
|
+
:max_val => tot_abs_max_val.ceil,
|
1717
|
+
:mid_val => 0,
|
1718
|
+
:min_val => -1 * tot_abs_max_val.ceil,
|
1719
|
+
:print_value => $heatmapvalues,
|
1720
|
+
:title => stem).write("#{stem}.#{$heatmapformat}")
|
1721
|
+
|
1722
|
+
$logger.info "Generating a heat map for #{stem} table done."
|
1355
1723
|
end
|
1356
1724
|
|
1357
|
-
$logger.info "Calculating log odds
|
1358
|
-
|
1359
|
-
#
|
1360
|
-
# Part 7. END
|
1361
|
-
#
|
1725
|
+
$logger.info "Calculating log odds ratios done."
|
1362
1726
|
end
|
1363
1727
|
|
1728
|
+
#
|
1729
|
+
# Part 7. END
|
1730
|
+
#
|
1731
|
+
|
1364
1732
|
$outfh.close
|
1365
|
-
$logger.info "Egor END."
|
1366
1733
|
exit 0
|
1367
1734
|
end
|
1368
1735
|
end
|