semin-egor 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/egor/cli.rb ADDED
@@ -0,0 +1,1738 @@
1
+ require 'rubygems'
2
+ require 'getoptlong'
3
+ require 'logger'
4
+ require 'narray'
5
+ require 'bio'
6
+ require 'set'
7
+ require 'facets'
8
+
9
+ require 'math_extensions'
10
+ require 'string_extensions'
11
+ require 'narray_extensions'
12
+ require 'nmatrix_extensions'
13
+
14
+ require 'egor/environment'
15
+ require 'egor/environment_class_hash'
16
+ require 'egor/environment_feature'
17
+ require 'egor/environment_feature_array'
18
+ require 'egor/heatmap_array'
19
+
20
+ # This is a module for an actual command line interpreter for Egor
21
+ # ---
22
+ # Copyright (C) 2008-9 Semin Lee
23
+ module Egor
24
+ class CLI
25
+ class << self
26
+
27
+ # :nodoc:
28
+ def print_version
29
+ puts VERSION
30
+ end
31
+
32
+ # Print Egor's Usage on the screen
33
+ #
34
+ # :call-seq:
35
+ # Egor::CLI::print_usage
36
+ #
37
+ def print_usage
38
+ puts <<-USAGE
39
+ egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
40
+
41
+ Usage:
42
+ egor [ options ] -l TEMLIST-file -c CLASSDEF-file
43
+ or
44
+ egor [ options ] -f TEM-file -c CLASSDEF-file
45
+
46
+ Options:
47
+ --tem-file (-f) FILE: a tem file
48
+ --tem-list (-l) FILE: a list for tem files
49
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
50
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
51
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
52
+ --noweight: calculate substitution counts with no weights
53
+ --smooth (-s) INTEGER:
54
+ 0 for partial smoothing (default)
55
+ 1 for full smoothing
56
+ --p1smooth: perform smoothing for p1 probability calculation when partial smoothing
57
+ --nosmooth: perform no smoothing operation
58
+ --cys (-y) INTEGER:
59
+ 0 for using C and J only for structure (default)
60
+ 1 for both structure and sequence
61
+ 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
62
+ --output INTEGER:
63
+ 0 for raw counts (no smoothing performed)
64
+ 1 for probabilities
65
+ 2 for log-odds (default)
66
+ --noroundoff: do not round off log odds ratio
67
+ --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
68
+ --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
69
+ --autosigma: automatically adjust the sigma value for smoothing
70
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
71
+ --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
72
+ --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
73
+ --heatmap INTEGER:
74
+ 0 create a heat map file for each substitution table
75
+ 1 create one big file containing all substitution tables
76
+ 2 do both 0 and 1
77
+ --heatmap-format INTEGER:
78
+ 0 for Portable Network Graphics (PNG) Format (default)
79
+ 1 for Graphics Interchange Format (GIF)
80
+ 2 for Joint Photographic Experts Group (JPEG) Format
81
+ 3 for Microsoft Windows bitmap (BMP) Format
82
+ 4 for Portable Document Format (PDF)
83
+ --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
84
+ --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
85
+ --heatmap-value: print values in the cells when generating heat maps
86
+ --verbose (-v) INTEGER
87
+ 0 for ERROR level
88
+ 1 for WARN or above level (default)
89
+ 2 for INFO or above level
90
+ 3 for DEBUG or above level
91
+ --version: print version
92
+ --help (-h): show help
93
+
94
+ USAGE
95
+ end
96
+
97
+ # Calculate PID between two sequences
98
+ #
99
+ # :call-seq:
100
+ # Egor::CLI::calculate_pid(seq1, seq2) -> Float
101
+ #
102
+ def calculate_pid(seq1, seq2)
103
+ aas1 = seq1.split('')
104
+ aas2 = seq2.split('')
105
+ cols = aas1.zip(aas2)
106
+ align = 0 # no. of aligned columns
107
+ ident = 0 # no. of identical columns
108
+ intgp = 0 # no. of internal gaps
109
+
110
+ cols.each do |col|
111
+ if (col[0] != '-') && (col[1] != '-')
112
+ align += 1
113
+ if col[0] == col[1]
114
+ ident += 1
115
+ end
116
+ elsif (((col[0] == '-') && (col[1] != '-')) ||
117
+ ((col[0] != '-') && (col[1] == '-')))
118
+ intgp += 1
119
+ end
120
+ end
121
+
122
+ pid = 100.0 * ident.to_f / (align + intgp)
123
+ end
124
+
125
+ # :nodoc:
126
+ def execute(arguments=[])
127
+ #
128
+ # * Abbreviations in the codes
129
+ #
130
+ # env: environment
131
+ # tem: (FUGUE) template
132
+ # classdef: (envlironment) class definition
133
+ # aa: amino acid
134
+ # aa: weighted amino acid
135
+ # tot: total
136
+ # rel: relative
137
+ # jnt: joint
138
+ # cnt: count
139
+ # mut: mutation
140
+ # mutb: mutability
141
+ # freq: frequency
142
+ # prob: probability
143
+ # logo: log odds ratio
144
+ # opts: options
145
+ # fh: file handle
146
+ # ff: flat file
147
+ # ali: alignment
148
+ # mat: matrix
149
+ # arr: array
150
+
151
+
152
+ # Part 1.
153
+ #
154
+ # Global variables and their default values
155
+ #
156
+
157
+ $logger = Logger.new(STDOUT)
158
+ $logger.level = Logger::WARN
159
+
160
+ # default set of 21 amino acids including J (Cysteine, the free thiol form)
161
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
162
+ $tem_list = nil
163
+ $tem_file = nil
164
+ $classdef = 'classdef.dat'
165
+ $outfile = 'allmat.dat'
166
+ $outfh = nil # file hanfle for outfile
167
+ $output = 2 # default: log odds matrix
168
+ $ali_size = 0
169
+ $tot_aa = 0
170
+ $sigma = 5.0
171
+ $autosigma = false
172
+ $weight = 60
173
+ $noweight = false
174
+ $smooth = :partial
175
+ $nosmooth = false
176
+ $noroundoff = false
177
+ $p1smooth = false
178
+ $scale = 3
179
+ $pidmin = nil
180
+ $pidmax = nil
181
+ $scale = 3
182
+ $add = nil
183
+ $cys = 0
184
+ $targetenv = false
185
+ $penv = false
186
+ $heatmap = nil
187
+ $heatmapcol = nil
188
+ $heatmapformat = 'png'
189
+ $heatmapstem = 'heatmaps'
190
+ $heatmapvalue = false
191
+ $rvg_width = 550
192
+ $rvg_height = 650
193
+ $canvas_width = 550
194
+ $canvas_height = 650
195
+ $cell_width = 20
196
+ $cell_height = 20
197
+
198
+ $aa_tot_cnt = Hash.new(0)
199
+ $aa_mut_cnt = Hash.new(0)
200
+ $aa_mutb = {}
201
+ $aa_rel_mutb = {}
202
+ $aa_tot_freq = {}
203
+ $aa_env_cnt = Hash.new(0)
204
+ $smooth_prob = {}
205
+ $tot_cnt_mat = nil
206
+ $tot_prob_mat = nil
207
+ $tot_logo_mat = nil
208
+ $tot_smooth_prob = {}
209
+
210
+ # minimum ratio of amino acid count to sigma value
211
+ $min_cnt_sigma_ratio = 500.0
212
+
213
+ #
214
+ # Part 1 END
215
+ #
216
+
217
+ # Part 2.
218
+ #
219
+ # Parsing options
220
+ #
221
+
222
+ opts = GetoptLong.new(
223
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
224
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
225
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
226
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
227
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
228
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
229
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
230
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
231
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
232
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
233
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
234
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
235
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
237
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--heatmap-value', GetoptLong::NO_ARGUMENT ],
240
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
241
+ [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
242
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
243
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
244
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
245
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
246
+ [ '--version', GetoptLong::NO_ARGUMENT ]
247
+ )
248
+
249
+ begin
250
+ opts.each do |opt, arg|
251
+ case opt
252
+ when '--help'
253
+ print_usage
254
+ exit 0
255
+ when '--tem-list'
256
+ $tem_list = arg
257
+ when '--tem-file'
258
+ $tem_file = arg
259
+ when '--classdef'
260
+ $classdef = arg
261
+ when '--output'
262
+ $output = arg.to_i
263
+ when '--outfile'
264
+ $outfile = arg
265
+ when '--cys'
266
+ $cys = arg.to_i
267
+ when '--targetenv'
268
+ $targetenv = (arg.to_i == 1) ? true : false
269
+ when '--weight'
270
+ $weight = arg.to_i
271
+ when '--sigma'
272
+ $sigma = arg.to_f
273
+ when '--autosigma'
274
+ $autosigma = true
275
+ when '--pidmin'
276
+ $pidmin = arg.to_f
277
+ when '--pidmax'
278
+ $pidmax = arg.to_f
279
+ when '--noweight'
280
+ $noweight = true
281
+ when '--noroundoff'
282
+ $noroundoff = true
283
+ when '--smooth'
284
+ $smooth = (arg.to_i == 1) ? :full : :partial
285
+ when '--nosmooth'
286
+ $nosmooth = true
287
+ when '--p1smooth'
288
+ $p1smooth = true
289
+ when '--scale'
290
+ $scale = arg.to_f
291
+ when '--add'
292
+ $add = arg.to_f
293
+ when '--penv'
294
+ warn "--penv option is not supported."
295
+ exit 1
296
+ $penv = true
297
+ when '--heatmap'
298
+ $heatmap = case arg.to_i
299
+ when (0..2) then arg.to_i
300
+ else
301
+ warn "--heatmap #{arg.to_i} is not allowed."
302
+ exit1
303
+ end
304
+ when '--heatmap-columns'
305
+ $heatmapcol = arg.to_i
306
+ when '--heatmap-stem'
307
+ $heatmapstem = arg.to_s
308
+ when '--heatmap-format'
309
+ $heatmapformat = case arg.to_i
310
+ when 0 then 'png'
311
+ when 1 then 'gif'
312
+ when 2 then 'jpg'
313
+ when 3 then 'bmp'
314
+ when 4 then 'pdf'
315
+ else
316
+ warn "--heatmap-format #{arg.to_i} is not supported."
317
+ exit 1
318
+ end
319
+ when '--heatmap-value'
320
+ $heatmapvalue = true
321
+ when '--verbose'
322
+ $logger.level = case arg.to_i
323
+ when 0 then Logger::ERROR
324
+ when 1 then Logger::WARN
325
+ when 2 then Logger::INFO
326
+ when 3 then Logger::DEBUG
327
+ else
328
+ warn "--verbose (-v) #{arg.to_i} is not supported."
329
+ exit 1
330
+ end
331
+ when '--version'
332
+ print_version
333
+ exit 0
334
+ end
335
+ end
336
+ rescue
337
+ # invalid option
338
+ exit 1
339
+ end
340
+
341
+ # when arguments are nonsense, print usage
342
+ if ((ARGV.length != 0) ||
343
+ (!$tem_list && !$tem_file) ||
344
+ ($tem_list && $tem_file))
345
+ print_usage
346
+ exit 1
347
+ end
348
+
349
+ # warn if any input file is missing
350
+ if $tem_list && !File.exist?($tem_list)
351
+ warn "Cannot find template list file, #{$tem_list}"
352
+ exit 1
353
+ end
354
+
355
+ if $tem_file && !File.exist?($tem_file)
356
+ warn "Cannot find template file, #{$tem_file}"
357
+ exit 1
358
+ end
359
+
360
+ if $classdef && !File.exist?($classdef)
361
+ warn "Cannot find environment class definition file, #{$classdef}"
362
+ exit 1
363
+ end
364
+
365
+ #
366
+ # Part 2 END
367
+ #
368
+
369
+
370
+ # Part 3.
371
+ #
372
+ # Reading Environment Class Definition File
373
+ #
374
+
375
+ # check --cys option and modify amino_acids set if necessary
376
+ if $cys == 2
377
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
378
+ end
379
+
380
+ # create an EnvironmentFeatureList object for storing all environment
381
+ # features
382
+ $env_features = EnvironmentFeatureArray.new
383
+
384
+ # an array for storing indexes of constrained environment features
385
+ $cst_features = []
386
+
387
+ # add substituted amino acid (aa1) in a substitution to the environment
388
+ # feature list
389
+ $env_features << EnvironmentFeature.new('sequence',
390
+ $amino_acids,
391
+ $amino_acids,
392
+ 'F',
393
+ 'F')
394
+
395
+ # read environment class definiton file and store them into
396
+ # the hash prepared above
397
+ env_index = 1
398
+
399
+ IO.foreach($classdef) do |line|
400
+ line.chomp!
401
+ if line.start_with?('#')
402
+ next
403
+ elsif (env_ftr = line.chomp.split(/;/)).length == 5
404
+ $logger.info "An environment feature, #{line} detected."
405
+ if env_ftr[-1] == 'T'
406
+ # skip silenced environment feature
407
+ $logger.warn "The environment feature, #{line} silent."
408
+ next
409
+ end
410
+ if env_ftr[-2] == 'T'
411
+ $cst_features << env_index
412
+ $logger.warn "The environment feature, #{line} constrained."
413
+ end
414
+ $env_features << EnvironmentFeature.new(env_ftr[0],
415
+ env_ftr[1].split(''),
416
+ env_ftr[2].split(''),
417
+ env_ftr[3],
418
+ env_ftr[4])
419
+ env_index += 1
420
+ else
421
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
+ "a environment class definition."
423
+ exit 1
424
+ end
425
+ end
426
+
427
+ # a hash for storing all environment classes
428
+ $env_classes = EnvironmentClassHash.new
429
+
430
+ # generate all possible combinations of environment labels, and store
431
+ # every environment class into the hash prepared above with the label
432
+ # as a key
433
+ $env_features.label_combinations.each_with_index { |e, i|
434
+ $env_classes[e.flatten.join] = Environment.new(i,
435
+ e.flatten.join,
436
+ $amino_acids)
437
+ }
438
+
439
+ #
440
+ # Part 3 END
441
+ #
442
+
443
+
444
+ # Part 4.
445
+ #
446
+ # Reading TEM file or TEMLIST list file and couting substitutions
447
+ #
448
+
449
+ # a global file handle for output
450
+ $outfh = File.open($outfile, 'w')
451
+
452
+ if $tem_file
453
+ $tem_list_io = StringIO.new($tem_file)
454
+ end
455
+
456
+ if $tem_list
457
+ $tem_list_io = File.open($tem_list)
458
+ end
459
+
460
+ $tem_list_io.each_line do |tem_file|
461
+ tem_file.chomp!
462
+
463
+ ali = Bio::Alignment::OriginalAlignment.new
464
+ ff = Bio::FlatFile.auto(tem_file)
465
+
466
+ ff.each_entry do |pir|
467
+ if (pir.definition == 'sequence') || (pir.definition == 'structure')
468
+ ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
469
+ end
470
+ end
471
+
472
+ if ali.size < 2
473
+ $logger.warn "Skipped #{tem_file} which has only one unique entry."
474
+ next
475
+ end
476
+
477
+ $ali_size += 1
478
+ env_labels = {}
479
+ disulphide = {}
480
+
481
+ ali.each_pair do |key, seq|
482
+ # check disulphide bond environment first!
483
+ ff.rewind
484
+ ff.each_entry do |pir|
485
+ if ((pir.entry_id == key) &&
486
+ ((pir.definition == "disulphide") ||
487
+ (pir.definition == "disulfide")))
488
+ disulphide[key] = pir.data.remove_internal_spaces.split('')
489
+ end
490
+ end
491
+
492
+ $env_features.each_with_index do |ec, ei|
493
+ env_labels[key] = [] unless env_labels.has_key?(key)
494
+
495
+ ff.rewind
496
+ ff.each_entry do |pir|
497
+ if (pir.entry_id == key) && (pir.definition == ec.name)
498
+ labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
499
+ if sym == '-'
500
+ '-'
501
+ elsif sym == 'X' || sym == 'x'
502
+ 'X'
503
+ else
504
+ if ei == 0 # Amino Acid Environment Feature
505
+ (disulphide.has_key?(key) &&
506
+ (disulphide[key][pos] == 'F') &&
507
+ (sym == 'C')) ? 'J' : sym
508
+ else
509
+ ec.labels[ec.symbols.index(sym)]
510
+ end
511
+ end
512
+ end
513
+
514
+ if env_labels[key].empty?
515
+ env_labels[key] = labels
516
+ else
517
+ env_labels[key].each_with_index { |e, i|
518
+ env_labels[key][i] = e + labels[i]
519
+ }
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end
525
+
526
+ if $noweight
527
+ ali.each_pair do |id1, seq1|
528
+ ali.each_pair do |id2, seq2|
529
+ if id1 != id2
530
+ pid = calculate_pid(seq1, seq2)
531
+ s1 = seq1.split('')
532
+ s2 = seq2.split('')
533
+
534
+ # check PID_MIN
535
+ if $pidmin && (pid < $pidmin)
536
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
537
+ "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
538
+ next
539
+ end
540
+
541
+ # check PID_MAX
542
+ if $pidmax && (pid > $pidmax)
543
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
544
+ "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
545
+ next
546
+ end
547
+
548
+ s1.each_with_index do |aa1, pos|
549
+ aa1.upcase!
550
+ aa2 = s2[pos].upcase
551
+
552
+ if env_labels[id1][pos].include?('X')
553
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
554
+ next
555
+ end
556
+
557
+ if env_labels[id2][pos].include?('X')
558
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
559
+ next
560
+ end
561
+
562
+ unless $amino_acids.include?(aa1)
563
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
564
+ next
565
+ end
566
+
567
+ unless $amino_acids.include?(aa2)
568
+ $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
569
+ next
570
+ end
571
+
572
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
573
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
574
+
575
+ if $cst_features.empty?
576
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
577
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
578
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
579
+ else
580
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
581
+ next
582
+ end
583
+
584
+ grp_label = env_labels[id1][pos][1..-1]
585
+
586
+ if $aa_env_cnt.has_key? grp_label
587
+ if $aa_env_cnt[grp_label].has_key? aa1
588
+ $aa_env_cnt[grp_label][aa1] += 1
589
+ else
590
+ $aa_env_cnt[grp_label][aa1] = 1
591
+ end
592
+ else
593
+ $aa_env_cnt[grp_label] = Hash.new(0)
594
+ $aa_env_cnt[grp_label][aa1] = 1
595
+ end
596
+
597
+ if $aa_tot_cnt.has_key? aa1
598
+ $aa_tot_cnt[aa1] += 1
599
+ else
600
+ $aa_tot_cnt[aa1] = 1
601
+ end
602
+
603
+ if aa1 != aa2
604
+ if $aa_mut_cnt.has_key? aa1
605
+ $aa_mut_cnt[aa1] += 1
606
+ else
607
+ $aa_mut_cnt[aa1] = 1
608
+ end
609
+ end
610
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
611
+ end
612
+ end
613
+ end
614
+ end
615
+ else
616
+ # BLOSUM-like weighting
617
+ clusters = []
618
+ ali.each_pair { |i, s| clusters << [i] }
619
+
620
+ # a loop for single linkage clustering
621
+ begin
622
+ continue = false
623
+ 0.upto(clusters.size - 2) do |i|
624
+ indexes = []
625
+ (i + 1).upto(clusters.size - 1) do |j|
626
+ found = false
627
+ clusters[i].each do |c1|
628
+ clusters[j].each do |c2|
629
+ if calculate_pid(ali[c1], ali[c2]) >= $weight
630
+ indexes << j
631
+ found = true
632
+ break
633
+ end
634
+ end
635
+ break if found
636
+ end
637
+ end
638
+
639
+ unless indexes.empty?
640
+ continue = true
641
+ group = clusters[i]
642
+ indexes.each do |k|
643
+ group = group.concat(clusters[k])
644
+ clusters[k] = nil
645
+ end
646
+ clusters[i] = group
647
+ clusters.compact!
648
+ end
649
+ end
650
+ end while(continue)
651
+
652
+ if clusters.size < 2
653
+ $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
654
+ next
655
+ end
656
+
657
+ clusters.combination(2).each do |cluster1, cluster2|
658
+ cluster1.each do |id1|
659
+ cluster2.each do |id2|
660
+ seq1 = ali[id1].split('')
661
+ seq2 = ali[id2].split('')
662
+
663
+ seq1.each_with_index do |aa1, pos|
664
+ aa1.upcase!
665
+ aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
666
+
667
+ if env_labels[id1][pos].include?('X')
668
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
669
+ next
670
+ end
671
+
672
+ if env_labels[id2][pos].include?('X')
673
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
674
+ next
675
+ end
676
+
677
+ unless $amino_acids.include?(aa1)
678
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
679
+ next
680
+ end
681
+
682
+ unless $amino_acids.include?(aa2)
683
+ $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
684
+ next
685
+ end
686
+
687
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
688
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
689
+ cnt1 = 1.0 / cluster1.size
690
+ cnt2 = 1.0 / cluster2.size
691
+ jnt_cnt = cnt1 * cnt2
692
+
693
+ if $cst_features.empty?
694
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
695
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
696
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
697
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
698
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
699
+ else
700
+ $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
701
+ next
702
+ end
703
+
704
+ grp_label1 = env_labels[id1][pos][1..-1]
705
+ grp_label2 = env_labels[id2][pos][1..-1]
706
+
707
+ if $aa_env_cnt.has_key? grp_label1
708
+ if $aa_env_cnt[grp_label1].has_key? aa1
709
+ $aa_env_cnt[grp_label1][aa1] += cnt1
710
+ else
711
+ $aa_env_cnt[grp_label1][aa1] = cnt1
712
+ end
713
+ else
714
+ $aa_env_cnt[grp_label1] = Hash.new(0.0)
715
+ $aa_env_cnt[grp_label1][aa1] = cnt1
716
+ end
717
+
718
+ if $aa_env_cnt.has_key? grp_label2
719
+ if $aa_env_cnt[grp_label2].has_key? aa2
720
+ $aa_env_cnt[grp_label2][aa2] += cnt2
721
+ else
722
+ $aa_env_cnt[grp_label2][aa2] = cnt2
723
+ end
724
+ else
725
+ $aa_env_cnt[grp_label2] = Hash.new(0.0)
726
+ $aa_env_cnt[grp_label2][aa2] = cnt2
727
+ end
728
+
729
+ if $aa_tot_cnt.has_key? aa1
730
+ $aa_tot_cnt[aa1] += cnt1
731
+ else
732
+ $aa_tot_cnt[aa1] = cnt1
733
+ end
734
+
735
+ if $aa_tot_cnt.has_key? aa2
736
+ $aa_tot_cnt[aa2] += cnt2
737
+ else
738
+ $aa_tot_cnt[aa2] = cnt2
739
+ end
740
+
741
+ if aa1 != aa2
742
+ if $aa_mut_cnt.has_key? aa1
743
+ $aa_mut_cnt[aa1] += cnt1
744
+ else
745
+ $aa_mut_cnt[aa1] = cnt1
746
+ end
747
+ if $aa_mut_cnt.has_key? aa2
748
+ $aa_mut_cnt[aa2] += cnt2
749
+ else
750
+ $aa_mut_cnt[aa2] = cnt2
751
+ end
752
+ end
753
+
754
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
755
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
756
+ end
757
+ end
758
+ end
759
+ end
760
+ end
761
+ $logger.info "Analysing #{tem_file} done."
762
+ end
763
+
764
+ # print out default header
765
+ $outfh.puts <<HEADER
766
+ # Environment-specific amino acid substitution matrices
767
+ # Creator: egor version #{Egor::VERSION}
768
+ # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
769
+ #
770
+ # Definitions for structural environments:
771
+ # #{$env_features.size - 1} features used
772
+ #
773
+ HEADER
774
+
775
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
776
+
777
+ $outfh.puts <<HEADER
778
+ # (read in from #{$classdef})
779
+ #
780
+ # Number of alignments: #{$ali_size}
781
+ # (list of .tem files read in from #{$tem_list})
782
+ #
783
+ # Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
784
+ #
785
+ # There are #{$amino_acids.size} amino acids considered.
786
+ # #{$amino_acids.join}
787
+ #
788
+ HEADER
789
+
790
+ if $amino_acids.include? 'J'
791
+ $outfh.puts <<HEADER
792
+ # C: Cystine (the disulfide-bonded form)
793
+ # J: Cysteine (the free thiol form)
794
+ #
795
+ HEADER
796
+ end
797
+
798
+ if $noweight
799
+ $outfh.puts '# Weighting scheme: none'
800
+ else
801
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
802
+ end
803
+
804
+ # calculate amino acid frequencies and mutabilities, and
805
+ # print them as default statistics in the header part
806
+ ala_factor = if $aa_tot_cnt['A'] == 0
807
+ 0.0
808
+ elsif $aa_mut_cnt['A'] == 0
809
+ 0.0
810
+ else
811
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
812
+ end
813
+ $tot_aa = $aa_tot_cnt.values.sum
814
+
815
+ $outfh.puts '#'
816
+ $outfh.puts "# Total amino acid frequencies:\n"
817
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
818
+
819
+ min_cnt = -1
820
+ min_sigma = nil
821
+
822
+ $amino_acids.each do |res|
823
+ if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
824
+ if min_cnt < 0
825
+ min_cnt = $aa_tot_cnt[res]
826
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
827
+ elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
828
+ min_cnt = $aa_tot_cnt[res]
829
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
830
+ end
831
+
832
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
833
+ end
834
+
835
+ $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
836
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
837
+ $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
838
+ end
839
+
840
+ $amino_acids.each do |res|
841
+ if $noweight
842
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
843
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
844
+ else
845
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
846
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
847
+ end
848
+ end
849
+
850
+ if min_cnt > -1
851
+ $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
852
+ if $autosigma
853
+ $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
854
+ $sigma = min_sigma
855
+ end
856
+ end
857
+
858
+ $outfh.puts '#'
859
+ $outfh.puts '# RES: Amino acid one letter code'
860
+ $outfh.puts '# TOT_OBS: Total count of incidence'
861
+ $outfh.puts '# MUT_OBS: Total count of mutation'
862
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
863
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
864
+ $outfh.puts '# REL_FREQ: Relative frequency'
865
+ $outfh.puts '#'
866
+
867
+ #
868
+ # Part 4. END
869
+ #
870
+
871
+
872
+ # Part 5.
873
+ #
874
+ # Generating substitution frequency matrices
875
+ #
876
+
877
+ # calculating probabilities for each environment
878
+ $env_classes.values.each do |e|
879
+ if e.freq_array.sum != 0
880
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
881
+ end
882
+ end
883
+
884
+ # count raw frequencies
885
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
886
+ group_matrices = []
887
+
888
+ # for each combination of environment features
889
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
890
+ grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
891
+
892
+ $amino_acids.each_with_index do |aa, aj|
893
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
894
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
895
+ end
896
+
897
+ $tot_cnt_mat += grp_cnt_mat
898
+ group_matrices << [group[0], grp_cnt_mat]
899
+ end
900
+
901
+ $logger.info "Counting substitutions done."
902
+
903
+ if $output == 0
904
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
905
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
906
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
907
+
908
+ group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
909
+ # for a matrix file
910
+ stem = "#{grp_no}. #{grp_label}"
911
+ $outfh.puts ">#{grp_label} #{grp_no}"
912
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
913
+ :row_header => $amino_acids)
914
+
915
+ # for a heat map
916
+ if $heatmap == 0 or $heatmap == 2
917
+ grp_cnt_mat.heatmap(:col_header => $amino_acids,
918
+ :row_header => $amino_acids,
919
+ :rvg_width => $rvg_width,
920
+ :rvg_height => $rvg_height,
921
+ :canvas_width => $canvas_width,
922
+ :canvas_height => $canvas_height,
923
+ :max_val => grp_max_val.ceil,
924
+ :min_val => 0,
925
+ :print_value => $heatmapvalue,
926
+ :title => stem).write("#{stem}.#{$heatmapformat}")
927
+
928
+ $logger.info "Generating a heat map for #{stem} table done."
929
+ end
930
+
931
+ if $heatmap == 1 or $heatmap == 2
932
+ heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
933
+ :row_header => $amino_acids,
934
+ :rvg_width => $rvg_width,
935
+ :rvg_height => $rvg_height - 50,
936
+ :canvas_width => $canvas_width,
937
+ :canvas_height => $canvas_height - 50,
938
+ :max_val => grp_max_val.ceil,
939
+ :min_val => 0,
940
+ :print_value => $heatmapvalue,
941
+ :print_gradient => false,
942
+ :title => stem,
943
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
944
+ end
945
+ end
946
+
947
+ if $heatmap == 1 or $heatmap == 2
948
+ file = "#{$heatmapstem}.#{$heatmapformat}"
949
+ heatmaps.heatmap(:columns => $heatmapcol,
950
+ :rvg_width => $rvg_width,
951
+ :max_val => grp_max_val.ceil,
952
+ :min_val => 0).write(file)
953
+
954
+ $logger.info "Generating heat maps in a file, #{file} done."
955
+ end
956
+
957
+ # total
958
+ $outfh.puts '>Total'
959
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
960
+ :row_header => $amino_acids)
961
+
962
+ if $heatmap == 0 or $heatmap == 2
963
+ stem = "#{group_matrices.size}. TOTAL"
964
+ heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
965
+ :row_header => $amino_acids,
966
+ :rvg_width => $rvg_width,
967
+ :rvg_height => $rvg_height,
968
+ :canvas_width => $canvas_width,
969
+ :canvas_height => $canvas_height,
970
+ :max_val => $tot_cnt_mat.max.ceil,
971
+ :min_val => 0,
972
+ :print_value => $heatmapvalue,
973
+ :title => stem).write("#{stem}.#{$heatmapformat}")
974
+
975
+ $logger.info "Generating a heat map for #{stem} table done."
976
+ end
977
+ exit 0
978
+ end
979
+
980
+ #
981
+ # Part 5. END
982
+ #
983
+
984
+
985
+ # Part 6.
986
+ #
987
+ # Calculating substitution probability tables
988
+ #
989
+
990
+ if $output == 1
991
+ $outfh.puts <<HEADER
992
+ #
993
+ # Each column (j) represents the probability distribution for the
994
+ # likelihood of acceptance of a mutational event by a residue type j in
995
+ # a particular structural environment (specified after >) leading to
996
+ # any other residue type (i) and sums up to 100.
997
+ #
998
+ HEADER
999
+ end
1000
+
1001
+ # when nosmoothing !!!
1002
+ if ($output > 0) && $nosmooth
1003
+ # reinitialize $tot_cnt_mat for pseudocounts
1004
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1005
+
1006
+ # for each combination of environment features
1007
+ pseudo_cnt = $add || (1.0 / $env_classes.group_size)
1008
+
1009
+ # add pseudo counts for each frequency vector
1010
+ $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
1011
+
1012
+ # re-calculate probability vector for each environment class
1013
+ $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
1014
+
1015
+ group_matrices = []
1016
+
1017
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1018
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1019
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1020
+
1021
+ $amino_acids.each_with_index do |aa, aj|
1022
+ env_class = group[1].find { |e| e.label.start_with?(aa) }
1023
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
1024
+ 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
1025
+ end
1026
+
1027
+ $tot_cnt_mat += grp_cnt_mat
1028
+ group_matrices << [group[0], grp_prob_mat]
1029
+ end
1030
+
1031
+ if $output == 1
1032
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1033
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1034
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1035
+
1036
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1037
+ # for a matrix file
1038
+ stem = "#{grp_no}. #{grp_label}"
1039
+ $outfh.puts ">#{grp_label} #{grp_no}"
1040
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1041
+ :row_header => $amino_acids)
1042
+
1043
+
1044
+ # for a heat map
1045
+ if $heatmap == 0 or $heatmap == 2
1046
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1047
+ :row_header => $amino_acids,
1048
+ :rvg_width => $rvg_width,
1049
+ :rvg_height => $rvg_height,
1050
+ :canvas_width => $canvas_width,
1051
+ :canvas_height => $canvas_height,
1052
+ :max_val => grp_max_val.ceil,
1053
+ :min_val => 0,
1054
+ :print_value => $heatmapvalue,
1055
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1056
+
1057
+ $logger.info "Generating a heat map for #{stem} table done."
1058
+ end
1059
+
1060
+ if $heatmap == 1 or $heatmap == 2
1061
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1062
+ :row_header => $amino_acids,
1063
+ :rvg_width => $rvg_width,
1064
+ :rvg_height => $rvg_height - 50,
1065
+ :canvas_width => $canvas_width,
1066
+ :canvas_height => $canvas_height - 50,
1067
+ :max_val => grp_max_val.ceil,
1068
+ :min_val => 0,
1069
+ :print_value => $heatmapvalue,
1070
+ :print_gradient => false,
1071
+ :title => stem,
1072
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1073
+ end
1074
+ end
1075
+
1076
+ # for heat maps in a single file
1077
+ if $heatmap == 1 or $heatmap == 2
1078
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1079
+ heatmaps.heatmap(:columns => $heatmapcol,
1080
+ :rvg_width => $rvg_width,
1081
+ :max_val => grp_max_val.ceil,
1082
+ :min_val => 0).write(file)
1083
+
1084
+ $logger.info "Generating heat maps in a file, #{file} done."
1085
+ end
1086
+ end
1087
+
1088
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1089
+
1090
+ 0.upto($amino_acids.size - 1) do |aj|
1091
+ col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
1092
+ 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
1093
+ end
1094
+
1095
+ if $output == 1
1096
+ $outfh.puts '>Total'
1097
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1098
+ :row_header => $amino_acids)
1099
+ $outfh.close
1100
+
1101
+ # for a heat map
1102
+ if $heatmap == 0 or $heatmap == 2
1103
+ stem = "#{group_matrices.size}. TOTAL"
1104
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1105
+ :row_header => $amino_acids,
1106
+ :rvg_width => $rvg_width,
1107
+ :rvg_height => $rvg_height,
1108
+ :canvas_width => $canvas_width,
1109
+ :canvas_height => $canvas_height,
1110
+ :max_val => $tot_prob_mat.max.ceil,
1111
+ :min_val => 0,
1112
+ :print_value => $heatmapvalue,
1113
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1114
+
1115
+ $logger.info "Generating a heat map for #{stem} table done."
1116
+ end
1117
+ exit 0
1118
+ end
1119
+
1120
+ $logger.info 'Calculating substitution probabilities (no smoothing) done.'
1121
+ end
1122
+
1123
+ # when smoothing!!!
1124
+ if ($output > 0) && !$nosmooth
1125
+ #
1126
+ # p1 probabilities
1127
+ #
1128
+ p1 = NArray.float($amino_acids.size)
1129
+ a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
1130
+ big_N = $tot_aa.to_f
1131
+ small_n = $amino_acids.size.to_f
1132
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1133
+ omega2 = 1.0 - omega1
1134
+
1135
+ if ($smooth == :full) || $p1smooth
1136
+ # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
1137
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
1138
+ $smooth_prob[1] = p1
1139
+ elsif ($smooth == :partial)
1140
+ # no smoothing for p1 probabilities just as Kenji's subst
1141
+ # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
1142
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
1143
+ $smooth_prob[1] = p1
1144
+ end
1145
+
1146
+ #
1147
+ # p2 and above
1148
+ #
1149
+ env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
1150
+
1151
+ if $smooth == :partial
1152
+ $outfh.puts <<HEADER
1153
+ #
1154
+ # Partial Smoothing:
1155
+ #
1156
+ HEADER
1157
+ if $p1smooth
1158
+ $outfh.puts <<HEADER
1159
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1160
+ # each row in all matrices and smoothing them with A0 (a uniform distribution)
1161
+ # ^^^^^^^^^
1162
+ HEADER
1163
+ else
1164
+ $outfh.puts <<HEADER
1165
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1166
+ # each row in all matrices without smoothing
1167
+ # ^^^^^^^^^^^^^^^^^
1168
+ HEADER
1169
+ end
1170
+
1171
+ $outfh.puts <<HEADER
1172
+ # p2(ri|Rj) is estimated as:
1173
+ # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
1174
+ #
1175
+ # p3(ri|Rj,fq) is estimated as:
1176
+ # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
1177
+ # where
1178
+ # A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
1179
+ #
1180
+ # The smoothing procedure is curtailed here and finally
1181
+ # ^^^^^^^^^
1182
+ # p5(ri|Rj,...) is estimated as:
1183
+ # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
1184
+ # where
1185
+ # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
1186
+ #
1187
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1188
+ #
1189
+ # sigma value used is: #{$sigma}
1190
+ #
1191
+ HEADER
1192
+ 1.upto($env_features.size) do |ci|
1193
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
1194
+ if (ci > 2) && (ci < $env_features.size)
1195
+ $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
1196
+ next
1197
+ end
1198
+
1199
+ env_labels.combination(ci) do |c1|
1200
+ c1[0].product(*c1[1..-1]).each do |labels|
1201
+ pattern = '.' * $env_features.size
1202
+
1203
+ labels.each do |label|
1204
+ i = label[0].chr.to_i
1205
+ l = label[1].chr
1206
+ pattern[i] = l
1207
+ end
1208
+
1209
+ if pattern =~ /^\./
1210
+ $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
1211
+ next
1212
+ end
1213
+
1214
+ # get environments matching the pattern created above
1215
+ # and calculate amino acid frequencies and their probabilities for all the environments
1216
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1217
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1218
+ prob_arr = NArray.float($amino_acids.size)
1219
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
1220
+
1221
+ # # assess whether a residue type j is compatible with a particular combination of structural features
1222
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
1223
+ # if ci == $env_features.size
1224
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
1225
+ # sub_pattern = '.' * $env_features.size
1226
+ # sub_pattern[0] = aa_label
1227
+ # sub_freq_sum = 0
1228
+ #
1229
+ # labels[1..-1].each do |label|
1230
+ # next if label.start_with?('0')
1231
+ # i = label[0].chr.to_i
1232
+ # l = label[1].chr
1233
+ # sub_pattern[i] = l
1234
+ # sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1235
+ # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1236
+ # sub_freq_sum += sub_freq_arr.sum
1237
+ # end
1238
+ #
1239
+ # if sub_freq_sum == 0
1240
+ # if $smooth_prob.has_key?(ci + 1)
1241
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1242
+ # else
1243
+ # $smooth_prob[ci + 1] = {}
1244
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1245
+ # end
1246
+ # $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
1247
+ # next
1248
+ # end
1249
+ # end
1250
+
1251
+ # collect priors
1252
+ priors = []
1253
+
1254
+ if ci == 1
1255
+ priors << $smooth_prob[1]
1256
+ elsif ci == 2
1257
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
1258
+ priors << $smooth_prob[2][c3.to_set]
1259
+ }
1260
+ elsif ci == $env_features.size
1261
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
1262
+ priors << $smooth_prob[3][c3.to_set]
1263
+ }
1264
+ end
1265
+
1266
+ # entropy based prior weighting step
1267
+ entropy_max = Math::log($amino_acids.size)
1268
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
1269
+ begin
1270
+ p == 0.0 ? s - 1 : s + p * Math::log(p)
1271
+ rescue
1272
+ #puts "P: #{p}"
1273
+ end
1274
+ } }
1275
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1276
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1277
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1278
+
1279
+ # smoothing step
1280
+ smooth_prob_arr = NArray.float($amino_acids.size)
1281
+ big_N = freq_arr.sum.to_f
1282
+ small_n = $amino_acids.size.to_f
1283
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1284
+ omega2 = 1.0 - omega1
1285
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1286
+
1287
+ # normalization step
1288
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1289
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1290
+
1291
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1292
+ if $smooth_prob.has_key?(ci + 1)
1293
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1294
+ else
1295
+ $smooth_prob[ci + 1] = {}
1296
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1297
+ end
1298
+ end
1299
+ end
1300
+ end
1301
+ $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
1302
+ else
1303
+ $outfh.puts <<HEADER
1304
+ #
1305
+ # Full Smoothing:
1306
+ #
1307
+ # p1(ri) is estimated as:
1308
+ # p1(ri) = omega1 * A0 + omega2 * W1(ri)
1309
+ #
1310
+ # p2(ri|f1q) is estimated as:
1311
+ # p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
1312
+ #
1313
+ # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
1314
+ #
1315
+ # p3(ri|f1q,f2q) is estimated as:
1316
+ # p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
1317
+ # where
1318
+ # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
1319
+ #
1320
+ # The smoothing procedure is NOT curtailed here and it goes upto
1321
+ # ^^^^^^^^^^^^^
1322
+ #
1323
+ # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
1324
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
1325
+ # where
1326
+ # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
1327
+ #
1328
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1329
+ #
1330
+ # sigma value used is: #{$sigma}
1331
+ #
1332
+ HEADER
1333
+ # full smooting
1334
+ 1.upto($env_features.size) do |ci|
1335
+ env_labels.combination(ci) do |c1|
1336
+ c1[0].product(*c1[1..-1]).each do |labels|
1337
+ pattern = '.' * $env_features.size
1338
+ labels.each do |label|
1339
+ j = label[0].chr.to_i
1340
+ l = label[1].chr
1341
+ pattern[j] = l
1342
+ end
1343
+
1344
+ # get environmetns, frequencies, and probabilities
1345
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1346
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1347
+ prob_arr = NArray.float($amino_acids.size)
1348
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
1349
+
1350
+ # collect priors
1351
+ priors = []
1352
+ if ci > 1
1353
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
1354
+ else
1355
+ priors << $smooth_prob[1]
1356
+ end
1357
+
1358
+ # entropy based weighting priors
1359
+ entropy_max = Math::log($amino_acids.size)
1360
+ entropies = priors.map do |prior|
1361
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
1362
+ end
1363
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
1364
+
1365
+ # smoothing step
1366
+ smooth_prob_arr = NArray.float($amino_acids.size)
1367
+ big_N = freq_arr.sum.to_f
1368
+ small_n = $amino_acids.size.to_f
1369
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1370
+ omega2 = 1.0 - omega1
1371
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1372
+
1373
+ # normalization step
1374
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1375
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1376
+
1377
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1378
+ if $smooth_prob.has_key?(ci + 1)
1379
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1380
+ else
1381
+ $smooth_prob[ci + 1] = {}
1382
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1383
+ end
1384
+ end
1385
+ end
1386
+ end
1387
+ $logger.info 'Calculating substitution probabilities (full smoothing) done.'
1388
+ end
1389
+
1390
+ # updating smoothed probability array for each envrionment
1391
+ $env_classes.values.each do |env|
1392
+ env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1393
+ end
1394
+
1395
+ # sorting environments and build 21X21 substitution matrices
1396
+ group_matrices = []
1397
+
1398
+ $env_classes.groups_sorted_by_residue_labels.each do |group|
1399
+ # calculating 21X21 substitution probability matrix for each envrionment
1400
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1401
+
1402
+ $amino_acids.each_with_index do |aa, ai|
1403
+ smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1404
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1405
+ end
1406
+
1407
+ group_matrices << [group[0], grp_prob_mat]
1408
+ end
1409
+
1410
+ if $output == 1
1411
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1412
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1413
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1414
+
1415
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1416
+ # for a matrix file
1417
+ stem = "#{grp_no}. #{grp_label}"
1418
+ $outfh.puts ">#{grp_label} #{grp_no}"
1419
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1420
+ :row_header => $amino_acids)
1421
+
1422
+ # for heat map generation
1423
+ if $heatmap == 0 or $heatmap == 2
1424
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1425
+ :row_header => $amino_acids,
1426
+ :rvg_width => $rvg_width,
1427
+ :rvg_height => $rvg_height,
1428
+ :canvas_width => $canvas_width,
1429
+ :canvas_height => $canvas_height,
1430
+ :max_val => grp_max_val.ceil,
1431
+ :min_val => 0,
1432
+ :print_value => $heatmapvalue,
1433
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1434
+
1435
+ $logger.info "Generating a heat map for #{stem} table done."
1436
+ end
1437
+
1438
+ if $heatmap == 1 or $heatmap == 2
1439
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1440
+ :row_header => $amino_acids,
1441
+ :rvg_width => $rvg_width,
1442
+ :rvg_height => $rvg_height - 50,
1443
+ :canvas_width => $canvas_width,
1444
+ :canvas_height => $canvas_height - 50,
1445
+ :max_val => grp_max_val.ceil,
1446
+ :min_val => 0,
1447
+ :print_value => $heatmapvalue,
1448
+ :print_gradient => false,
1449
+ :title => stem,
1450
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1451
+ end
1452
+ end
1453
+
1454
+ # for heat maps in a single file
1455
+ if $heatmap == 1 or $heatmap == 2
1456
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1457
+ heatmaps.heatmap(:columns => $heatmapcol,
1458
+ :rvg_width => $rvg_width,
1459
+ :max_val => grp_max_val.ceil,
1460
+ :min_val => 0).write(file)
1461
+
1462
+ $logger.info "Generating heat maps in a file, #{file} done."
1463
+ end
1464
+ end
1465
+
1466
+ # for a total substitution probability matrix
1467
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1468
+
1469
+ $amino_acids.each_with_index do |aa, aj|
1470
+ 0.upto($amino_acids.size - 1) do |ai|
1471
+ $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
1472
+ end
1473
+ end
1474
+
1475
+ if $output == 1
1476
+ $outfh.puts '>Total'
1477
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1478
+ :row_header => $amino_acids)
1479
+ $outfh.close
1480
+
1481
+ # for a heat map
1482
+ if $heatmap == 0 or $heatmap == 2
1483
+ stem = "#{group_matrices.size}. TOTAL"
1484
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1485
+ :row_header => $amino_acids,
1486
+ :rvg_width => $rvg_width,
1487
+ :rvg_height => $rvg_height,
1488
+ :canvas_width => $canvas_width,
1489
+ :canvas_height => $canvas_height,
1490
+ :max_val => $tot_prob_mat.max.ceil,
1491
+ :min_val => 0,
1492
+ :print_value => $heatmapvalue,
1493
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1494
+
1495
+ $logger.info "Generating a heat map for #{stem} table done."
1496
+ end
1497
+ exit 0
1498
+ end
1499
+ end
1500
+
1501
+ #
1502
+ # Part 6. END
1503
+ #
1504
+
1505
+
1506
+ # Part 7.
1507
+ #
1508
+ # Calculating log odds ratio scoring matrices
1509
+ #
1510
+ if $output == 2
1511
+ $outfh.puts <<HEADER
1512
+ #
1513
+ # The probabilities were then divided by the background probabilities
1514
+ HEADER
1515
+ if $penv
1516
+ $outfh.puts <<HEADER
1517
+ # which were derived from the environment-dependent amino acid frequencies.
1518
+ # ^^^^^^^^^^^^^^^^^^^^^
1519
+ HEADER
1520
+ else
1521
+ $outfh.puts <<HEADER
1522
+ # which were derived from the environment-independent amino acid frequencies.
1523
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1524
+ HEADER
1525
+ end
1526
+
1527
+ grp_logo_mats = []
1528
+ factor = $scale / Math::log(2)
1529
+
1530
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1531
+ # calculating substitution probability matrix for each envrionment
1532
+ grp_label = group[0]
1533
+ grp_envs = group[1]
1534
+ grp_logo_mat = $cys == 0 ?
1535
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1536
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1537
+
1538
+ $amino_acids.each_with_index do |aa, aj|
1539
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1540
+ env.logo_array = $cys == 0 ?
1541
+ NArray.float($amino_acids.size + 1) :
1542
+ NArray.float($amino_acids.size)
1543
+
1544
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1545
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1546
+ odds = prob / pai
1547
+ env.logo_array[ai] = factor * Math::log(odds)
1548
+ grp_logo_mat[aj, ai] = env.logo_array[ai]
1549
+ end
1550
+
1551
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1552
+ if $cys == 0
1553
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1554
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1555
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1556
+ odds = prob / pai
1557
+ env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1558
+ grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1559
+ end
1560
+ end
1561
+
1562
+ grp_logo_mats << [grp_label, grp_logo_mat]
1563
+ end
1564
+
1565
+ $tot_logo_mat = $cys == 0 ?
1566
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1567
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1568
+
1569
+ $amino_acids.each_with_index do |aa1, aj|
1570
+ $amino_acids.each_with_index do |aa2, ai|
1571
+ prob = $tot_prob_mat[aj, ai]
1572
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1573
+ odds = prob / pai
1574
+ $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1575
+ end
1576
+
1577
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1578
+ if $cys == 0
1579
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1580
+ prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1581
+ odds = prob / pai
1582
+ $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1583
+ end
1584
+ end
1585
+
1586
+
1587
+ # calculating relative entropy for each amino acid pair H and
1588
+ # the expected score E in bit units
1589
+ tot_E = 0.0
1590
+ tot_H = 0.0
1591
+
1592
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1593
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1594
+ if j != i
1595
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1596
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1597
+ else
1598
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1599
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1600
+ end
1601
+ end
1602
+ end
1603
+
1604
+ $outfh.puts <<HEADER
1605
+ #
1606
+ # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1607
+ HEADER
1608
+ unless $noroundoff
1609
+ $outfh.puts <<HEADER
1610
+ # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1611
+ HEADER
1612
+ end
1613
+
1614
+ $outfh.puts <<HEADER
1615
+ # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1616
+ #
1617
+ HEADER
1618
+
1619
+ grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
1620
+ grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
1621
+ abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
1622
+ row_header = $cys ? $amino_acids + %w[U] : $amino_acids
1623
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1624
+ $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
1625
+
1626
+ grp_logo_mats.each_with_index do |arr, grp_no|
1627
+ grp_label = arr[0]
1628
+ grp_logo_mat = arr[1]
1629
+ stem = "#{grp_no}. #{grp_label}"
1630
+
1631
+ unless $noroundoff
1632
+ grp_logo_mat = grp_logo_mat.round
1633
+ end
1634
+
1635
+ # for a matrix file
1636
+ $outfh.puts ">#{grp_label} #{grp_no}"
1637
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
1638
+ :row_header => row_header)
1639
+ # for a heat map
1640
+ if $heatmap == 0 or $heatmap == 2
1641
+ grp_logo_mat.heatmap(:col_header => $amino_acids,
1642
+ :row_header => row_header,
1643
+ :rvg_width => $rvg_width,
1644
+ :rvg_height => $rvg_height,
1645
+ :canvas_width => $canvas_width,
1646
+ :canvas_height => $canvas_height,
1647
+ :gradient_beg_color => '#0000FF',
1648
+ :gradient_mid_color => '#FFFFFF',
1649
+ :gradient_end_color => '#FF0000',
1650
+ :max_val => abs_max_val.ceil,
1651
+ :mid_val => 0,
1652
+ :min_val => -1 * abs_max_val.ceil,
1653
+ :print_value => $heatmapvalue,
1654
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1655
+
1656
+ $logger.info "Generating a heat map for #{stem} table done."
1657
+ end
1658
+
1659
+ if $heatmap == 1 or $heatmap == 2
1660
+ heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1661
+ :row_header => row_header,
1662
+ :rvg_width => $rvg_width,
1663
+ :rvg_height => $rvg_height - 50,
1664
+ :canvas_width => $canvas_width,
1665
+ :canvas_height => $canvas_height - 50,
1666
+ :gradient_beg_color => '#0000FF',
1667
+ :gradient_mid_color => '#FFFFFF',
1668
+ :gradient_end_color => '#FF0000',
1669
+ :max_val => abs_max_val.ceil,
1670
+ :mid_val => 0,
1671
+ :min_val => -1 * abs_max_val.ceil,
1672
+ :print_value => $heatmapvalue,
1673
+ :print_gradient => false,
1674
+ :title => stem,
1675
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1676
+ end
1677
+ end
1678
+
1679
+ # for heat maps in a single file
1680
+ if $heatmap == 1 or $heatmap == 2
1681
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1682
+ heatmaps.heatmap(:columns => $heatmapcol,
1683
+ :rvg_width => $rvg_width,
1684
+ :gradient_beg_color => '#0000FF',
1685
+ :gradient_mid_color => '#FFFFFF',
1686
+ :gradient_end_color => '#FF0000',
1687
+ :max_val => abs_max_val.ceil,
1688
+ :mid_val => 0,
1689
+ :min_val => -1 * abs_max_val.ceil).write(file)
1690
+
1691
+ $logger.info "Generating heat maps in a file, #{file} done."
1692
+ end
1693
+
1694
+ # for a matrix file
1695
+ unless $noroundoff
1696
+ $tot_logo_mat = $tot_logo_mat.round
1697
+ end
1698
+
1699
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1700
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
1701
+ :row_header => row_header)
1702
+
1703
+ # for a heat map
1704
+ if $heatmap == 0 or $heatmap == 2
1705
+ stem = "#{group_matrices.size}. TOTAL"
1706
+ tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
1707
+ $tot_logo_mat.heatmap(:col_header => $amino_acids,
1708
+ :row_header => row_header,
1709
+ :rvg_width => $rvg_width,
1710
+ :rvg_height => $rvg_height,
1711
+ :canvas_width => $canvas_width,
1712
+ :canvas_height => $canvas_height,
1713
+ :gradient_beg_color => '#0000FF',
1714
+ :gradient_mid_color => '#FFFFFF',
1715
+ :gradient_end_color => '#FF0000',
1716
+ :max_val => tot_abs_max_val.ceil,
1717
+ :mid_val => 0,
1718
+ :min_val => -1 * tot_abs_max_val.ceil,
1719
+ :print_value => $heatmapvalue,
1720
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1721
+
1722
+ $logger.info "Generating a heat map for #{stem} table done."
1723
+ end
1724
+
1725
+ $logger.info "Calculating log odds ratios done."
1726
+ end
1727
+
1728
+ #
1729
+ # Part 7. END
1730
+ #
1731
+
1732
+ $outfh.close
1733
+ exit 0
1734
+ end
1735
+ end
1736
+
1737
+ end # class CLI
1738
+ end # module Egor