semin-egor 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/egor/cli.rb ADDED
@@ -0,0 +1,1738 @@
1
+ require 'rubygems'
2
+ require 'getoptlong'
3
+ require 'logger'
4
+ require 'narray'
5
+ require 'bio'
6
+ require 'set'
7
+ require 'facets'
8
+
9
+ require 'math_extensions'
10
+ require 'string_extensions'
11
+ require 'narray_extensions'
12
+ require 'nmatrix_extensions'
13
+
14
+ require 'egor/environment'
15
+ require 'egor/environment_class_hash'
16
+ require 'egor/environment_feature'
17
+ require 'egor/environment_feature_array'
18
+ require 'egor/heatmap_array'
19
+
20
+ # This is a module for an actual command line interpreter for Egor
21
+ # ---
22
+ # Copyright (C) 2008-9 Semin Lee
23
+ module Egor
24
+ class CLI
25
+ class << self
26
+
27
+ # :nodoc:
28
+ def print_version
29
+ puts VERSION
30
+ end
31
+
32
+ # Print Egor's Usage on the screen
33
+ #
34
+ # :call-seq:
35
+ # Egor::CLI::print_usage
36
+ #
37
+ def print_usage
38
+ puts <<-USAGE
39
+ egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
40
+
41
+ Usage:
42
+ egor [ options ] -l TEMLIST-file -c CLASSDEF-file
43
+ or
44
+ egor [ options ] -f TEM-file -c CLASSDEF-file
45
+
46
+ Options:
47
+ --tem-file (-f) FILE: a tem file
48
+ --tem-list (-l) FILE: a list for tem files
49
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
50
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
51
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
52
+ --noweight: calculate substitution counts with no weights
53
+ --smooth (-s) INTEGER:
54
+ 0 for partial smoothing (default)
55
+ 1 for full smoothing
56
+ --p1smooth: perform smoothing for p1 probability calculation when partial smoothing
57
+ --nosmooth: perform no smoothing operation
58
+ --cys (-y) INTEGER:
59
+ 0 for using C and J only for structure (default)
60
+ 1 for both structure and sequence
61
+ 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
62
+ --output INTEGER:
63
+ 0 for raw counts (no smoothing performed)
64
+ 1 for probabilities
65
+ 2 for log-odds (default)
66
+ --noroundoff: do not round off log odds ratio
67
+ --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
68
+ --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
69
+ --autosigma: automatically adjust the sigma value for smoothing
70
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
71
+ --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
72
+ --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
73
+ --heatmap INTEGER:
74
+ 0 create a heat map file for each substitution table
75
+ 1 create one big file containing all substitution tables
76
+ 2 do both 0 and 1
77
+ --heatmap-format INTEGER:
78
+ 0 for Portable Network Graphics (PNG) Format (default)
79
+ 1 for Graphics Interchange Format (GIF)
80
+ 2 for Joint Photographic Experts Group (JPEG) Format
81
+ 3 for Microsoft Windows bitmap (BMP) Format
82
+ 4 for Portable Document Format (PDF)
83
+ --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
84
+ --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
85
+ --heatmap-value: print values in the cells when generating heat maps
86
+ --verbose (-v) INTEGER
87
+ 0 for ERROR level
88
+ 1 for WARN or above level (default)
89
+ 2 for INFO or above level
90
+ 3 for DEBUG or above level
91
+ --version: print version
92
+ --help (-h): show help
93
+
94
+ USAGE
95
+ end
96
+
97
+ # Calculate PID between two sequences
98
+ #
99
+ # :call-seq:
100
+ # Egor::CLI::calculate_pid(seq1, seq2) -> Float
101
+ #
102
+ def calculate_pid(seq1, seq2)
103
+ aas1 = seq1.split('')
104
+ aas2 = seq2.split('')
105
+ cols = aas1.zip(aas2)
106
+ align = 0 # no. of aligned columns
107
+ ident = 0 # no. of identical columns
108
+ intgp = 0 # no. of internal gaps
109
+
110
+ cols.each do |col|
111
+ if (col[0] != '-') && (col[1] != '-')
112
+ align += 1
113
+ if col[0] == col[1]
114
+ ident += 1
115
+ end
116
+ elsif (((col[0] == '-') && (col[1] != '-')) ||
117
+ ((col[0] != '-') && (col[1] == '-')))
118
+ intgp += 1
119
+ end
120
+ end
121
+
122
+ pid = 100.0 * ident.to_f / (align + intgp)
123
+ end
124
+
125
+ # :nodoc:
126
+ def execute(arguments=[])
127
+ #
128
+ # * Abbreviations in the codes
129
+ #
130
+ # env: environment
131
+ # tem: (FUGUE) template
132
+ # classdef: (envlironment) class definition
133
+ # aa: amino acid
134
+ # aa: weighted amino acid
135
+ # tot: total
136
+ # rel: relative
137
+ # jnt: joint
138
+ # cnt: count
139
+ # mut: mutation
140
+ # mutb: mutability
141
+ # freq: frequency
142
+ # prob: probability
143
+ # logo: log odds ratio
144
+ # opts: options
145
+ # fh: file handle
146
+ # ff: flat file
147
+ # ali: alignment
148
+ # mat: matrix
149
+ # arr: array
150
+
151
+
152
+ # Part 1.
153
+ #
154
+ # Global variables and their default values
155
+ #
156
+
157
+ $logger = Logger.new(STDOUT)
158
+ $logger.level = Logger::WARN
159
+
160
+ # default set of 21 amino acids including J (Cysteine, the free thiol form)
161
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
162
+ $tem_list = nil
163
+ $tem_file = nil
164
+ $classdef = 'classdef.dat'
165
+ $outfile = 'allmat.dat'
166
+ $outfh = nil # file hanfle for outfile
167
+ $output = 2 # default: log odds matrix
168
+ $ali_size = 0
169
+ $tot_aa = 0
170
+ $sigma = 5.0
171
+ $autosigma = false
172
+ $weight = 60
173
+ $noweight = false
174
+ $smooth = :partial
175
+ $nosmooth = false
176
+ $noroundoff = false
177
+ $p1smooth = false
178
+ $scale = 3
179
+ $pidmin = nil
180
+ $pidmax = nil
181
+ $scale = 3
182
+ $add = nil
183
+ $cys = 0
184
+ $targetenv = false
185
+ $penv = false
186
+ $heatmap = nil
187
+ $heatmapcol = nil
188
+ $heatmapformat = 'png'
189
+ $heatmapstem = 'heatmaps'
190
+ $heatmapvalue = false
191
+ $rvg_width = 550
192
+ $rvg_height = 650
193
+ $canvas_width = 550
194
+ $canvas_height = 650
195
+ $cell_width = 20
196
+ $cell_height = 20
197
+
198
+ $aa_tot_cnt = Hash.new(0)
199
+ $aa_mut_cnt = Hash.new(0)
200
+ $aa_mutb = {}
201
+ $aa_rel_mutb = {}
202
+ $aa_tot_freq = {}
203
+ $aa_env_cnt = Hash.new(0)
204
+ $smooth_prob = {}
205
+ $tot_cnt_mat = nil
206
+ $tot_prob_mat = nil
207
+ $tot_logo_mat = nil
208
+ $tot_smooth_prob = {}
209
+
210
+ # minimum ratio of amino acid count to sigma value
211
+ $min_cnt_sigma_ratio = 500.0
212
+
213
+ #
214
+ # Part 1 END
215
+ #
216
+
217
+ # Part 2.
218
+ #
219
+ # Parsing options
220
+ #
221
+
222
+ opts = GetoptLong.new(
223
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
224
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
225
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
226
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
227
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
228
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
229
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
230
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
231
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
232
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
233
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
234
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
235
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
237
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--heatmap-value', GetoptLong::NO_ARGUMENT ],
240
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
241
+ [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
242
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
243
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
244
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
245
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
246
+ [ '--version', GetoptLong::NO_ARGUMENT ]
247
+ )
248
+
249
+ begin
250
+ opts.each do |opt, arg|
251
+ case opt
252
+ when '--help'
253
+ print_usage
254
+ exit 0
255
+ when '--tem-list'
256
+ $tem_list = arg
257
+ when '--tem-file'
258
+ $tem_file = arg
259
+ when '--classdef'
260
+ $classdef = arg
261
+ when '--output'
262
+ $output = arg.to_i
263
+ when '--outfile'
264
+ $outfile = arg
265
+ when '--cys'
266
+ $cys = arg.to_i
267
+ when '--targetenv'
268
+ $targetenv = (arg.to_i == 1) ? true : false
269
+ when '--weight'
270
+ $weight = arg.to_i
271
+ when '--sigma'
272
+ $sigma = arg.to_f
273
+ when '--autosigma'
274
+ $autosigma = true
275
+ when '--pidmin'
276
+ $pidmin = arg.to_f
277
+ when '--pidmax'
278
+ $pidmax = arg.to_f
279
+ when '--noweight'
280
+ $noweight = true
281
+ when '--noroundoff'
282
+ $noroundoff = true
283
+ when '--smooth'
284
+ $smooth = (arg.to_i == 1) ? :full : :partial
285
+ when '--nosmooth'
286
+ $nosmooth = true
287
+ when '--p1smooth'
288
+ $p1smooth = true
289
+ when '--scale'
290
+ $scale = arg.to_f
291
+ when '--add'
292
+ $add = arg.to_f
293
+ when '--penv'
294
+ warn "--penv option is not supported."
295
+ exit 1
296
+ $penv = true
297
+ when '--heatmap'
298
+ $heatmap = case arg.to_i
299
+ when (0..2) then arg.to_i
300
+ else
301
+ warn "--heatmap #{arg.to_i} is not allowed."
302
+ exit1
303
+ end
304
+ when '--heatmap-columns'
305
+ $heatmapcol = arg.to_i
306
+ when '--heatmap-stem'
307
+ $heatmapstem = arg.to_s
308
+ when '--heatmap-format'
309
+ $heatmapformat = case arg.to_i
310
+ when 0 then 'png'
311
+ when 1 then 'gif'
312
+ when 2 then 'jpg'
313
+ when 3 then 'bmp'
314
+ when 4 then 'pdf'
315
+ else
316
+ warn "--heatmap-format #{arg.to_i} is not supported."
317
+ exit 1
318
+ end
319
+ when '--heatmap-value'
320
+ $heatmapvalue = true
321
+ when '--verbose'
322
+ $logger.level = case arg.to_i
323
+ when 0 then Logger::ERROR
324
+ when 1 then Logger::WARN
325
+ when 2 then Logger::INFO
326
+ when 3 then Logger::DEBUG
327
+ else
328
+ warn "--verbose (-v) #{arg.to_i} is not supported."
329
+ exit 1
330
+ end
331
+ when '--version'
332
+ print_version
333
+ exit 0
334
+ end
335
+ end
336
+ rescue
337
+ # invalid option
338
+ exit 1
339
+ end
340
+
341
+ # when arguments are nonsense, print usage
342
+ if ((ARGV.length != 0) ||
343
+ (!$tem_list && !$tem_file) ||
344
+ ($tem_list && $tem_file))
345
+ print_usage
346
+ exit 1
347
+ end
348
+
349
+ # warn if any input file is missing
350
+ if $tem_list && !File.exist?($tem_list)
351
+ warn "Cannot find template list file, #{$tem_list}"
352
+ exit 1
353
+ end
354
+
355
+ if $tem_file && !File.exist?($tem_file)
356
+ warn "Cannot find template file, #{$tem_file}"
357
+ exit 1
358
+ end
359
+
360
+ if $classdef && !File.exist?($classdef)
361
+ warn "Cannot find environment class definition file, #{$classdef}"
362
+ exit 1
363
+ end
364
+
365
+ #
366
+ # Part 2 END
367
+ #
368
+
369
+
370
+ # Part 3.
371
+ #
372
+ # Reading Environment Class Definition File
373
+ #
374
+
375
+ # check --cys option and modify amino_acids set if necessary
376
+ if $cys == 2
377
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
378
+ end
379
+
380
+ # create an EnvironmentFeatureList object for storing all environment
381
+ # features
382
+ $env_features = EnvironmentFeatureArray.new
383
+
384
+ # an array for storing indexes of constrained environment features
385
+ $cst_features = []
386
+
387
+ # add substituted amino acid (aa1) in a substitution to the environment
388
+ # feature list
389
+ $env_features << EnvironmentFeature.new('sequence',
390
+ $amino_acids,
391
+ $amino_acids,
392
+ 'F',
393
+ 'F')
394
+
395
+ # read environment class definiton file and store them into
396
+ # the hash prepared above
397
+ env_index = 1
398
+
399
+ IO.foreach($classdef) do |line|
400
+ line.chomp!
401
+ if line.start_with?('#')
402
+ next
403
+ elsif (env_ftr = line.chomp.split(/;/)).length == 5
404
+ $logger.info "An environment feature, #{line} detected."
405
+ if env_ftr[-1] == 'T'
406
+ # skip silenced environment feature
407
+ $logger.warn "The environment feature, #{line} silent."
408
+ next
409
+ end
410
+ if env_ftr[-2] == 'T'
411
+ $cst_features << env_index
412
+ $logger.warn "The environment feature, #{line} constrained."
413
+ end
414
+ $env_features << EnvironmentFeature.new(env_ftr[0],
415
+ env_ftr[1].split(''),
416
+ env_ftr[2].split(''),
417
+ env_ftr[3],
418
+ env_ftr[4])
419
+ env_index += 1
420
+ else
421
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
+ "a environment class definition."
423
+ exit 1
424
+ end
425
+ end
426
+
427
+ # a hash for storing all environment classes
428
+ $env_classes = EnvironmentClassHash.new
429
+
430
+ # generate all possible combinations of environment labels, and store
431
+ # every environment class into the hash prepared above with the label
432
+ # as a key
433
+ $env_features.label_combinations.each_with_index { |e, i|
434
+ $env_classes[e.flatten.join] = Environment.new(i,
435
+ e.flatten.join,
436
+ $amino_acids)
437
+ }
438
+
439
+ #
440
+ # Part 3 END
441
+ #
442
+
443
+
444
+ # Part 4.
445
+ #
446
+ # Reading TEM file or TEMLIST list file and couting substitutions
447
+ #
448
+
449
+ # a global file handle for output
450
+ $outfh = File.open($outfile, 'w')
451
+
452
+ if $tem_file
453
+ $tem_list_io = StringIO.new($tem_file)
454
+ end
455
+
456
+ if $tem_list
457
+ $tem_list_io = File.open($tem_list)
458
+ end
459
+
460
+ $tem_list_io.each_line do |tem_file|
461
+ tem_file.chomp!
462
+
463
+ ali = Bio::Alignment::OriginalAlignment.new
464
+ ff = Bio::FlatFile.auto(tem_file)
465
+
466
+ ff.each_entry do |pir|
467
+ if (pir.definition == 'sequence') || (pir.definition == 'structure')
468
+ ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
469
+ end
470
+ end
471
+
472
+ if ali.size < 2
473
+ $logger.warn "Skipped #{tem_file} which has only one unique entry."
474
+ next
475
+ end
476
+
477
+ $ali_size += 1
478
+ env_labels = {}
479
+ disulphide = {}
480
+
481
+ ali.each_pair do |key, seq|
482
+ # check disulphide bond environment first!
483
+ ff.rewind
484
+ ff.each_entry do |pir|
485
+ if ((pir.entry_id == key) &&
486
+ ((pir.definition == "disulphide") ||
487
+ (pir.definition == "disulfide")))
488
+ disulphide[key] = pir.data.remove_internal_spaces.split('')
489
+ end
490
+ end
491
+
492
+ $env_features.each_with_index do |ec, ei|
493
+ env_labels[key] = [] unless env_labels.has_key?(key)
494
+
495
+ ff.rewind
496
+ ff.each_entry do |pir|
497
+ if (pir.entry_id == key) && (pir.definition == ec.name)
498
+ labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
499
+ if sym == '-'
500
+ '-'
501
+ elsif sym == 'X' || sym == 'x'
502
+ 'X'
503
+ else
504
+ if ei == 0 # Amino Acid Environment Feature
505
+ (disulphide.has_key?(key) &&
506
+ (disulphide[key][pos] == 'F') &&
507
+ (sym == 'C')) ? 'J' : sym
508
+ else
509
+ ec.labels[ec.symbols.index(sym)]
510
+ end
511
+ end
512
+ end
513
+
514
+ if env_labels[key].empty?
515
+ env_labels[key] = labels
516
+ else
517
+ env_labels[key].each_with_index { |e, i|
518
+ env_labels[key][i] = e + labels[i]
519
+ }
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end
525
+
526
+ if $noweight
527
+ ali.each_pair do |id1, seq1|
528
+ ali.each_pair do |id2, seq2|
529
+ if id1 != id2
530
+ pid = calculate_pid(seq1, seq2)
531
+ s1 = seq1.split('')
532
+ s2 = seq2.split('')
533
+
534
+ # check PID_MIN
535
+ if $pidmin && (pid < $pidmin)
536
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
537
+ "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
538
+ next
539
+ end
540
+
541
+ # check PID_MAX
542
+ if $pidmax && (pid > $pidmax)
543
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
544
+ "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
545
+ next
546
+ end
547
+
548
+ s1.each_with_index do |aa1, pos|
549
+ aa1.upcase!
550
+ aa2 = s2[pos].upcase
551
+
552
+ if env_labels[id1][pos].include?('X')
553
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
554
+ next
555
+ end
556
+
557
+ if env_labels[id2][pos].include?('X')
558
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
559
+ next
560
+ end
561
+
562
+ unless $amino_acids.include?(aa1)
563
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
564
+ next
565
+ end
566
+
567
+ unless $amino_acids.include?(aa2)
568
+ $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
569
+ next
570
+ end
571
+
572
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
573
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
574
+
575
+ if $cst_features.empty?
576
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
577
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
578
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
579
+ else
580
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
581
+ next
582
+ end
583
+
584
+ grp_label = env_labels[id1][pos][1..-1]
585
+
586
+ if $aa_env_cnt.has_key? grp_label
587
+ if $aa_env_cnt[grp_label].has_key? aa1
588
+ $aa_env_cnt[grp_label][aa1] += 1
589
+ else
590
+ $aa_env_cnt[grp_label][aa1] = 1
591
+ end
592
+ else
593
+ $aa_env_cnt[grp_label] = Hash.new(0)
594
+ $aa_env_cnt[grp_label][aa1] = 1
595
+ end
596
+
597
+ if $aa_tot_cnt.has_key? aa1
598
+ $aa_tot_cnt[aa1] += 1
599
+ else
600
+ $aa_tot_cnt[aa1] = 1
601
+ end
602
+
603
+ if aa1 != aa2
604
+ if $aa_mut_cnt.has_key? aa1
605
+ $aa_mut_cnt[aa1] += 1
606
+ else
607
+ $aa_mut_cnt[aa1] = 1
608
+ end
609
+ end
610
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
611
+ end
612
+ end
613
+ end
614
+ end
615
+ else
616
+ # BLOSUM-like weighting
617
+ clusters = []
618
+ ali.each_pair { |i, s| clusters << [i] }
619
+
620
+ # a loop for single linkage clustering
621
+ begin
622
+ continue = false
623
+ 0.upto(clusters.size - 2) do |i|
624
+ indexes = []
625
+ (i + 1).upto(clusters.size - 1) do |j|
626
+ found = false
627
+ clusters[i].each do |c1|
628
+ clusters[j].each do |c2|
629
+ if calculate_pid(ali[c1], ali[c2]) >= $weight
630
+ indexes << j
631
+ found = true
632
+ break
633
+ end
634
+ end
635
+ break if found
636
+ end
637
+ end
638
+
639
+ unless indexes.empty?
640
+ continue = true
641
+ group = clusters[i]
642
+ indexes.each do |k|
643
+ group = group.concat(clusters[k])
644
+ clusters[k] = nil
645
+ end
646
+ clusters[i] = group
647
+ clusters.compact!
648
+ end
649
+ end
650
+ end while(continue)
651
+
652
+ if clusters.size < 2
653
+ $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
654
+ next
655
+ end
656
+
657
+ clusters.combination(2).each do |cluster1, cluster2|
658
+ cluster1.each do |id1|
659
+ cluster2.each do |id2|
660
+ seq1 = ali[id1].split('')
661
+ seq2 = ali[id2].split('')
662
+
663
+ seq1.each_with_index do |aa1, pos|
664
+ aa1.upcase!
665
+ aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
666
+
667
+ if env_labels[id1][pos].include?('X')
668
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
669
+ next
670
+ end
671
+
672
+ if env_labels[id2][pos].include?('X')
673
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
674
+ next
675
+ end
676
+
677
+ unless $amino_acids.include?(aa1)
678
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
679
+ next
680
+ end
681
+
682
+ unless $amino_acids.include?(aa2)
683
+ $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
684
+ next
685
+ end
686
+
687
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
688
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
689
+ cnt1 = 1.0 / cluster1.size
690
+ cnt2 = 1.0 / cluster2.size
691
+ jnt_cnt = cnt1 * cnt2
692
+
693
+ if $cst_features.empty?
694
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
695
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
696
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
697
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
698
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
699
+ else
700
+ $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
701
+ next
702
+ end
703
+
704
+ grp_label1 = env_labels[id1][pos][1..-1]
705
+ grp_label2 = env_labels[id2][pos][1..-1]
706
+
707
+ if $aa_env_cnt.has_key? grp_label1
708
+ if $aa_env_cnt[grp_label1].has_key? aa1
709
+ $aa_env_cnt[grp_label1][aa1] += cnt1
710
+ else
711
+ $aa_env_cnt[grp_label1][aa1] = cnt1
712
+ end
713
+ else
714
+ $aa_env_cnt[grp_label1] = Hash.new(0.0)
715
+ $aa_env_cnt[grp_label1][aa1] = cnt1
716
+ end
717
+
718
+ if $aa_env_cnt.has_key? grp_label2
719
+ if $aa_env_cnt[grp_label2].has_key? aa2
720
+ $aa_env_cnt[grp_label2][aa2] += cnt2
721
+ else
722
+ $aa_env_cnt[grp_label2][aa2] = cnt2
723
+ end
724
+ else
725
+ $aa_env_cnt[grp_label2] = Hash.new(0.0)
726
+ $aa_env_cnt[grp_label2][aa2] = cnt2
727
+ end
728
+
729
+ if $aa_tot_cnt.has_key? aa1
730
+ $aa_tot_cnt[aa1] += cnt1
731
+ else
732
+ $aa_tot_cnt[aa1] = cnt1
733
+ end
734
+
735
+ if $aa_tot_cnt.has_key? aa2
736
+ $aa_tot_cnt[aa2] += cnt2
737
+ else
738
+ $aa_tot_cnt[aa2] = cnt2
739
+ end
740
+
741
+ if aa1 != aa2
742
+ if $aa_mut_cnt.has_key? aa1
743
+ $aa_mut_cnt[aa1] += cnt1
744
+ else
745
+ $aa_mut_cnt[aa1] = cnt1
746
+ end
747
+ if $aa_mut_cnt.has_key? aa2
748
+ $aa_mut_cnt[aa2] += cnt2
749
+ else
750
+ $aa_mut_cnt[aa2] = cnt2
751
+ end
752
+ end
753
+
754
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
755
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
756
+ end
757
+ end
758
+ end
759
+ end
760
+ end
761
+ $logger.info "Analysing #{tem_file} done."
762
+ end
763
+
764
+ # print out default header
765
+ $outfh.puts <<HEADER
766
+ # Environment-specific amino acid substitution matrices
767
+ # Creator: egor version #{Egor::VERSION}
768
+ # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
769
+ #
770
+ # Definitions for structural environments:
771
+ # #{$env_features.size - 1} features used
772
+ #
773
+ HEADER
774
+
775
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
776
+
777
+ $outfh.puts <<HEADER
778
+ # (read in from #{$classdef})
779
+ #
780
+ # Number of alignments: #{$ali_size}
781
+ # (list of .tem files read in from #{$tem_list})
782
+ #
783
+ # Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
784
+ #
785
+ # There are #{$amino_acids.size} amino acids considered.
786
+ # #{$amino_acids.join}
787
+ #
788
+ HEADER
789
+
790
+ if $amino_acids.include? 'J'
791
+ $outfh.puts <<HEADER
792
+ # C: Cystine (the disulfide-bonded form)
793
+ # J: Cysteine (the free thiol form)
794
+ #
795
+ HEADER
796
+ end
797
+
798
+ if $noweight
799
+ $outfh.puts '# Weighting scheme: none'
800
+ else
801
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
802
+ end
803
+
804
+ # calculate amino acid frequencies and mutabilities, and
805
+ # print them as default statistics in the header part
806
+ ala_factor = if $aa_tot_cnt['A'] == 0
807
+ 0.0
808
+ elsif $aa_mut_cnt['A'] == 0
809
+ 0.0
810
+ else
811
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
812
+ end
813
+ $tot_aa = $aa_tot_cnt.values.sum
814
+
815
+ $outfh.puts '#'
816
+ $outfh.puts "# Total amino acid frequencies:\n"
817
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
818
+
819
+ min_cnt = -1
820
+ min_sigma = nil
821
+
822
+ $amino_acids.each do |res|
823
+ if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
824
+ if min_cnt < 0
825
+ min_cnt = $aa_tot_cnt[res]
826
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
827
+ elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
828
+ min_cnt = $aa_tot_cnt[res]
829
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
830
+ end
831
+
832
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
833
+ end
834
+
835
+ $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
836
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
837
+ $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
838
+ end
839
+
840
+ $amino_acids.each do |res|
841
+ if $noweight
842
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
843
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
844
+ else
845
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
846
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
847
+ end
848
+ end
849
+
850
+ if min_cnt > -1
851
+ $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
852
+ if $autosigma
853
+ $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
854
+ $sigma = min_sigma
855
+ end
856
+ end
857
+
858
+ $outfh.puts '#'
859
+ $outfh.puts '# RES: Amino acid one letter code'
860
+ $outfh.puts '# TOT_OBS: Total count of incidence'
861
+ $outfh.puts '# MUT_OBS: Total count of mutation'
862
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
863
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
864
+ $outfh.puts '# REL_FREQ: Relative frequency'
865
+ $outfh.puts '#'
866
+
867
+ #
868
+ # Part 4. END
869
+ #
870
+
871
+
872
+ # Part 5.
873
+ #
874
+ # Generating substitution frequency matrices
875
+ #
876
+
877
+ # calculating probabilities for each environment
878
+ $env_classes.values.each do |e|
879
+ if e.freq_array.sum != 0
880
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
881
+ end
882
+ end
883
+
884
+ # count raw frequencies
885
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
886
+ group_matrices = []
887
+
888
+ # for each combination of environment features
889
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
890
+ grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
891
+
892
+ $amino_acids.each_with_index do |aa, aj|
893
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
894
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
895
+ end
896
+
897
+ $tot_cnt_mat += grp_cnt_mat
898
+ group_matrices << [group[0], grp_cnt_mat]
899
+ end
900
+
901
+ $logger.info "Counting substitutions done."
902
+
903
+ if $output == 0
904
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
905
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
906
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
907
+
908
+ group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
909
+ # for a matrix file
910
+ stem = "#{grp_no}. #{grp_label}"
911
+ $outfh.puts ">#{grp_label} #{grp_no}"
912
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
913
+ :row_header => $amino_acids)
914
+
915
+ # for a heat map
916
+ if $heatmap == 0 or $heatmap == 2
917
+ grp_cnt_mat.heatmap(:col_header => $amino_acids,
918
+ :row_header => $amino_acids,
919
+ :rvg_width => $rvg_width,
920
+ :rvg_height => $rvg_height,
921
+ :canvas_width => $canvas_width,
922
+ :canvas_height => $canvas_height,
923
+ :max_val => grp_max_val.ceil,
924
+ :min_val => 0,
925
+ :print_value => $heatmapvalue,
926
+ :title => stem).write("#{stem}.#{$heatmapformat}")
927
+
928
+ $logger.info "Generating a heat map for #{stem} table done."
929
+ end
930
+
931
+ if $heatmap == 1 or $heatmap == 2
932
+ heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
933
+ :row_header => $amino_acids,
934
+ :rvg_width => $rvg_width,
935
+ :rvg_height => $rvg_height - 50,
936
+ :canvas_width => $canvas_width,
937
+ :canvas_height => $canvas_height - 50,
938
+ :max_val => grp_max_val.ceil,
939
+ :min_val => 0,
940
+ :print_value => $heatmapvalue,
941
+ :print_gradient => false,
942
+ :title => stem,
943
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
944
+ end
945
+ end
946
+
947
+ if $heatmap == 1 or $heatmap == 2
948
+ file = "#{$heatmapstem}.#{$heatmapformat}"
949
+ heatmaps.heatmap(:columns => $heatmapcol,
950
+ :rvg_width => $rvg_width,
951
+ :max_val => grp_max_val.ceil,
952
+ :min_val => 0).write(file)
953
+
954
+ $logger.info "Generating heat maps in a file, #{file} done."
955
+ end
956
+
957
+ # total
958
+ $outfh.puts '>Total'
959
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
960
+ :row_header => $amino_acids)
961
+
962
+ if $heatmap == 0 or $heatmap == 2
963
+ stem = "#{group_matrices.size}. TOTAL"
964
+ heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
965
+ :row_header => $amino_acids,
966
+ :rvg_width => $rvg_width,
967
+ :rvg_height => $rvg_height,
968
+ :canvas_width => $canvas_width,
969
+ :canvas_height => $canvas_height,
970
+ :max_val => $tot_cnt_mat.max.ceil,
971
+ :min_val => 0,
972
+ :print_value => $heatmapvalue,
973
+ :title => stem).write("#{stem}.#{$heatmapformat}")
974
+
975
+ $logger.info "Generating a heat map for #{stem} table done."
976
+ end
977
+ exit 0
978
+ end
979
+
980
+ #
981
+ # Part 5. END
982
+ #
983
+
984
+
985
+ # Part 6.
986
+ #
987
+ # Calculating substitution probability tables
988
+ #
989
+
990
+ if $output == 1
991
+ $outfh.puts <<HEADER
992
+ #
993
+ # Each column (j) represents the probability distribution for the
994
+ # likelihood of acceptance of a mutational event by a residue type j in
995
+ # a particular structural environment (specified after >) leading to
996
+ # any other residue type (i) and sums up to 100.
997
+ #
998
+ HEADER
999
+ end
1000
+
1001
+ # when nosmoothing !!!
1002
+ if ($output > 0) && $nosmooth
1003
+ # reinitialize $tot_cnt_mat for pseudocounts
1004
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1005
+
1006
+ # for each combination of environment features
1007
+ pseudo_cnt = $add || (1.0 / $env_classes.group_size)
1008
+
1009
+ # add pseudo counts for each frequency vector
1010
+ $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
1011
+
1012
+ # re-calculate probability vector for each environment class
1013
+ $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
1014
+
1015
+ group_matrices = []
1016
+
1017
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1018
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1019
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1020
+
1021
+ $amino_acids.each_with_index do |aa, aj|
1022
+ env_class = group[1].find { |e| e.label.start_with?(aa) }
1023
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
1024
+ 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
1025
+ end
1026
+
1027
+ $tot_cnt_mat += grp_cnt_mat
1028
+ group_matrices << [group[0], grp_prob_mat]
1029
+ end
1030
+
1031
+ if $output == 1
1032
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1033
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1034
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1035
+
1036
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1037
+ # for a matrix file
1038
+ stem = "#{grp_no}. #{grp_label}"
1039
+ $outfh.puts ">#{grp_label} #{grp_no}"
1040
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1041
+ :row_header => $amino_acids)
1042
+
1043
+
1044
+ # for a heat map
1045
+ if $heatmap == 0 or $heatmap == 2
1046
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1047
+ :row_header => $amino_acids,
1048
+ :rvg_width => $rvg_width,
1049
+ :rvg_height => $rvg_height,
1050
+ :canvas_width => $canvas_width,
1051
+ :canvas_height => $canvas_height,
1052
+ :max_val => grp_max_val.ceil,
1053
+ :min_val => 0,
1054
+ :print_value => $heatmapvalue,
1055
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1056
+
1057
+ $logger.info "Generating a heat map for #{stem} table done."
1058
+ end
1059
+
1060
+ if $heatmap == 1 or $heatmap == 2
1061
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1062
+ :row_header => $amino_acids,
1063
+ :rvg_width => $rvg_width,
1064
+ :rvg_height => $rvg_height - 50,
1065
+ :canvas_width => $canvas_width,
1066
+ :canvas_height => $canvas_height - 50,
1067
+ :max_val => grp_max_val.ceil,
1068
+ :min_val => 0,
1069
+ :print_value => $heatmapvalue,
1070
+ :print_gradient => false,
1071
+ :title => stem,
1072
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1073
+ end
1074
+ end
1075
+
1076
+ # for heat maps in a single file
1077
+ if $heatmap == 1 or $heatmap == 2
1078
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1079
+ heatmaps.heatmap(:columns => $heatmapcol,
1080
+ :rvg_width => $rvg_width,
1081
+ :max_val => grp_max_val.ceil,
1082
+ :min_val => 0).write(file)
1083
+
1084
+ $logger.info "Generating heat maps in a file, #{file} done."
1085
+ end
1086
+ end
1087
+
1088
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1089
+
1090
+ 0.upto($amino_acids.size - 1) do |aj|
1091
+ col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
1092
+ 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
1093
+ end
1094
+
1095
+ if $output == 1
1096
+ $outfh.puts '>Total'
1097
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1098
+ :row_header => $amino_acids)
1099
+ $outfh.close
1100
+
1101
+ # for a heat map
1102
+ if $heatmap == 0 or $heatmap == 2
1103
+ stem = "#{group_matrices.size}. TOTAL"
1104
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1105
+ :row_header => $amino_acids,
1106
+ :rvg_width => $rvg_width,
1107
+ :rvg_height => $rvg_height,
1108
+ :canvas_width => $canvas_width,
1109
+ :canvas_height => $canvas_height,
1110
+ :max_val => $tot_prob_mat.max.ceil,
1111
+ :min_val => 0,
1112
+ :print_value => $heatmapvalue,
1113
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1114
+
1115
+ $logger.info "Generating a heat map for #{stem} table done."
1116
+ end
1117
+ exit 0
1118
+ end
1119
+
1120
+ $logger.info 'Calculating substitution probabilities (no smoothing) done.'
1121
+ end
1122
+
1123
+ # when smoothing!!!
1124
+ if ($output > 0) && !$nosmooth
1125
+ #
1126
+ # p1 probabilities
1127
+ #
1128
+ p1 = NArray.float($amino_acids.size)
1129
+ a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
1130
+ big_N = $tot_aa.to_f
1131
+ small_n = $amino_acids.size.to_f
1132
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1133
+ omega2 = 1.0 - omega1
1134
+
1135
+ if ($smooth == :full) || $p1smooth
1136
+ # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
1137
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
1138
+ $smooth_prob[1] = p1
1139
+ elsif ($smooth == :partial)
1140
+ # no smoothing for p1 probabilities just as Kenji's subst
1141
+ # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
1142
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
1143
+ $smooth_prob[1] = p1
1144
+ end
1145
+
1146
+ #
1147
+ # p2 and above
1148
+ #
1149
+ env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
1150
+
1151
+ if $smooth == :partial
1152
+ $outfh.puts <<HEADER
1153
+ #
1154
+ # Partial Smoothing:
1155
+ #
1156
+ HEADER
1157
+ if $p1smooth
1158
+ $outfh.puts <<HEADER
1159
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1160
+ # each row in all matrices and smoothing them with A0 (a uniform distribution)
1161
+ # ^^^^^^^^^
1162
+ HEADER
1163
+ else
1164
+ $outfh.puts <<HEADER
1165
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1166
+ # each row in all matrices without smoothing
1167
+ # ^^^^^^^^^^^^^^^^^
1168
+ HEADER
1169
+ end
1170
+
1171
+ $outfh.puts <<HEADER
1172
+ # p2(ri|Rj) is estimated as:
1173
+ # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
1174
+ #
1175
+ # p3(ri|Rj,fq) is estimated as:
1176
+ # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
1177
+ # where
1178
+ # A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
1179
+ #
1180
+ # The smoothing procedure is curtailed here and finally
1181
+ # ^^^^^^^^^
1182
+ # p5(ri|Rj,...) is estimated as:
1183
+ # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
1184
+ # where
1185
+ # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
1186
+ #
1187
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1188
+ #
1189
+ # sigma value used is: #{$sigma}
1190
+ #
1191
+ HEADER
1192
+ 1.upto($env_features.size) do |ci|
1193
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
1194
+ if (ci > 2) && (ci < $env_features.size)
1195
+ $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
1196
+ next
1197
+ end
1198
+
1199
+ env_labels.combination(ci) do |c1|
1200
+ c1[0].product(*c1[1..-1]).each do |labels|
1201
+ pattern = '.' * $env_features.size
1202
+
1203
+ labels.each do |label|
1204
+ i = label[0].chr.to_i
1205
+ l = label[1].chr
1206
+ pattern[i] = l
1207
+ end
1208
+
1209
+ if pattern =~ /^\./
1210
+ $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
1211
+ next
1212
+ end
1213
+
1214
+ # get environments matching the pattern created above
1215
+ # and calculate amino acid frequencies and their probabilities for all the environments
1216
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1217
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1218
+ prob_arr = NArray.float($amino_acids.size)
1219
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
1220
+
1221
+ # # assess whether a residue type j is compatible with a particular combination of structural features
1222
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
1223
+ # if ci == $env_features.size
1224
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
1225
+ # sub_pattern = '.' * $env_features.size
1226
+ # sub_pattern[0] = aa_label
1227
+ # sub_freq_sum = 0
1228
+ #
1229
+ # labels[1..-1].each do |label|
1230
+ # next if label.start_with?('0')
1231
+ # i = label[0].chr.to_i
1232
+ # l = label[1].chr
1233
+ # sub_pattern[i] = l
1234
+ # sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1235
+ # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1236
+ # sub_freq_sum += sub_freq_arr.sum
1237
+ # end
1238
+ #
1239
+ # if sub_freq_sum == 0
1240
+ # if $smooth_prob.has_key?(ci + 1)
1241
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1242
+ # else
1243
+ # $smooth_prob[ci + 1] = {}
1244
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1245
+ # end
1246
+ # $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
1247
+ # next
1248
+ # end
1249
+ # end
1250
+
1251
+ # collect priors
1252
+ priors = []
1253
+
1254
+ if ci == 1
1255
+ priors << $smooth_prob[1]
1256
+ elsif ci == 2
1257
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
1258
+ priors << $smooth_prob[2][c3.to_set]
1259
+ }
1260
+ elsif ci == $env_features.size
1261
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
1262
+ priors << $smooth_prob[3][c3.to_set]
1263
+ }
1264
+ end
1265
+
1266
+ # entropy based prior weighting step
1267
+ entropy_max = Math::log($amino_acids.size)
1268
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
1269
+ begin
1270
+ p == 0.0 ? s - 1 : s + p * Math::log(p)
1271
+ rescue
1272
+ #puts "P: #{p}"
1273
+ end
1274
+ } }
1275
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1276
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1277
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1278
+
1279
+ # smoothing step
1280
+ smooth_prob_arr = NArray.float($amino_acids.size)
1281
+ big_N = freq_arr.sum.to_f
1282
+ small_n = $amino_acids.size.to_f
1283
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1284
+ omega2 = 1.0 - omega1
1285
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1286
+
1287
+ # normalization step
1288
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1289
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1290
+
1291
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1292
+ if $smooth_prob.has_key?(ci + 1)
1293
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1294
+ else
1295
+ $smooth_prob[ci + 1] = {}
1296
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1297
+ end
1298
+ end
1299
+ end
1300
+ end
1301
+ $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
1302
+ else
1303
+ $outfh.puts <<HEADER
1304
+ #
1305
+ # Full Smoothing:
1306
+ #
1307
+ # p1(ri) is estimated as:
1308
+ # p1(ri) = omega1 * A0 + omega2 * W1(ri)
1309
+ #
1310
+ # p2(ri|f1q) is estimated as:
1311
+ # p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
1312
+ #
1313
+ # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
1314
+ #
1315
+ # p3(ri|f1q,f2q) is estimated as:
1316
+ # p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
1317
+ # where
1318
+ # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
1319
+ #
1320
+ # The smoothing procedure is NOT curtailed here and it goes upto
1321
+ # ^^^^^^^^^^^^^
1322
+ #
1323
+ # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
1324
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
1325
+ # where
1326
+ # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
1327
+ #
1328
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1329
+ #
1330
+ # sigma value used is: #{$sigma}
1331
+ #
1332
+ HEADER
1333
+ # full smooting
1334
+ 1.upto($env_features.size) do |ci|
1335
+ env_labels.combination(ci) do |c1|
1336
+ c1[0].product(*c1[1..-1]).each do |labels|
1337
+ pattern = '.' * $env_features.size
1338
+ labels.each do |label|
1339
+ j = label[0].chr.to_i
1340
+ l = label[1].chr
1341
+ pattern[j] = l
1342
+ end
1343
+
1344
+ # get environmetns, frequencies, and probabilities
1345
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1346
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1347
+ prob_arr = NArray.float($amino_acids.size)
1348
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
1349
+
1350
+ # collect priors
1351
+ priors = []
1352
+ if ci > 1
1353
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
1354
+ else
1355
+ priors << $smooth_prob[1]
1356
+ end
1357
+
1358
+ # entropy based weighting priors
1359
+ entropy_max = Math::log($amino_acids.size)
1360
+ entropies = priors.map do |prior|
1361
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
1362
+ end
1363
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
1364
+
1365
+ # smoothing step
1366
+ smooth_prob_arr = NArray.float($amino_acids.size)
1367
+ big_N = freq_arr.sum.to_f
1368
+ small_n = $amino_acids.size.to_f
1369
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1370
+ omega2 = 1.0 - omega1
1371
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1372
+
1373
+ # normalization step
1374
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1375
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1376
+
1377
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1378
+ if $smooth_prob.has_key?(ci + 1)
1379
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1380
+ else
1381
+ $smooth_prob[ci + 1] = {}
1382
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1383
+ end
1384
+ end
1385
+ end
1386
+ end
1387
+ $logger.info 'Calculating substitution probabilities (full smoothing) done.'
1388
+ end
1389
+
1390
+ # updating smoothed probability array for each envrionment
1391
+ $env_classes.values.each do |env|
1392
+ env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1393
+ end
1394
+
1395
+ # sorting environments and build 21X21 substitution matrices
1396
+ group_matrices = []
1397
+
1398
+ $env_classes.groups_sorted_by_residue_labels.each do |group|
1399
+ # calculating 21X21 substitution probability matrix for each envrionment
1400
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1401
+
1402
+ $amino_acids.each_with_index do |aa, ai|
1403
+ smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1404
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1405
+ end
1406
+
1407
+ group_matrices << [group[0], grp_prob_mat]
1408
+ end
1409
+
1410
+ if $output == 1
1411
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1412
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1413
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1414
+
1415
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1416
+ # for a matrix file
1417
+ stem = "#{grp_no}. #{grp_label}"
1418
+ $outfh.puts ">#{grp_label} #{grp_no}"
1419
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1420
+ :row_header => $amino_acids)
1421
+
1422
+ # for heat map generation
1423
+ if $heatmap == 0 or $heatmap == 2
1424
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1425
+ :row_header => $amino_acids,
1426
+ :rvg_width => $rvg_width,
1427
+ :rvg_height => $rvg_height,
1428
+ :canvas_width => $canvas_width,
1429
+ :canvas_height => $canvas_height,
1430
+ :max_val => grp_max_val.ceil,
1431
+ :min_val => 0,
1432
+ :print_value => $heatmapvalue,
1433
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1434
+
1435
+ $logger.info "Generating a heat map for #{stem} table done."
1436
+ end
1437
+
1438
+ if $heatmap == 1 or $heatmap == 2
1439
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1440
+ :row_header => $amino_acids,
1441
+ :rvg_width => $rvg_width,
1442
+ :rvg_height => $rvg_height - 50,
1443
+ :canvas_width => $canvas_width,
1444
+ :canvas_height => $canvas_height - 50,
1445
+ :max_val => grp_max_val.ceil,
1446
+ :min_val => 0,
1447
+ :print_value => $heatmapvalue,
1448
+ :print_gradient => false,
1449
+ :title => stem,
1450
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1451
+ end
1452
+ end
1453
+
1454
+ # for heat maps in a single file
1455
+ if $heatmap == 1 or $heatmap == 2
1456
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1457
+ heatmaps.heatmap(:columns => $heatmapcol,
1458
+ :rvg_width => $rvg_width,
1459
+ :max_val => grp_max_val.ceil,
1460
+ :min_val => 0).write(file)
1461
+
1462
+ $logger.info "Generating heat maps in a file, #{file} done."
1463
+ end
1464
+ end
1465
+
1466
+ # for a total substitution probability matrix
1467
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1468
+
1469
+ $amino_acids.each_with_index do |aa, aj|
1470
+ 0.upto($amino_acids.size - 1) do |ai|
1471
+ $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
1472
+ end
1473
+ end
1474
+
1475
+ if $output == 1
1476
+ $outfh.puts '>Total'
1477
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1478
+ :row_header => $amino_acids)
1479
+ $outfh.close
1480
+
1481
+ # for a heat map
1482
+ if $heatmap == 0 or $heatmap == 2
1483
+ stem = "#{group_matrices.size}. TOTAL"
1484
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1485
+ :row_header => $amino_acids,
1486
+ :rvg_width => $rvg_width,
1487
+ :rvg_height => $rvg_height,
1488
+ :canvas_width => $canvas_width,
1489
+ :canvas_height => $canvas_height,
1490
+ :max_val => $tot_prob_mat.max.ceil,
1491
+ :min_val => 0,
1492
+ :print_value => $heatmapvalue,
1493
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1494
+
1495
+ $logger.info "Generating a heat map for #{stem} table done."
1496
+ end
1497
+ exit 0
1498
+ end
1499
+ end
1500
+
1501
+ #
1502
+ # Part 6. END
1503
+ #
1504
+
1505
+
1506
+ # Part 7.
1507
+ #
1508
+ # Calculating log odds ratio scoring matrices
1509
+ #
1510
+ if $output == 2
1511
+ $outfh.puts <<HEADER
1512
+ #
1513
+ # The probabilities were then divided by the background probabilities
1514
+ HEADER
1515
+ if $penv
1516
+ $outfh.puts <<HEADER
1517
+ # which were derived from the environment-dependent amino acid frequencies.
1518
+ # ^^^^^^^^^^^^^^^^^^^^^
1519
+ HEADER
1520
+ else
1521
+ $outfh.puts <<HEADER
1522
+ # which were derived from the environment-independent amino acid frequencies.
1523
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1524
+ HEADER
1525
+ end
1526
+
1527
+ grp_logo_mats = []
1528
+ factor = $scale / Math::log(2)
1529
+
1530
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1531
+ # calculating substitution probability matrix for each envrionment
1532
+ grp_label = group[0]
1533
+ grp_envs = group[1]
1534
+ grp_logo_mat = $cys == 0 ?
1535
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1536
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1537
+
1538
+ $amino_acids.each_with_index do |aa, aj|
1539
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1540
+ env.logo_array = $cys == 0 ?
1541
+ NArray.float($amino_acids.size + 1) :
1542
+ NArray.float($amino_acids.size)
1543
+
1544
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1545
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1546
+ odds = prob / pai
1547
+ env.logo_array[ai] = factor * Math::log(odds)
1548
+ grp_logo_mat[aj, ai] = env.logo_array[ai]
1549
+ end
1550
+
1551
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1552
+ if $cys == 0
1553
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1554
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1555
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1556
+ odds = prob / pai
1557
+ env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1558
+ grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1559
+ end
1560
+ end
1561
+
1562
+ grp_logo_mats << [grp_label, grp_logo_mat]
1563
+ end
1564
+
1565
+ $tot_logo_mat = $cys == 0 ?
1566
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1567
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1568
+
1569
+ $amino_acids.each_with_index do |aa1, aj|
1570
+ $amino_acids.each_with_index do |aa2, ai|
1571
+ prob = $tot_prob_mat[aj, ai]
1572
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1573
+ odds = prob / pai
1574
+ $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1575
+ end
1576
+
1577
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1578
+ if $cys == 0
1579
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1580
+ prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1581
+ odds = prob / pai
1582
+ $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1583
+ end
1584
+ end
1585
+
1586
+
1587
+ # calculating relative entropy for each amino acid pair H and
1588
+ # the expected score E in bit units
1589
+ tot_E = 0.0
1590
+ tot_H = 0.0
1591
+
1592
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1593
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1594
+ if j != i
1595
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1596
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1597
+ else
1598
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1599
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1600
+ end
1601
+ end
1602
+ end
1603
+
1604
+ $outfh.puts <<HEADER
1605
+ #
1606
+ # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1607
+ HEADER
1608
+ unless $noroundoff
1609
+ $outfh.puts <<HEADER
1610
+ # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1611
+ HEADER
1612
+ end
1613
+
1614
+ $outfh.puts <<HEADER
1615
+ # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1616
+ #
1617
+ HEADER
1618
+
1619
+ grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
1620
+ grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
1621
+ abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
1622
+ row_header = $cys ? $amino_acids + %w[U] : $amino_acids
1623
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1624
+ $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
1625
+
1626
+ grp_logo_mats.each_with_index do |arr, grp_no|
1627
+ grp_label = arr[0]
1628
+ grp_logo_mat = arr[1]
1629
+ stem = "#{grp_no}. #{grp_label}"
1630
+
1631
+ unless $noroundoff
1632
+ grp_logo_mat = grp_logo_mat.round
1633
+ end
1634
+
1635
+ # for a matrix file
1636
+ $outfh.puts ">#{grp_label} #{grp_no}"
1637
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
1638
+ :row_header => row_header)
1639
+ # for a heat map
1640
+ if $heatmap == 0 or $heatmap == 2
1641
+ grp_logo_mat.heatmap(:col_header => $amino_acids,
1642
+ :row_header => row_header,
1643
+ :rvg_width => $rvg_width,
1644
+ :rvg_height => $rvg_height,
1645
+ :canvas_width => $canvas_width,
1646
+ :canvas_height => $canvas_height,
1647
+ :gradient_beg_color => '#0000FF',
1648
+ :gradient_mid_color => '#FFFFFF',
1649
+ :gradient_end_color => '#FF0000',
1650
+ :max_val => abs_max_val.ceil,
1651
+ :mid_val => 0,
1652
+ :min_val => -1 * abs_max_val.ceil,
1653
+ :print_value => $heatmapvalue,
1654
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1655
+
1656
+ $logger.info "Generating a heat map for #{stem} table done."
1657
+ end
1658
+
1659
+ if $heatmap == 1 or $heatmap == 2
1660
+ heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1661
+ :row_header => row_header,
1662
+ :rvg_width => $rvg_width,
1663
+ :rvg_height => $rvg_height - 50,
1664
+ :canvas_width => $canvas_width,
1665
+ :canvas_height => $canvas_height - 50,
1666
+ :gradient_beg_color => '#0000FF',
1667
+ :gradient_mid_color => '#FFFFFF',
1668
+ :gradient_end_color => '#FF0000',
1669
+ :max_val => abs_max_val.ceil,
1670
+ :mid_val => 0,
1671
+ :min_val => -1 * abs_max_val.ceil,
1672
+ :print_value => $heatmapvalue,
1673
+ :print_gradient => false,
1674
+ :title => stem,
1675
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1676
+ end
1677
+ end
1678
+
1679
+ # for heat maps in a single file
1680
+ if $heatmap == 1 or $heatmap == 2
1681
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1682
+ heatmaps.heatmap(:columns => $heatmapcol,
1683
+ :rvg_width => $rvg_width,
1684
+ :gradient_beg_color => '#0000FF',
1685
+ :gradient_mid_color => '#FFFFFF',
1686
+ :gradient_end_color => '#FF0000',
1687
+ :max_val => abs_max_val.ceil,
1688
+ :mid_val => 0,
1689
+ :min_val => -1 * abs_max_val.ceil).write(file)
1690
+
1691
+ $logger.info "Generating heat maps in a file, #{file} done."
1692
+ end
1693
+
1694
+ # for a matrix file
1695
+ unless $noroundoff
1696
+ $tot_logo_mat = $tot_logo_mat.round
1697
+ end
1698
+
1699
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1700
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
1701
+ :row_header => row_header)
1702
+
1703
+ # for a heat map
1704
+ if $heatmap == 0 or $heatmap == 2
1705
+ stem = "#{group_matrices.size}. TOTAL"
1706
+ tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
1707
+ $tot_logo_mat.heatmap(:col_header => $amino_acids,
1708
+ :row_header => row_header,
1709
+ :rvg_width => $rvg_width,
1710
+ :rvg_height => $rvg_height,
1711
+ :canvas_width => $canvas_width,
1712
+ :canvas_height => $canvas_height,
1713
+ :gradient_beg_color => '#0000FF',
1714
+ :gradient_mid_color => '#FFFFFF',
1715
+ :gradient_end_color => '#FF0000',
1716
+ :max_val => tot_abs_max_val.ceil,
1717
+ :mid_val => 0,
1718
+ :min_val => -1 * tot_abs_max_val.ceil,
1719
+ :print_value => $heatmapvalue,
1720
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1721
+
1722
+ $logger.info "Generating a heat map for #{stem} table done."
1723
+ end
1724
+
1725
+ $logger.info "Calculating log odds ratios done."
1726
+ end
1727
+
1728
+ #
1729
+ # Part 7. END
1730
+ #
1731
+
1732
+ $outfh.close
1733
+ exit 0
1734
+ end
1735
+ end
1736
+
1737
+ end # class CLI
1738
+ end # module Egor