egor 0.0.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Egor
5
- VERSION = '0.0.5'
5
+ VERSION = '0.9.0'
6
6
  end
@@ -5,16 +5,17 @@ require 'narray'
5
5
  require 'bio'
6
6
  require 'set'
7
7
  require 'facets'
8
- require 'simple_memoize'
9
8
 
9
+ require 'math_extensions'
10
+ require 'string_extensions'
10
11
  require 'narray_extensions'
11
12
  require 'nmatrix_extensions'
12
- require 'enumerable_extensions'
13
- require 'math_extensions'
14
- require 'environment'
15
- require 'environment_class_hash'
16
- require 'environment_feature'
17
- require 'environment_feature_array'
13
+
14
+ require 'egor/environment'
15
+ require 'egor/environment_class_hash'
16
+ require 'egor/environment_feature'
17
+ require 'egor/environment_feature_array'
18
+ require 'egor/heatmap_array'
18
19
 
19
20
  # This is a module for an actual command line interpreter for Egor
20
21
  # ---
@@ -25,7 +26,7 @@ module Egor
25
26
 
26
27
  # :nodoc:
27
28
  def print_version
28
- puts Egor::VERSION
29
+ puts VERSION
29
30
  end
30
31
 
31
32
  # Print Egor's Usage on the screen
@@ -62,14 +63,26 @@ Options:
62
63
  0 for raw counts (no smoothing performed)
63
64
  1 for probabilities
64
65
  2 for log-odds (default)
65
- --noround: do not round off log odds ratio
66
+ --noroundoff: do not round off log odds ratio
66
67
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
67
68
  --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
68
69
  --autosigma: automatically adjust the sigma value for smoothing
69
70
  --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
70
- --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
71
71
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
72
72
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
73
+ --heatmap INTEGER:
74
+ 0 create a heat map file for each substitution table
75
+ 1 create one big file containing all heat maps from substitution tables
76
+ 2 do both 0 and 1
77
+ --heatmap-format INTEGER:
78
+ 0 for Portable Network Graphics (PNG) Format (default)
79
+ 1 for Graphics Interchange Format (GIF)
80
+ 2 for Joint Photographic Experts Group (JPEG) Format
81
+ 3 for Microsoft Windows bitmap (BMP) Format
82
+ 4 for Portable Document Format (PDF)
83
+ --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
84
+ --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
85
+ --heatmap-values: print values in the cells when generating heat maps
73
86
  --verbose (-v) INTEGER
74
87
  0 for ERROR level
75
88
  1 for WARN or above level (default)
@@ -87,12 +100,12 @@ Options:
87
100
  # Egor::CLI::calculate_pid(seq1, seq2) -> Float
88
101
  #
89
102
  def calculate_pid(seq1, seq2)
90
- s1 = seq1.split('')
91
- s2 = seq2.split('')
92
- cols = s1.zip(s2)
93
- align = 0
94
- ident = 0
95
- intgp = 0
103
+ aas1 = seq1.split('')
104
+ aas2 = seq2.split('')
105
+ cols = aas1.zip(aas2)
106
+ align = 0 # no. of aligned columns
107
+ ident = 0 # no. of identical columns
108
+ intgp = 0 # no. of internal gaps
96
109
 
97
110
  cols.each do |col|
98
111
  if (col[0] != '-') && (col[1] != '-')
@@ -100,14 +113,14 @@ Options:
100
113
  if col[0] == col[1]
101
114
  ident += 1
102
115
  end
103
- elsif (((col[0] == '-') && (col[1] != '-')) || ((col[0] != '-') && (col[1] == '-')))
116
+ elsif (((col[0] == '-') && (col[1] != '-')) ||
117
+ ((col[0] != '-') && (col[1] == '-')))
104
118
  intgp += 1
105
119
  end
106
120
  end
107
121
 
108
122
  pid = 100.0 * ident.to_f / (align + intgp)
109
123
  end
110
- memoize :calculate_pid
111
124
 
112
125
  # :nodoc:
113
126
  def execute(arguments=[])
@@ -121,7 +134,7 @@ Options:
121
134
  # aa: weighted amino acid
122
135
  # tot: total
123
136
  # rel: relative
124
- # obs: observation
137
+ # jnt: joint
125
138
  # cnt: count
126
139
  # mut: mutation
127
140
  # mutb: mutability
@@ -145,31 +158,42 @@ Options:
145
158
  $logger.level = Logger::WARN
146
159
 
147
160
  # default set of 21 amino acids including J (Cysteine, the free thiol form)
148
- $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
149
-
150
- $tem_list = nil
151
- $tem_file = nil
152
- $classdef = 'classdef.dat'
153
- $outfile = 'allmat.dat'
154
- $outfh = nil # file hanfle for outfile
155
- $output = 2 # default: log odds matrix
156
- $ali_size = 0
157
- $tot_aa = 0
158
- $sigma = 5.0
159
- $autosigma = false
160
- $weight = 60
161
- $noweight = false
162
- $smooth = :partial
163
- $nosmooth = false
164
- $noround = false
165
- $p1smooth = false
166
- $scale = 3
167
- $pidmin = nil
168
- $pidmax = nil
169
- $scale = 3
170
- $add = nil
171
- $cys = 0
172
- $penv = false
161
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
162
+ $tem_list = nil
163
+ $tem_file = nil
164
+ $classdef = 'classdef.dat'
165
+ $outfile = 'allmat.dat'
166
+ $outfh = nil # file hanfle for outfile
167
+ $output = 2 # default: log odds matrix
168
+ $ali_size = 0
169
+ $tot_aa = 0
170
+ $sigma = 5.0
171
+ $autosigma = false
172
+ $weight = 60
173
+ $noweight = false
174
+ $smooth = :partial
175
+ $nosmooth = false
176
+ $noroundoff = false
177
+ $p1smooth = false
178
+ $scale = 3
179
+ $pidmin = nil
180
+ $pidmax = nil
181
+ $scale = 3
182
+ $add = nil
183
+ $cys = 0
184
+ $targetenv = false
185
+ $penv = false
186
+ $heatmap = nil
187
+ $heatmapcol = nil
188
+ $heatmapformat = 'png'
189
+ $heatmapstem = 'heatmaps'
190
+ $heatmapvalues = false
191
+ $rvg_width = 550
192
+ $rvg_height = 650
193
+ $canvas_width = 550
194
+ $canvas_height = 650
195
+ $cell_width = 20
196
+ $cell_height = 20
173
197
 
174
198
  $aa_tot_cnt = Hash.new(0)
175
199
  $aa_mut_cnt = Hash.new(0)
@@ -184,7 +208,7 @@ Options:
184
208
  $tot_smooth_prob = {}
185
209
 
186
210
  # minimum ratio of amino acid count to sigma value
187
- $min_obs_sigma_ratio = 500.0
211
+ $min_cnt_sigma_ratio = 500.0
188
212
 
189
213
  #
190
214
  # Part 1 END
@@ -205,11 +229,16 @@ Options:
205
229
  [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
206
230
  [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
207
231
  [ '--noweight', GetoptLong::NO_ARGUMENT ],
208
- [ '--noround', GetoptLong::NO_ARGUMENT ],
232
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
209
233
  [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
210
234
  [ '--autosigma', GetoptLong::NO_ARGUMENT ],
211
- #[ '--heatmap', GetoptLong::NO_ARGUMENT ],
235
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
237
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
212
240
  [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
241
+ [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
213
242
  [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
214
243
  [ '--penv', GetoptLong::NO_ARGUMENT ],
215
244
  [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
@@ -235,6 +264,8 @@ Options:
235
264
  $outfile = arg
236
265
  when '--cys'
237
266
  $cys = arg.to_i
267
+ when '--targetenv'
268
+ $targetenv = (arg.to_i == 1) ? true : false
238
269
  when '--weight'
239
270
  $weight = arg.to_i
240
271
  when '--sigma'
@@ -247,8 +278,8 @@ Options:
247
278
  $pidmax = arg.to_f
248
279
  when '--noweight'
249
280
  $noweight = true
250
- when '--noround'
251
- $noround = true
281
+ when '--noroundoff'
282
+ $noroundoff = true
252
283
  when '--smooth'
253
284
  $smooth = (arg.to_i == 1) ? :full : :partial
254
285
  when '--nosmooth'
@@ -260,18 +291,42 @@ Options:
260
291
  when '--add'
261
292
  $add = arg.to_f
262
293
  when '--penv'
263
- warn "--penv option is not supported yet."
294
+ warn "--penv option is not supported."
264
295
  exit 1
265
296
  $penv = true
266
- # when '--heatmap'
267
- # $heatmap = true
297
+ when '--heatmap'
298
+ $heatmap = case arg.to_i
299
+ when (0..2) then arg.to_i
300
+ else
301
+ warn "--heatmap #{arg.to_i} is not allowed."
302
+ exit1
303
+ end
304
+ when '--heatmap-columns'
305
+ $heatmapcol = arg.to_i
306
+ when '--heatmap-stem'
307
+ $heatmapstem = arg.to_s
308
+ when '--heatmap-format'
309
+ $heatmapformat = case arg.to_i
310
+ when 0 then 'png'
311
+ when 1 then 'gif'
312
+ when 2 then 'jpg'
313
+ when 3 then 'bmp'
314
+ when 4 then 'pdf'
315
+ else
316
+ warn "--heatmap-format #{arg.to_i} is not supported."
317
+ exit 1
318
+ end
319
+ when '--heatmap-values'
320
+ $heatmapvalues = true
268
321
  when '--verbose'
269
322
  $logger.level = case arg.to_i
270
323
  when 0 then Logger::ERROR
271
324
  when 1 then Logger::WARN
272
325
  when 2 then Logger::INFO
273
326
  when 3 then Logger::DEBUG
274
- else Logger::WARN
327
+ else
328
+ warn "--verbose (-v) #{arg.to_i} is not supported."
329
+ exit 1
275
330
  end
276
331
  when '--version'
277
332
  print_version
@@ -284,7 +339,9 @@ Options:
284
339
  end
285
340
 
286
341
  # when arguments are nonsense, print usage
287
- if ((ARGV.length != 0) || (!$tem_list && !$tem_file) || ($tem_list && $tem_file))
342
+ if ((ARGV.length != 0) ||
343
+ (!$tem_list && !$tem_file) ||
344
+ ($tem_list && $tem_file))
288
345
  print_usage
289
346
  exit 1
290
347
  end
@@ -305,7 +362,6 @@ Options:
305
362
  exit 1
306
363
  end
307
364
 
308
-
309
365
  #
310
366
  # Part 2 END
311
367
  #
@@ -316,23 +372,28 @@ Options:
316
372
  # Reading Environment Class Definition File
317
373
  #
318
374
 
319
- $logger.info "Egor START."
320
-
321
375
  # check --cys option and modify amino_acids set if necessary
322
376
  if $cys == 2
323
377
  $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
324
378
  end
325
379
 
326
- # create an EnvironmentFeatureList object for storing all environment features
380
+ # create an EnvironmentFeatureList object for storing all environment
381
+ # features
327
382
  $env_features = EnvironmentFeatureArray.new
328
383
 
329
384
  # an array for storing indexes of constrained environment features
330
385
  $cst_features = []
331
386
 
332
- # add substituted amino acid (aa1) in a substitution to the environment feature list
333
- $env_features << EnvironmentFeature.new('sequence', $amino_acids, $amino_acids, 'F', 'F')
387
+ # add substituted amino acid (aa1) in a substitution to the environment
388
+ # feature list
389
+ $env_features << EnvironmentFeature.new('sequence',
390
+ $amino_acids,
391
+ $amino_acids,
392
+ 'F',
393
+ 'F')
334
394
 
335
- # read environment class definiton file and store them into the hash prepared above
395
+ # read environment class definiton file and store them into
396
+ # the hash prepared above
336
397
  env_index = 1
337
398
 
338
399
  IO.foreach($classdef) do |line|
@@ -350,10 +411,15 @@ Options:
350
411
  $cst_features << env_index
351
412
  $logger.warn "The environment feature, #{line} constrained."
352
413
  end
353
- $env_features << EnvironmentFeature.new(env_ftr[0], env_ftr[1].split(''), env_ftr[2].split(''), env_ftr[3], env_ftr[4])
414
+ $env_features << EnvironmentFeature.new(env_ftr[0],
415
+ env_ftr[1].split(''),
416
+ env_ftr[2].split(''),
417
+ env_ftr[3],
418
+ env_ftr[4])
354
419
  env_index += 1
355
420
  else
356
- $logger.error "\"#{line}\" doesn't seem to be a proper format for a environment class definition."
421
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
+ "a environment class definition."
357
423
  exit 1
358
424
  end
359
425
  end
@@ -361,9 +427,13 @@ Options:
361
427
  # a hash for storing all environment classes
362
428
  $env_classes = EnvironmentClassHash.new
363
429
 
364
- # generate all possible combinations of environment labels, and store every environment class into the hash prepared above with the label as a key
430
+ # generate all possible combinations of environment labels, and store
431
+ # every environment class into the hash prepared above with the label
432
+ # as a key
365
433
  $env_features.label_combinations.each_with_index { |e, i|
366
- $env_classes[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
434
+ $env_classes[e.flatten.join] = Environment.new(i,
435
+ e.flatten.join,
436
+ $amino_acids)
367
437
  }
368
438
 
369
439
  #
@@ -390,19 +460,17 @@ Options:
390
460
  $tem_list_io.each_line do |tem_file|
391
461
  tem_file.chomp!
392
462
 
393
- $logger.info "Analysing #{tem_file} ..."
394
-
395
463
  ali = Bio::Alignment::OriginalAlignment.new
396
464
  ff = Bio::FlatFile.auto(tem_file)
397
465
 
398
466
  ff.each_entry do |pir|
399
467
  if (pir.definition == 'sequence') || (pir.definition == 'structure')
400
- ali.add_seq(pir.data.gsub("\n", ''), pir.entry_id)
468
+ ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
401
469
  end
402
470
  end
403
471
 
404
472
  if ali.size < 2
405
- $logger.warn "Skipped #{tem_file}, there is only one unique entry."
473
+ $logger.warn "Skipped #{tem_file} which has only one unique entry."
406
474
  next
407
475
  end
408
476
 
@@ -414,8 +482,10 @@ Options:
414
482
  # check disulphide bond environment first!
415
483
  ff.rewind
416
484
  ff.each_entry do |pir|
417
- if (pir.entry_id == key) && ((pir.definition == "disulphide") || (pir.definition == "disulfide"))
418
- disulphide[key] = pir.data.gsub("\n", '').split('')
485
+ if ((pir.entry_id == key) &&
486
+ ((pir.definition == "disulphide") ||
487
+ (pir.definition == "disulfide")))
488
+ disulphide[key] = pir.data.remove_internal_spaces.split('')
419
489
  end
420
490
  end
421
491
 
@@ -425,14 +495,16 @@ Options:
425
495
  ff.rewind
426
496
  ff.each_entry do |pir|
427
497
  if (pir.entry_id == key) && (pir.definition == ec.name)
428
- labels = pir.data.gsub("\n", '').split('').map_with_index do |sym, pos|
498
+ labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
429
499
  if sym == '-'
430
500
  '-'
431
501
  elsif sym == 'X' || sym == 'x'
432
502
  'X'
433
503
  else
434
504
  if ei == 0 # Amino Acid Environment Feature
435
- (disulphide.has_key?(key) && (disulphide[key][pos] == 'F') && (sym == 'C')) ? 'J' : sym
505
+ (disulphide.has_key?(key) &&
506
+ (disulphide[key][pos] == 'F') &&
507
+ (sym == 'C')) ? 'J' : sym
436
508
  else
437
509
  ec.labels[ec.symbols.index(sym)]
438
510
  end
@@ -442,7 +514,9 @@ Options:
442
514
  if env_labels[key].empty?
443
515
  env_labels[key] = labels
444
516
  else
445
- env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
517
+ env_labels[key].each_with_index { |e, i|
518
+ env_labels[key][i] = e + labels[i]
519
+ }
446
520
  end
447
521
  end
448
522
  end
@@ -459,13 +533,15 @@ Options:
459
533
 
460
534
  # check PID_MIN
461
535
  if $pidmin && (pid < $pidmin)
462
- $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
536
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
537
+ "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
463
538
  next
464
539
  end
465
540
 
466
541
  # check PID_MAX
467
542
  if $pidmax && (pid > $pidmax)
468
- $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
543
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
544
+ "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
469
545
  next
470
546
  end
471
547
 
@@ -574,7 +650,7 @@ Options:
574
650
  end while(continue)
575
651
 
576
652
  if clusters.size < 2
577
- $logger.debug "Skipped #{tem_file} because there is only one cluster at the #{$weight} PID level."
653
+ $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
578
654
  next
579
655
  end
580
656
 
@@ -610,16 +686,16 @@ Options:
610
686
 
611
687
  aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
612
688
  aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
613
- obs1 = 1.0 / cluster1.size
614
- obs2 = 1.0 / cluster2.size
615
- obs_cnt = obs1 * obs2
689
+ cnt1 = 1.0 / cluster1.size
690
+ cnt2 = 1.0 / cluster2.size
691
+ jnt_cnt = cnt1 * cnt2
616
692
 
617
693
  if $cst_features.empty?
618
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
619
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
694
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
695
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
620
696
  elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
621
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
622
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
697
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
698
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
623
699
  else
624
700
  $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
625
701
  next
@@ -630,64 +706,65 @@ Options:
630
706
 
631
707
  if $aa_env_cnt.has_key? grp_label1
632
708
  if $aa_env_cnt[grp_label1].has_key? aa1
633
- $aa_env_cnt[grp_label1][aa1] += obs1
709
+ $aa_env_cnt[grp_label1][aa1] += cnt1
634
710
  else
635
- $aa_env_cnt[grp_label1][aa1] = obs1
711
+ $aa_env_cnt[grp_label1][aa1] = cnt1
636
712
  end
637
713
  else
638
714
  $aa_env_cnt[grp_label1] = Hash.new(0.0)
639
- $aa_env_cnt[grp_label1][aa1] = obs1
715
+ $aa_env_cnt[grp_label1][aa1] = cnt1
640
716
  end
641
717
 
642
718
  if $aa_env_cnt.has_key? grp_label2
643
719
  if $aa_env_cnt[grp_label2].has_key? aa2
644
- $aa_env_cnt[grp_label2][aa2] += obs2
720
+ $aa_env_cnt[grp_label2][aa2] += cnt2
645
721
  else
646
- $aa_env_cnt[grp_label2][aa2] = obs2
722
+ $aa_env_cnt[grp_label2][aa2] = cnt2
647
723
  end
648
724
  else
649
725
  $aa_env_cnt[grp_label2] = Hash.new(0.0)
650
- $aa_env_cnt[grp_label2][aa2] = obs2
726
+ $aa_env_cnt[grp_label2][aa2] = cnt2
651
727
  end
652
728
 
653
729
  if $aa_tot_cnt.has_key? aa1
654
- $aa_tot_cnt[aa1] += obs1
730
+ $aa_tot_cnt[aa1] += cnt1
655
731
  else
656
- $aa_tot_cnt[aa1] = obs1
732
+ $aa_tot_cnt[aa1] = cnt1
657
733
  end
658
734
 
659
735
  if $aa_tot_cnt.has_key? aa2
660
- $aa_tot_cnt[aa2] += obs2
736
+ $aa_tot_cnt[aa2] += cnt2
661
737
  else
662
- $aa_tot_cnt[aa2] = obs2
738
+ $aa_tot_cnt[aa2] = cnt2
663
739
  end
664
740
 
665
741
  if aa1 != aa2
666
742
  if $aa_mut_cnt.has_key? aa1
667
- $aa_mut_cnt[aa1] += obs1
743
+ $aa_mut_cnt[aa1] += cnt1
668
744
  else
669
- $aa_mut_cnt[aa1] = obs1
745
+ $aa_mut_cnt[aa1] = cnt1
670
746
  end
671
747
  if $aa_mut_cnt.has_key? aa2
672
- $aa_mut_cnt[aa2] += obs2
748
+ $aa_mut_cnt[aa2] += cnt2
673
749
  else
674
- $aa_mut_cnt[aa2] = obs2
750
+ $aa_mut_cnt[aa2] = cnt2
675
751
  end
676
752
  end
677
753
 
678
- $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
679
- $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
754
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
755
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
680
756
  end
681
757
  end
682
758
  end
683
759
  end
684
760
  end
761
+ $logger.info "Analysing #{tem_file} done."
685
762
  end
686
763
 
687
764
  # print out default header
688
765
  $outfh.puts <<HEADER
689
766
  # Environment-specific amino acid substitution matrices
690
- # Creator: egor version #{Egor::VERSION}
767
+ # Creator: egor version #{VERSION}
691
768
  # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
692
769
  #
693
770
  # Definitions for structural environments:
@@ -739,20 +816,20 @@ HEADER
739
816
  $outfh.puts "# Total amino acid frequencies:\n"
740
817
  $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
741
818
 
742
- min_obs = -1
819
+ min_cnt = -1
743
820
  min_sigma = nil
744
821
 
745
822
  $amino_acids.each do |res|
746
- if ($aa_tot_cnt[res] / $sigma) < $min_obs_sigma_ratio
747
- if min_obs < 0
748
- min_obs = $aa_tot_cnt[res]
749
- min_sigma = min_obs / $min_obs_sigma_ratio
750
- elsif (min_obs > 0) && (min_obs > $aa_tot_cnt[res])
751
- min_obs = $aa_tot_cnt[res]
752
- min_sigma = min_obs / $min_obs_sigma_ratio
823
+ if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
824
+ if min_cnt < 0
825
+ min_cnt = $aa_tot_cnt[res]
826
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
827
+ elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
828
+ min_cnt = $aa_tot_cnt[res]
829
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
753
830
  end
754
831
 
755
- $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total observation (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
832
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
756
833
  end
757
834
 
758
835
  $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
@@ -770,7 +847,7 @@ HEADER
770
847
  end
771
848
  end
772
849
 
773
- if min_obs > -1
850
+ if min_cnt > -1
774
851
  $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
775
852
  if $autosigma
776
853
  $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
@@ -780,12 +857,13 @@ HEADER
780
857
 
781
858
  $outfh.puts '#'
782
859
  $outfh.puts '# RES: Amino acid one letter code'
783
- $outfh.puts '# TOT_OBS: Total observations of incidence'
784
- $outfh.puts '# MUT_OBS: Total observations of mutation'
860
+ $outfh.puts '# TOT_OBS: Total count of incidence'
861
+ $outfh.puts '# MUT_OBS: Total count of mutation'
785
862
  $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
786
- $outfh.puts '# REL_MUTB: Relative mutability (ALA=100)'
863
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
787
864
  $outfh.puts '# REL_FREQ: Relative frequency'
788
865
  $outfh.puts '#'
866
+
789
867
  #
790
868
  # Part 4. END
791
869
  #
@@ -804,7 +882,8 @@ HEADER
804
882
  end
805
883
 
806
884
  # count raw frequencies
807
- $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
885
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
886
+ group_matrices = []
808
887
 
809
888
  # for each combination of environment features
810
889
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
@@ -816,22 +895,88 @@ HEADER
816
895
  end
817
896
 
818
897
  $tot_cnt_mat += grp_cnt_mat
819
-
820
- if $output == 0
821
- $outfh.puts ">#{group[0]} #{group_no}"
822
- $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
823
- end
898
+ group_matrices << [group[0], grp_cnt_mat]
824
899
  end
825
900
 
901
+ $logger.info "Counting substitutions done."
902
+
826
903
  if $output == 0
904
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
905
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
906
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
907
+
908
+ group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
909
+ # for a matrix file
910
+ stem = "#{grp_no}. #{grp_label}"
911
+ $outfh.puts ">#{grp_label} #{grp_no}"
912
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
913
+ :row_header => $amino_acids)
914
+
915
+ # for a heat map
916
+ if $heatmap == 0 or $heatmap == 2
917
+ grp_cnt_mat.heatmap(:col_header => $amino_acids,
918
+ :row_header => $amino_acids,
919
+ :rvg_width => $rvg_width,
920
+ :rvg_height => $rvg_height,
921
+ :canvas_width => $canvas_width,
922
+ :canvas_height => $canvas_height,
923
+ :max_val => grp_max_val.ceil,
924
+ :min_val => 0,
925
+ :print_value => $heatmapvalues,
926
+ :title => stem).write("#{stem}.#{$heatmapformat}")
927
+
928
+ $logger.info "Generating a heat map for #{stem} table done."
929
+ end
930
+
931
+ if $heatmap == 1 or $heatmap == 2
932
+ heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
933
+ :row_header => $amino_acids,
934
+ :rvg_width => $rvg_width,
935
+ :rvg_height => $rvg_height - 50,
936
+ :canvas_width => $canvas_width,
937
+ :canvas_height => $canvas_height - 50,
938
+ :max_val => grp_max_val.ceil,
939
+ :min_val => 0,
940
+ :print_value => $heatmapvalues,
941
+ :print_gradient => false,
942
+ :title => stem,
943
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
944
+ end
945
+ end
946
+
947
+ if $heatmap == 1 or $heatmap == 2
948
+ file = "#{$heatmapstem}.#{$heatmapformat}"
949
+ heatmaps.heatmap(:columns => $heatmapcol,
950
+ :rvg_width => $rvg_width,
951
+ :max_val => grp_max_val.ceil,
952
+ :min_val => 0).write(file)
953
+
954
+ $logger.info "Generating heat maps in a file, #{file} done."
955
+ end
956
+
957
+ # total
827
958
  $outfh.puts '>Total'
828
- $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
829
- $logger.info 'Egor END.'
959
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
960
+ :row_header => $amino_acids)
961
+
962
+ if $heatmap == 0 or $heatmap == 2
963
+ stem = "#{group_matrices.size}. TOTAL"
964
+ heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
965
+ :row_header => $amino_acids,
966
+ :rvg_width => $rvg_width,
967
+ :rvg_height => $rvg_height,
968
+ :canvas_width => $canvas_width,
969
+ :canvas_height => $canvas_height,
970
+ :max_val => $tot_cnt_mat.max.ceil,
971
+ :min_val => 0,
972
+ :print_value => $heatmapvalues,
973
+ :title => stem).write("#{stem}.#{$heatmapformat}")
974
+
975
+ $logger.info "Generating a heat map for #{stem} table done."
976
+ end
830
977
  exit 0
831
978
  end
832
979
 
833
- $logger.info "Counting substitutions is done."
834
-
835
980
  #
836
981
  # Part 5. END
837
982
  #
@@ -867,6 +1012,8 @@ HEADER
867
1012
  # re-calculate probability vector for each environment class
868
1013
  $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
869
1014
 
1015
+ group_matrices = []
1016
+
870
1017
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
871
1018
  grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
872
1019
  grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
@@ -878,10 +1025,63 @@ HEADER
878
1025
  end
879
1026
 
880
1027
  $tot_cnt_mat += grp_cnt_mat
1028
+ group_matrices << [group[0], grp_prob_mat]
1029
+ end
1030
+
1031
+ if $output == 1
1032
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1033
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1034
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1035
+
1036
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1037
+ # for a matrix file
1038
+ stem = "#{grp_no}. #{grp_label}"
1039
+ $outfh.puts ">#{grp_label} #{grp_no}"
1040
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1041
+ :row_header => $amino_acids)
1042
+
1043
+
1044
+ # for a heat map
1045
+ if $heatmap == 0 or $heatmap == 2
1046
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1047
+ :row_header => $amino_acids,
1048
+ :rvg_width => $rvg_width,
1049
+ :rvg_height => $rvg_height,
1050
+ :canvas_width => $canvas_width,
1051
+ :canvas_height => $canvas_height,
1052
+ :max_val => grp_max_val.ceil,
1053
+ :min_val => 0,
1054
+ :print_value => $heatmapvalues,
1055
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1056
+
1057
+ $logger.info "Generating a heat map for #{stem} table done."
1058
+ end
1059
+
1060
+ if $heatmap == 1 or $heatmap == 2
1061
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1062
+ :row_header => $amino_acids,
1063
+ :rvg_width => $rvg_width,
1064
+ :rvg_height => $rvg_height - 50,
1065
+ :canvas_width => $canvas_width,
1066
+ :canvas_height => $canvas_height - 50,
1067
+ :max_val => grp_max_val.ceil,
1068
+ :min_val => 0,
1069
+ :print_value => $heatmapvalues,
1070
+ :print_gradient => false,
1071
+ :title => stem,
1072
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1073
+ end
1074
+ end
881
1075
 
882
- if ($output == 1)
883
- $outfh.puts ">#{group[0]} #{group_no}"
884
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1076
+ # for heat maps in a single file
1077
+ if $heatmap == 1 or $heatmap == 2
1078
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1079
+ heatmaps.heatmap(:columns => $heatmapcol,
1080
+ :rvg_width => $rvg_width,
1081
+ :max_val => grp_max_val.ceil,
1082
+ :min_val => 0).write(file)
1083
+
1084
+ $logger.info "Generating heat maps in a file, #{file} done."
885
1085
  end
886
1086
  end
887
1087
 
@@ -892,15 +1092,32 @@ HEADER
892
1092
  0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
893
1093
  end
894
1094
 
895
- $logger.info 'Calculating substitution probabilities is done (no smoothing)'
896
-
897
- if ($output == 1)
1095
+ if $output == 1
898
1096
  $outfh.puts '>Total'
899
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1097
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1098
+ :row_header => $amino_acids)
900
1099
  $outfh.close
901
- $logger.info 'Egor END.'
1100
+
1101
+ # for a heat map
1102
+ if $heatmap == 0 or $heatmap == 2
1103
+ stem = "#{group_matrices.size}. TOTAL"
1104
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1105
+ :row_header => $amino_acids,
1106
+ :rvg_width => $rvg_width,
1107
+ :rvg_height => $rvg_height,
1108
+ :canvas_width => $canvas_width,
1109
+ :canvas_height => $canvas_height,
1110
+ :max_val => $tot_prob_mat.max.ceil,
1111
+ :min_val => 0,
1112
+ :print_value => $heatmapvalues,
1113
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1114
+
1115
+ $logger.info "Generating a heat map for #{stem} table done."
1116
+ end
902
1117
  exit 0
903
1118
  end
1119
+
1120
+ $logger.info 'Calculating substitution probabilities (no smoothing) done.'
904
1121
  end
905
1122
 
906
1123
  # when smoothing!!!
@@ -980,7 +1197,7 @@ HEADER
980
1197
  end
981
1198
 
982
1199
  env_labels.combination(ci) do |c1|
983
- Enumerable.cart_prod(*c1).each do |labels|
1200
+ c1[0].product(*c1[1..-1]).each do |labels|
984
1201
  pattern = '.' * $env_features.size
985
1202
 
986
1203
  labels.each do |label|
@@ -1081,7 +1298,7 @@ HEADER
1081
1298
  end
1082
1299
  end
1083
1300
  end
1084
- $logger.info 'Calculating substitution probabilities is done (partial smoothing).'
1301
+ $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
1085
1302
  else
1086
1303
  $outfh.puts <<HEADER
1087
1304
  #
@@ -1116,7 +1333,7 @@ HEADER
1116
1333
  # full smooting
1117
1334
  1.upto($env_features.size) do |ci|
1118
1335
  env_labels.combination(ci) do |c1|
1119
- Enumerable.cart_prod(*c1).each do |labels|
1336
+ c1[0].product(*c1[1..-1]).each do |labels|
1120
1337
  pattern = '.' * $env_features.size
1121
1338
  labels.each do |label|
1122
1339
  j = label[0].chr.to_i
@@ -1167,7 +1384,7 @@ HEADER
1167
1384
  end
1168
1385
  end
1169
1386
  end
1170
- $logger.info 'Calculating substitution probabilities is done (full smoothing).'
1387
+ $logger.info 'Calculating substitution probabilities (full smoothing) done.'
1171
1388
  end
1172
1389
 
1173
1390
  # updating smoothed probability array for each envrionment
@@ -1176,7 +1393,9 @@ HEADER
1176
1393
  end
1177
1394
 
1178
1395
  # sorting environments and build 21X21 substitution matrices
1179
- $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1396
+ group_matrices = []
1397
+
1398
+ $env_classes.groups_sorted_by_residue_labels.each do |group|
1180
1399
  # calculating 21X21 substitution probability matrix for each envrionment
1181
1400
  grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1182
1401
 
@@ -1185,9 +1404,62 @@ HEADER
1185
1404
  0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1186
1405
  end
1187
1406
 
1188
- if $output == 1
1189
- $outfh.puts ">#{group[0]} #{group_no}"
1190
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1407
+ group_matrices << [group[0], grp_prob_mat]
1408
+ end
1409
+
1410
+ if $output == 1
1411
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1412
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1413
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1414
+
1415
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1416
+ # for a matrix file
1417
+ stem = "#{grp_no}. #{grp_label}"
1418
+ $outfh.puts ">#{grp_label} #{grp_no}"
1419
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1420
+ :row_header => $amino_acids)
1421
+
1422
+ # for heat map generation
1423
+ if $heatmap == 0 or $heatmap == 2
1424
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1425
+ :row_header => $amino_acids,
1426
+ :rvg_width => $rvg_width,
1427
+ :rvg_height => $rvg_height,
1428
+ :canvas_width => $canvas_width,
1429
+ :canvas_height => $canvas_height,
1430
+ :max_val => grp_max_val.ceil,
1431
+ :min_val => 0,
1432
+ :print_value => $heatmapvalues,
1433
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1434
+
1435
+ $logger.info "Generating a heat map for #{stem} table done."
1436
+ end
1437
+
1438
+ if $heatmap == 1 or $heatmap == 2
1439
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1440
+ :row_header => $amino_acids,
1441
+ :rvg_width => $rvg_width,
1442
+ :rvg_height => $rvg_height - 50,
1443
+ :canvas_width => $canvas_width,
1444
+ :canvas_height => $canvas_height - 50,
1445
+ :max_val => grp_max_val.ceil,
1446
+ :min_val => 0,
1447
+ :print_value => $heatmapvalues,
1448
+ :print_gradient => false,
1449
+ :title => stem,
1450
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1451
+ end
1452
+ end
1453
+
1454
+ # for heat maps in a single file
1455
+ if $heatmap == 1 or $heatmap == 2
1456
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1457
+ heatmaps.heatmap(:columns => $heatmapcol,
1458
+ :rvg_width => $rvg_width,
1459
+ :max_val => grp_max_val.ceil,
1460
+ :min_val => 0).write(file)
1461
+
1462
+ $logger.info "Generating heat maps in a file, #{file} done."
1191
1463
  end
1192
1464
  end
1193
1465
 
@@ -1202,9 +1474,26 @@ HEADER
1202
1474
 
1203
1475
  if $output == 1
1204
1476
  $outfh.puts '>Total'
1205
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1477
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1478
+ :row_header => $amino_acids)
1206
1479
  $outfh.close
1207
- $logger.info 'Egor END.'
1480
+
1481
+ # for a heat map
1482
+ if $heatmap == 0 or $heatmap == 2
1483
+ stem = "#{group_matrices.size}. TOTAL"
1484
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1485
+ :row_header => $amino_acids,
1486
+ :rvg_width => $rvg_width,
1487
+ :rvg_height => $rvg_height,
1488
+ :canvas_width => $canvas_width,
1489
+ :canvas_height => $canvas_height,
1490
+ :max_val => $tot_prob_mat.max.ceil,
1491
+ :min_val => 0,
1492
+ :print_value => $heatmapvalues,
1493
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1494
+
1495
+ $logger.info "Generating a heat map for #{stem} table done."
1496
+ end
1208
1497
  exit 0
1209
1498
  end
1210
1499
  end
@@ -1242,16 +1531,18 @@ HEADER
1242
1531
  # calculating substitution probability matrix for each envrionment
1243
1532
  grp_label = group[0]
1244
1533
  grp_envs = group[1]
1245
- grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1534
+ grp_logo_mat = $cys == 0 ?
1535
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1536
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1246
1537
 
1247
1538
  $amino_acids.each_with_index do |aa, aj|
1248
1539
  env = grp_envs.detect { |e| e.label.start_with?(aa) }
1249
- #paj = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').sum / $tot_cnt_mat.sum
1250
- env.logo_array = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1540
+ env.logo_array = $cys == 0 ?
1541
+ NArray.float($amino_acids.size + 1) :
1542
+ NArray.float($amino_acids.size)
1251
1543
 
1252
1544
  env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1253
1545
  pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1254
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1255
1546
  odds = prob / pai
1256
1547
  env.logo_array[ai] = factor * Math::log(odds)
1257
1548
  grp_logo_mat[aj, ai] = env.logo_array[ai]
@@ -1262,7 +1553,6 @@ HEADER
1262
1553
  pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1263
1554
  prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1264
1555
  env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1265
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1266
1556
  odds = prob / pai
1267
1557
  env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1268
1558
  grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
@@ -1272,13 +1562,14 @@ HEADER
1272
1562
  grp_logo_mats << [grp_label, grp_logo_mat]
1273
1563
  end
1274
1564
 
1275
- $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1565
+ $tot_logo_mat = $cys == 0 ?
1566
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1567
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1276
1568
 
1277
1569
  $amino_acids.each_with_index do |aa1, aj|
1278
1570
  $amino_acids.each_with_index do |aa2, ai|
1279
1571
  prob = $tot_prob_mat[aj, ai]
1280
1572
  pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1281
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1282
1573
  odds = prob / pai
1283
1574
  $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1284
1575
  end
@@ -1287,7 +1578,6 @@ HEADER
1287
1578
  if $cys == 0
1288
1579
  pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1289
1580
  prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1290
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1291
1581
  odds = prob / pai
1292
1582
  $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1293
1583
  end
@@ -1315,7 +1605,7 @@ HEADER
1315
1605
  #
1316
1606
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1317
1607
  HEADER
1318
- unless $noround
1608
+ unless $noroundoff
1319
1609
  $outfh.puts <<HEADER
1320
1610
  # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1321
1611
  HEADER
@@ -1326,43 +1616,120 @@ HEADER
1326
1616
  #
1327
1617
  HEADER
1328
1618
 
1619
+ grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
1620
+ grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
1621
+ abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
1622
+ row_header = $cys ? $amino_acids + %w[U] : $amino_acids
1623
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1624
+ $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
1625
+
1329
1626
  grp_logo_mats.each_with_index do |arr, grp_no|
1330
1627
  grp_label = arr[0]
1331
1628
  grp_logo_mat = arr[1]
1629
+ stem = "#{grp_no}. #{grp_label}"
1332
1630
 
1333
- unless $noround
1631
+ unless $noroundoff
1334
1632
  grp_logo_mat = grp_logo_mat.round
1335
1633
  end
1336
1634
 
1635
+ # for a matrix file
1337
1636
  $outfh.puts ">#{grp_label} #{grp_no}"
1338
- if $cys
1339
- $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1340
- else
1341
- $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1637
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
1638
+ :row_header => row_header)
1639
+ # for a heat map
1640
+ if $heatmap == 0 or $heatmap == 2
1641
+ grp_logo_mat.heatmap(:col_header => $amino_acids,
1642
+ :row_header => row_header,
1643
+ :rvg_width => $rvg_width,
1644
+ :rvg_height => $rvg_height,
1645
+ :canvas_width => $canvas_width,
1646
+ :canvas_height => $canvas_height,
1647
+ :gradient_beg_color => '#0000FF',
1648
+ :gradient_mid_color => '#FFFFFF',
1649
+ :gradient_end_color => '#FF0000',
1650
+ :max_val => abs_max_val.ceil,
1651
+ :mid_val => 0,
1652
+ :min_val => -1 * abs_max_val.ceil,
1653
+ :print_value => $heatmapvalues,
1654
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1655
+
1656
+ $logger.info "Generating a heat map for #{stem} table done."
1657
+ end
1658
+
1659
+ if $heatmap == 1 or $heatmap == 2
1660
+ heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1661
+ :row_header => row_header,
1662
+ :rvg_width => $rvg_width,
1663
+ :rvg_height => $rvg_height - 50,
1664
+ :canvas_width => $canvas_width,
1665
+ :canvas_height => $canvas_height - 50,
1666
+ :gradient_beg_color => '#0000FF',
1667
+ :gradient_mid_color => '#FFFFFF',
1668
+ :gradient_end_color => '#FF0000',
1669
+ :max_val => abs_max_val.ceil,
1670
+ :mid_val => 0,
1671
+ :min_val => -1 * abs_max_val.ceil,
1672
+ :print_value => $heatmapvalues,
1673
+ :print_gradient => false,
1674
+ :title => stem,
1675
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1342
1676
  end
1343
1677
  end
1344
1678
 
1345
- $outfh.puts ">Total #{grp_logo_mats.size}"
1679
+ # for heat maps in a single file
1680
+ if $heatmap == 1 or $heatmap == 2
1681
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1682
+ heatmaps.heatmap(:columns => $heatmapcol,
1683
+ :rvg_width => $rvg_width,
1684
+ :gradient_beg_color => '#0000FF',
1685
+ :gradient_mid_color => '#FFFFFF',
1686
+ :gradient_end_color => '#FF0000',
1687
+ :max_val => abs_max_val.ceil,
1688
+ :mid_val => 0,
1689
+ :min_val => -1 * abs_max_val.ceil).write(file)
1690
+
1691
+ $logger.info "Generating heat maps in a file, #{file} done."
1692
+ end
1346
1693
 
1347
- unless $noround
1694
+ # for a matrix file
1695
+ unless $noroundoff
1348
1696
  $tot_logo_mat = $tot_logo_mat.round
1349
1697
  end
1350
1698
 
1351
- if $cys == 0
1352
- $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1353
- else
1354
- $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1699
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1700
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
1701
+ :row_header => row_header)
1702
+
1703
+ # for a heat map
1704
+ if $heatmap == 0 or $heatmap == 2
1705
+ stem = "#{group_matrices.size}. TOTAL"
1706
+ tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
1707
+ $tot_logo_mat.heatmap(:col_header => $amino_acids,
1708
+ :row_header => row_header,
1709
+ :rvg_width => $rvg_width,
1710
+ :rvg_height => $rvg_height,
1711
+ :canvas_width => $canvas_width,
1712
+ :canvas_height => $canvas_height,
1713
+ :gradient_beg_color => '#0000FF',
1714
+ :gradient_mid_color => '#FFFFFF',
1715
+ :gradient_end_color => '#FF0000',
1716
+ :max_val => tot_abs_max_val.ceil,
1717
+ :mid_val => 0,
1718
+ :min_val => -1 * tot_abs_max_val.ceil,
1719
+ :print_value => $heatmapvalues,
1720
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1721
+
1722
+ $logger.info "Generating a heat map for #{stem} table done."
1355
1723
  end
1356
1724
 
1357
- $logger.info "Calculating log odds ratio is done."
1358
-
1359
- #
1360
- # Part 7. END
1361
- #
1725
+ $logger.info "Calculating log odds ratios done."
1362
1726
  end
1363
1727
 
1728
+ #
1729
+ # Part 7. END
1730
+ #
1731
+
1364
1732
  $outfh.close
1365
- $logger.info "Egor END."
1366
1733
  exit 0
1367
1734
  end
1368
1735
  end