egor 0.0.5 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Egor
5
- VERSION = '0.0.5'
5
+ VERSION = '0.9.0'
6
6
  end
@@ -5,16 +5,17 @@ require 'narray'
5
5
  require 'bio'
6
6
  require 'set'
7
7
  require 'facets'
8
- require 'simple_memoize'
9
8
 
9
+ require 'math_extensions'
10
+ require 'string_extensions'
10
11
  require 'narray_extensions'
11
12
  require 'nmatrix_extensions'
12
- require 'enumerable_extensions'
13
- require 'math_extensions'
14
- require 'environment'
15
- require 'environment_class_hash'
16
- require 'environment_feature'
17
- require 'environment_feature_array'
13
+
14
+ require 'egor/environment'
15
+ require 'egor/environment_class_hash'
16
+ require 'egor/environment_feature'
17
+ require 'egor/environment_feature_array'
18
+ require 'egor/heatmap_array'
18
19
 
19
20
  # This is a module for an actual command line interpreter for Egor
20
21
  # ---
@@ -25,7 +26,7 @@ module Egor
25
26
 
26
27
  # :nodoc:
27
28
  def print_version
28
- puts Egor::VERSION
29
+ puts VERSION
29
30
  end
30
31
 
31
32
  # Print Egor's Usage on the screen
@@ -62,14 +63,26 @@ Options:
62
63
  0 for raw counts (no smoothing performed)
63
64
  1 for probabilities
64
65
  2 for log-odds (default)
65
- --noround: do not round off log odds ratio
66
+ --noroundoff: do not round off log odds ratio
66
67
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
67
68
  --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
68
69
  --autosigma: automatically adjust the sigma value for smoothing
69
70
  --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
70
- --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
71
71
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
72
72
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
73
+ --heatmap INTEGER:
74
+ 0 create a heat map file for each substitution table
75
+ 1 create one big file containing all heat maps from substitution tables
76
+ 2 do both 0 and 1
77
+ --heatmap-format INTEGER:
78
+ 0 for Portable Network Graphics (PNG) Format (default)
79
+ 1 for Graphics Interchange Format (GIF)
80
+ 2 for Joint Photographic Experts Group (JPEG) Format
81
+ 3 for Microsoft Windows bitmap (BMP) Format
82
+ 4 for Portable Document Format (PDF)
83
+ --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
84
+ --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
85
+ --heatmap-values: print values in the cells when generating heat maps
73
86
  --verbose (-v) INTEGER
74
87
  0 for ERROR level
75
88
  1 for WARN or above level (default)
@@ -87,12 +100,12 @@ Options:
87
100
  # Egor::CLI::calculate_pid(seq1, seq2) -> Float
88
101
  #
89
102
  def calculate_pid(seq1, seq2)
90
- s1 = seq1.split('')
91
- s2 = seq2.split('')
92
- cols = s1.zip(s2)
93
- align = 0
94
- ident = 0
95
- intgp = 0
103
+ aas1 = seq1.split('')
104
+ aas2 = seq2.split('')
105
+ cols = aas1.zip(aas2)
106
+ align = 0 # no. of aligned columns
107
+ ident = 0 # no. of identical columns
108
+ intgp = 0 # no. of internal gaps
96
109
 
97
110
  cols.each do |col|
98
111
  if (col[0] != '-') && (col[1] != '-')
@@ -100,14 +113,14 @@ Options:
100
113
  if col[0] == col[1]
101
114
  ident += 1
102
115
  end
103
- elsif (((col[0] == '-') && (col[1] != '-')) || ((col[0] != '-') && (col[1] == '-')))
116
+ elsif (((col[0] == '-') && (col[1] != '-')) ||
117
+ ((col[0] != '-') && (col[1] == '-')))
104
118
  intgp += 1
105
119
  end
106
120
  end
107
121
 
108
122
  pid = 100.0 * ident.to_f / (align + intgp)
109
123
  end
110
- memoize :calculate_pid
111
124
 
112
125
  # :nodoc:
113
126
  def execute(arguments=[])
@@ -121,7 +134,7 @@ Options:
121
134
  # aa: weighted amino acid
122
135
  # tot: total
123
136
  # rel: relative
124
- # obs: observation
137
+ # jnt: joint
125
138
  # cnt: count
126
139
  # mut: mutation
127
140
  # mutb: mutability
@@ -145,31 +158,42 @@ Options:
145
158
  $logger.level = Logger::WARN
146
159
 
147
160
  # default set of 21 amino acids including J (Cysteine, the free thiol form)
148
- $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
149
-
150
- $tem_list = nil
151
- $tem_file = nil
152
- $classdef = 'classdef.dat'
153
- $outfile = 'allmat.dat'
154
- $outfh = nil # file hanfle for outfile
155
- $output = 2 # default: log odds matrix
156
- $ali_size = 0
157
- $tot_aa = 0
158
- $sigma = 5.0
159
- $autosigma = false
160
- $weight = 60
161
- $noweight = false
162
- $smooth = :partial
163
- $nosmooth = false
164
- $noround = false
165
- $p1smooth = false
166
- $scale = 3
167
- $pidmin = nil
168
- $pidmax = nil
169
- $scale = 3
170
- $add = nil
171
- $cys = 0
172
- $penv = false
161
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
162
+ $tem_list = nil
163
+ $tem_file = nil
164
+ $classdef = 'classdef.dat'
165
+ $outfile = 'allmat.dat'
166
+ $outfh = nil # file hanfle for outfile
167
+ $output = 2 # default: log odds matrix
168
+ $ali_size = 0
169
+ $tot_aa = 0
170
+ $sigma = 5.0
171
+ $autosigma = false
172
+ $weight = 60
173
+ $noweight = false
174
+ $smooth = :partial
175
+ $nosmooth = false
176
+ $noroundoff = false
177
+ $p1smooth = false
178
+ $scale = 3
179
+ $pidmin = nil
180
+ $pidmax = nil
181
+ $scale = 3
182
+ $add = nil
183
+ $cys = 0
184
+ $targetenv = false
185
+ $penv = false
186
+ $heatmap = nil
187
+ $heatmapcol = nil
188
+ $heatmapformat = 'png'
189
+ $heatmapstem = 'heatmaps'
190
+ $heatmapvalues = false
191
+ $rvg_width = 550
192
+ $rvg_height = 650
193
+ $canvas_width = 550
194
+ $canvas_height = 650
195
+ $cell_width = 20
196
+ $cell_height = 20
173
197
 
174
198
  $aa_tot_cnt = Hash.new(0)
175
199
  $aa_mut_cnt = Hash.new(0)
@@ -184,7 +208,7 @@ Options:
184
208
  $tot_smooth_prob = {}
185
209
 
186
210
  # minimum ratio of amino acid count to sigma value
187
- $min_obs_sigma_ratio = 500.0
211
+ $min_cnt_sigma_ratio = 500.0
188
212
 
189
213
  #
190
214
  # Part 1 END
@@ -205,11 +229,16 @@ Options:
205
229
  [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
206
230
  [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
207
231
  [ '--noweight', GetoptLong::NO_ARGUMENT ],
208
- [ '--noround', GetoptLong::NO_ARGUMENT ],
232
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
209
233
  [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
210
234
  [ '--autosigma', GetoptLong::NO_ARGUMENT ],
211
- #[ '--heatmap', GetoptLong::NO_ARGUMENT ],
235
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
237
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
212
240
  [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
241
+ [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
213
242
  [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
214
243
  [ '--penv', GetoptLong::NO_ARGUMENT ],
215
244
  [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
@@ -235,6 +264,8 @@ Options:
235
264
  $outfile = arg
236
265
  when '--cys'
237
266
  $cys = arg.to_i
267
+ when '--targetenv'
268
+ $targetenv = (arg.to_i == 1) ? true : false
238
269
  when '--weight'
239
270
  $weight = arg.to_i
240
271
  when '--sigma'
@@ -247,8 +278,8 @@ Options:
247
278
  $pidmax = arg.to_f
248
279
  when '--noweight'
249
280
  $noweight = true
250
- when '--noround'
251
- $noround = true
281
+ when '--noroundoff'
282
+ $noroundoff = true
252
283
  when '--smooth'
253
284
  $smooth = (arg.to_i == 1) ? :full : :partial
254
285
  when '--nosmooth'
@@ -260,18 +291,42 @@ Options:
260
291
  when '--add'
261
292
  $add = arg.to_f
262
293
  when '--penv'
263
- warn "--penv option is not supported yet."
294
+ warn "--penv option is not supported."
264
295
  exit 1
265
296
  $penv = true
266
- # when '--heatmap'
267
- # $heatmap = true
297
+ when '--heatmap'
298
+ $heatmap = case arg.to_i
299
+ when (0..2) then arg.to_i
300
+ else
301
+ warn "--heatmap #{arg.to_i} is not allowed."
302
+ exit1
303
+ end
304
+ when '--heatmap-columns'
305
+ $heatmapcol = arg.to_i
306
+ when '--heatmap-stem'
307
+ $heatmapstem = arg.to_s
308
+ when '--heatmap-format'
309
+ $heatmapformat = case arg.to_i
310
+ when 0 then 'png'
311
+ when 1 then 'gif'
312
+ when 2 then 'jpg'
313
+ when 3 then 'bmp'
314
+ when 4 then 'pdf'
315
+ else
316
+ warn "--heatmap-format #{arg.to_i} is not supported."
317
+ exit 1
318
+ end
319
+ when '--heatmap-values'
320
+ $heatmapvalues = true
268
321
  when '--verbose'
269
322
  $logger.level = case arg.to_i
270
323
  when 0 then Logger::ERROR
271
324
  when 1 then Logger::WARN
272
325
  when 2 then Logger::INFO
273
326
  when 3 then Logger::DEBUG
274
- else Logger::WARN
327
+ else
328
+ warn "--verbose (-v) #{arg.to_i} is not supported."
329
+ exit 1
275
330
  end
276
331
  when '--version'
277
332
  print_version
@@ -284,7 +339,9 @@ Options:
284
339
  end
285
340
 
286
341
  # when arguments are nonsense, print usage
287
- if ((ARGV.length != 0) || (!$tem_list && !$tem_file) || ($tem_list && $tem_file))
342
+ if ((ARGV.length != 0) ||
343
+ (!$tem_list && !$tem_file) ||
344
+ ($tem_list && $tem_file))
288
345
  print_usage
289
346
  exit 1
290
347
  end
@@ -305,7 +362,6 @@ Options:
305
362
  exit 1
306
363
  end
307
364
 
308
-
309
365
  #
310
366
  # Part 2 END
311
367
  #
@@ -316,23 +372,28 @@ Options:
316
372
  # Reading Environment Class Definition File
317
373
  #
318
374
 
319
- $logger.info "Egor START."
320
-
321
375
  # check --cys option and modify amino_acids set if necessary
322
376
  if $cys == 2
323
377
  $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
324
378
  end
325
379
 
326
- # create an EnvironmentFeatureList object for storing all environment features
380
+ # create an EnvironmentFeatureList object for storing all environment
381
+ # features
327
382
  $env_features = EnvironmentFeatureArray.new
328
383
 
329
384
  # an array for storing indexes of constrained environment features
330
385
  $cst_features = []
331
386
 
332
- # add substituted amino acid (aa1) in a substitution to the environment feature list
333
- $env_features << EnvironmentFeature.new('sequence', $amino_acids, $amino_acids, 'F', 'F')
387
+ # add substituted amino acid (aa1) in a substitution to the environment
388
+ # feature list
389
+ $env_features << EnvironmentFeature.new('sequence',
390
+ $amino_acids,
391
+ $amino_acids,
392
+ 'F',
393
+ 'F')
334
394
 
335
- # read environment class definiton file and store them into the hash prepared above
395
+ # read environment class definiton file and store them into
396
+ # the hash prepared above
336
397
  env_index = 1
337
398
 
338
399
  IO.foreach($classdef) do |line|
@@ -350,10 +411,15 @@ Options:
350
411
  $cst_features << env_index
351
412
  $logger.warn "The environment feature, #{line} constrained."
352
413
  end
353
- $env_features << EnvironmentFeature.new(env_ftr[0], env_ftr[1].split(''), env_ftr[2].split(''), env_ftr[3], env_ftr[4])
414
+ $env_features << EnvironmentFeature.new(env_ftr[0],
415
+ env_ftr[1].split(''),
416
+ env_ftr[2].split(''),
417
+ env_ftr[3],
418
+ env_ftr[4])
354
419
  env_index += 1
355
420
  else
356
- $logger.error "\"#{line}\" doesn't seem to be a proper format for a environment class definition."
421
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
+ "a environment class definition."
357
423
  exit 1
358
424
  end
359
425
  end
@@ -361,9 +427,13 @@ Options:
361
427
  # a hash for storing all environment classes
362
428
  $env_classes = EnvironmentClassHash.new
363
429
 
364
- # generate all possible combinations of environment labels, and store every environment class into the hash prepared above with the label as a key
430
+ # generate all possible combinations of environment labels, and store
431
+ # every environment class into the hash prepared above with the label
432
+ # as a key
365
433
  $env_features.label_combinations.each_with_index { |e, i|
366
- $env_classes[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
434
+ $env_classes[e.flatten.join] = Environment.new(i,
435
+ e.flatten.join,
436
+ $amino_acids)
367
437
  }
368
438
 
369
439
  #
@@ -390,19 +460,17 @@ Options:
390
460
  $tem_list_io.each_line do |tem_file|
391
461
  tem_file.chomp!
392
462
 
393
- $logger.info "Analysing #{tem_file} ..."
394
-
395
463
  ali = Bio::Alignment::OriginalAlignment.new
396
464
  ff = Bio::FlatFile.auto(tem_file)
397
465
 
398
466
  ff.each_entry do |pir|
399
467
  if (pir.definition == 'sequence') || (pir.definition == 'structure')
400
- ali.add_seq(pir.data.gsub("\n", ''), pir.entry_id)
468
+ ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
401
469
  end
402
470
  end
403
471
 
404
472
  if ali.size < 2
405
- $logger.warn "Skipped #{tem_file}, there is only one unique entry."
473
+ $logger.warn "Skipped #{tem_file} which has only one unique entry."
406
474
  next
407
475
  end
408
476
 
@@ -414,8 +482,10 @@ Options:
414
482
  # check disulphide bond environment first!
415
483
  ff.rewind
416
484
  ff.each_entry do |pir|
417
- if (pir.entry_id == key) && ((pir.definition == "disulphide") || (pir.definition == "disulfide"))
418
- disulphide[key] = pir.data.gsub("\n", '').split('')
485
+ if ((pir.entry_id == key) &&
486
+ ((pir.definition == "disulphide") ||
487
+ (pir.definition == "disulfide")))
488
+ disulphide[key] = pir.data.remove_internal_spaces.split('')
419
489
  end
420
490
  end
421
491
 
@@ -425,14 +495,16 @@ Options:
425
495
  ff.rewind
426
496
  ff.each_entry do |pir|
427
497
  if (pir.entry_id == key) && (pir.definition == ec.name)
428
- labels = pir.data.gsub("\n", '').split('').map_with_index do |sym, pos|
498
+ labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
429
499
  if sym == '-'
430
500
  '-'
431
501
  elsif sym == 'X' || sym == 'x'
432
502
  'X'
433
503
  else
434
504
  if ei == 0 # Amino Acid Environment Feature
435
- (disulphide.has_key?(key) && (disulphide[key][pos] == 'F') && (sym == 'C')) ? 'J' : sym
505
+ (disulphide.has_key?(key) &&
506
+ (disulphide[key][pos] == 'F') &&
507
+ (sym == 'C')) ? 'J' : sym
436
508
  else
437
509
  ec.labels[ec.symbols.index(sym)]
438
510
  end
@@ -442,7 +514,9 @@ Options:
442
514
  if env_labels[key].empty?
443
515
  env_labels[key] = labels
444
516
  else
445
- env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
517
+ env_labels[key].each_with_index { |e, i|
518
+ env_labels[key][i] = e + labels[i]
519
+ }
446
520
  end
447
521
  end
448
522
  end
@@ -459,13 +533,15 @@ Options:
459
533
 
460
534
  # check PID_MIN
461
535
  if $pidmin && (pid < $pidmin)
462
- $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
536
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
537
+ "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
463
538
  next
464
539
  end
465
540
 
466
541
  # check PID_MAX
467
542
  if $pidmax && (pid > $pidmax)
468
- $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
543
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
544
+ "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
469
545
  next
470
546
  end
471
547
 
@@ -574,7 +650,7 @@ Options:
574
650
  end while(continue)
575
651
 
576
652
  if clusters.size < 2
577
- $logger.debug "Skipped #{tem_file} because there is only one cluster at the #{$weight} PID level."
653
+ $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
578
654
  next
579
655
  end
580
656
 
@@ -610,16 +686,16 @@ Options:
610
686
 
611
687
  aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
612
688
  aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
613
- obs1 = 1.0 / cluster1.size
614
- obs2 = 1.0 / cluster2.size
615
- obs_cnt = obs1 * obs2
689
+ cnt1 = 1.0 / cluster1.size
690
+ cnt2 = 1.0 / cluster2.size
691
+ jnt_cnt = cnt1 * cnt2
616
692
 
617
693
  if $cst_features.empty?
618
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
619
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
694
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
695
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
620
696
  elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
621
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
622
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
697
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
698
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
623
699
  else
624
700
  $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
625
701
  next
@@ -630,64 +706,65 @@ Options:
630
706
 
631
707
  if $aa_env_cnt.has_key? grp_label1
632
708
  if $aa_env_cnt[grp_label1].has_key? aa1
633
- $aa_env_cnt[grp_label1][aa1] += obs1
709
+ $aa_env_cnt[grp_label1][aa1] += cnt1
634
710
  else
635
- $aa_env_cnt[grp_label1][aa1] = obs1
711
+ $aa_env_cnt[grp_label1][aa1] = cnt1
636
712
  end
637
713
  else
638
714
  $aa_env_cnt[grp_label1] = Hash.new(0.0)
639
- $aa_env_cnt[grp_label1][aa1] = obs1
715
+ $aa_env_cnt[grp_label1][aa1] = cnt1
640
716
  end
641
717
 
642
718
  if $aa_env_cnt.has_key? grp_label2
643
719
  if $aa_env_cnt[grp_label2].has_key? aa2
644
- $aa_env_cnt[grp_label2][aa2] += obs2
720
+ $aa_env_cnt[grp_label2][aa2] += cnt2
645
721
  else
646
- $aa_env_cnt[grp_label2][aa2] = obs2
722
+ $aa_env_cnt[grp_label2][aa2] = cnt2
647
723
  end
648
724
  else
649
725
  $aa_env_cnt[grp_label2] = Hash.new(0.0)
650
- $aa_env_cnt[grp_label2][aa2] = obs2
726
+ $aa_env_cnt[grp_label2][aa2] = cnt2
651
727
  end
652
728
 
653
729
  if $aa_tot_cnt.has_key? aa1
654
- $aa_tot_cnt[aa1] += obs1
730
+ $aa_tot_cnt[aa1] += cnt1
655
731
  else
656
- $aa_tot_cnt[aa1] = obs1
732
+ $aa_tot_cnt[aa1] = cnt1
657
733
  end
658
734
 
659
735
  if $aa_tot_cnt.has_key? aa2
660
- $aa_tot_cnt[aa2] += obs2
736
+ $aa_tot_cnt[aa2] += cnt2
661
737
  else
662
- $aa_tot_cnt[aa2] = obs2
738
+ $aa_tot_cnt[aa2] = cnt2
663
739
  end
664
740
 
665
741
  if aa1 != aa2
666
742
  if $aa_mut_cnt.has_key? aa1
667
- $aa_mut_cnt[aa1] += obs1
743
+ $aa_mut_cnt[aa1] += cnt1
668
744
  else
669
- $aa_mut_cnt[aa1] = obs1
745
+ $aa_mut_cnt[aa1] = cnt1
670
746
  end
671
747
  if $aa_mut_cnt.has_key? aa2
672
- $aa_mut_cnt[aa2] += obs2
748
+ $aa_mut_cnt[aa2] += cnt2
673
749
  else
674
- $aa_mut_cnt[aa2] = obs2
750
+ $aa_mut_cnt[aa2] = cnt2
675
751
  end
676
752
  end
677
753
 
678
- $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
679
- $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
754
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
755
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
680
756
  end
681
757
  end
682
758
  end
683
759
  end
684
760
  end
761
+ $logger.info "Analysing #{tem_file} done."
685
762
  end
686
763
 
687
764
  # print out default header
688
765
  $outfh.puts <<HEADER
689
766
  # Environment-specific amino acid substitution matrices
690
- # Creator: egor version #{Egor::VERSION}
767
+ # Creator: egor version #{VERSION}
691
768
  # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
692
769
  #
693
770
  # Definitions for structural environments:
@@ -739,20 +816,20 @@ HEADER
739
816
  $outfh.puts "# Total amino acid frequencies:\n"
740
817
  $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
741
818
 
742
- min_obs = -1
819
+ min_cnt = -1
743
820
  min_sigma = nil
744
821
 
745
822
  $amino_acids.each do |res|
746
- if ($aa_tot_cnt[res] / $sigma) < $min_obs_sigma_ratio
747
- if min_obs < 0
748
- min_obs = $aa_tot_cnt[res]
749
- min_sigma = min_obs / $min_obs_sigma_ratio
750
- elsif (min_obs > 0) && (min_obs > $aa_tot_cnt[res])
751
- min_obs = $aa_tot_cnt[res]
752
- min_sigma = min_obs / $min_obs_sigma_ratio
823
+ if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
824
+ if min_cnt < 0
825
+ min_cnt = $aa_tot_cnt[res]
826
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
827
+ elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
828
+ min_cnt = $aa_tot_cnt[res]
829
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
753
830
  end
754
831
 
755
- $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total observation (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
832
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
756
833
  end
757
834
 
758
835
  $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
@@ -770,7 +847,7 @@ HEADER
770
847
  end
771
848
  end
772
849
 
773
- if min_obs > -1
850
+ if min_cnt > -1
774
851
  $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
775
852
  if $autosigma
776
853
  $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
@@ -780,12 +857,13 @@ HEADER
780
857
 
781
858
  $outfh.puts '#'
782
859
  $outfh.puts '# RES: Amino acid one letter code'
783
- $outfh.puts '# TOT_OBS: Total observations of incidence'
784
- $outfh.puts '# MUT_OBS: Total observations of mutation'
860
+ $outfh.puts '# TOT_OBS: Total count of incidence'
861
+ $outfh.puts '# MUT_OBS: Total count of mutation'
785
862
  $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
786
- $outfh.puts '# REL_MUTB: Relative mutability (ALA=100)'
863
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
787
864
  $outfh.puts '# REL_FREQ: Relative frequency'
788
865
  $outfh.puts '#'
866
+
789
867
  #
790
868
  # Part 4. END
791
869
  #
@@ -804,7 +882,8 @@ HEADER
804
882
  end
805
883
 
806
884
  # count raw frequencies
807
- $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
885
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
886
+ group_matrices = []
808
887
 
809
888
  # for each combination of environment features
810
889
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
@@ -816,22 +895,88 @@ HEADER
816
895
  end
817
896
 
818
897
  $tot_cnt_mat += grp_cnt_mat
819
-
820
- if $output == 0
821
- $outfh.puts ">#{group[0]} #{group_no}"
822
- $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
823
- end
898
+ group_matrices << [group[0], grp_cnt_mat]
824
899
  end
825
900
 
901
+ $logger.info "Counting substitutions done."
902
+
826
903
  if $output == 0
904
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
905
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
906
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
907
+
908
+ group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
909
+ # for a matrix file
910
+ stem = "#{grp_no}. #{grp_label}"
911
+ $outfh.puts ">#{grp_label} #{grp_no}"
912
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
913
+ :row_header => $amino_acids)
914
+
915
+ # for a heat map
916
+ if $heatmap == 0 or $heatmap == 2
917
+ grp_cnt_mat.heatmap(:col_header => $amino_acids,
918
+ :row_header => $amino_acids,
919
+ :rvg_width => $rvg_width,
920
+ :rvg_height => $rvg_height,
921
+ :canvas_width => $canvas_width,
922
+ :canvas_height => $canvas_height,
923
+ :max_val => grp_max_val.ceil,
924
+ :min_val => 0,
925
+ :print_value => $heatmapvalues,
926
+ :title => stem).write("#{stem}.#{$heatmapformat}")
927
+
928
+ $logger.info "Generating a heat map for #{stem} table done."
929
+ end
930
+
931
+ if $heatmap == 1 or $heatmap == 2
932
+ heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
933
+ :row_header => $amino_acids,
934
+ :rvg_width => $rvg_width,
935
+ :rvg_height => $rvg_height - 50,
936
+ :canvas_width => $canvas_width,
937
+ :canvas_height => $canvas_height - 50,
938
+ :max_val => grp_max_val.ceil,
939
+ :min_val => 0,
940
+ :print_value => $heatmapvalues,
941
+ :print_gradient => false,
942
+ :title => stem,
943
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
944
+ end
945
+ end
946
+
947
+ if $heatmap == 1 or $heatmap == 2
948
+ file = "#{$heatmapstem}.#{$heatmapformat}"
949
+ heatmaps.heatmap(:columns => $heatmapcol,
950
+ :rvg_width => $rvg_width,
951
+ :max_val => grp_max_val.ceil,
952
+ :min_val => 0).write(file)
953
+
954
+ $logger.info "Generating heat maps in a file, #{file} done."
955
+ end
956
+
957
+ # total
827
958
  $outfh.puts '>Total'
828
- $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
829
- $logger.info 'Egor END.'
959
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
960
+ :row_header => $amino_acids)
961
+
962
+ if $heatmap == 0 or $heatmap == 2
963
+ stem = "#{group_matrices.size}. TOTAL"
964
+ heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
965
+ :row_header => $amino_acids,
966
+ :rvg_width => $rvg_width,
967
+ :rvg_height => $rvg_height,
968
+ :canvas_width => $canvas_width,
969
+ :canvas_height => $canvas_height,
970
+ :max_val => $tot_cnt_mat.max.ceil,
971
+ :min_val => 0,
972
+ :print_value => $heatmapvalues,
973
+ :title => stem).write("#{stem}.#{$heatmapformat}")
974
+
975
+ $logger.info "Generating a heat map for #{stem} table done."
976
+ end
830
977
  exit 0
831
978
  end
832
979
 
833
- $logger.info "Counting substitutions is done."
834
-
835
980
  #
836
981
  # Part 5. END
837
982
  #
@@ -867,6 +1012,8 @@ HEADER
867
1012
  # re-calculate probability vector for each environment class
868
1013
  $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
869
1014
 
1015
+ group_matrices = []
1016
+
870
1017
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
871
1018
  grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
872
1019
  grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
@@ -878,10 +1025,63 @@ HEADER
878
1025
  end
879
1026
 
880
1027
  $tot_cnt_mat += grp_cnt_mat
1028
+ group_matrices << [group[0], grp_prob_mat]
1029
+ end
1030
+
1031
+ if $output == 1
1032
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1033
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1034
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1035
+
1036
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1037
+ # for a matrix file
1038
+ stem = "#{grp_no}. #{grp_label}"
1039
+ $outfh.puts ">#{grp_label} #{grp_no}"
1040
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1041
+ :row_header => $amino_acids)
1042
+
1043
+
1044
+ # for a heat map
1045
+ if $heatmap == 0 or $heatmap == 2
1046
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1047
+ :row_header => $amino_acids,
1048
+ :rvg_width => $rvg_width,
1049
+ :rvg_height => $rvg_height,
1050
+ :canvas_width => $canvas_width,
1051
+ :canvas_height => $canvas_height,
1052
+ :max_val => grp_max_val.ceil,
1053
+ :min_val => 0,
1054
+ :print_value => $heatmapvalues,
1055
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1056
+
1057
+ $logger.info "Generating a heat map for #{stem} table done."
1058
+ end
1059
+
1060
+ if $heatmap == 1 or $heatmap == 2
1061
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1062
+ :row_header => $amino_acids,
1063
+ :rvg_width => $rvg_width,
1064
+ :rvg_height => $rvg_height - 50,
1065
+ :canvas_width => $canvas_width,
1066
+ :canvas_height => $canvas_height - 50,
1067
+ :max_val => grp_max_val.ceil,
1068
+ :min_val => 0,
1069
+ :print_value => $heatmapvalues,
1070
+ :print_gradient => false,
1071
+ :title => stem,
1072
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1073
+ end
1074
+ end
881
1075
 
882
- if ($output == 1)
883
- $outfh.puts ">#{group[0]} #{group_no}"
884
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1076
+ # for heat maps in a single file
1077
+ if $heatmap == 1 or $heatmap == 2
1078
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1079
+ heatmaps.heatmap(:columns => $heatmapcol,
1080
+ :rvg_width => $rvg_width,
1081
+ :max_val => grp_max_val.ceil,
1082
+ :min_val => 0).write(file)
1083
+
1084
+ $logger.info "Generating heat maps in a file, #{file} done."
885
1085
  end
886
1086
  end
887
1087
 
@@ -892,15 +1092,32 @@ HEADER
892
1092
  0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
893
1093
  end
894
1094
 
895
- $logger.info 'Calculating substitution probabilities is done (no smoothing)'
896
-
897
- if ($output == 1)
1095
+ if $output == 1
898
1096
  $outfh.puts '>Total'
899
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1097
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1098
+ :row_header => $amino_acids)
900
1099
  $outfh.close
901
- $logger.info 'Egor END.'
1100
+
1101
+ # for a heat map
1102
+ if $heatmap == 0 or $heatmap == 2
1103
+ stem = "#{group_matrices.size}. TOTAL"
1104
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1105
+ :row_header => $amino_acids,
1106
+ :rvg_width => $rvg_width,
1107
+ :rvg_height => $rvg_height,
1108
+ :canvas_width => $canvas_width,
1109
+ :canvas_height => $canvas_height,
1110
+ :max_val => $tot_prob_mat.max.ceil,
1111
+ :min_val => 0,
1112
+ :print_value => $heatmapvalues,
1113
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1114
+
1115
+ $logger.info "Generating a heat map for #{stem} table done."
1116
+ end
902
1117
  exit 0
903
1118
  end
1119
+
1120
+ $logger.info 'Calculating substitution probabilities (no smoothing) done.'
904
1121
  end
905
1122
 
906
1123
  # when smoothing!!!
@@ -980,7 +1197,7 @@ HEADER
980
1197
  end
981
1198
 
982
1199
  env_labels.combination(ci) do |c1|
983
- Enumerable.cart_prod(*c1).each do |labels|
1200
+ c1[0].product(*c1[1..-1]).each do |labels|
984
1201
  pattern = '.' * $env_features.size
985
1202
 
986
1203
  labels.each do |label|
@@ -1081,7 +1298,7 @@ HEADER
1081
1298
  end
1082
1299
  end
1083
1300
  end
1084
- $logger.info 'Calculating substitution probabilities is done (partial smoothing).'
1301
+ $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
1085
1302
  else
1086
1303
  $outfh.puts <<HEADER
1087
1304
  #
@@ -1116,7 +1333,7 @@ HEADER
1116
1333
  # full smooting
1117
1334
  1.upto($env_features.size) do |ci|
1118
1335
  env_labels.combination(ci) do |c1|
1119
- Enumerable.cart_prod(*c1).each do |labels|
1336
+ c1[0].product(*c1[1..-1]).each do |labels|
1120
1337
  pattern = '.' * $env_features.size
1121
1338
  labels.each do |label|
1122
1339
  j = label[0].chr.to_i
@@ -1167,7 +1384,7 @@ HEADER
1167
1384
  end
1168
1385
  end
1169
1386
  end
1170
- $logger.info 'Calculating substitution probabilities is done (full smoothing).'
1387
+ $logger.info 'Calculating substitution probabilities (full smoothing) done.'
1171
1388
  end
1172
1389
 
1173
1390
  # updating smoothed probability array for each envrionment
@@ -1176,7 +1393,9 @@ HEADER
1176
1393
  end
1177
1394
 
1178
1395
  # sorting environments and build 21X21 substitution matrices
1179
- $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1396
+ group_matrices = []
1397
+
1398
+ $env_classes.groups_sorted_by_residue_labels.each do |group|
1180
1399
  # calculating 21X21 substitution probability matrix for each envrionment
1181
1400
  grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1182
1401
 
@@ -1185,9 +1404,62 @@ HEADER
1185
1404
  0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1186
1405
  end
1187
1406
 
1188
- if $output == 1
1189
- $outfh.puts ">#{group[0]} #{group_no}"
1190
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1407
+ group_matrices << [group[0], grp_prob_mat]
1408
+ end
1409
+
1410
+ if $output == 1
1411
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1412
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1413
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1414
+
1415
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1416
+ # for a matrix file
1417
+ stem = "#{grp_no}. #{grp_label}"
1418
+ $outfh.puts ">#{grp_label} #{grp_no}"
1419
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1420
+ :row_header => $amino_acids)
1421
+
1422
+ # for heat map generation
1423
+ if $heatmap == 0 or $heatmap == 2
1424
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1425
+ :row_header => $amino_acids,
1426
+ :rvg_width => $rvg_width,
1427
+ :rvg_height => $rvg_height,
1428
+ :canvas_width => $canvas_width,
1429
+ :canvas_height => $canvas_height,
1430
+ :max_val => grp_max_val.ceil,
1431
+ :min_val => 0,
1432
+ :print_value => $heatmapvalues,
1433
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1434
+
1435
+ $logger.info "Generating a heat map for #{stem} table done."
1436
+ end
1437
+
1438
+ if $heatmap == 1 or $heatmap == 2
1439
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1440
+ :row_header => $amino_acids,
1441
+ :rvg_width => $rvg_width,
1442
+ :rvg_height => $rvg_height - 50,
1443
+ :canvas_width => $canvas_width,
1444
+ :canvas_height => $canvas_height - 50,
1445
+ :max_val => grp_max_val.ceil,
1446
+ :min_val => 0,
1447
+ :print_value => $heatmapvalues,
1448
+ :print_gradient => false,
1449
+ :title => stem,
1450
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1451
+ end
1452
+ end
1453
+
1454
+ # for heat maps in a single file
1455
+ if $heatmap == 1 or $heatmap == 2
1456
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1457
+ heatmaps.heatmap(:columns => $heatmapcol,
1458
+ :rvg_width => $rvg_width,
1459
+ :max_val => grp_max_val.ceil,
1460
+ :min_val => 0).write(file)
1461
+
1462
+ $logger.info "Generating heat maps in a file, #{file} done."
1191
1463
  end
1192
1464
  end
1193
1465
 
@@ -1202,9 +1474,26 @@ HEADER
1202
1474
 
1203
1475
  if $output == 1
1204
1476
  $outfh.puts '>Total'
1205
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1477
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1478
+ :row_header => $amino_acids)
1206
1479
  $outfh.close
1207
- $logger.info 'Egor END.'
1480
+
1481
+ # for a heat map
1482
+ if $heatmap == 0 or $heatmap == 2
1483
+ stem = "#{group_matrices.size}. TOTAL"
1484
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1485
+ :row_header => $amino_acids,
1486
+ :rvg_width => $rvg_width,
1487
+ :rvg_height => $rvg_height,
1488
+ :canvas_width => $canvas_width,
1489
+ :canvas_height => $canvas_height,
1490
+ :max_val => $tot_prob_mat.max.ceil,
1491
+ :min_val => 0,
1492
+ :print_value => $heatmapvalues,
1493
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1494
+
1495
+ $logger.info "Generating a heat map for #{stem} table done."
1496
+ end
1208
1497
  exit 0
1209
1498
  end
1210
1499
  end
@@ -1242,16 +1531,18 @@ HEADER
1242
1531
  # calculating substitution probability matrix for each envrionment
1243
1532
  grp_label = group[0]
1244
1533
  grp_envs = group[1]
1245
- grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1534
+ grp_logo_mat = $cys == 0 ?
1535
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1536
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1246
1537
 
1247
1538
  $amino_acids.each_with_index do |aa, aj|
1248
1539
  env = grp_envs.detect { |e| e.label.start_with?(aa) }
1249
- #paj = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').sum / $tot_cnt_mat.sum
1250
- env.logo_array = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1540
+ env.logo_array = $cys == 0 ?
1541
+ NArray.float($amino_acids.size + 1) :
1542
+ NArray.float($amino_acids.size)
1251
1543
 
1252
1544
  env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1253
1545
  pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1254
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1255
1546
  odds = prob / pai
1256
1547
  env.logo_array[ai] = factor * Math::log(odds)
1257
1548
  grp_logo_mat[aj, ai] = env.logo_array[ai]
@@ -1262,7 +1553,6 @@ HEADER
1262
1553
  pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1263
1554
  prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1264
1555
  env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1265
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1266
1556
  odds = prob / pai
1267
1557
  env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1268
1558
  grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
@@ -1272,13 +1562,14 @@ HEADER
1272
1562
  grp_logo_mats << [grp_label, grp_logo_mat]
1273
1563
  end
1274
1564
 
1275
- $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1565
+ $tot_logo_mat = $cys == 0 ?
1566
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1567
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1276
1568
 
1277
1569
  $amino_acids.each_with_index do |aa1, aj|
1278
1570
  $amino_acids.each_with_index do |aa2, ai|
1279
1571
  prob = $tot_prob_mat[aj, ai]
1280
1572
  pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1281
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1282
1573
  odds = prob / pai
1283
1574
  $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1284
1575
  end
@@ -1287,7 +1578,6 @@ HEADER
1287
1578
  if $cys == 0
1288
1579
  pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1289
1580
  prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1290
- #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1291
1581
  odds = prob / pai
1292
1582
  $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1293
1583
  end
@@ -1315,7 +1605,7 @@ HEADER
1315
1605
  #
1316
1606
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1317
1607
  HEADER
1318
- unless $noround
1608
+ unless $noroundoff
1319
1609
  $outfh.puts <<HEADER
1320
1610
  # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1321
1611
  HEADER
@@ -1326,43 +1616,120 @@ HEADER
1326
1616
  #
1327
1617
  HEADER
1328
1618
 
1619
+ grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
1620
+ grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
1621
+ abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
1622
+ row_header = $cys ? $amino_acids + %w[U] : $amino_acids
1623
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1624
+ $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
1625
+
1329
1626
  grp_logo_mats.each_with_index do |arr, grp_no|
1330
1627
  grp_label = arr[0]
1331
1628
  grp_logo_mat = arr[1]
1629
+ stem = "#{grp_no}. #{grp_label}"
1332
1630
 
1333
- unless $noround
1631
+ unless $noroundoff
1334
1632
  grp_logo_mat = grp_logo_mat.round
1335
1633
  end
1336
1634
 
1635
+ # for a matrix file
1337
1636
  $outfh.puts ">#{grp_label} #{grp_no}"
1338
- if $cys
1339
- $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1340
- else
1341
- $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1637
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
1638
+ :row_header => row_header)
1639
+ # for a heat map
1640
+ if $heatmap == 0 or $heatmap == 2
1641
+ grp_logo_mat.heatmap(:col_header => $amino_acids,
1642
+ :row_header => row_header,
1643
+ :rvg_width => $rvg_width,
1644
+ :rvg_height => $rvg_height,
1645
+ :canvas_width => $canvas_width,
1646
+ :canvas_height => $canvas_height,
1647
+ :gradient_beg_color => '#0000FF',
1648
+ :gradient_mid_color => '#FFFFFF',
1649
+ :gradient_end_color => '#FF0000',
1650
+ :max_val => abs_max_val.ceil,
1651
+ :mid_val => 0,
1652
+ :min_val => -1 * abs_max_val.ceil,
1653
+ :print_value => $heatmapvalues,
1654
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1655
+
1656
+ $logger.info "Generating a heat map for #{stem} table done."
1657
+ end
1658
+
1659
+ if $heatmap == 1 or $heatmap == 2
1660
+ heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1661
+ :row_header => row_header,
1662
+ :rvg_width => $rvg_width,
1663
+ :rvg_height => $rvg_height - 50,
1664
+ :canvas_width => $canvas_width,
1665
+ :canvas_height => $canvas_height - 50,
1666
+ :gradient_beg_color => '#0000FF',
1667
+ :gradient_mid_color => '#FFFFFF',
1668
+ :gradient_end_color => '#FF0000',
1669
+ :max_val => abs_max_val.ceil,
1670
+ :mid_val => 0,
1671
+ :min_val => -1 * abs_max_val.ceil,
1672
+ :print_value => $heatmapvalues,
1673
+ :print_gradient => false,
1674
+ :title => stem,
1675
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
1342
1676
  end
1343
1677
  end
1344
1678
 
1345
- $outfh.puts ">Total #{grp_logo_mats.size}"
1679
+ # for heat maps in a single file
1680
+ if $heatmap == 1 or $heatmap == 2
1681
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1682
+ heatmaps.heatmap(:columns => $heatmapcol,
1683
+ :rvg_width => $rvg_width,
1684
+ :gradient_beg_color => '#0000FF',
1685
+ :gradient_mid_color => '#FFFFFF',
1686
+ :gradient_end_color => '#FF0000',
1687
+ :max_val => abs_max_val.ceil,
1688
+ :mid_val => 0,
1689
+ :min_val => -1 * abs_max_val.ceil).write(file)
1690
+
1691
+ $logger.info "Generating heat maps in a file, #{file} done."
1692
+ end
1346
1693
 
1347
- unless $noround
1694
+ # for a matrix file
1695
+ unless $noroundoff
1348
1696
  $tot_logo_mat = $tot_logo_mat.round
1349
1697
  end
1350
1698
 
1351
- if $cys == 0
1352
- $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1353
- else
1354
- $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1699
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1700
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
1701
+ :row_header => row_header)
1702
+
1703
+ # for a heat map
1704
+ if $heatmap == 0 or $heatmap == 2
1705
+ stem = "#{group_matrices.size}. TOTAL"
1706
+ tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
1707
+ $tot_logo_mat.heatmap(:col_header => $amino_acids,
1708
+ :row_header => row_header,
1709
+ :rvg_width => $rvg_width,
1710
+ :rvg_height => $rvg_height,
1711
+ :canvas_width => $canvas_width,
1712
+ :canvas_height => $canvas_height,
1713
+ :gradient_beg_color => '#0000FF',
1714
+ :gradient_mid_color => '#FFFFFF',
1715
+ :gradient_end_color => '#FF0000',
1716
+ :max_val => tot_abs_max_val.ceil,
1717
+ :mid_val => 0,
1718
+ :min_val => -1 * tot_abs_max_val.ceil,
1719
+ :print_value => $heatmapvalues,
1720
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1721
+
1722
+ $logger.info "Generating a heat map for #{stem} table done."
1355
1723
  end
1356
1724
 
1357
- $logger.info "Calculating log odds ratio is done."
1358
-
1359
- #
1360
- # Part 7. END
1361
- #
1725
+ $logger.info "Calculating log odds ratios done."
1362
1726
  end
1363
1727
 
1728
+ #
1729
+ # Part 7. END
1730
+ #
1731
+
1364
1732
  $outfh.close
1365
- $logger.info "Egor END."
1366
1733
  exit 0
1367
1734
  end
1368
1735
  end