ulla 0.9.6 → 0.9.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,10 @@
1
+ == 0.9.7 30/05/2009
2
+
3
+ * Added --environment option to consider not only substituted amino acids' environments but also substituting amino acids' environments
4
+ * Changed the default pseudocount, (1 / # of environment classes) to 0, a pseudocount needs to be explicitly provided with --add option
5
+ * Changed the fixed column size (7) of raw count matrices to vary depending on the number of digits of a maximum amino acid count
6
+ * Fixed a bug in reading an environment feature definition file (empty line)
7
+
1
8
  == 0.9.6 18/03/2009
2
9
 
3
10
  * Warns if Rmagick is not properly installed or missing.
@@ -18,7 +18,7 @@ http://www-cryst.bioc.cam.ac.uk/ulla
18
18
 
19
19
  == Requirements
20
20
 
21
- * ruby 1.8.7 or above (http://www.ruby-lang.org)
21
+ * ruby 1.8.7 or above (1.9.0 or above recommended, http://www.ruby-lang.org)
22
22
  * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems)
23
23
 
24
24
  Following RubyGems will be automatically installed if you have rubygems installed on your machine
@@ -4,16 +4,19 @@ require 'facets'
4
4
 
5
5
  module NArrayExtensions
6
6
 
7
- def pretty_string(opts={})
8
- { :col_header => nil,
9
- :row_header => nil }.merge!(opts)
7
+ def pretty_string(options={})
8
+ opts = {:col_header => nil,
9
+ :row_header => nil,
10
+ :col_size => 7}.merge(options)
10
11
 
11
- ("%-3s" % "#") + opts[:col_header].inject("") { |s, a| s + ("%7s" % a) } + "\n" +
12
+ ("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
13
+ s + ("%#{opts[:col_size]}s" % a)
14
+ } + "\n" +
12
15
  self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
13
16
  if v.is_a? Float
14
- s + ("%7.2f" % v)
17
+ s + ("%#{opts[:col_size]}.2f" % v)
15
18
  else
16
- s + ("%7d" % v)
19
+ s + ("%#{opts[:col_size]}d" % v)
17
20
  end
18
21
  }
19
22
  end
@@ -13,17 +13,18 @@ end
13
13
  module NMatrixExtensions
14
14
 
15
15
  def pretty_string(options={})
16
- opts = {:col_header => nil,
17
- :row_header => nil }.merge(options)
16
+ opts = {:col_header => nil,
17
+ :row_header => nil,
18
+ :col_size => 7}.merge(options)
18
19
 
19
20
  ("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
20
- s + ("%7s" % a)
21
+ s + ("%#{opts[:col_size]}s" % a)
21
22
  } + "\n" + self.to_a.map_with_index { |a, i|
22
23
  ("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
23
24
  if v.is_a? Float
24
- s + ("%7.2f" % v)
25
+ s + ("%#{opts[:col_size]}.2f" % v)
25
26
  else
26
- s + ("%7d" % v)
27
+ s + ("%#{opts[:col_size]}d" % v)
27
28
  end
28
29
  }
29
30
  }.join("\n")
@@ -67,6 +68,7 @@ module NMatrixExtensions
67
68
  :title? => true,
68
69
  :title => '',
69
70
  :title_font_size => 35,
71
+ :title_font_scale => 1.0,
70
72
  :print_value => false,
71
73
  :key_font_size => 15,
72
74
  :value_font_size => 8,
@@ -75,15 +77,15 @@ module NMatrixExtensions
75
77
  RVG::dpi = opts[:dpi]
76
78
 
77
79
  rvg = RVG.new(opts[:rvg_width], opts[:rvg_height]) do |canvas|
78
- title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * 0.6) / 2.0
79
- title_y = opts[:header_height] - opts[:title_font_size] * 0.7
80
+ title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * opts[:title_font_scale] / 2.0) / 2.0
81
+ title_y = opts[:header_height] - opts[:title_font_size] * opts[:title_font_scale]
80
82
 
81
83
  canvas.viewbox(0, 0, opts[:canvas_width], opts[:canvas_height])
82
84
  canvas.background_fill = opts[:background]
83
85
  canvas.desc = opts[:title]
84
86
 
85
87
  if opts[:title?]
86
- canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size])
88
+ canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size] * opts[:title_font_scale])
87
89
  end
88
90
 
89
91
  # border for whole matrix
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Ulla
5
- VERSION = '0.9.6'
5
+ VERSION = '0.9.7'
6
6
  end
@@ -39,6 +39,9 @@ Options:
39
39
  --outfile (-o) FILE: output filename (default 'allmat.dat')
40
40
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
41
41
  --noweight: calculate substitution counts with no weights
42
+ --environment (-e) INTEGER:
43
+ 0 for considering only substituted amino acids' environments (default)
44
+ 1 for considering both substituted and substituting amino acids' environments
42
45
  --smooth (-s) INTEGER:
43
46
  0 for partial smoothing (default)
44
47
  1 for full smoothing
@@ -56,7 +59,7 @@ Options:
56
59
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
57
60
  --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
58
61
  --autosigma: automatically adjust the sigma value for smoothing
59
- --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
62
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
60
63
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
61
64
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
62
65
  --heatmap INTEGER:
@@ -88,22 +91,23 @@ Options:
88
91
  # :call-seq:
89
92
  # Ulla::CLI::calculate_pid(seq1, seq2) -> Float
90
93
  #
91
- def calculate_pid(seq1, seq2)
92
- aas1 = seq1.split('')
93
- aas2 = seq2.split('')
94
+ def calculate_pid(seq1, seq2, unit)
95
+ aas1 = seq1.scan(/\w{#{unit}}/)
96
+ aas2 = seq2.scan(/\w{#{unit}}/)
94
97
  cols = aas1.zip(aas2)
98
+ gap = ($gap || '-') * unit
95
99
  align = 0 # no. of aligned columns
96
100
  ident = 0 # no. of identical columns
97
101
  intgp = 0 # no. of internal gaps
98
102
 
99
103
  cols.each do |col|
100
- if (col[0] != '-') && (col[1] != '-')
104
+ if (col[0] != gap) && (col[1] != gap)
101
105
  align += 1
102
106
  if col[0] == col[1]
103
107
  ident += 1
104
108
  end
105
- elsif (((col[0] == '-') && (col[1] != '-')) ||
106
- ((col[0] != '-') && (col[1] == '-')))
109
+ elsif (((col[0] == gap) && (col[1] != gap)) ||
110
+ ((col[0] != gap) && (col[1] == gap)))
107
111
  intgp += 1
108
112
  end
109
113
  end
@@ -148,8 +152,11 @@ Options:
148
152
 
149
153
  # default set of 21 amino acids including J (Cysteine, the free thiol form)
150
154
  $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
155
+ $gap = '-'
151
156
  $tem_list = nil
152
157
  $tem_file = nil
158
+ $environment = 0
159
+ $col_size = nil
153
160
  $classdef = 'classdef.dat'
154
161
  $outfile = 'allmat.dat'
155
162
  $outfh = nil # file hanfle for outfile
@@ -176,7 +183,7 @@ Options:
176
183
  $heatmapcol = nil
177
184
  $heatmapformat = 'png'
178
185
  $heatmapstem = 'heatmaps'
179
- $heatmapvalues = false
186
+ $heatmapvalues = false
180
187
  $rvg_width = 550
181
188
  $rvg_height = 650
182
189
  $canvas_width = 550
@@ -189,7 +196,6 @@ Options:
189
196
  $aa_mutb = {}
190
197
  $aa_rel_mutb = {}
191
198
  $aa_tot_freq = {}
192
- $aa_env_cnt = Hash.new(0)
193
199
  $smooth_prob = {}
194
200
  $tot_cnt_mat = nil
195
201
  $tot_prob_mat = nil
@@ -209,30 +215,32 @@ Options:
209
215
  #
210
216
 
211
217
  opts = GetoptLong.new(
212
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
213
- [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
214
- [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
215
- [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
216
- [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
217
- [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
218
- [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
219
- [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
220
- [ '--noweight', GetoptLong::NO_ARGUMENT ],
221
- [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
222
- [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
223
- [ '--autosigma', GetoptLong::NO_ARGUMENT ],
224
- [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
225
- [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
226
- [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
227
- [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
228
- [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
229
- [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
230
- [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
231
- [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
232
- [ '--penv', GetoptLong::NO_ARGUMENT ],
233
- [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
234
- [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
235
- [ '--version', GetoptLong::NO_ARGUMENT ]
218
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
219
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
220
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
221
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
222
+ [ '--environment', '-e', GetoptLong::REQUIRED_ARGUMENT ],
223
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
224
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
225
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
226
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
227
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
228
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
229
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
230
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
231
+ [ '--add', GetoptLong::REQUIRED_ARGUMENT ],
232
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
233
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
234
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
235
+ [ '--heatmap-columns', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
237
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--targetenv', '-t', GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
240
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
241
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
242
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
243
+ [ '--version', GetoptLong::NO_ARGUMENT ]
236
244
  )
237
245
 
238
246
  begin
@@ -247,6 +255,8 @@ Options:
247
255
  $tem_file = arg
248
256
  when '--classdef'
249
257
  $classdef = arg
258
+ when '--environment'
259
+ $environment = arg.to_i
250
260
  when '--output'
251
261
  $output = arg.to_i
252
262
  when '--outfile'
@@ -335,7 +345,7 @@ Options:
335
345
  exit 1
336
346
  end
337
347
 
338
- # warn if any input file is missing
348
+ # warn if any mandatory input file is missing
339
349
  if $tem_list && !File.exist?($tem_list)
340
350
  warn "Cannot find template list file, #{$tem_list}"
341
351
  exit 1
@@ -372,12 +382,12 @@ Options:
372
382
  # Reading Environment Class Definition File
373
383
  #
374
384
 
375
- # check --cys option and modify amino_acids set if necessary
385
+ # if --cys option 2, then we don't care about 'J' (for both Cystine and Cystine)
376
386
  if $cys == 2
377
- $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
387
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.delete('J')
378
388
  end
379
389
 
380
- # create an EnvironmentFeatureList object for storing all environment
390
+ # create an EnvironmentFeatureArray object for storing all environment
381
391
  # features
382
392
  $env_features = EnvironmentFeatureArray.new
383
393
 
@@ -398,9 +408,9 @@ Options:
398
408
 
399
409
  IO.foreach($classdef) do |line|
400
410
  line.chomp!
401
- if line.start_with?('#')
411
+ if line.start_with?('#') || line.blank?
402
412
  next
403
- elsif (env_ftr = line.chomp.split(/;/)).length == 5
413
+ elsif (env_ftr = line.split(/;/)).length == 5
404
414
  $logger.info "An environment feature, #{line} detected."
405
415
  if env_ftr[-1] == 'T'
406
416
  # skip silenced environment feature
@@ -418,23 +428,39 @@ Options:
418
428
  env_ftr[4])
419
429
  env_index += 1
420
430
  else
421
- $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
- "a environment class definition."
431
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for " +
432
+ "an environment class definition."
423
433
  exit 1
424
434
  end
425
435
  end
426
436
 
437
+ # set the size of amino acid column unit, extended gap
438
+ # and extended amino acid labels
439
+ $col_size = $environment == 1 ? $env_features.size : 1
440
+ $ext_gap = $gap * $col_size
441
+ $ext_amino_acids = []
442
+
427
443
  # a hash for storing all environment classes
428
444
  $env_classes = EnvironmentClassHash.new
429
445
 
430
446
  # generate all possible combinations of environment labels, and store
431
447
  # every environment class into the hash prepared above with the label
432
448
  # as a key
433
- $env_features.label_combinations.each_with_index { |e, i|
434
- $env_classes[e.flatten.join] = Environment.new(i,
435
- e.flatten.join,
436
- $amino_acids)
437
- }
449
+ $env_features.label_combinations.each_with_index do |ef1, i|
450
+ key1 = ef1.flatten.join
451
+ $ext_amino_acids << key1
452
+
453
+ if $environment == 0
454
+ $env_classes[key1] = Environment.new(i, key1, $amino_acids)
455
+ else
456
+ # when considering both substituted and substituting amino acids' environtments,
457
+ # add target (substituting) aa's environment label
458
+ $env_features.label_combinations_without_aa_type.each_with_index do |ef2, j|
459
+ key2 = key1 + "-" + ef2.flatten.join
460
+ $env_classes[key2] = Environment.new(i + j, key2, $amino_acids)
461
+ end
462
+ end
463
+ end
438
464
 
439
465
  #
440
466
  # Part 3 END
@@ -512,9 +538,7 @@ Options:
512
538
  if env_labels[key].empty?
513
539
  env_labels[key] = labels
514
540
  else
515
- env_labels[key].each_with_index { |e, i|
516
- env_labels[key][i] = e + labels[i]
517
- }
541
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
518
542
  end
519
543
  end
520
544
  end
@@ -523,97 +547,92 @@ Options:
523
547
 
524
548
  if $noweight
525
549
  ali.each_pair do |id1, seq1|
550
+ if $environment == 1
551
+ seq1 = seq1.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id1][pos] }.join
552
+ end
553
+
526
554
  ali.each_pair do |id2, seq2|
527
555
  if id1 != id2
528
- pid = calculate_pid(seq1, seq2)
529
- s1 = seq1.split('')
530
- s2 = seq2.split('')
556
+ if $environment == 1
557
+ seq2 = seq2.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id2][pos] }.join
558
+ end
559
+
560
+ pid = calculate_pid(seq1, seq2, $col_size)
561
+ s1 = seq1.scan(/\S{#{$col_size}}/)
562
+ s2 = seq2.scan(/\S{#{$col_size}}/)
531
563
 
532
564
  # check PID_MIN
533
565
  if $pidmin && (pid < $pidmin)
534
- $logger.info "Skip alignment between #{id1} and #{id2} " +
535
- "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
566
+ $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
536
567
  next
537
568
  end
538
569
 
539
570
  # check PID_MAX
540
571
  if $pidmax && (pid > $pidmax)
541
- $logger.info "Skip alignment between #{id1} and #{id2} " +
542
- "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
572
+ $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
543
573
  next
544
574
  end
545
575
 
546
576
  s1.each_with_index do |aa1, pos|
547
- aa1.upcase!
548
- aa2 = s2[pos].upcase
577
+ aa2 = s2[pos]
549
578
 
550
579
  if env_labels[id1][pos].include?('X')
551
- $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
580
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1[0].chr} were masked."
552
581
  next
553
582
  end
554
583
 
555
584
  if env_labels[id2][pos].include?('X')
556
- $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
585
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2[0].chr} were masked."
557
586
  next
558
587
  end
559
588
 
560
- unless $amino_acids.include?(aa1)
561
- $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
589
+ unless $amino_acids.include?(aa1[0].chr)
590
+ $logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not a standard amino acid." unless aa1 == $ext_gap
562
591
  next
563
592
  end
564
593
 
565
- unless $amino_acids.include?(aa2)
566
- $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
594
+ unless $amino_acids.include?(aa2[0].chr)
595
+ $logger.warn "#{id1}-#{pos}-#{aa2[0].chr} is not a standard amino acid." unless aa2 == $ext_gap
567
596
  next
568
597
  end
569
598
 
570
- aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C') && ($cys != 2)) ? 'J' : aa1
571
- aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C') && ($cys != 2)) ? 'J' : aa2
599
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
600
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
601
+ env_label = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
572
602
 
573
603
  if $cst_features.empty?
574
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
604
+ $env_classes[env_label].increase_residue_count(aa2[0].chr)
575
605
  elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
576
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
606
+ $env_classes[env_label].increase_residue_count(aa2[0].chr)
577
607
  else
578
- $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
608
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
579
609
  next
580
610
  end
581
611
 
582
- grp_label = env_labels[id1][pos][1..-1]
583
-
584
- if $aa_env_cnt.has_key? grp_label
585
- if $aa_env_cnt[grp_label].has_key? aa1
586
- $aa_env_cnt[grp_label][aa1] += 1
587
- else
588
- $aa_env_cnt[grp_label][aa1] = 1
589
- end
590
- else
591
- $aa_env_cnt[grp_label] = Hash.new(0)
592
- $aa_env_cnt[grp_label][aa1] = 1
593
- end
594
-
595
- if $aa_tot_cnt.has_key? aa1
596
- $aa_tot_cnt[aa1] += 1
597
- else
598
- $aa_tot_cnt[aa1] = 1
599
- end
612
+ $aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += 1 : $aa_tot_cnt[aa1] = 1
613
+ $aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += 1 : $aa_mut_cnt[aa1] = 1 if aa1 != aa2
600
614
 
601
- if aa1 != aa2
602
- if $aa_mut_cnt.has_key? aa1
603
- $aa_mut_cnt[aa1] += 1
604
- else
605
- $aa_mut_cnt[aa1] = 1
606
- end
607
- end
608
- $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
615
+ $logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (1) was added to the environments class, #{env_label}."
609
616
  end
610
617
  end
611
618
  end
612
619
  end
613
620
  else
614
621
  # BLOSUM-like weighting
615
- clusters = []
616
- ali.each_pair { |i, s| clusters << [i] }
622
+ clusters = []
623
+ ext_ali = Bio::Alignment::OriginalAlignment.new
624
+
625
+ ali.each_pair do |key, seq|
626
+ clusters << [key]
627
+ if $environment == 1
628
+ ext_seq = seq.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[key][pos] }.join
629
+ ext_ali.add_seq(ext_seq, key)
630
+ end
631
+ end
632
+
633
+ if $environment == 1
634
+ ali = ext_ali
635
+ end
617
636
 
618
637
  # a loop for single linkage clustering
619
638
  begin
@@ -624,7 +643,7 @@ Options:
624
643
  found = false
625
644
  clusters[i].each do |c1|
626
645
  clusters[j].each do |c2|
627
- if calculate_pid(ali[c1], ali[c2]) >= $weight
646
+ if calculate_pid(ali[c1], ali[c2], $col_size) >= $weight
628
647
  indexes << j
629
648
  found = true
630
649
  break
@@ -655,102 +674,58 @@ Options:
655
674
  clusters.combination(2).each do |cluster1, cluster2|
656
675
  cluster1.each do |id1|
657
676
  cluster2.each do |id2|
658
- seq1 = ali[id1].split('')
659
- seq2 = ali[id2].split('')
677
+ seq1 = ali[id1].scan(/\S{#{$col_size}}/)
678
+ seq2 = ali[id2].scan(/\S{#{$col_size}}/)
660
679
 
661
680
  seq1.each_with_index do |aa1, pos|
662
- aa1.upcase!
663
- aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
681
+ aa2 = seq2[pos]
664
682
 
665
683
  if env_labels[id1][pos].include?('X')
666
- $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
684
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1[0].chr} are masked."
667
685
  next
668
686
  end
669
687
 
670
688
  if env_labels[id2][pos].include?('X')
671
- $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
689
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2[0].chr} are masked."
672
690
  next
673
691
  end
674
692
 
675
- unless $amino_acids.include?(aa1)
676
- $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
693
+ unless $amino_acids.include?(aa1[0].chr)
694
+ $logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not standard amino acid." unless aa1 == $ext_gap
677
695
  next
678
696
  end
679
697
 
680
- unless $amino_acids.include?(aa2)
681
- $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
698
+ unless $amino_acids.include?(aa2[0].chr)
699
+ $logger.warn "#{id2}-#{pos}-#{aa2[0].chr} is not standard amino acid." unless aa2 == $ext_gap
682
700
  next
683
701
  end
684
702
 
685
- aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C') && ($cys != 2)) ? 'J' : aa1
686
- aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C') && ($cys != 2)) ? 'J' : aa2
687
- cnt1 = 1.0 / cluster1.size
688
- cnt2 = 1.0 / cluster2.size
689
- jnt_cnt = cnt1 * cnt2
703
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
704
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
705
+ cnt1 = 1.0 / cluster1.size.to_f
706
+ cnt2 = 1.0 / cluster2.size.to_f
707
+ jnt_cnt = cnt1 * cnt2
708
+ env_label1 = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
709
+ env_label2 = $environment == 1 ? aa2 + '-' + aa1[1..-1] : env_labels[id2][pos]
690
710
 
691
711
  if $cst_features.empty?
692
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
693
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
712
+ $env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
713
+ $env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
694
714
  elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
695
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
696
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
715
+ $env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
716
+ $env_classes[env_label2].increase_residue_count(aa1[1].chr, jnt_cnt)
697
717
  else
698
- $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
718
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
699
719
  next
700
720
  end
701
721
 
702
- grp_label1 = env_labels[id1][pos][1..-1]
703
- grp_label2 = env_labels[id2][pos][1..-1]
722
+ $aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
723
+ $aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
724
+ $aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1 if aa1 == aa2
725
+ $aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2 if aa1 == aa2
704
726
 
705
- if $aa_env_cnt.has_key? grp_label1
706
- if $aa_env_cnt[grp_label1].has_key? aa1
707
- $aa_env_cnt[grp_label1][aa1] += cnt1
708
- else
709
- $aa_env_cnt[grp_label1][aa1] = cnt1
710
- end
711
- else
712
- $aa_env_cnt[grp_label1] = Hash.new(0.0)
713
- $aa_env_cnt[grp_label1][aa1] = cnt1
714
- end
715
-
716
- if $aa_env_cnt.has_key? grp_label2
717
- if $aa_env_cnt[grp_label2].has_key? aa2
718
- $aa_env_cnt[grp_label2][aa2] += cnt2
719
- else
720
- $aa_env_cnt[grp_label2][aa2] = cnt2
721
- end
722
- else
723
- $aa_env_cnt[grp_label2] = Hash.new(0.0)
724
- $aa_env_cnt[grp_label2][aa2] = cnt2
725
- end
726
-
727
- if $aa_tot_cnt.has_key? aa1
728
- $aa_tot_cnt[aa1] += cnt1
729
- else
730
- $aa_tot_cnt[aa1] = cnt1
731
- end
732
-
733
- if $aa_tot_cnt.has_key? aa2
734
- $aa_tot_cnt[aa2] += cnt2
735
- else
736
- $aa_tot_cnt[aa2] = cnt2
737
- end
738
-
739
- if aa1 != aa2
740
- if $aa_mut_cnt.has_key? aa1
741
- $aa_mut_cnt[aa1] += cnt1
742
- else
743
- $aa_mut_cnt[aa1] = cnt1
744
- end
745
- if $aa_mut_cnt.has_key? aa2
746
- $aa_mut_cnt[aa2] += cnt2
747
- else
748
- $aa_mut_cnt[aa2] = cnt2
749
- end
750
- end
751
-
752
- $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
753
- $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
727
+ $logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label1}."
728
+ $logger.debug "#{id2}-#{pos}-#{aa2[0].chr} -> #{id1}-#{pos}-#{aa1[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label2}."
754
729
  end
755
730
  end
756
731
  end
@@ -799,66 +774,108 @@ HEADER
799
774
  $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
800
775
  end
801
776
 
777
+ if $environment == 0
778
+ $outfh.puts '#'
779
+ $outfh.puts '# Considered environments: substituted a.a.'
780
+ else
781
+ $outfh.puts '#'
782
+ $outfh.puts '# Considered environments: substituted a.a. and substituting a.a.'
783
+ end
784
+
802
785
  # calculate amino acid frequencies and mutabilities, and
803
786
  # print them as default statistics in the header part
804
- ala_factor = if $aa_tot_cnt['A'] == 0
805
- 0.0
806
- elsif $aa_mut_cnt['A'] == 0
807
- 0.0
808
- else
809
- 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
810
- end
811
- $tot_aa = $aa_tot_cnt.values.sum
787
+ if $environment == 0
788
+ ala_factor = if $aa_tot_cnt['A'] == 0
789
+ 0.0
790
+ elsif $aa_mut_cnt['A'] == 0
791
+ 0.0
792
+ else
793
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
794
+ end
795
+ end
796
+
797
+ $tot_aa = $aa_tot_cnt.values.sum
812
798
 
813
799
  $outfh.puts '#'
814
800
  $outfh.puts "# Total amino acid frequencies:\n"
815
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
816
801
 
817
- min_cnt = -1
818
- min_sigma = nil
802
+ if $environment == 0
803
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
804
+ else
805
+ $outfh.puts "# %-3s %-#{$env_features.size}s %9s %9s %8s" % %w[RES ENV TOT_OBS MUT_OBS REL_FREQ]
806
+ end
819
807
 
820
- $amino_acids.each do |res|
821
- if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
822
- if min_cnt < 0
823
- min_cnt = $aa_tot_cnt[res]
824
- min_sigma = min_cnt / $min_cnt_sigma_ratio
825
- elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
826
- min_cnt = $aa_tot_cnt[res]
827
- min_sigma = min_cnt / $min_cnt_sigma_ratio
808
+ min_cnt = 0
809
+ min_sigma = nil
810
+ aas = $environment == 0 ? $amino_acids : $ext_amino_acids
811
+
812
+ aas.each do |aa|
813
+ if ($aa_tot_cnt[aa] / $sigma) < $min_cnt_sigma_ratio
814
+ if $aa_tot_cnt[aa] > 0 and min_cnt > $aa_tot_cnt[aa]
815
+ min_cnt = $aa_tot_cnt[aa]
816
+ elsif min_cnt == 0
817
+ min_cnt = 1
828
818
  end
829
819
 
830
- $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
831
- end
820
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
832
821
 
833
- $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
834
- $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
835
- $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
836
- end
822
+ if $environment == 0
823
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
824
+ "the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa}."
825
+ else
826
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
827
+ "the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa[0].chr} under the environment class #{aa[1..-1]}."
828
+ end
829
+ end
837
830
 
838
- $amino_acids.each do |res|
839
- if $noweight
840
- $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
841
- [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
842
- else
843
- $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
844
- [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
831
+ if $environment == 0
832
+ $aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ? 1.0 : ($aa_mut_cnt[aa] / $aa_tot_cnt[aa].to_f)
833
+ $aa_rel_mutb[aa] = $aa_mutb[aa] * ala_factor
845
834
  end
835
+
836
+ $aa_tot_freq[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_tot_cnt[aa] / $tot_aa.to_f)
846
837
  end
847
838
 
848
- if min_cnt > -1
839
+ if min_cnt > 0
849
840
  $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
841
+
850
842
  if $autosigma
851
843
  $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
852
844
  $sigma = min_sigma
853
845
  end
854
846
  end
855
847
 
848
+ aas.each do |aa|
849
+ columns = $environment == 0 ?
850
+ [aa, $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_mutb[aa], $aa_rel_mutb[aa], $aa_tot_freq[aa]] :
851
+ [aa[0].chr, aa[1..-1], $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_tot_freq[aa]]
852
+
853
+ if $noweight
854
+ if $environment == 0
855
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' % columns
856
+ else
857
+ $outfh.puts "# %-3s %-#{$env_features.size}s %9d %9d %8.4f" % columns
858
+ end
859
+ else
860
+ if $environment == 0
861
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' % columns
862
+ else
863
+ $outfh.puts "# %-3s %-#{$env_features.size}s %9.2f %9.2f %8.4f" % columns
864
+ end
865
+ end
866
+ end
867
+
856
868
  $outfh.puts '#'
857
869
  $outfh.puts '# RES: Amino acid one letter code'
870
+ $outfh.puts '# ENV: Environment label of amino acid'
858
871
  $outfh.puts '# TOT_OBS: Total count of incidence'
859
872
  $outfh.puts '# MUT_OBS: Total count of mutation'
860
- $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
861
- $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
873
+
874
+ if $environment == 0
875
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
876
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
877
+ end
878
+
862
879
  $outfh.puts '# REL_FREQ: Relative frequency'
863
880
  $outfh.puts '#'
864
881
 
@@ -872,7 +889,7 @@ HEADER
872
889
  # Generating substitution frequency matrices
873
890
  #
874
891
 
875
- # calculating probabilities for each environment
892
+ # calculating probabilities for each environment class
876
893
  $env_classes.values.each do |e|
877
894
  if e.freq_array.sum != 0
878
895
  e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
@@ -880,12 +897,12 @@ HEADER
880
897
  end
881
898
 
882
899
  # count raw frequencies
883
- $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
900
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
884
901
  group_matrices = []
885
902
 
886
903
  # for each combination of environment features
887
904
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
888
- grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
905
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
889
906
 
890
907
  $amino_acids.each_with_index do |aa, aj|
891
908
  freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
@@ -901,6 +918,8 @@ HEADER
901
918
  if $output == 0
902
919
  heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
903
920
  grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
921
+ aa_max_cnt = $aa_tot_cnt.to_a.map { |k, v| v }.max
922
+ mat_col_size = aa_max_cnt.floor.to_s.size + 4
904
923
  $heatmapcol ||= Math::sqrt(group_matrices.size).round
905
924
 
906
925
  group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
@@ -908,7 +927,8 @@ HEADER
908
927
  stem = "#{grp_no}. #{grp_label}"
909
928
  $outfh.puts ">#{grp_label} #{grp_no}"
910
929
  $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
911
- :row_header => $amino_acids)
930
+ :row_header => $amino_acids,
931
+ :col_size => mat_col_size > 7 ? mat_col_size : 7)
912
932
 
913
933
  # for a heat map
914
934
  if $heatmap == 0 or $heatmap == 2
@@ -956,7 +976,8 @@ HEADER
956
976
  # total
957
977
  $outfh.puts '>Total'
958
978
  $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
959
- :row_header => $amino_acids)
979
+ :row_header => $amino_acids,
980
+ :col_size => mat_col_size > 7 ? mat_col_size : 7)
960
981
 
961
982
  if $heatmap == 0 or $heatmap == 2
962
983
  stem = "#{group_matrices.size}. TOTAL"
@@ -999,23 +1020,28 @@ HEADER
999
1020
 
1000
1021
  # when nosmoothing !!!
1001
1022
  if ($output > 0) && $nosmooth
1002
- # reinitialize $tot_cnt_mat for pseudocounts
1003
1023
  $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1004
1024
 
1005
- # for each combination of environment features
1006
- pseudo_cnt = $add || (1.0 / $env_classes.group_size)
1007
-
1008
- # add pseudo counts for each frequency vector
1009
- $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
1025
+ # if pseudo count provided, reinitialize $tot_cnt_mat by adding pseudocounts
1026
+ if $add
1027
+ $env_classes.values.each { |e| e.freq_array += $add }
1028
+ end
1010
1029
 
1011
1030
  # re-calculate probability vector for each environment class
1012
- $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
1031
+ $env_classes.values.each do |e|
1032
+ if e.freq_array.sum == 0
1033
+ # if no observation, then probabilities are zeros, too
1034
+ e.prob_array = e.freq_array
1035
+ else
1036
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum.to_f
1037
+ end
1038
+ end
1013
1039
 
1014
1040
  group_matrices = []
1015
1041
 
1016
1042
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1017
- grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1018
- grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1043
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1044
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1019
1045
 
1020
1046
  $amino_acids.each_with_index do |aa, aj|
1021
1047
  env_class = group[1].find { |e| e.label.start_with?(aa) }
@@ -1039,7 +1065,6 @@ HEADER
1039
1065
  $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1040
1066
  :row_header => $amino_acids)
1041
1067
 
1042
-
1043
1068
  # for a heat map
1044
1069
  if $heatmap == 0 or $heatmap == 2
1045
1070
  grp_prob_mat.heatmap(:col_header => $amino_acids,
@@ -1134,12 +1159,24 @@ HEADER
1134
1159
 
1135
1160
  if ($smooth == :full) || $p1smooth
1136
1161
  # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
1137
- 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
1162
+ 0.upto($amino_acids.size - 1) do |i|
1163
+ if $environment == 0
1164
+ p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]])
1165
+ else
1166
+ p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum)
1167
+ end
1168
+ end
1138
1169
  $smooth_prob[1] = p1
1139
1170
  elsif ($smooth == :partial)
1140
1171
  # no smoothing for p1 probabilities just as Kenji's subst
1141
1172
  # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
1142
- 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
1173
+ 0.upto($amino_acids.size - 1) do |i|
1174
+ if $environment == 0
1175
+ p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]]
1176
+ else
1177
+ p1[i] = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
1178
+ end
1179
+ end
1143
1180
  $smooth_prob[1] = p1
1144
1181
  end
1145
1182
 
@@ -1148,6 +1185,10 @@ HEADER
1148
1185
  #
1149
1186
  env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
1150
1187
 
1188
+ if $environment == 1
1189
+ env_labels += $env_features[1..-1].map_with_index { |ef, ei| ef.labels.map { |l| "#{ei + $env_features.size}#{l}" } }
1190
+ end
1191
+
1151
1192
  if $smooth == :partial
1152
1193
  $outfh.puts <<HEADER
1153
1194
  #
@@ -1189,9 +1230,9 @@ HEADER
1189
1230
  # sigma value used is: #{$sigma}
1190
1231
  #
1191
1232
  HEADER
1192
- 1.upto($env_features.size) do |ci|
1233
+ 1.upto(env_labels.size) do |ci|
1193
1234
  # for partial smoothing, only P1 ~ P3, and Pn are considered
1194
- if (ci > 2) && (ci < $env_features.size)
1235
+ if (ci > 2) && (ci < env_labels.size)
1195
1236
  $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
1196
1237
  next
1197
1238
  end
@@ -1200,6 +1241,10 @@ HEADER
1200
1241
  c1[0].product(*c1[1..-1]).each do |labels|
1201
1242
  pattern = '.' * $env_features.size
1202
1243
 
1244
+ if $environment == 1
1245
+ pattern += '.' * ($env_features.size - 1)
1246
+ end
1247
+
1203
1248
  labels.each do |label|
1204
1249
  i = label[0].chr.to_i
1205
1250
  l = label[1].chr
@@ -1211,12 +1256,22 @@ HEADER
1211
1256
  next
1212
1257
  end
1213
1258
 
1259
+ if $environment == 1
1260
+ pattern[$env_features.size, 0] = "-"
1261
+ end
1262
+
1214
1263
  # get environments matching the pattern created above
1215
1264
  # and calculate amino acid frequencies and their probabilities for all the environments
1216
- envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1265
+ envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
1217
1266
  freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1218
1267
  prob_arr = NArray.float($amino_acids.size)
1219
- 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
1268
+ 0.upto($amino_acids.size - 1) do |i|
1269
+ if freq_arr.sum == 0
1270
+ prob_arr[i] = 0
1271
+ else
1272
+ prob_arr[i] = freq_arr[i] / freq_arr.sum.to_f
1273
+ end
1274
+ end
1220
1275
 
1221
1276
  # # assess whether a residue type j is compatible with a particular combination of structural features
1222
1277
  # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
@@ -1254,29 +1309,23 @@ HEADER
1254
1309
  if ci == 1
1255
1310
  priors << $smooth_prob[1]
1256
1311
  elsif ci == 2
1257
- labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
1312
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each do |c3|
1258
1313
  priors << $smooth_prob[2][c3.to_set]
1259
- }
1260
- elsif ci == $env_features.size
1261
- labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
1314
+ end
1315
+ elsif ci == env_labels.size
1316
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each do |c3|
1262
1317
  priors << $smooth_prob[3][c3.to_set]
1263
- }
1318
+ end
1264
1319
  end
1265
1320
 
1266
- # entropy based prior weighting step
1267
- entropy_max = Math::log($amino_acids.size)
1268
- entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
1269
- begin
1270
- p == 0.0 ? s - 1 : s + p * Math::log(p)
1271
- rescue
1272
- #puts "P: #{p}"
1273
- end
1274
- } }
1275
- mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1321
+ # entropy based weighting prior step
1322
+ entropy_max = NMath::log($amino_acids.size)
1323
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
1324
+ mod_entropies = entropies.map { |entropy| (entropy_max - entropy) / entropy_max }
1276
1325
  weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1277
1326
  weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1278
1327
 
1279
- # smoothing step
1328
+ # actual smoothing step
1280
1329
  smooth_prob_arr = NArray.float($amino_acids.size)
1281
1330
  big_N = freq_arr.sum.to_f
1282
1331
  small_n = $amino_acids.size.to_f
@@ -1285,8 +1334,8 @@ HEADER
1285
1334
  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1286
1335
 
1287
1336
  # normalization step
1288
- smooth_prob_arr_sum = smooth_prob_arr.sum
1289
- 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1337
+ total = smooth_prob_arr.sum
1338
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
1290
1339
 
1291
1340
  # store smoothed probabilties in a hash using a set of envrionment labels as a key
1292
1341
  if $smooth_prob.has_key?(ci + 1)
@@ -1331,36 +1380,47 @@ HEADER
1331
1380
  #
1332
1381
  HEADER
1333
1382
  # full smooting
1334
- 1.upto($env_features.size) do |ci|
1383
+ 1.upto(env_labels.size) do |ci|
1335
1384
  env_labels.combination(ci) do |c1|
1336
1385
  c1[0].product(*c1[1..-1]).each do |labels|
1386
+
1337
1387
  pattern = '.' * $env_features.size
1388
+
1389
+ if $environment == 1
1390
+ pattern += '.' * ($env_features.size - 1)
1391
+ end
1392
+
1338
1393
  labels.each do |label|
1339
1394
  j = label[0].chr.to_i
1340
1395
  l = label[1].chr
1341
1396
  pattern[j] = l
1342
1397
  end
1343
1398
 
1399
+ if $environment == 1
1400
+ pattern[$env_features.size, 0] = "-"
1401
+ end
1402
+
1344
1403
  # get environmetns, frequencies, and probabilities
1345
- envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1404
+ envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
1346
1405
  freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1347
1406
  prob_arr = NArray.float($amino_acids.size)
1348
1407
  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
1349
1408
 
1350
1409
  # collect priors
1351
- priors = []
1410
+ priors = []
1411
+
1352
1412
  if ci > 1
1353
1413
  labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
1354
1414
  else
1355
1415
  priors << $smooth_prob[1]
1356
1416
  end
1357
1417
 
1358
- # entropy based weighting priors
1359
- entropy_max = Math::log($amino_acids.size)
1360
- entropies = priors.map do |prior|
1361
- (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
1362
- end
1363
- weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
1418
+ # entropy based weighting priors step
1419
+ entropy_max = NMath::log($amino_acids.size)
1420
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
1421
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1422
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1423
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1364
1424
 
1365
1425
  # smoothing step
1366
1426
  smooth_prob_arr = NArray.float($amino_acids.size)
@@ -1371,8 +1431,8 @@ HEADER
1371
1431
  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1372
1432
 
1373
1433
  # normalization step
1374
- smooth_prob_arr_sum = smooth_prob_arr.sum
1375
- 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1434
+ total = smooth_prob_arr.sum
1435
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
1376
1436
 
1377
1437
  # store smoothed probabilties in a hash using a set of envrionment labels as a key
1378
1438
  if $smooth_prob.has_key?(ci + 1)
@@ -1389,7 +1449,7 @@ HEADER
1389
1449
 
1390
1450
  # updating smoothed probability array for each envrionment
1391
1451
  $env_classes.values.each do |env|
1392
- env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1452
+ env.smooth_prob_array = $smooth_prob[env_labels.size + 1][env.label_set]
1393
1453
  end
1394
1454
 
1395
1455
  # sorting environments and build 21X21 substitution matrices
@@ -1526,7 +1586,7 @@ HEADER
1526
1586
  end
1527
1587
 
1528
1588
  grp_logo_mats = []
1529
- factor = $scale / Math::log(2)
1589
+ factor = $scale / NMath::log(2)
1530
1590
 
1531
1591
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1532
1592
  # calculating substitution probability matrix for each envrionment
@@ -1536,6 +1596,11 @@ HEADER
1536
1596
  NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1537
1597
  NMatrix.float($amino_acids.size, $amino_acids.size)
1538
1598
 
1599
+ if $environment == 1
1600
+ # parse substituting aa's environment label
1601
+ tgt_label = grp_label.split('-').last
1602
+ end
1603
+
1539
1604
  $amino_acids.each_with_index do |aa, aj|
1540
1605
  env = grp_envs.detect { |e| e.label.start_with?(aa) }
1541
1606
  env.logo_array = $cys == 0 ?
@@ -1543,19 +1608,29 @@ HEADER
1543
1608
  NArray.float($amino_acids.size)
1544
1609
 
1545
1610
  env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1546
- pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1611
+ if $environment == 0
1612
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1613
+ else
1614
+ pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
1615
+ end
1616
+
1547
1617
  odds = prob / pai
1548
- env.logo_array[ai] = factor * Math::log(odds)
1618
+ env.logo_array[ai] = factor * NMath::log(odds)
1549
1619
  grp_logo_mat[aj, ai] = env.logo_array[ai]
1550
1620
  end
1551
1621
 
1552
1622
  # adding log odds ratio for 'U' (J or C) when --cyc is 0
1553
1623
  if $cys == 0
1554
- pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1555
- prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1556
- env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1557
- odds = prob / pai
1558
- env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1624
+ if $environment == 0
1625
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1626
+ else
1627
+ pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
1628
+ $aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
1629
+ end
1630
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1631
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1632
+ odds = prob / pai
1633
+ env.logo_array[$amino_acids.size] = factor * NMath::log(odds)
1559
1634
  grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1560
1635
  end
1561
1636
  end
@@ -1569,22 +1644,32 @@ HEADER
1569
1644
 
1570
1645
  $amino_acids.each_with_index do |aa1, aj|
1571
1646
  $amino_acids.each_with_index do |aa2, ai|
1572
- prob = $tot_prob_mat[aj, ai]
1573
- pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1574
- odds = prob / pai
1575
- $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1647
+ prob = $tot_prob_mat[aj, ai]
1648
+
1649
+ if $environment == 0
1650
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1651
+ else
1652
+ pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
1653
+ end
1654
+
1655
+ odds = prob / pai
1656
+ $tot_logo_mat[aj, ai] = factor * NMath::log(odds)
1576
1657
  end
1577
1658
 
1578
1659
  # adding log odds ratio for 'U' (J or C) when --cyc is 0
1579
1660
  if $cys == 0
1580
- pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1661
+ if $environment == 0
1662
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1663
+ else
1664
+ pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
1665
+ $aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
1666
+ end
1581
1667
  prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1582
1668
  odds = prob / pai
1583
- $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1669
+ $tot_logo_mat[aj, $amino_acids.size] = factor * NMath::log(odds)
1584
1670
  end
1585
1671
  end
1586
1672
 
1587
-
1588
1673
  # calculating relative entropy for each amino acid pair H and
1589
1674
  # the expected score E in bit units
1590
1675
  tot_E = 0.0
@@ -1593,10 +1678,22 @@ HEADER
1593
1678
  0.upto($tot_logo_mat.shape[0] - 1) do |j|
1594
1679
  0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1595
1680
  if j != i
1596
- tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1681
+ if $environment == 0
1682
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1683
+ else
1684
+ tot_E += $tot_logo_mat[j, i] *
1685
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
1686
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum / 2.0
1687
+ end
1597
1688
  tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1598
1689
  else
1599
- tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1690
+ if $environment == 0
1691
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1692
+ else
1693
+ tot_E += $tot_logo_mat[j, i] *
1694
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
1695
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
1696
+ end
1600
1697
  tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1601
1698
  end
1602
1699
  end
@@ -1662,9 +1759,9 @@ HEADER
1662
1759
  heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1663
1760
  :row_header => row_header,
1664
1761
  :rvg_width => $rvg_width,
1665
- :rvg_height => $rvg_height - 50,
1762
+ :rvg_height => $rvg_height,
1666
1763
  :canvas_width => $canvas_width,
1667
- :canvas_height => $canvas_height - 50,
1764
+ :canvas_height => $canvas_height,
1668
1765
  :gradient_beg_color => '#0000FF',
1669
1766
  :gradient_mid_color => '#FFFFFF',
1670
1767
  :gradient_end_color => '#FF0000',
@@ -1674,6 +1771,7 @@ HEADER
1674
1771
  :print_value => $heatmapvalues,
1675
1772
  :print_gradient => false,
1676
1773
  :title => stem,
1774
+ :title_font_scale => 1.0,
1677
1775
  :title_font_size => title_font_size)
1678
1776
  end
1679
1777
  end