ulla 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,10 @@
1
+ == 0.9.7 30/05/2009
2
+
3
+ * Added --environment option to consider not only substituted amino acids' environments but also substituting amino acids' environments
4
+ * Changed the default pseudocount, (1 / # of environment classes) to 0, a pseudocount needs to be explicitly provided with --add option
5
+ * Changed the fixed column size (7) of raw count matrices to vary depending on the number of digits of a maximum amino acid count
6
+ * Fixed a bug in reading an environment feature definition file (empty line)
7
+
1
8
  == 0.9.6 18/03/2009
2
9
 
3
10
  * Warns if Rmagick is not properly installed or missing.
@@ -18,7 +18,7 @@ http://www-cryst.bioc.cam.ac.uk/ulla
18
18
 
19
19
  == Requirements
20
20
 
21
- * ruby 1.8.7 or above (http://www.ruby-lang.org)
21
+ * ruby 1.8.7 or above (1.9.0 or above recommended, http://www.ruby-lang.org)
22
22
  * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems)
23
23
 
24
24
  Following RubyGems will be automatically installed if you have rubygems installed on your machine
@@ -4,16 +4,19 @@ require 'facets'
4
4
 
5
5
  module NArrayExtensions
6
6
 
7
- def pretty_string(opts={})
8
- { :col_header => nil,
9
- :row_header => nil }.merge!(opts)
7
+ def pretty_string(options={})
8
+ opts = {:col_header => nil,
9
+ :row_header => nil,
10
+ :col_size => 7}.merge(options)
10
11
 
11
- ("%-3s" % "#") + opts[:col_header].inject("") { |s, a| s + ("%7s" % a) } + "\n" +
12
+ ("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
13
+ s + ("%#{opts[:col_size]}s" % a)
14
+ } + "\n" +
12
15
  self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
13
16
  if v.is_a? Float
14
- s + ("%7.2f" % v)
17
+ s + ("%#{opts[:col_size]}.2f" % v)
15
18
  else
16
- s + ("%7d" % v)
19
+ s + ("%#{opts[:col_size]}d" % v)
17
20
  end
18
21
  }
19
22
  end
@@ -13,17 +13,18 @@ end
13
13
  module NMatrixExtensions
14
14
 
15
15
  def pretty_string(options={})
16
- opts = {:col_header => nil,
17
- :row_header => nil }.merge(options)
16
+ opts = {:col_header => nil,
17
+ :row_header => nil,
18
+ :col_size => 7}.merge(options)
18
19
 
19
20
  ("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
20
- s + ("%7s" % a)
21
+ s + ("%#{opts[:col_size]}s" % a)
21
22
  } + "\n" + self.to_a.map_with_index { |a, i|
22
23
  ("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
23
24
  if v.is_a? Float
24
- s + ("%7.2f" % v)
25
+ s + ("%#{opts[:col_size]}.2f" % v)
25
26
  else
26
- s + ("%7d" % v)
27
+ s + ("%#{opts[:col_size]}d" % v)
27
28
  end
28
29
  }
29
30
  }.join("\n")
@@ -67,6 +68,7 @@ module NMatrixExtensions
67
68
  :title? => true,
68
69
  :title => '',
69
70
  :title_font_size => 35,
71
+ :title_font_scale => 1.0,
70
72
  :print_value => false,
71
73
  :key_font_size => 15,
72
74
  :value_font_size => 8,
@@ -75,15 +77,15 @@ module NMatrixExtensions
75
77
  RVG::dpi = opts[:dpi]
76
78
 
77
79
  rvg = RVG.new(opts[:rvg_width], opts[:rvg_height]) do |canvas|
78
- title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * 0.6) / 2.0
79
- title_y = opts[:header_height] - opts[:title_font_size] * 0.7
80
+ title_x = (opts[:canvas_width] - opts[:title].length * opts[:title_font_size] * opts[:title_font_scale] / 2.0) / 2.0
81
+ title_y = opts[:header_height] - opts[:title_font_size] * opts[:title_font_scale]
80
82
 
81
83
  canvas.viewbox(0, 0, opts[:canvas_width], opts[:canvas_height])
82
84
  canvas.background_fill = opts[:background]
83
85
  canvas.desc = opts[:title]
84
86
 
85
87
  if opts[:title?]
86
- canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size])
88
+ canvas.text(title_x, title_y, opts[:title]).styles(:font_size => opts[:title_font_size] * opts[:title_font_scale])
87
89
  end
88
90
 
89
91
  # border for whole matrix
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Ulla
5
- VERSION = '0.9.6'
5
+ VERSION = '0.9.7'
6
6
  end
@@ -39,6 +39,9 @@ Options:
39
39
  --outfile (-o) FILE: output filename (default 'allmat.dat')
40
40
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
41
41
  --noweight: calculate substitution counts with no weights
42
+ --environment (-e) INTEGER:
43
+ 0 for considering only substituted amino acids' environments (default)
44
+ 1 for considering both substituted and substituting amino acids' environments
42
45
  --smooth (-s) INTEGER:
43
46
  0 for partial smoothing (default)
44
47
  1 for full smoothing
@@ -56,7 +59,7 @@ Options:
56
59
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
57
60
  --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
58
61
  --autosigma: automatically adjust the sigma value for smoothing
59
- --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
62
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
60
63
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
61
64
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
62
65
  --heatmap INTEGER:
@@ -88,22 +91,23 @@ Options:
88
91
  # :call-seq:
89
92
  # Ulla::CLI::calculate_pid(seq1, seq2) -> Float
90
93
  #
91
- def calculate_pid(seq1, seq2)
92
- aas1 = seq1.split('')
93
- aas2 = seq2.split('')
94
+ def calculate_pid(seq1, seq2, unit)
95
+ aas1 = seq1.scan(/\w{#{unit}}/)
96
+ aas2 = seq2.scan(/\w{#{unit}}/)
94
97
  cols = aas1.zip(aas2)
98
+ gap = ($gap || '-') * unit
95
99
  align = 0 # no. of aligned columns
96
100
  ident = 0 # no. of identical columns
97
101
  intgp = 0 # no. of internal gaps
98
102
 
99
103
  cols.each do |col|
100
- if (col[0] != '-') && (col[1] != '-')
104
+ if (col[0] != gap) && (col[1] != gap)
101
105
  align += 1
102
106
  if col[0] == col[1]
103
107
  ident += 1
104
108
  end
105
- elsif (((col[0] == '-') && (col[1] != '-')) ||
106
- ((col[0] != '-') && (col[1] == '-')))
109
+ elsif (((col[0] == gap) && (col[1] != gap)) ||
110
+ ((col[0] != gap) && (col[1] == gap)))
107
111
  intgp += 1
108
112
  end
109
113
  end
@@ -148,8 +152,11 @@ Options:
148
152
 
149
153
  # default set of 21 amino acids including J (Cysteine, the free thiol form)
150
154
  $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
155
+ $gap = '-'
151
156
  $tem_list = nil
152
157
  $tem_file = nil
158
+ $environment = 0
159
+ $col_size = nil
153
160
  $classdef = 'classdef.dat'
154
161
  $outfile = 'allmat.dat'
155
162
  $outfh = nil # file hanfle for outfile
@@ -176,7 +183,7 @@ Options:
176
183
  $heatmapcol = nil
177
184
  $heatmapformat = 'png'
178
185
  $heatmapstem = 'heatmaps'
179
- $heatmapvalues = false
186
+ $heatmapvalues = false
180
187
  $rvg_width = 550
181
188
  $rvg_height = 650
182
189
  $canvas_width = 550
@@ -189,7 +196,6 @@ Options:
189
196
  $aa_mutb = {}
190
197
  $aa_rel_mutb = {}
191
198
  $aa_tot_freq = {}
192
- $aa_env_cnt = Hash.new(0)
193
199
  $smooth_prob = {}
194
200
  $tot_cnt_mat = nil
195
201
  $tot_prob_mat = nil
@@ -209,30 +215,32 @@ Options:
209
215
  #
210
216
 
211
217
  opts = GetoptLong.new(
212
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
213
- [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
214
- [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
215
- [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
216
- [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
217
- [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
218
- [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
219
- [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
220
- [ '--noweight', GetoptLong::NO_ARGUMENT ],
221
- [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
222
- [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
223
- [ '--autosigma', GetoptLong::NO_ARGUMENT ],
224
- [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
225
- [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
226
- [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
227
- [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
228
- [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
229
- [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
230
- [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
231
- [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
232
- [ '--penv', GetoptLong::NO_ARGUMENT ],
233
- [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
234
- [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
235
- [ '--version', GetoptLong::NO_ARGUMENT ]
218
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
219
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
220
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
221
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
222
+ [ '--environment', '-e', GetoptLong::REQUIRED_ARGUMENT ],
223
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
224
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
225
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
226
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
227
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
228
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
229
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
230
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
231
+ [ '--add', GetoptLong::REQUIRED_ARGUMENT ],
232
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
233
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
234
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
235
+ [ '--heatmap-columns', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
237
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--targetenv', '-t', GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
240
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
241
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
242
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
243
+ [ '--version', GetoptLong::NO_ARGUMENT ]
236
244
  )
237
245
 
238
246
  begin
@@ -247,6 +255,8 @@ Options:
247
255
  $tem_file = arg
248
256
  when '--classdef'
249
257
  $classdef = arg
258
+ when '--environment'
259
+ $environment = arg.to_i
250
260
  when '--output'
251
261
  $output = arg.to_i
252
262
  when '--outfile'
@@ -335,7 +345,7 @@ Options:
335
345
  exit 1
336
346
  end
337
347
 
338
- # warn if any input file is missing
348
+ # warn if any mandatory input file is missing
339
349
  if $tem_list && !File.exist?($tem_list)
340
350
  warn "Cannot find template list file, #{$tem_list}"
341
351
  exit 1
@@ -372,12 +382,12 @@ Options:
372
382
  # Reading Environment Class Definition File
373
383
  #
374
384
 
375
- # check --cys option and modify amino_acids set if necessary
385
+ # if --cys option 2, then we don't care about 'J' (for both Cystine and Cystine)
376
386
  if $cys == 2
377
- $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
387
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.delete('J')
378
388
  end
379
389
 
380
- # create an EnvironmentFeatureList object for storing all environment
390
+ # create an EnvironmentFeatureArray object for storing all environment
381
391
  # features
382
392
  $env_features = EnvironmentFeatureArray.new
383
393
 
@@ -398,9 +408,9 @@ Options:
398
408
 
399
409
  IO.foreach($classdef) do |line|
400
410
  line.chomp!
401
- if line.start_with?('#')
411
+ if line.start_with?('#') || line.blank?
402
412
  next
403
- elsif (env_ftr = line.chomp.split(/;/)).length == 5
413
+ elsif (env_ftr = line.split(/;/)).length == 5
404
414
  $logger.info "An environment feature, #{line} detected."
405
415
  if env_ftr[-1] == 'T'
406
416
  # skip silenced environment feature
@@ -418,23 +428,39 @@ Options:
418
428
  env_ftr[4])
419
429
  env_index += 1
420
430
  else
421
- $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
- "a environment class definition."
431
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for " +
432
+ "an environment class definition."
423
433
  exit 1
424
434
  end
425
435
  end
426
436
 
437
+ # set the size of amino acid column unit, extended gap
438
+ # and extended amino acid labels
439
+ $col_size = $environment == 1 ? $env_features.size : 1
440
+ $ext_gap = $gap * $col_size
441
+ $ext_amino_acids = []
442
+
427
443
  # a hash for storing all environment classes
428
444
  $env_classes = EnvironmentClassHash.new
429
445
 
430
446
  # generate all possible combinations of environment labels, and store
431
447
  # every environment class into the hash prepared above with the label
432
448
  # as a key
433
- $env_features.label_combinations.each_with_index { |e, i|
434
- $env_classes[e.flatten.join] = Environment.new(i,
435
- e.flatten.join,
436
- $amino_acids)
437
- }
449
+ $env_features.label_combinations.each_with_index do |ef1, i|
450
+ key1 = ef1.flatten.join
451
+ $ext_amino_acids << key1
452
+
453
+ if $environment == 0
454
+ $env_classes[key1] = Environment.new(i, key1, $amino_acids)
455
+ else
456
+ # when considering both substituted and substituting amino acids' environtments,
457
+ # add target (substituting) aa's environment label
458
+ $env_features.label_combinations_without_aa_type.each_with_index do |ef2, j|
459
+ key2 = key1 + "-" + ef2.flatten.join
460
+ $env_classes[key2] = Environment.new(i + j, key2, $amino_acids)
461
+ end
462
+ end
463
+ end
438
464
 
439
465
  #
440
466
  # Part 3 END
@@ -512,9 +538,7 @@ Options:
512
538
  if env_labels[key].empty?
513
539
  env_labels[key] = labels
514
540
  else
515
- env_labels[key].each_with_index { |e, i|
516
- env_labels[key][i] = e + labels[i]
517
- }
541
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
518
542
  end
519
543
  end
520
544
  end
@@ -523,97 +547,92 @@ Options:
523
547
 
524
548
  if $noweight
525
549
  ali.each_pair do |id1, seq1|
550
+ if $environment == 1
551
+ seq1 = seq1.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id1][pos] }.join
552
+ end
553
+
526
554
  ali.each_pair do |id2, seq2|
527
555
  if id1 != id2
528
- pid = calculate_pid(seq1, seq2)
529
- s1 = seq1.split('')
530
- s2 = seq2.split('')
556
+ if $environment == 1
557
+ seq2 = seq2.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[id2][pos] }.join
558
+ end
559
+
560
+ pid = calculate_pid(seq1, seq2, $col_size)
561
+ s1 = seq1.scan(/\S{#{$col_size}}/)
562
+ s2 = seq2.scan(/\S{#{$col_size}}/)
531
563
 
532
564
  # check PID_MIN
533
565
  if $pidmin && (pid < $pidmin)
534
- $logger.info "Skip alignment between #{id1} and #{id2} " +
535
- "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
566
+ $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
536
567
  next
537
568
  end
538
569
 
539
570
  # check PID_MAX
540
571
  if $pidmax && (pid > $pidmax)
541
- $logger.info "Skip alignment between #{id1} and #{id2} " +
542
- "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
572
+ $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
543
573
  next
544
574
  end
545
575
 
546
576
  s1.each_with_index do |aa1, pos|
547
- aa1.upcase!
548
- aa2 = s2[pos].upcase
577
+ aa2 = s2[pos]
549
578
 
550
579
  if env_labels[id1][pos].include?('X')
551
- $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
580
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1[0].chr} were masked."
552
581
  next
553
582
  end
554
583
 
555
584
  if env_labels[id2][pos].include?('X')
556
- $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
585
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2[0].chr} were masked."
557
586
  next
558
587
  end
559
588
 
560
- unless $amino_acids.include?(aa1)
561
- $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
589
+ unless $amino_acids.include?(aa1[0].chr)
590
+ $logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not a standard amino acid." unless aa1 == $ext_gap
562
591
  next
563
592
  end
564
593
 
565
- unless $amino_acids.include?(aa2)
566
- $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
594
+ unless $amino_acids.include?(aa2[0].chr)
595
+ $logger.warn "#{id1}-#{pos}-#{aa2[0].chr} is not a standard amino acid." unless aa2 == $ext_gap
567
596
  next
568
597
  end
569
598
 
570
- aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C') && ($cys != 2)) ? 'J' : aa1
571
- aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C') && ($cys != 2)) ? 'J' : aa2
599
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
600
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
601
+ env_label = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
572
602
 
573
603
  if $cst_features.empty?
574
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
604
+ $env_classes[env_label].increase_residue_count(aa2[0].chr)
575
605
  elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
576
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
606
+ $env_classes[env_label].increase_residue_count(aa2[0].chr)
577
607
  else
578
- $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
608
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
579
609
  next
580
610
  end
581
611
 
582
- grp_label = env_labels[id1][pos][1..-1]
583
-
584
- if $aa_env_cnt.has_key? grp_label
585
- if $aa_env_cnt[grp_label].has_key? aa1
586
- $aa_env_cnt[grp_label][aa1] += 1
587
- else
588
- $aa_env_cnt[grp_label][aa1] = 1
589
- end
590
- else
591
- $aa_env_cnt[grp_label] = Hash.new(0)
592
- $aa_env_cnt[grp_label][aa1] = 1
593
- end
594
-
595
- if $aa_tot_cnt.has_key? aa1
596
- $aa_tot_cnt[aa1] += 1
597
- else
598
- $aa_tot_cnt[aa1] = 1
599
- end
612
+ $aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += 1 : $aa_tot_cnt[aa1] = 1
613
+ $aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += 1 : $aa_mut_cnt[aa1] = 1 if aa1 != aa2
600
614
 
601
- if aa1 != aa2
602
- if $aa_mut_cnt.has_key? aa1
603
- $aa_mut_cnt[aa1] += 1
604
- else
605
- $aa_mut_cnt[aa1] = 1
606
- end
607
- end
608
- $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
615
+ $logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (1) was added to the environments class, #{env_label}."
609
616
  end
610
617
  end
611
618
  end
612
619
  end
613
620
  else
614
621
  # BLOSUM-like weighting
615
- clusters = []
616
- ali.each_pair { |i, s| clusters << [i] }
622
+ clusters = []
623
+ ext_ali = Bio::Alignment::OriginalAlignment.new
624
+
625
+ ali.each_pair do |key, seq|
626
+ clusters << [key]
627
+ if $environment == 1
628
+ ext_seq = seq.split('').map_with_index { |aa, pos| aa == $gap ? $ext_gap : env_labels[key][pos] }.join
629
+ ext_ali.add_seq(ext_seq, key)
630
+ end
631
+ end
632
+
633
+ if $environment == 1
634
+ ali = ext_ali
635
+ end
617
636
 
618
637
  # a loop for single linkage clustering
619
638
  begin
@@ -624,7 +643,7 @@ Options:
624
643
  found = false
625
644
  clusters[i].each do |c1|
626
645
  clusters[j].each do |c2|
627
- if calculate_pid(ali[c1], ali[c2]) >= $weight
646
+ if calculate_pid(ali[c1], ali[c2], $col_size) >= $weight
628
647
  indexes << j
629
648
  found = true
630
649
  break
@@ -655,102 +674,58 @@ Options:
655
674
  clusters.combination(2).each do |cluster1, cluster2|
656
675
  cluster1.each do |id1|
657
676
  cluster2.each do |id2|
658
- seq1 = ali[id1].split('')
659
- seq2 = ali[id2].split('')
677
+ seq1 = ali[id1].scan(/\S{#{$col_size}}/)
678
+ seq2 = ali[id2].scan(/\S{#{$col_size}}/)
660
679
 
661
680
  seq1.each_with_index do |aa1, pos|
662
- aa1.upcase!
663
- aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
681
+ aa2 = seq2[pos]
664
682
 
665
683
  if env_labels[id1][pos].include?('X')
666
- $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
684
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1[0].chr} are masked."
667
685
  next
668
686
  end
669
687
 
670
688
  if env_labels[id2][pos].include?('X')
671
- $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
689
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2[0].chr} are masked."
672
690
  next
673
691
  end
674
692
 
675
- unless $amino_acids.include?(aa1)
676
- $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
693
+ unless $amino_acids.include?(aa1[0].chr)
694
+ $logger.warn "#{id1}-#{pos}-#{aa1[0].chr} is not standard amino acid." unless aa1 == $ext_gap
677
695
  next
678
696
  end
679
697
 
680
- unless $amino_acids.include?(aa2)
681
- $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
698
+ unless $amino_acids.include?(aa2[0].chr)
699
+ $logger.warn "#{id2}-#{pos}-#{aa2[0].chr} is not standard amino acid." unless aa2 == $ext_gap
682
700
  next
683
701
  end
684
702
 
685
- aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C') && ($cys != 2)) ? 'J' : aa1
686
- aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C') && ($cys != 2)) ? 'J' : aa2
687
- cnt1 = 1.0 / cluster1.size
688
- cnt2 = 1.0 / cluster2.size
689
- jnt_cnt = cnt1 * cnt2
703
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
704
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
705
+ cnt1 = 1.0 / cluster1.size.to_f
706
+ cnt2 = 1.0 / cluster2.size.to_f
707
+ jnt_cnt = cnt1 * cnt2
708
+ env_label1 = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
709
+ env_label2 = $environment == 1 ? aa2 + '-' + aa1[1..-1] : env_labels[id2][pos]
690
710
 
691
711
  if $cst_features.empty?
692
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
693
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
712
+ $env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
713
+ $env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
694
714
  elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
695
- $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
696
- $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
715
+ $env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
716
+ $env_classes[env_label2].increase_residue_count(aa1[1].chr, jnt_cnt)
697
717
  else
698
- $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
718
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
699
719
  next
700
720
  end
701
721
 
702
- grp_label1 = env_labels[id1][pos][1..-1]
703
- grp_label2 = env_labels[id2][pos][1..-1]
722
+ $aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
723
+ $aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
724
+ $aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1 if aa1 == aa2
725
+ $aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2 if aa1 == aa2
704
726
 
705
- if $aa_env_cnt.has_key? grp_label1
706
- if $aa_env_cnt[grp_label1].has_key? aa1
707
- $aa_env_cnt[grp_label1][aa1] += cnt1
708
- else
709
- $aa_env_cnt[grp_label1][aa1] = cnt1
710
- end
711
- else
712
- $aa_env_cnt[grp_label1] = Hash.new(0.0)
713
- $aa_env_cnt[grp_label1][aa1] = cnt1
714
- end
715
-
716
- if $aa_env_cnt.has_key? grp_label2
717
- if $aa_env_cnt[grp_label2].has_key? aa2
718
- $aa_env_cnt[grp_label2][aa2] += cnt2
719
- else
720
- $aa_env_cnt[grp_label2][aa2] = cnt2
721
- end
722
- else
723
- $aa_env_cnt[grp_label2] = Hash.new(0.0)
724
- $aa_env_cnt[grp_label2][aa2] = cnt2
725
- end
726
-
727
- if $aa_tot_cnt.has_key? aa1
728
- $aa_tot_cnt[aa1] += cnt1
729
- else
730
- $aa_tot_cnt[aa1] = cnt1
731
- end
732
-
733
- if $aa_tot_cnt.has_key? aa2
734
- $aa_tot_cnt[aa2] += cnt2
735
- else
736
- $aa_tot_cnt[aa2] = cnt2
737
- end
738
-
739
- if aa1 != aa2
740
- if $aa_mut_cnt.has_key? aa1
741
- $aa_mut_cnt[aa1] += cnt1
742
- else
743
- $aa_mut_cnt[aa1] = cnt1
744
- end
745
- if $aa_mut_cnt.has_key? aa2
746
- $aa_mut_cnt[aa2] += cnt2
747
- else
748
- $aa_mut_cnt[aa2] = cnt2
749
- end
750
- end
751
-
752
- $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
753
- $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
727
+ $logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label1}."
728
+ $logger.debug "#{id2}-#{pos}-#{aa2[0].chr} -> #{id1}-#{pos}-#{aa1[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label2}."
754
729
  end
755
730
  end
756
731
  end
@@ -799,66 +774,108 @@ HEADER
799
774
  $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
800
775
  end
801
776
 
777
+ if $environment == 0
778
+ $outfh.puts '#'
779
+ $outfh.puts '# Considered environments: substituted a.a.'
780
+ else
781
+ $outfh.puts '#'
782
+ $outfh.puts '# Considered environments: substituted a.a. and substituting a.a.'
783
+ end
784
+
802
785
  # calculate amino acid frequencies and mutabilities, and
803
786
  # print them as default statistics in the header part
804
- ala_factor = if $aa_tot_cnt['A'] == 0
805
- 0.0
806
- elsif $aa_mut_cnt['A'] == 0
807
- 0.0
808
- else
809
- 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
810
- end
811
- $tot_aa = $aa_tot_cnt.values.sum
787
+ if $environment == 0
788
+ ala_factor = if $aa_tot_cnt['A'] == 0
789
+ 0.0
790
+ elsif $aa_mut_cnt['A'] == 0
791
+ 0.0
792
+ else
793
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
794
+ end
795
+ end
796
+
797
+ $tot_aa = $aa_tot_cnt.values.sum
812
798
 
813
799
  $outfh.puts '#'
814
800
  $outfh.puts "# Total amino acid frequencies:\n"
815
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
816
801
 
817
- min_cnt = -1
818
- min_sigma = nil
802
+ if $environment == 0
803
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
804
+ else
805
+ $outfh.puts "# %-3s %-#{$env_features.size}s %9s %9s %8s" % %w[RES ENV TOT_OBS MUT_OBS REL_FREQ]
806
+ end
819
807
 
820
- $amino_acids.each do |res|
821
- if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
822
- if min_cnt < 0
823
- min_cnt = $aa_tot_cnt[res]
824
- min_sigma = min_cnt / $min_cnt_sigma_ratio
825
- elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
826
- min_cnt = $aa_tot_cnt[res]
827
- min_sigma = min_cnt / $min_cnt_sigma_ratio
808
+ min_cnt = 0
809
+ min_sigma = nil
810
+ aas = $environment == 0 ? $amino_acids : $ext_amino_acids
811
+
812
+ aas.each do |aa|
813
+ if ($aa_tot_cnt[aa] / $sigma) < $min_cnt_sigma_ratio
814
+ if $aa_tot_cnt[aa] > 0 and min_cnt > $aa_tot_cnt[aa]
815
+ min_cnt = $aa_tot_cnt[aa]
816
+ elsif min_cnt == 0
817
+ min_cnt = 1
828
818
  end
829
819
 
830
- $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
831
- end
820
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
832
821
 
833
- $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
834
- $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
835
- $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
836
- end
822
+ if $environment == 0
823
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
824
+ "the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa}."
825
+ else
826
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for " +
827
+ "the total count (#{"%.2f" % $aa_tot_cnt[aa]}) of amino acid, #{aa[0].chr} under the environment class #{aa[1..-1]}."
828
+ end
829
+ end
837
830
 
838
- $amino_acids.each do |res|
839
- if $noweight
840
- $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
841
- [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
842
- else
843
- $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
844
- [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
831
+ if $environment == 0
832
+ $aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ? 1.0 : ($aa_mut_cnt[aa] / $aa_tot_cnt[aa].to_f)
833
+ $aa_rel_mutb[aa] = $aa_mutb[aa] * ala_factor
845
834
  end
835
+
836
+ $aa_tot_freq[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_tot_cnt[aa] / $tot_aa.to_f)
846
837
  end
847
838
 
848
- if min_cnt > -1
839
+ if min_cnt > 0
849
840
  $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
841
+
850
842
  if $autosigma
851
843
  $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
852
844
  $sigma = min_sigma
853
845
  end
854
846
  end
855
847
 
848
+ aas.each do |aa|
849
+ columns = $environment == 0 ?
850
+ [aa, $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_mutb[aa], $aa_rel_mutb[aa], $aa_tot_freq[aa]] :
851
+ [aa[0].chr, aa[1..-1], $aa_tot_cnt[aa], $aa_mut_cnt[aa], $aa_tot_freq[aa]]
852
+
853
+ if $noweight
854
+ if $environment == 0
855
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' % columns
856
+ else
857
+ $outfh.puts "# %-3s %-#{$env_features.size}s %9d %9d %8.4f" % columns
858
+ end
859
+ else
860
+ if $environment == 0
861
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' % columns
862
+ else
863
+ $outfh.puts "# %-3s %-#{$env_features.size}s %9.2f %9.2f %8.4f" % columns
864
+ end
865
+ end
866
+ end
867
+
856
868
  $outfh.puts '#'
857
869
  $outfh.puts '# RES: Amino acid one letter code'
870
+ $outfh.puts '# ENV: Environment label of amino acid'
858
871
  $outfh.puts '# TOT_OBS: Total count of incidence'
859
872
  $outfh.puts '# MUT_OBS: Total count of mutation'
860
- $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
861
- $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
873
+
874
+ if $environment == 0
875
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
876
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
877
+ end
878
+
862
879
  $outfh.puts '# REL_FREQ: Relative frequency'
863
880
  $outfh.puts '#'
864
881
 
@@ -872,7 +889,7 @@ HEADER
872
889
  # Generating substitution frequency matrices
873
890
  #
874
891
 
875
- # calculating probabilities for each environment
892
+ # calculating probabilities for each environment class
876
893
  $env_classes.values.each do |e|
877
894
  if e.freq_array.sum != 0
878
895
  e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
@@ -880,12 +897,12 @@ HEADER
880
897
  end
881
898
 
882
899
  # count raw frequencies
883
- $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
900
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
884
901
  group_matrices = []
885
902
 
886
903
  # for each combination of environment features
887
904
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
888
- grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
905
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
889
906
 
890
907
  $amino_acids.each_with_index do |aa, aj|
891
908
  freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
@@ -901,6 +918,8 @@ HEADER
901
918
  if $output == 0
902
919
  heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
903
920
  grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
921
+ aa_max_cnt = $aa_tot_cnt.to_a.map { |k, v| v }.max
922
+ mat_col_size = aa_max_cnt.floor.to_s.size + 4
904
923
  $heatmapcol ||= Math::sqrt(group_matrices.size).round
905
924
 
906
925
  group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
@@ -908,7 +927,8 @@ HEADER
908
927
  stem = "#{grp_no}. #{grp_label}"
909
928
  $outfh.puts ">#{grp_label} #{grp_no}"
910
929
  $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
911
- :row_header => $amino_acids)
930
+ :row_header => $amino_acids,
931
+ :col_size => mat_col_size > 7 ? mat_col_size : 7)
912
932
 
913
933
  # for a heat map
914
934
  if $heatmap == 0 or $heatmap == 2
@@ -956,7 +976,8 @@ HEADER
956
976
  # total
957
977
  $outfh.puts '>Total'
958
978
  $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
959
- :row_header => $amino_acids)
979
+ :row_header => $amino_acids,
980
+ :col_size => mat_col_size > 7 ? mat_col_size : 7)
960
981
 
961
982
  if $heatmap == 0 or $heatmap == 2
962
983
  stem = "#{group_matrices.size}. TOTAL"
@@ -999,23 +1020,28 @@ HEADER
999
1020
 
1000
1021
  # when nosmoothing !!!
1001
1022
  if ($output > 0) && $nosmooth
1002
- # reinitialize $tot_cnt_mat for pseudocounts
1003
1023
  $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1004
1024
 
1005
- # for each combination of environment features
1006
- pseudo_cnt = $add || (1.0 / $env_classes.group_size)
1007
-
1008
- # add pseudo counts for each frequency vector
1009
- $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
1025
+ # if pseudo count provided, reinitialize $tot_cnt_mat by adding pseudocounts
1026
+ if $add
1027
+ $env_classes.values.each { |e| e.freq_array += $add }
1028
+ end
1010
1029
 
1011
1030
  # re-calculate probability vector for each environment class
1012
- $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
1031
+ $env_classes.values.each do |e|
1032
+ if e.freq_array.sum == 0
1033
+ # if no observation, then probabilities are zeros, too
1034
+ e.prob_array = e.freq_array
1035
+ else
1036
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum.to_f
1037
+ end
1038
+ end
1013
1039
 
1014
1040
  group_matrices = []
1015
1041
 
1016
1042
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1017
- grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1018
- grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1043
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1044
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1019
1045
 
1020
1046
  $amino_acids.each_with_index do |aa, aj|
1021
1047
  env_class = group[1].find { |e| e.label.start_with?(aa) }
@@ -1039,7 +1065,6 @@ HEADER
1039
1065
  $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1040
1066
  :row_header => $amino_acids)
1041
1067
 
1042
-
1043
1068
  # for a heat map
1044
1069
  if $heatmap == 0 or $heatmap == 2
1045
1070
  grp_prob_mat.heatmap(:col_header => $amino_acids,
@@ -1134,12 +1159,24 @@ HEADER
1134
1159
 
1135
1160
  if ($smooth == :full) || $p1smooth
1136
1161
  # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
1137
- 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
1162
+ 0.upto($amino_acids.size - 1) do |i|
1163
+ if $environment == 0
1164
+ p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]])
1165
+ else
1166
+ p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum)
1167
+ end
1168
+ end
1138
1169
  $smooth_prob[1] = p1
1139
1170
  elsif ($smooth == :partial)
1140
1171
  # no smoothing for p1 probabilities just as Kenji's subst
1141
1172
  # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
1142
- 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
1173
+ 0.upto($amino_acids.size - 1) do |i|
1174
+ if $environment == 0
1175
+ p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]]
1176
+ else
1177
+ p1[i] = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
1178
+ end
1179
+ end
1143
1180
  $smooth_prob[1] = p1
1144
1181
  end
1145
1182
 
@@ -1148,6 +1185,10 @@ HEADER
1148
1185
  #
1149
1186
  env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
1150
1187
 
1188
+ if $environment == 1
1189
+ env_labels += $env_features[1..-1].map_with_index { |ef, ei| ef.labels.map { |l| "#{ei + $env_features.size}#{l}" } }
1190
+ end
1191
+
1151
1192
  if $smooth == :partial
1152
1193
  $outfh.puts <<HEADER
1153
1194
  #
@@ -1189,9 +1230,9 @@ HEADER
1189
1230
  # sigma value used is: #{$sigma}
1190
1231
  #
1191
1232
  HEADER
1192
- 1.upto($env_features.size) do |ci|
1233
+ 1.upto(env_labels.size) do |ci|
1193
1234
  # for partial smoothing, only P1 ~ P3, and Pn are considered
1194
- if (ci > 2) && (ci < $env_features.size)
1235
+ if (ci > 2) && (ci < env_labels.size)
1195
1236
  $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
1196
1237
  next
1197
1238
  end
@@ -1200,6 +1241,10 @@ HEADER
1200
1241
  c1[0].product(*c1[1..-1]).each do |labels|
1201
1242
  pattern = '.' * $env_features.size
1202
1243
 
1244
+ if $environment == 1
1245
+ pattern += '.' * ($env_features.size - 1)
1246
+ end
1247
+
1203
1248
  labels.each do |label|
1204
1249
  i = label[0].chr.to_i
1205
1250
  l = label[1].chr
@@ -1211,12 +1256,22 @@ HEADER
1211
1256
  next
1212
1257
  end
1213
1258
 
1259
+ if $environment == 1
1260
+ pattern[$env_features.size, 0] = "-"
1261
+ end
1262
+
1214
1263
  # get environments matching the pattern created above
1215
1264
  # and calculate amino acid frequencies and their probabilities for all the environments
1216
- envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1265
+ envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
1217
1266
  freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1218
1267
  prob_arr = NArray.float($amino_acids.size)
1219
- 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
1268
+ 0.upto($amino_acids.size - 1) do |i|
1269
+ if freq_arr.sum == 0
1270
+ prob_arr[i] = 0
1271
+ else
1272
+ prob_arr[i] = freq_arr[i] / freq_arr.sum.to_f
1273
+ end
1274
+ end
1220
1275
 
1221
1276
  # # assess whether a residue type j is compatible with a particular combination of structural features
1222
1277
  # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
@@ -1254,29 +1309,23 @@ HEADER
1254
1309
  if ci == 1
1255
1310
  priors << $smooth_prob[1]
1256
1311
  elsif ci == 2
1257
- labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
1312
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each do |c3|
1258
1313
  priors << $smooth_prob[2][c3.to_set]
1259
- }
1260
- elsif ci == $env_features.size
1261
- labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
1314
+ end
1315
+ elsif ci == env_labels.size
1316
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each do |c3|
1262
1317
  priors << $smooth_prob[3][c3.to_set]
1263
- }
1318
+ end
1264
1319
  end
1265
1320
 
1266
- # entropy based prior weighting step
1267
- entropy_max = Math::log($amino_acids.size)
1268
- entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
1269
- begin
1270
- p == 0.0 ? s - 1 : s + p * Math::log(p)
1271
- rescue
1272
- #puts "P: #{p}"
1273
- end
1274
- } }
1275
- mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1321
+ # entropy based weighting prior step
1322
+ entropy_max = NMath::log($amino_acids.size)
1323
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
1324
+ mod_entropies = entropies.map { |entropy| (entropy_max - entropy) / entropy_max }
1276
1325
  weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1277
1326
  weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1278
1327
 
1279
- # smoothing step
1328
+ # actual smoothing step
1280
1329
  smooth_prob_arr = NArray.float($amino_acids.size)
1281
1330
  big_N = freq_arr.sum.to_f
1282
1331
  small_n = $amino_acids.size.to_f
@@ -1285,8 +1334,8 @@ HEADER
1285
1334
  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1286
1335
 
1287
1336
  # normalization step
1288
- smooth_prob_arr_sum = smooth_prob_arr.sum
1289
- 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1337
+ total = smooth_prob_arr.sum
1338
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
1290
1339
 
1291
1340
  # store smoothed probabilties in a hash using a set of envrionment labels as a key
1292
1341
  if $smooth_prob.has_key?(ci + 1)
@@ -1331,36 +1380,47 @@ HEADER
1331
1380
  #
1332
1381
  HEADER
1333
1382
  # full smooting
1334
- 1.upto($env_features.size) do |ci|
1383
+ 1.upto(env_labels.size) do |ci|
1335
1384
  env_labels.combination(ci) do |c1|
1336
1385
  c1[0].product(*c1[1..-1]).each do |labels|
1386
+
1337
1387
  pattern = '.' * $env_features.size
1388
+
1389
+ if $environment == 1
1390
+ pattern += '.' * ($env_features.size - 1)
1391
+ end
1392
+
1338
1393
  labels.each do |label|
1339
1394
  j = label[0].chr.to_i
1340
1395
  l = label[1].chr
1341
1396
  pattern[j] = l
1342
1397
  end
1343
1398
 
1399
+ if $environment == 1
1400
+ pattern[$env_features.size, 0] = "-"
1401
+ end
1402
+
1344
1403
  # get environmetns, frequencies, and probabilities
1345
- envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1404
+ envs = $env_classes.values.select { |env| env.label.match(/^#{pattern}/) }
1346
1405
  freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1347
1406
  prob_arr = NArray.float($amino_acids.size)
1348
1407
  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
1349
1408
 
1350
1409
  # collect priors
1351
- priors = []
1410
+ priors = []
1411
+
1352
1412
  if ci > 1
1353
1413
  labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
1354
1414
  else
1355
1415
  priors << $smooth_prob[1]
1356
1416
  end
1357
1417
 
1358
- # entropy based weighting priors
1359
- entropy_max = Math::log($amino_acids.size)
1360
- entropies = priors.map do |prior|
1361
- (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
1362
- end
1363
- weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
1418
+ # entropy based weighting priors step
1419
+ entropy_max = NMath::log($amino_acids.size)
1420
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0 ? s : s + p * Math::log(p) } }
1421
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1422
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1423
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1364
1424
 
1365
1425
  # smoothing step
1366
1426
  smooth_prob_arr = NArray.float($amino_acids.size)
@@ -1371,8 +1431,8 @@ HEADER
1371
1431
  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1372
1432
 
1373
1433
  # normalization step
1374
- smooth_prob_arr_sum = smooth_prob_arr.sum
1375
- 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1434
+ total = smooth_prob_arr.sum
1435
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / total) }
1376
1436
 
1377
1437
  # store smoothed probabilties in a hash using a set of envrionment labels as a key
1378
1438
  if $smooth_prob.has_key?(ci + 1)
@@ -1389,7 +1449,7 @@ HEADER
1389
1449
 
1390
1450
  # updating smoothed probability array for each envrionment
1391
1451
  $env_classes.values.each do |env|
1392
- env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1452
+ env.smooth_prob_array = $smooth_prob[env_labels.size + 1][env.label_set]
1393
1453
  end
1394
1454
 
1395
1455
  # sorting environments and build 21X21 substitution matrices
@@ -1526,7 +1586,7 @@ HEADER
1526
1586
  end
1527
1587
 
1528
1588
  grp_logo_mats = []
1529
- factor = $scale / Math::log(2)
1589
+ factor = $scale / NMath::log(2)
1530
1590
 
1531
1591
  $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1532
1592
  # calculating substitution probability matrix for each envrionment
@@ -1536,6 +1596,11 @@ HEADER
1536
1596
  NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1537
1597
  NMatrix.float($amino_acids.size, $amino_acids.size)
1538
1598
 
1599
+ if $environment == 1
1600
+ # parse substituting aa's environment label
1601
+ tgt_label = grp_label.split('-').last
1602
+ end
1603
+
1539
1604
  $amino_acids.each_with_index do |aa, aj|
1540
1605
  env = grp_envs.detect { |e| e.label.start_with?(aa) }
1541
1606
  env.logo_array = $cys == 0 ?
@@ -1543,19 +1608,29 @@ HEADER
1543
1608
  NArray.float($amino_acids.size)
1544
1609
 
1545
1610
  env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1546
- pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1611
+ if $environment == 0
1612
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1613
+ else
1614
+ pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
1615
+ end
1616
+
1547
1617
  odds = prob / pai
1548
- env.logo_array[ai] = factor * Math::log(odds)
1618
+ env.logo_array[ai] = factor * NMath::log(odds)
1549
1619
  grp_logo_mat[aj, ai] = env.logo_array[ai]
1550
1620
  end
1551
1621
 
1552
1622
  # adding log odds ratio for 'U' (J or C) when --cyc is 0
1553
1623
  if $cys == 0
1554
- pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1555
- prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1556
- env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1557
- odds = prob / pai
1558
- env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1624
+ if $environment == 0
1625
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1626
+ else
1627
+ pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
1628
+ $aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
1629
+ end
1630
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1631
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1632
+ odds = prob / pai
1633
+ env.logo_array[$amino_acids.size] = factor * NMath::log(odds)
1559
1634
  grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1560
1635
  end
1561
1636
  end
@@ -1569,22 +1644,32 @@ HEADER
1569
1644
 
1570
1645
  $amino_acids.each_with_index do |aa1, aj|
1571
1646
  $amino_acids.each_with_index do |aa2, ai|
1572
- prob = $tot_prob_mat[aj, ai]
1573
- pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1574
- odds = prob / pai
1575
- $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1647
+ prob = $tot_prob_mat[aj, ai]
1648
+
1649
+ if $environment == 0
1650
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1651
+ else
1652
+ pai = 100.0 * $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[ai]) }.map { |k, v| v }.sum
1653
+ end
1654
+
1655
+ odds = prob / pai
1656
+ $tot_logo_mat[aj, ai] = factor * NMath::log(odds)
1576
1657
  end
1577
1658
 
1578
1659
  # adding log odds ratio for 'U' (J or C) when --cyc is 0
1579
1660
  if $cys == 0
1580
- pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1661
+ if $environment == 0
1662
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1663
+ else
1664
+ pai = 100.0 * ($aa_tot_freq.select { |k, v| k.start_with?('C') }.map { |k, v| v }.sum +
1665
+ $aa_tot_freq.select { |k, v| k.start_with?('J') }.map { |k, v| v }.sum)
1666
+ end
1581
1667
  prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1582
1668
  odds = prob / pai
1583
- $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1669
+ $tot_logo_mat[aj, $amino_acids.size] = factor * NMath::log(odds)
1584
1670
  end
1585
1671
  end
1586
1672
 
1587
-
1588
1673
  # calculating relative entropy for each amino acid pair H and
1589
1674
  # the expected score E in bit units
1590
1675
  tot_E = 0.0
@@ -1593,10 +1678,22 @@ HEADER
1593
1678
  0.upto($tot_logo_mat.shape[0] - 1) do |j|
1594
1679
  0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1595
1680
  if j != i
1596
- tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1681
+ if $environment == 0
1682
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1683
+ else
1684
+ tot_E += $tot_logo_mat[j, i] *
1685
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
1686
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum / 2.0
1687
+ end
1597
1688
  tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1598
1689
  else
1599
- tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1690
+ if $environment == 0
1691
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1692
+ else
1693
+ tot_E += $tot_logo_mat[j, i] *
1694
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[j]) }.map { |k, v| v }.sum *
1695
+ $aa_tot_freq.select { |k, v| k.start_with?($amino_acids[i]) }.map { |k, v| v }.sum
1696
+ end
1600
1697
  tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1601
1698
  end
1602
1699
  end
@@ -1662,9 +1759,9 @@ HEADER
1662
1759
  heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1663
1760
  :row_header => row_header,
1664
1761
  :rvg_width => $rvg_width,
1665
- :rvg_height => $rvg_height - 50,
1762
+ :rvg_height => $rvg_height,
1666
1763
  :canvas_width => $canvas_width,
1667
- :canvas_height => $canvas_height - 50,
1764
+ :canvas_height => $canvas_height,
1668
1765
  :gradient_beg_color => '#0000FF',
1669
1766
  :gradient_mid_color => '#FFFFFF',
1670
1767
  :gradient_end_color => '#FF0000',
@@ -1674,6 +1771,7 @@ HEADER
1674
1771
  :print_value => $heatmapvalues,
1675
1772
  :print_gradient => false,
1676
1773
  :title => stem,
1774
+ :title_font_scale => 1.0,
1677
1775
  :title_font_size => title_font_size)
1678
1776
  end
1679
1777
  end