RubyGems - egor - Versions diffs - 0.0.2 → 0.0.3 - Mend

egor 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,9 @@
+== 0.0.3 2008-12-09
+* 2 major enhancement:
+  * An option '--cys (-j) 2' added not to distinguish J from C, so 'disulphide bond' environment feature is not prerequisite
+  * Masking works for target amino acid, too
 == 0.0.2 2008-11-13
 * 2 major enhancement:

data/README.rdoc CHANGED Viewed

@@ -2,10 +2,12 @@
 * http://egor.rubyforge.org
 == DESCRIPTION:
 egor: Esst GeneratOR, a program for calculating environment-specific substitution tables
 == FEATURES/PROBLEMS:
 * No more segmentation fault
@@ -14,34 +16,42 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
 * Full smoothing supported
 * In theory, infinite number of environment features can be handled
+== INSTALL:
+    $ sudo gem install egor
 == BASIC USAGE:
     $ egor -l TEMLIST-file -c classdef.dat
         or
     $ egor -f TEM-file -c classdef.dat
 == OPTIONS:
-    --tem-file (-f) STRING: a tem file
-    --tem-list (-l) STRING: a list for tem files
-    --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
-    --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
-    --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (not supported yet)
+    --tem-file (-f) FILE: a tem file
+    --tem-list (-l) FILE: a list for tem files
+    --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
+    --outfile (-o) FILE: output filename (default 'allmat.dat')
+    --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
     --noweight: calculate substitution counts with no weights (default)
     --smooth (-s) INTEGER:
         0 for parial smoothing (default)
         1 for full smoothing
     --nosmooth: perform no smoothing operation
-    --cys (-y) INTEGER: (!!!not implemented yet!!!)
-        0 for using C and J only for structure
-        1 for both structure and sequence (default)
+    --cys (-y) INTEGER:
+        0 for using C and J only for structure (default)
+        1 for both structure and sequence
+        2 for using only C for both
     --output INTEGER:
         0 for raw counts (no-smoothing performed)
         1 for probabilities
         2 for log-odds (default)
     --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
     --sigma DOUBLE: change the sigma value for smoothing (default 5)
-    --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
-    --penv: use environment-dependent frequencies for log-odds calculation (default false) (!!!not implemented yet!!!)
+    --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/=classes)
+    --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
     --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
     --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
     --verbose (-v) INTEGER
@@ -52,17 +62,19 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
     --version: print version
     --help (-h): show help
 == REQUIREMENTS:
 * ruby 1.8.6 or above (http://www.ruby-lang.org)
 * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
+Following RubyGems will be automatically installed if you have rubygems installed on your machine
 * narray (http://narray.rubyforge.org/)
 * facets (http://facets.rubyforge.org/)
 * bio (http://bioruby.open-bio.org/)
+* simple_memoize (http://github.com/JackDanger/simple_memoize/tree/master)
-== INSTALL:
-    $ sudo gem install egor
 == LICENSE:

data/egor.gemspec CHANGED Viewed

@@ -2,11 +2,12 @@
 Gem::Specification.new do |s|
   s.name = %q{egor}
-  s.version = "0.0.1"
+  s.version = "0.0.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Semin Lee"]
-  s.date = %q{2008-11-10}
+  s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
+  s.date = %q{2008-12-09}
   s.default_executable = %q{egor}
   s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
   s.email = ["seminlee@gmail.com"]
@@ -20,8 +21,9 @@ Gem::Specification.new do |s|
   s.require_paths = ["lib"]
   s.rubyforge_project = %q{egor}
   s.rubygems_version = %q{1.3.1}
+  s.signing_key = %q{/Users/semin/.gem/gem-private_key.pem}
   s.summary = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
-  s.test_files = ["test/test_helper.rb", "test/test_egor.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_nmatrix_extensions.rb", "test/test_egor_cli.rb"]
+  s.test_files = ["test/test_egor.rb", "test/test_egor_cli.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_helper.rb", "test/test_nmatrix_extensions.rb"]
   if s.respond_to? :specification_version then
     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
@@ -32,14 +34,14 @@ Gem::Specification.new do |s|
       s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
       s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
       s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
-      s.add_development_dependency(%q<newgem>, [">= 1.0.7"])
+      s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
       s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
     else
       s.add_dependency(%q<narray>, [">= 0.5.9.5"])
       s.add_dependency(%q<bio>, [">= 1.2.1"])
       s.add_dependency(%q<facets>, [">= 2.4.5"])
       s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
-      s.add_dependency(%q<newgem>, [">= 1.0.7"])
+      s.add_dependency(%q<newgem>, [">= 1.1.0"])
       s.add_dependency(%q<hoe>, [">= 1.8.0"])
     end
   else
@@ -47,7 +49,7 @@ Gem::Specification.new do |s|
     s.add_dependency(%q<bio>, [">= 1.2.1"])
     s.add_dependency(%q<facets>, [">= 2.4.5"])
     s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
-    s.add_dependency(%q<newgem>, [">= 1.0.7"])
+    s.add_dependency(%q<newgem>, [">= 1.1.0"])
     s.add_dependency(%q<hoe>, [">= 1.8.0"])
   end
 end

data/lib/egor/cli.rb CHANGED Viewed

@@ -44,7 +44,7 @@ Options:
     --tem-file (-f) FILE: a tem file
     --tem-list (-l) FILE: a list for tem files
     --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
-    --outfile (-o) FILE: output filename ("allmat.dat" if not specified)
+    --outfile (-o) FILE: output filename (default 'allmat.dat')
     --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
     --noweight: calculate substitution counts with no weights (default)
     --smooth (-s) INTEGER:
@@ -54,6 +54,7 @@ Options:
     --cys (-y) INTEGER:
         0 for using C and J only for structure (default)
         1 for both structure and sequence
+        2 for using only C for both
     --output INTEGER:
         0 for raw counts (no-smoothing performed)
         1 for probabilities
@@ -152,12 +153,12 @@ Options:
         $cys          = 0
         $penv         = false
-        $aa_tot_obs   = {}
-        $aa_mut_obs   = {}
+        $aa_tot_obs   = Hash.new(0)
+        $aa_mut_obs   = Hash.new(0)
         $aa_mutb      = {}
         $aa_rel_mutb  = {}
         $aa_rel_freq  = {}
-        $env_aa_obs   = {}
+        $env_aa_obs   = Hash.new(0)
         $smooth_prob  = {}
         $tot_freq_mat = nil
         $tot_prob_mat = nil
@@ -200,7 +201,7 @@ Options:
           when '--outfile'
             $outfile      = arg
           when '--cys'
-            $cys          = (arg.to_i == 1 ? false : true)
+            $cys          = arg.to_i
           when '--weight'
             $weight       = arg.to_i
           when '--sigma'
@@ -255,10 +256,12 @@ Options:
         # Reading Environment Class Definition File
         #
+        # set amino_acids
+        $amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
         # an array for storing all environment feature objects
         $env_features = []
         # an array for storing indexes of constrained environment features
         $cst_features = []
@@ -310,7 +313,7 @@ Options:
         }.inject { |pro, lb|
           pro.product(lb)
         }.each_with_index { |e, i|
-          $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
+          $envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
         }
         # Part 4.
@@ -322,291 +325,308 @@ Options:
         $outfh = File.open($outfile, "w")
         if $tem_file
-          $tem_list = [$tem_file]
+          $tem_list_io = StringIO.new($tem_file)
         end
         if $tem_list
-          IO.foreach($tem_list) do |tem_file|
-            tem_file.chomp!
+          $tem_list_io = File.open($tem_list)
+        end
+        $tem_list_io.each_line do |tem_file|
+          tem_file.chomp!
+          $logger.info ">>> Analysing #{tem_file} ..."
+          ali = Bio::Alignment::OriginalAlignment.new
+          ff  = Bio::FlatFile.auto(tem_file)
+          ff.each_entry do |pir|
+            if pir.definition == "sequence"
+              ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
+            end
+          end
+          if ali.size < 2
+            $logger.warn "!!! Skipped #{tem_file}, there is only one 'sequence' entry"
+            next
+          end
-            $logger.info ">>> Analysing #{tem_file} ..."
+          $ali_size   += 1
+          env_labels  = {}
+          disulphide  = {}
-            ali = Bio::Alignment::OriginalAlignment.new
-            ff  = Bio::FlatFile.auto(tem_file)
+          ali.each_pair do |key, seq|
+            # check disulphide bond environment first!
+            ff.rewind
             ff.each_entry do |pir|
-              if pir.definition == "sequence"
-                ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
+              if (pir.entry_id == key) && (pir.definition == "disulphide")
+                disulphide[key] = pir.data.gsub("\n", "").split("")
               end
             end
-            $ali_size   += 1
-            env_labels  = {}
-            disulphide  = {}
+            $env_features.each_with_index do |ec, ei|
+              env_labels[key] = [] unless env_labels.has_key?(key)
-            ali.each_pair do |key, seq|
-              # check disulphide bond environment first!
               ff.rewind
               ff.each_entry do |pir|
-                if (pir.entry_id == key) && (pir.definition == "disulphide")
-                  disulphide[key] = pir.data.gsub("\n", "").split("")
-                end
-              end
-              $env_features.each_with_index do |ec, ei|
-                env_labels[key] = [] unless env_labels.has_key?(key)
-                ff.rewind
-                ff.each_entry do |pir|
-                  if (pir.entry_id == key) && (pir.definition == ec.name)
-                    labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
-                      if sym == "-"
-                        "-"
-                      elsif sym == "X" || sym == "x"
-                        "X"
+                if (pir.entry_id == key) && (pir.definition == ec.name)
+                  labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
+                    if sym == "-"
+                      "-"
+                    elsif sym == "X" || sym == "x"
+                      "X"
+                    else
+                      if ei == 0 # Amino Acid Environment Feature
+                        (( disulphide.has_key?(key) and disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
                       else
-                        if ei == 0 # Amino Acid Environment Feature
-                          ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
-                        else
-                          ec.labels[ec.symbols.index(sym)]
-                        end
+                        ec.labels[ec.symbols.index(sym)]
                       end
                     end
+                  end
-                    if env_labels[key].empty?
-                      env_labels[key] = labels
-                    else
-                      env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
-                    end
+                  if env_labels[key].empty?
+                    env_labels[key] = labels
+                  else
+                    env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
                   end
                 end
               end
             end
+          end
+          if $noweight
+            ali.each_pair do |id1, seq1|
+              ali.each_pair do |id2, seq2|
+                if id1 != id2
+                  pid  = calc_pid(seq1, seq2)
+                  s1 = seq1.split("")
+                  s2 = seq2.split("")
+                  # check PID_MIN
+                  if $pidmin && (pid < $pidmin)
+                    $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
+                    next
+                  end
-            if $noweight
-              ali.each_pair do |id1, seq1|
-                ali.each_pair do |id2, seq2|
-                  if id1 != id2
-                    pid  = calc_pid(seq1, seq2)
-                    s1 = seq1.split("")
-                    s2 = seq2.split("")
-                    # check PID_MIN
-                    if $pidmin && (pid < $pidmin)
-                      $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
+                  # check PID_MAX
+                  if $pidmax && (pid > $pidmax)
+                    $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
+                    next
+                  end
+                  s1.each_with_index do |aa1, pos|
+                    aa1.upcase!
+                    aa2 = s2[pos].upcase
+                    if env_labels[id1][pos].include?("X")
+                      $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
                       next
                     end
-                    # check PID_MAX
-                    if $pidmax && (pid > $pidmax)
-                      $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
+                    if env_labels[id2][pos].include?("X")
+                      $logger.info ">>> Substitutions to #{id2}-#{pos}-#{aa2} were masked"
                       next
                     end
-                    s1.each_with_index do |aa1, pos|
-                      if env_labels[id1][pos].include?("X")
-                        $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
-                        next
-                      end
-                      aa1.upcase!
-                      aa2 = s2[pos].upcase
-                      if !$amino_acids.include?(aa1)
-                        $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
-                        next
-                      end
+                    if !$amino_acids.include?(aa1)
+                      $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
+                      next
+                    end
-                      if !$amino_acids.include?(aa2)
-                        $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
-                        next
-                      end
+                    if !$amino_acids.include?(aa2)
+                      $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
+                      next
+                    end
-                      aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
-                      aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
+                    aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
+                    aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
-                      if $cst_features.empty?
-                        $envs[env_labels[id1][pos]].increase_residue_count(aa2)
-                      elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
-                             env_labels[id2][pos].split("").values_at(*$cst_features))
-                        $envs[env_labels[id1][pos]].increase_residue_count(aa2)
-                      else
-                        $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
-                        next
-                      end
+                    if $cst_features.empty?
+                      $envs[env_labels[id1][pos]].increase_residue_count(aa2)
+                    elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
+                      $envs[env_labels[id1][pos]].increase_residue_count(aa2)
+                    else
+                      $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
+                      next
+                    end
-                      grp_label = env_labels[id1][pos][1..-1]
+                    grp_label = env_labels[id1][pos][1..-1]
-                      if $env_aa_obs.has_key? grp_label
-                        if $env_aa_obs[grp_label].has_key? aa1
-                          $env_aa_obs[grp_label][aa1] += 1
-                        else
-                          $env_aa_obs[grp_label][aa1] = 1
-                        end
+                    if $env_aa_obs.has_key? grp_label
+                      if $env_aa_obs[grp_label].has_key? aa1
+                        $env_aa_obs[grp_label][aa1] += 1
                       else
-                        $env_aa_obs[grp_label] = Hash.new(0)
                         $env_aa_obs[grp_label][aa1] = 1
                       end
+                    else
+                      $env_aa_obs[grp_label] = Hash.new(0)
+                      $env_aa_obs[grp_label][aa1] = 1
+                    end
-                      if $aa_tot_obs.has_key? aa1
-                        $aa_tot_obs[aa1] += 1
-                      else
-                        $aa_tot_obs[aa1] = 1
-                      end
+                    if $aa_tot_obs.has_key? aa1
+                      $aa_tot_obs[aa1] += 1
+                    else
+                      $aa_tot_obs[aa1] = 1
+                    end
-                      if aa1 != aa2
-                        if $aa_mut_obs.has_key? aa1
-                          $aa_mut_obs[aa1] += 1
-                        else
-                          $aa_mut_obs[aa1] = 1
-                        end
+                    if aa1 != aa2
+                      if $aa_mut_obs.has_key? aa1
+                        $aa_mut_obs[aa1] += 1
+                      else
+                        $aa_mut_obs[aa1] = 1
                       end
-                      $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
                     end
+                    $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
                   end
                 end
               end
-            else
-              # BLOSUM-like weighting
-              clusters = []
-              ali.each_pair { |i, s| clusters << [i] }
-              # a loop for single linkage clustering
-              begin
-                continue = false
-                0.upto(clusters.size - 2) do |i|
-                  indexes = []
-                  (i + 1).upto(clusters.size - 1) do |j|
-                    found = false
-                    clusters[i].each do |c1|
-                      clusters[j].each do |c2|
-                        if calc_pid(ali[c1], ali[c2]) >= $weight
-                          indexes << j
-                          found = true
-                          break
-                        end
+            end
+          else
+            # BLOSUM-like weighting
+            clusters = []
+            ali.each_pair { |i, s| clusters << [i] }
+            # a loop for single linkage clustering
+            begin
+              continue = false
+              0.upto(clusters.size - 2) do |i|
+                indexes = []
+                (i + 1).upto(clusters.size - 1) do |j|
+                  found = false
+                  clusters[i].each do |c1|
+                    clusters[j].each do |c2|
+                      if calc_pid(ali[c1], ali[c2]) >= $weight
+                        indexes << j
+                        found = true
+                        break
                       end
-                      break if found
                     end
+                    break if found
                   end
+                end
-                  unless indexes.empty?
-                    continue  = true
-                    group     = clusters[i]
-                    indexes.each do |k|
-                      group       = group.concat(clusters[k])
-                      clusters[k] = nil
-                    end
-                    clusters[i] = group
-                    clusters.compact!
+                unless indexes.empty?
+                  continue  = true
+                  group     = clusters[i]
+                  indexes.each do |k|
+                    group       = group.concat(clusters[k])
+                    clusters[k] = nil
                   end
+                  clusters[i] = group
+                  clusters.compact!
                 end
-              end while(continue)
-              clusters.combination(2).each do |cluster1, cluster2|
-                cluster1.each do |id1|
-                  cluster2.each do |id2|
-                    seq1 = ali[id1].split("")
-                    seq2 = ali[id2].split("")
-                    seq1.each_with_index do |aa1, pos|
-                      if env_labels[id1][pos].include?("X")
-                        $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
-                        next
-                      end
+              end
+            end while(continue)
-                      aa1.upcase!
-                      aa2 = seq2[pos].upcase
+            clusters.combination(2).each do |cluster1, cluster2|
+              cluster1.each do |id1|
+                cluster2.each do |id2|
+                  seq1 = ali[id1].split("")
+                  seq2 = ali[id2].split("")
-                      if !$amino_acids.include?(aa1)
-                        $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
-                        next
-                      end
+                  seq1.each_with_index do |aa1, pos|
+                    aa1.upcase!
+                    aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
-                      if !$amino_acids.include?(aa2)
-                        $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
-                        next
-                      end
+                    if env_labels[id1][pos].include?("X")
+                      $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
+                      next
+                    end
-                      aa1   = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
-                      aa2   = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
-                      size1 = cluster1.size
-                      size2 = cluster2.size
-                      obs1  = 1.0 / size1
-                      obs2  = 1.0 / size2
-                      if $cst_features.empty?
-                        $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
-                        $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
-                      elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
-                             env_labels[id2][pos].split("").values_at(*$cst_features))
-                        $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
-                        $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
-                      else
-                        $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
-                        next
-                      end
+                    if env_labels[id2][pos].include?("X")
+                      $logger.debug "*** Substitutions to #{id2}-#{pos}-#{aa2} were masked"
+                      next
+                    end
+                    if !$amino_acids.include?(aa1)
+                      $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
+                      next
+                    end
+                    if !$amino_acids.include?(aa2)
+                      $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
+                      next
+                    end
+                    aa1   = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
+                    aa2   = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
+                    size1 = cluster1.size
+                    size2 = cluster2.size
+                    obs1  = 1.0 / size1
+                    obs2  = 1.0 / size2
+                    if $cst_features.empty?
+                      $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
+                      $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
+                    elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
+                      $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
+                      $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
+                    else
+                      $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
+                      next
+                    end
-                      grp_label1 = env_labels[id1][pos][1..-1]
-                      grp_label2 = env_labels[id2][pos][1..-1]
+                    grp_label1 = env_labels[id1][pos][1..-1]
+                    grp_label2 = env_labels[id2][pos][1..-1]
-                      if $env_aa_obs.has_key? grp_label1
-                        if $env_aa_obs[grp_label1].has_key? aa1
-                          $env_aa_obs[grp_label1][aa1] += obs1
-                        else
-                          $env_aa_obs[grp_label1][aa1] = obs1
-                        end
+                    if $env_aa_obs.has_key? grp_label1
+                      if $env_aa_obs[grp_label1].has_key? aa1
+                        $env_aa_obs[grp_label1][aa1] += obs1
                       else
-                        $env_aa_obs[grp_label1] = Hash.new(0.0)
                         $env_aa_obs[grp_label1][aa1] = obs1
                       end
+                    else
+                      $env_aa_obs[grp_label1] = Hash.new(0.0)
+                      $env_aa_obs[grp_label1][aa1] = obs1
+                    end
-                      if $env_aa_obs.has_key? grp_label2
-                        if $env_aa_obs[grp_label2].has_key? aa2
-                          $env_aa_obs[grp_label2][aa2] += obs2
-                        else
-                          $env_aa_obs[grp_label2][aa2] = obs2
-                        end
+                    if $env_aa_obs.has_key? grp_label2
+                      if $env_aa_obs[grp_label2].has_key? aa2
+                        $env_aa_obs[grp_label2][aa2] += obs2
                       else
-                        $env_aa_obs[grp_label2] = Hash.new(0.0)
                         $env_aa_obs[grp_label2][aa2] = obs2
                       end
+                    else
+                      $env_aa_obs[grp_label2] = Hash.new(0.0)
+                      $env_aa_obs[grp_label2][aa2] = obs2
+                    end
-                      if $aa_tot_obs.has_key? aa1
-                        $aa_tot_obs[aa1] += obs1
-                      else
-                        $aa_tot_obs[aa1] = obs1
-                      end
+                    if $aa_tot_obs.has_key? aa1
+                      $aa_tot_obs[aa1] += obs1
+                    else
+                      $aa_tot_obs[aa1] = obs1
+                    end
-                      if $aa_tot_obs.has_key? aa2
-                        $aa_tot_obs[aa2] += obs2
+                    if $aa_tot_obs.has_key? aa2
+                      $aa_tot_obs[aa2] += obs2
+                    else
+                      $aa_tot_obs[aa2] = obs2
+                    end
+                    if aa1 != aa2
+                      if $aa_mut_obs.has_key? aa1
+                        $aa_mut_obs[aa1] += obs1
                       else
-                        $aa_tot_obs[aa2] = obs2
+                        $aa_mut_obs[aa1] = obs1
                       end
-                      if aa1 != aa2
-                        if $aa_mut_obs.has_key? aa1
-                          $aa_mut_obs[aa1] += obs1
-                        else
-                          $aa_mut_obs[aa1] = obs1
-                        end
-                        if $aa_mut_obs.has_key? aa2
-                          $aa_mut_obs[aa2] += obs2
-                        else
-                          $aa_mut_obs[aa2] = obs2
-                        end
+                      if $aa_mut_obs.has_key? aa2
+                        $aa_mut_obs[aa2] += obs2
+                      else
+                        $aa_mut_obs[aa2] = obs2
                       end
-                      $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
-                      $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
                     end
+                    $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
+                    $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution for #{env_labels[id2][pos]}"
                   end
                 end
               end
-            end # if !$nosmooth
-          end # IO.foreach($tem_list)
+            end
+          end # if !$nosmooth
+        end
-          # print out default header
-          $outfh.puts <<HEADER
+        # print out default header
+        $outfh.puts <<HEADER
+#
 # Environment-specific amino acid substitution matrices
 # Creator: egor version #{Egor::VERSION}
 # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
@@ -616,9 +636,9 @@ Options:
 #
 HEADER
-          $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
+        $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
-          $outfh.puts <<HEADER
+        $outfh.puts <<HEADER
 #
 # (read in from #{$classdef})
 #
@@ -632,164 +652,164 @@ HEADER
 #
 HEADER
+        if $noweight
+          $outfh.puts "# Weighting scheme: none"
+        else
+          $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
+        end
+        # calculate amino acid frequencies and mutabilities, and
+        # print them as default statistics in the header part
+        ala_factor  = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
+        $tot_aa     = $aa_tot_obs.values.sum
+        $outfh.puts "#"
+        $outfh.puts "# Total amino acid frequencies:\n"
+        $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
+        $amino_acids.each do |res|
+          $aa_mutb[res]     = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
+          $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
+          $aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
+        end
+        $amino_acids.each do |res|
           if $noweight
-            $outfh.puts "# Weighting scheme: none"
+            $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
+              [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
           else
-            $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
+            $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
+              [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
           end
+        end
-          # calculate amino acid frequencies and mutabilities, and
-          # print them as default statistics in the header part
-          ala_factor  = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
-          $tot_aa     = $aa_tot_obs.values.sum
-          $outfh.puts "#"
-          $outfh.puts "# Total amino acid frequencies:\n"
-          $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
+        # Part 5.
+        #
+        # Calculating substitution frequency tables
+        #
-          $aa_tot_obs.each_pair do |res, freq|
-            $aa_mutb[res]      = $aa_mut_obs[res] / freq.to_f
-            $aa_rel_mutb[res]  = $aa_mutb[res] * ala_factor
-            $aa_rel_freq[res]  = freq / $tot_aa.to_f
+        # calculating probabilities for each environment
+        $envs.values.each do |e|
+          if e.freq_array.sum != 0
+            e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
           end
+        end
-          $amino_acids.each do |res|
-            if $noweight
-              $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
-                [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
-            else
-              $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
-                [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
-            end
-          end
+        # count raw frequencies
+        $tot_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
+        # for each combination of environment features
+        env_groups = $envs.values.group_by { |env| env.label[1..-1] }
-          # Part 5.
-          #
-          # Calculating substitution frequency tables
-          #
+        env_groups.to_a.sort_by { |env_group|
+          # a bit clumsy sorting here...
+          env_group[0].split("").map_with_index { |l, i|
+            $env_features[i + 1].labels.index(l)
+          }
+        }.each_with_index do |group, group_no|
+          grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
-          # calculating probabilities for each environment
-          $envs.values.each do |e|
-            if e.freq_array.sum != 0
-              e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
-            end
+          $amino_acids.each_with_index do |aa, ai|
+            freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
+            0.upto($amino_acids.size - 1) { |j| grp_freq_mat[ai, j] = freq_array[j] }
+          end
+          $tot_freq_mat += grp_freq_mat
+          if $output == 0
+            $outfh.puts ">#{group[0]} #{group_no}"
+            $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
           end
+        end
+        if $output == 0
+          $outfh.puts ">Total"
+          $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+          exit 0
+        end
-          # count raw frequencies
-          $tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
+        # Part 6.
+        #
+        # Calculating substitution probability tables
+        #
+        if $output == 1
+          $outfh.puts <<HEADER
+#
+# Each column (j) represents the probability distribution for the
+# likelihood of acceptance of a mutational event by a residue type j in
+# a particular structural environment (specified after >) leading to
+# any other residue type (i) and sums up to 100.
+#
+HEADER
+        end
+        if ($output > 0) && $nosmooth
+          # Probability matrices
+          $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
           # for each combination of environment features
           env_groups = $envs.values.group_by { |env| env.label[1..-1] }
           env_groups.to_a.sort_by { |env_group|
             # a bit clumsy sorting here...
             env_group[0].split("").map_with_index { |l, i|
               $env_features[i + 1].labels.index(l)
             }
           }.each_with_index do |group, group_no|
-            grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
+            grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
             $amino_acids.each_with_index do |aa, ai|
-              freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
-              0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
+              prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
+              0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = prob_array[j] }
             end
-            $tot_freq_mat += grp_freq_mat
+            $tot_prob_mat += grp_prob_mat
-            if $output == 0
+            if ($output == 1)
               $outfh.puts ">#{group[0]} #{group_no}"
-              $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+              $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
             end
           end
-          if $output == 0
+          if ($output == 1)
             $outfh.puts ">Total"
-            $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+            $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+            $outfh.close
             exit 0
           end
+        end
-          # Part 6.
+        # for smoothing...
+        if ($output > 0) && !$nosmooth
           #
-          # Calculating substitution probability tables
+          # p1 probability
           #
-          if $output == 1
-            $outfh.puts <<HEADER
-#
-# Each column (j) represents the probability distribution for the
-# likelihood of acceptance of a mutational event by a residue type j in
-# a particular structural environment (specified after >) leading to
-# any other residue type (i) and sums up to 100.
-#
-HEADER
-          end
-          if ($output > 0) && $nosmooth
-            # Probability matrices
-            $tot_prob_mat = NMatrix.float(21, 21)
-            # for each combination of environment features
-            env_groups = $envs.values.group_by { |env| env.label[1..-1] }
-            env_groups.to_a.sort_by { |env_group|
-              # a bit clumsy sorting here...
-              env_group[0].split("").map_with_index { |l, i|
-                $env_features[i + 1].labels.index(l)
-              }
-            }.each_with_index do |group, group_no|
-              grp_prob_mat = NMatrix.float(21,21)
-              $amino_acids.each_with_index do |aa, ai|
-                prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
-                0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
-              end
-              $tot_prob_mat += grp_prob_mat
-              if ($output == 1)
-                $outfh.puts ">#{group[0]} #{group_no}"
-                $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
-              end
-            end
-            if ($output == 1)
-              $outfh.puts ">Total"
-              $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
-              $outfh.close
-              exit 0
-            end
+          p1      = NArray.float($amino_acids.size)
+          a0      = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
+          big_N   = $tot_aa.to_f
+          small_n = $amino_acids.size.to_f
+          omega1  = 1.0 / (1 + big_N / ($sigma * small_n))
+          omega2  = 1.0 - omega1
+          if $smooth == :partial
+            # for partial smoothing, p1 probability is not smoothed!
+            0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
+            $smooth_prob[1] = p1
+          else
+            # for full smoothing, p1 probability is smoothed
+            0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
+            $smooth_prob[1] = p1
           end
-          # for smoothing...
-          if ($output > 0) && !$nosmooth
-            #
-            # p1 probability
-            #
-            p1      = NArray.float(21)
-            a0      = NArray.float(21).fill(1 / 21.0)
-            big_N   = $tot_aa.to_f
-            small_n = 21.0
-            omega1  = 1.0 / (1 + big_N / ($sigma * small_n))
-            omega2  = 1.0 - omega1
-            if $smooth == :partial
-              # for partial smoothing, p1 probability is not smoothed!
-              0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
-              $smooth_prob[1] = p1
-            else
-              # for full smoothing, p1 probability is smoothed
-              0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
-              $smooth_prob[1] = p1
-            end
-            #
-            # p2 and above
-            #
-            env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
+          #
+          # p2 and above
+          #
+          env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
-            if $smooth == :partial
-              $outfh.puts <<HEADER
+          if $smooth == :partial
+            $outfh.puts <<HEADER
 #
 # Partial Smoothing:
 #
@@ -813,106 +833,107 @@ HEADER
 # Weights (omegas) are calculated as in Topham et al. 1993)
 #
 # sigma value used is:  5.00
+#
 HEADER
-              1.upto($env_features.size) do |ci|
-                # for partial smoothing, only P1 ~ P3, and Pn are considered
-                next if (ci > 2) && (ci < $env_features.size)
-                env_labels.combination(ci) do |c1|
-                  Enumerable.cart_prod(*c1).each do |labels|
-                    pattern = "." * $env_features.size
-                    labels.each do |label|
-                      i = label[0].chr.to_i
-                      l = label[1].chr
-                      pattern[i] = l
-                    end
+            1.upto($env_features.size) do |ci|
+              # for partial smoothing, only P1 ~ P3, and Pn are considered
+              next if (ci > 2) && (ci < $env_features.size)
+              env_labels.combination(ci) do |c1|
+                Enumerable.cart_prod(*c1).each do |labels|
+                  pattern = "." * $env_features.size
+                  labels.each do |label|
+                    i = label[0].chr.to_i
+                    l = label[1].chr
+                    pattern[i] = l
+                  end
-                    if pattern =~ /^\./
-                      $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
-                      next
-                    end
+                  if pattern =~ /^\./
+                    $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
+                    next
+                  end
-                    # get environmetns, frequencies, and probabilities
-                    envs      = $envs.values.select { |env| env.label.match(pattern.to_re) }
-                    freq_arr  = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
-                    prob_arr  = NArray.float(21)
-                    0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
-  #                  # assess whether a residue type j is compatible with a particular combination of structural features
-  #                  # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
-  #                  if ci == $env_features.size
-  #                    aa_label        = labels.find { |l| l.match(/^0/) }[1].chr
-  #                    sub_pattern     = "." * $env_features.size
-  #                    sub_pattern[0]  = aa_label
-  #                    sub_freq_sum    = 0
-  #
-  #                    labels[1..-1].each do |label|
-  #                      next if label.start_with?("0")
-  #                      i               = label[0].chr.to_i
-  #                      l               = label[1].chr
-  #                      sub_pattern[i]  = l
-  #                      sub_envs        = $envs.values.select { |env| env.label.match(pattern.to_re) }
-  #                      sub_freq_arr    = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
-  #                      sub_freq_sum    += sub_freq_arr.sum
-  #                    end
-  #
-  #                    if sub_freq_sum == 0
-  #                      if $smooth_prob.has_key?(ci + 1)
-  #                        $smooth_prob[ci + 1][labels.to_set] = prob_arr
-  #                      else
-  #                        $smooth_prob[ci + 1] = {}
-  #                        $smooth_prob[ci + 1][labels.to_set] = prob_arr
-  #                      end
-  #                      $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
-  #                      next
-  #                    end
-  #                  end
-                    # collect priors if ci > 1
-                    priors  = []
-                    if ci == 2
-                      labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
-                        priors << $smooth_prob[2][c3.to_set]
-                      }
-                    elsif ci == $env_features.size
-                      labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
-                        priors << $smooth_prob[3][c3.to_set]
-                      }
-                    end
+                  # get environmetns, frequencies, and probabilities
+                  envs      = $envs.values.select { |env| env.label.match(pattern.to_re) }
+                  freq_arr  = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
+                  prob_arr  = NArray.float($amino_acids.size)
+                  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
+#                  # assess whether a residue type j is compatible with a particular combination of structural features
+#                  # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
+#                  if ci == $env_features.size
+#                    aa_label        = labels.find { |l| l.match(/^0/) }[1].chr
+#                    sub_pattern     = "." * $env_features.size
+#                    sub_pattern[0]  = aa_label
+#                    sub_freq_sum    = 0
+#
+#                    labels[1..-1].each do |label|
+#                      next if label.start_with?("0")
+#                      i               = label[0].chr.to_i
+#                      l               = label[1].chr
+#                      sub_pattern[i]  = l
+#                      sub_envs        = $envs.values.select { |env| env.label.match(pattern.to_re) }
+#                      sub_freq_arr    = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
+#                      sub_freq_sum    += sub_freq_arr.sum
+#                    end
+#
+#                    if sub_freq_sum == 0
+#                      if $smooth_prob.has_key?(ci + 1)
+#                        $smooth_prob[ci + 1][labels.to_set] = prob_arr
+#                      else
+#                        $smooth_prob[ci + 1] = {}
+#                        $smooth_prob[ci + 1][labels.to_set] = prob_arr
+#                      end
+#                      $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
+#                      next
+#                    end
+#                  end
+                  # collect priors if ci > 1
+                  priors  = []
+                  if ci == 2
+                    labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
+                      priors << $smooth_prob[2][c3.to_set]
+                    }
+                  elsif ci == $env_features.size
+                    labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
+                      priors << $smooth_prob[3][c3.to_set]
+                    }
+                  end
-                    # entropy based weighting priors
-                    entropy_max     = Math::log(21)
-                    entropies       = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
-                    mod_entropies   = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
-                    weights         = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
-                    weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
-                    # smoothing step
-                    smooth_prob_arr = NArray.float(21)
-                    big_N           = freq_arr.sum.to_f
-                    small_n         = 21.0
-                    omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
-                    omega2          = 1.0 - omega1
-                    0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
-                    # normalization step
-                    smooth_prob_arr_sum = smooth_prob_arr.sum
-                    0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
-                    # store smoothed probabilties in a hash using a set of envrionment labels as a key
-                    if !$smooth_prob.has_key?(ci + 1)
-                      $smooth_prob[ci + 1] = {}
-                      $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
-                    else
-                      $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
-                    end
+                  # entropy based weighting priors
+                  entropy_max     = Math::log($amino_acids.size)
+                  entropies       = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
+                  mod_entropies   = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
+                  weights         = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
+                  weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
+                  # smoothing step
+                  smooth_prob_arr = NArray.float($amino_acids.size)
+                  big_N           = freq_arr.sum.to_f
+                  small_n         = $amino_acids.size.to_f
+                  omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
+                  omega2          = 1.0 - omega1
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
+                  # normalization step
+                  smooth_prob_arr_sum = smooth_prob_arr.sum
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
+                  # store smoothed probabilties in a hash using a set of envrionment labels as a key
+                  if !$smooth_prob.has_key?(ci + 1)
+                    $smooth_prob[ci + 1] = {}
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
+                  else
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
                   end
                 end
               end
-            else
-              $outfh.puts <<HEADER
+            end
+          else
+            $outfh.puts <<HEADER
 #
 # Full Smoothing:
 #
@@ -939,193 +960,194 @@ HEADER
 # Weights (omegas) are calculated as in Topham et al. 1993)
 #
 # sigma value used is:  5.00
+#
 HEADER
-              # full smooting
-              1.upto($env_features.size) do |ci|
-                env_labels.combination(ci) do |c1|
-                  Enumerable.cart_prod(*c1).each do |labels|
-                    pattern = "." * $env_features.size
-                    labels.each do |label|
-                      j = label[0].chr.to_i
-                      l = label[1].chr
-                      pattern[j] = l
-                    end
+            # full smooting
+            1.upto($env_features.size) do |ci|
+              env_labels.combination(ci) do |c1|
+                Enumerable.cart_prod(*c1).each do |labels|
+                  pattern = "." * $env_features.size
+                  labels.each do |label|
+                    j = label[0].chr.to_i
+                    l = label[1].chr
+                    pattern[j] = l
+                  end
-                    # get environmetns, frequencies, and probabilities
-                    envs      = $envs.values.select { |env| env.label.match(pattern.to_re) }
-                    freq_arr  = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
-                    prob_arr  = NArray.float(21)
-                    0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
+                  # get environmetns, frequencies, and probabilities
+                  envs      = $envs.values.select { |env| env.label.match(pattern.to_re) }
+                  freq_arr  = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
+                  prob_arr  = NArray.float($amino_acids.size)
+                  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
-                    # collect priors
-                    priors  = []
-                    if ci > 1
-                      labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
-                    else
-                      priors << $smooth_prob[1]
-                    end
+                  # collect priors
+                  priors  = []
+                  if ci > 1
+                    labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
+                  else
+                    priors << $smooth_prob[1]
+                  end
-                    # entropy based weighting priors
-                    entropy_max = Math::log(21)
-                    entropies = priors.map do |prior|
-                      (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
-                    end
-                    weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
-                    # smoothing step
-                    smooth_prob_arr = NArray.float(21)
-                    big_N           = freq_arr.sum.to_f
-                    small_n         = 21.0
-                    omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
-                    omega2          = 1.0 - omega1
-                    0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
-                    # normalization step
-                    smooth_prob_arr_sum = smooth_prob_arr.sum
-                    0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
-                    # store smoothed probabilties in a hash using a set of envrionment labels as a key
-                    if !$smooth_prob.has_key?(ci + 1)
-                      $smooth_prob[ci + 1] = {}
-                      $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
-                    else
-                      $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
-                    end
+                  # entropy based weighting priors
+                  entropy_max = Math::log($amino_acids.size)
+                  entropies = priors.map do |prior|
+                    (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
+                  end
+                  weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
+                  # smoothing step
+                  smooth_prob_arr = NArray.float($amino_acids.size)
+                  big_N           = freq_arr.sum.to_f
+                  small_n         = $amino_acids.size.to_f
+                  omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
+                  omega2          = 1.0 - omega1
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
+                  # normalization step
+                  smooth_prob_arr_sum = smooth_prob_arr.sum
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
+                  # store smoothed probabilties in a hash using a set of envrionment labels as a key
+                  if !$smooth_prob.has_key?(ci + 1)
+                    $smooth_prob[ci + 1] = {}
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
+                  else
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
                   end
                 end
               end
             end
+          end
-            # updating smoothed probability array for each envrionment
-            $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
-            # for a total substitution probability matrix
-            $tot_prob_mat = NMatrix.float(21,21)
-            # grouping environments by its environment labels but amino acid label
-            env_groups = $envs.values.group_by { |env| env.label[1..-1] }
+          # updating smoothed probability array for each envrionment
+          $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
-            # sorting environments and build 21X21 substitution matrices
-            env_groups.to_a.sort_by { |env_group|
-              # a bit clumsy sorting here...
-              env_group[0].split("").map_with_index { |l, i|
-                $env_features[i + 1].labels.index(l)
-              }
-            }.each_with_index do |group, group_no|
-              # calculating 21X21 substitution probability matrix for each envrionment
-              grp_prob_mat = NMatrix.float(21,21)
+          # for a total substitution probability matrix
+          $tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
-              $amino_acids.each_with_index do |aa, ai|
-                smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
-                0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
-              end
+          # grouping environments by its environment labels but amino acid label
+          env_groups = $envs.values.group_by { |env| env.label[1..-1] }
-              $tot_prob_mat += grp_prob_mat
+          # sorting environments and build 21X21 substitution matrices
+          env_groups.to_a.sort_by { |env_group|
+            # a bit clumsy sorting here...
+            env_group[0].split("").map_with_index { |l, i|
+              $env_features[i + 1].labels.index(l)
+            }
+          }.each_with_index do |group, group_no|
+            # calculating 21X21 substitution probability matrix for each envrionment
+            grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
-              if $output == 1
-                $outfh.puts ">#{group[0]} #{group_no}"
-                $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
-              end
+            $amino_acids.each_with_index do |aa, ai|
+              smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
+              0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
             end
-            $tot_prob_mat /= env_groups.size
+            $tot_prob_mat += grp_prob_mat
             if $output == 1
-              $outfh.puts ">Total"
-              $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
-              $outfh.close
-              exit 0
+              $outfh.puts ">#{group[0]} #{group_no}"
+              $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
             end
+          end
+          $tot_prob_mat /= env_groups.size
-            # Part 7.
-            #
-            # Calculating log-add ratio scoring matrices
-            #
-            if $output == 2
-              $outfh.puts <<HEADER
+          if $output == 1
+            $outfh.puts ">Total"
+            $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+            $outfh.close
+            exit 0
+          end
+          # Part 7.
+          #
+          # Calculating log odds ratio scoring matrices
+          #
+          if $output == 2
+            $outfh.puts <<HEADER
 #
 # The probabilities were then divided by the background probabilities
 HEADER
-              if $penv
-                $outfh.puts <<HEADER
+            if $penv
+              $outfh.puts <<HEADER
 # which were derived from the environment-independent amino acid frequencies.
 #                             ^^^^^^^^^^^^^^^^^^^^^^^
 HEADER
-              else
-                $outfh.puts <<HEADER
+            else
+              $outfh.puts <<HEADER
 # which were derived from the environment-dependent amino acid frequencies.
 #                             ^^^^^^^^^^^^^^^^^^^^^
 HEADER
-              end
+            end
-              $tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
-              grp_logo_mats = []
-              factor        = $scale / Math::log(2)
-              # grouping environments by its environment labels but amino acid label
-              env_groups = $envs.values.group_by { |env| env.label[1..-1] }
-              # sorting environments and build 21X21 substitution matrices
-              env_groups.to_a.sort_by { |env_group|
-                # a bit clumsy sorting here...
-                env_group[0].split("").map_with_index { |l, i|
-                  $env_features[i + 1].labels.index(l)
-                }
-              }.each_with_index do |group, group_no|
-                # calculating 21X21 substitution probability matrix for each envrionment
-                grp_label     = group[0]
-                grp_envs      = group[1]
-                grp_logo_mat  = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
-                $amino_acids.each_with_index do |aa, ai|
-                  env       = grp_envs.detect { |e| e.label.start_with?(aa) }
-                  logo_arr  = $cys ? NArray.float(22) : NArray.float(21)
-                  env.smooth_prob_array.to_a.each_with_index do |prob, j|
-                    paj         = 100.0 * $aa_rel_freq[$amino_acids[j]]
-                    odds        = prob == 0.0 ? 0.000001 / paj : prob / paj
-                    logo_arr[j] = factor * Math::log(odds)
-                  end
+            $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
+            grp_logo_mats = []
+            factor        = $scale / Math::log(2)
-                  0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
+            # grouping environments by its environment labels but amino acid label
+            env_groups = $envs.values.group_by { |env| env.label[1..-1] }
-                  # adding log odds ratio for "U" (J or C) when --cyc is ON
-                  if $cys
-                    paj   = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
-                    prob  = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
-                    odds  = prob == 0.0 ? 0.000001 / paj : prob / paj
-                    logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
-                    grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
-                  end
+            # sorting environments and build 21X21 substitution matrices
+            env_groups.to_a.sort_by { |env_group|
+              # a bit clumsy sorting here...
+              env_group[0].split("").map_with_index { |l, i|
+                $env_features[i + 1].labels.index(l)
+              }
+            }.each_with_index do |group, group_no|
+              # calculating 21X21 substitution probability matrix for each envrionment
+              grp_label     = group[0]
+              grp_envs      = group[1]
+              grp_logo_mat  = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
+              $amino_acids.each_with_index do |aa, ai|
+                env       = grp_envs.detect { |e| e.label.start_with?(aa) }
+                logo_arr  = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
+                env.smooth_prob_array.to_a.each_with_index do |prob, j|
+                  paj         = 100.0 * $aa_rel_freq[$amino_acids[j]]
+                  odds        = prob == 0.0 ? 0.000001 / paj : prob / paj
+                  logo_arr[j] = factor * Math::log(odds)
                 end
-                $tot_logo_mat += grp_logo_mat
-                grp_logo_mats << [grp_label, grp_logo_mat]
+                0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
+                # adding log odds ratio for "U" (J or C) when --cyc is 0
+                if $cys == 0
+                  paj   = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
+                  prob  = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
+                  odds  = prob == 0.0 ? 0.000001 / paj : prob / paj
+                  logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
+                  grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
+                end
               end
-              $tot_logo_mat /= env_groups.size
+              $tot_logo_mat += grp_logo_mat
+              grp_logo_mats << [grp_label, grp_logo_mat]
+            end
-              # calculating relative entropy for each amino acid pair H and
-              # the expected score E in bit units
-              #
-              # I'm a bit suspicious about this part...
-              tot_E = 0.0
-              tot_H = 0.0
+            $tot_logo_mat /= env_groups.size
-              0.upto($tot_logo_mat.shape[0] - 1) do |i|
-                0.upto($tot_logo_mat.shape[0] - 1) do |j|
-                  if i != j
-                    tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
-                    tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
-                  else
-                    tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
-                    tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
-                  end
+            # calculating relative entropy for each amino acid pair H and
+            # the expected score E in bit units
+            #
+            # I'm a bit suspicious about this part...
+            tot_E = 0.0
+            tot_H = 0.0
+            0.upto($tot_logo_mat.shape[0] - 1) do |i|
+              0.upto($tot_logo_mat.shape[0] - 1) do |j|
+                if i != j
+                  tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
+                  tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
+                else
+                  tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
+                  tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
                 end
               end
+            end
-              $outfh.puts <<HEADER
+            $outfh.puts <<HEADER
 #
 # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
 # rounded to the nearest integer (log-odds scores in 1/3 bit units).
@@ -1134,27 +1156,27 @@ HEADER
 #
 HEADER
-              grp_logo_mats.each_with_index do |arr, grp_no|
-                grp_label     = arr[0]
-                grp_logo_mat  = arr[1]
+            grp_logo_mats.each_with_index do |arr, grp_no|
+              grp_label     = arr[0]
+              grp_logo_mat  = arr[1]
-                $outfh.puts ">#{grp_label} #{grp_no}"
-                if $cys
-                  $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
-                else
-                  $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
-                end
-              end
-              $outfh.puts ">Total #{grp_logo_mats.size}"
+              $outfh.puts ">#{grp_label} #{grp_no}"
               if $cys
-                $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
+                $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
               else
-                $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+                $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
               end
-              $outfh.close
-              exit 0
             end
+            $outfh.puts ">Total #{grp_logo_mats.size}"
+            if $cys == 0
+              $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
+            else
+              $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
+            end
+            $outfh.close
+            exit 0
           end
         end
       end