RubyGems - semin-egor - Versions diffs - 0.9.0 - Mend

semin-egor 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/History.txt +49 -0
data/Manifest.txt +37 -0
data/PostInstall.txt +5 -0
data/Rakefile +32 -0
data/bin/egor +10 -0
data/config/website.yml +2 -0
data/config/website.yml.sample +2 -0
data/egor.gemspec +51 -0
data/lib/egor/cli.rb +1738 -0
data/lib/egor/environment.rb +34 -0
data/lib/egor/environment_class_hash.rb +20 -0
data/lib/egor/environment_feature.rb +16 -0
data/lib/egor/environment_feature_array.rb +12 -0
data/lib/egor/heatmap_array.rb +111 -0
data/lib/egor.rb +6 -0
data/lib/math_extensions.rb +7 -0
data/lib/narray_extensions.rb +22 -0
data/lib/nmatrix_extensions.rb +245 -0
data/lib/string_extensions.rb +17 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/script/txt2html +71 -0
data/test/test_egor.rb +11 -0
data/test/test_egor_cli.rb +9 -0
data/test/test_egor_environment_class_hash.rb +25 -0
data/test/test_egor_environment_feature.rb +29 -0
data/test/test_helper.rb +2 -0
data/test/test_math_extensions.rb +11 -0
data/test/test_narray_extensions.rb +14 -0
data/test/test_nmatrix_extensions.rb +16 -0
data/test/test_string_extensions.rb +11 -0
data/website/index.html +16 -0
data/website/index.txt +217 -0
data/website/javascripts/rounded_corners_lite.inc.js +285 -0
data/website/stylesheets/screen.css +158 -0
data/website/template.html.erb +57 -0
metadata +155 -0

data/lib/egor/cli.rb ADDED Viewed

@@ -0,0 +1,1738 @@
+require 'rubygems'
+require 'getoptlong'
+require 'logger'
+require 'narray'
+require 'bio'
+require 'set'
+require 'facets'
+require 'math_extensions'
+require 'string_extensions'
+require 'narray_extensions'
+require 'nmatrix_extensions'
+require 'egor/environment'
+require 'egor/environment_class_hash'
+require 'egor/environment_feature'
+require 'egor/environment_feature_array'
+require 'egor/heatmap_array'
+# This is a module for an actual command line interpreter for Egor
+# ---
+# Copyright (C) 2008-9 Semin Lee
+module Egor
+  class CLI
+    class << self
+      # :nodoc:
+      def print_version
+        puts VERSION
+      end
+      # Print Egor's Usage on the screen
+      #
+      # :call-seq:
+      #   Egor::CLI::print_usage
+      #
+      def print_usage
+        puts <<-USAGE
+egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
+Usage:
+    egor [ options ] -l TEMLIST-file -c CLASSDEF-file
+        or
+    egor [ options ] -f TEM-file -c CLASSDEF-file
+Options:
+    --tem-file (-f) FILE: a tem file
+    --tem-list (-l) FILE: a list for tem files
+    --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
+    --outfile (-o) FILE: output filename (default 'allmat.dat')
+    --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
+    --noweight: calculate substitution counts with no weights
+    --smooth (-s) INTEGER:
+        0 for partial smoothing (default)
+        1 for full smoothing
+    --p1smooth: perform smoothing for p1 probability calculation when partial smoothing
+    --nosmooth: perform no smoothing operation
+    --cys (-y) INTEGER:
+        0 for using C and J only for structure (default)
+        1 for both structure and sequence
+        2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
+    --output INTEGER:
+        0 for raw counts (no smoothing performed)
+        1 for probabilities
+        2 for log-odds (default)
+    --noroundoff: do not round off log odds ratio
+    --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
+    --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
+    --autosigma: automatically adjust the sigma value for smoothing
+    --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
+    --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
+    --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
+    --heatmap INTEGER:
+        0 create a heat map file for each substitution table
+        1 create one big file containing all substitution tables
+        2 do both 0 and 1
+    --heatmap-format INTEGER:
+        0 for Portable Network Graphics (PNG) Format (default)
+        1 for Graphics Interchange Format (GIF)
+        2 for Joint Photographic Experts Group (JPEG) Format
+        3 for Microsoft Windows bitmap (BMP) Format
+        4 for Portable Document Format (PDF)
+    --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
+    --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
+    --heatmap-value: print values in the cells when generating heat maps
+    --verbose (-v) INTEGER
+        0 for ERROR level
+        1 for WARN or above level (default)
+        2 for INFO or above level
+        3 for DEBUG or above level
+    --version: print version
+    --help (-h): show help
+        USAGE
+      end
+      # Calculate PID between two sequences
+      #
+      # :call-seq:
+      #   Egor::CLI::calculate_pid(seq1, seq2) -> Float
+      #
+      def calculate_pid(seq1, seq2)
+        aas1  = seq1.split('')
+        aas2  = seq2.split('')
+        cols  = aas1.zip(aas2)
+        align = 0 # no. of aligned columns
+        ident = 0 # no. of identical columns
+        intgp = 0 # no. of internal gaps
+        cols.each do |col|
+          if (col[0] != '-') && (col[1] != '-')
+            align += 1
+            if col[0] == col[1]
+              ident += 1
+            end
+          elsif (((col[0] == '-') && (col[1] != '-')) ||
+                 ((col[0] != '-') && (col[1] == '-')))
+            intgp += 1
+          end
+        end
+        pid = 100.0 * ident.to_f / (align + intgp)
+      end
+      # :nodoc:
+      def execute(arguments=[])
+        #
+        # * Abbreviations in the codes
+        #
+        # env: environment
+        # tem: (FUGUE) template
+        # classdef: (envlironment) class definition
+        # aa: amino acid
+        # aa: weighted amino acid
+        # tot: total
+        # rel: relative
+        # jnt: joint
+        # cnt: count
+        # mut: mutation
+        # mutb: mutability
+        # freq: frequency
+        # prob: probability
+        # logo: log odds ratio
+        # opts: options
+        # fh: file handle
+        # ff: flat file
+        # ali: alignment
+        # mat: matrix
+        # arr: array
+        # Part 1.
+        #
+        # Global variables and their default values
+        #
+        $logger       = Logger.new(STDOUT)
+        $logger.level = Logger::WARN
+        # default set of 21 amino acids including J (Cysteine, the free thiol form)
+        $amino_acids    = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
+        $tem_list       = nil
+        $tem_file       = nil
+        $classdef       = 'classdef.dat'
+        $outfile        = 'allmat.dat'
+        $outfh          = nil # file hanfle for outfile
+        $output         = 2 # default: log odds matrix
+        $ali_size       = 0
+        $tot_aa         = 0
+        $sigma          = 5.0
+        $autosigma      = false
+        $weight         = 60
+        $noweight       = false
+        $smooth         = :partial
+        $nosmooth       = false
+        $noroundoff     = false
+        $p1smooth       = false
+        $scale          = 3
+        $pidmin         = nil
+        $pidmax         = nil
+        $scale          = 3
+        $add            = nil
+        $cys            = 0
+        $targetenv      = false
+        $penv           = false
+        $heatmap        = nil
+        $heatmapcol     = nil
+        $heatmapformat  = 'png'
+        $heatmapstem    = 'heatmaps'
+        $heatmapvalue   = false
+        $rvg_width      = 550
+        $rvg_height     = 650
+        $canvas_width   = 550
+        $canvas_height  = 650
+        $cell_width     = 20
+        $cell_height    = 20
+        $aa_tot_cnt   = Hash.new(0)
+        $aa_mut_cnt   = Hash.new(0)
+        $aa_mutb      = {}
+        $aa_rel_mutb  = {}
+        $aa_tot_freq  = {}
+        $aa_env_cnt   = Hash.new(0)
+        $smooth_prob  = {}
+        $tot_cnt_mat  = nil
+        $tot_prob_mat = nil
+        $tot_logo_mat = nil
+        $tot_smooth_prob = {}
+        # minimum ratio of amino acid count to sigma value
+        $min_cnt_sigma_ratio = 500.0
+        #
+        # Part 1 END
+        #
+        # Part 2.
+        #
+        # Parsing options
+        #
+        opts = GetoptLong.new(
+          [ '--help',     '-h', GetoptLong::NO_ARGUMENT ],
+          [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--smooth',   '-s', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--nosmooth',       GetoptLong::NO_ARGUMENT ],
+          [ '--p1smooth',       GetoptLong::NO_ARGUMENT ],
+          [ '--weight',   '-w', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--noweight',       GetoptLong::NO_ARGUMENT ],
+          [ '--noroundoff',     GetoptLong::NO_ARGUMENT ],
+          [ '--sigma',          GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--autosigma',      GetoptLong::NO_ARGUMENT ],
+          [ '--heatmap',        GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--heatmap-stem',   GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--heatmap-value',  GetoptLong::NO_ARGUMENT ],
+          [ '--output',         GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--cys',      '-y', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--penv',           GetoptLong::NO_ARGUMENT ],
+          [ '--outfile',  '-o', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--verbose',  '-v', GetoptLong::REQUIRED_ARGUMENT ],
+          [ '--version',        GetoptLong::NO_ARGUMENT ]
+        )
+        begin
+          opts.each do |opt, arg|
+            case opt
+            when '--help'
+              print_usage
+              exit 0
+            when '--tem-list'
+              $tem_list     = arg
+            when '--tem-file'
+              $tem_file     = arg
+            when '--classdef'
+              $classdef     = arg
+            when '--output'
+              $output       = arg.to_i
+            when '--outfile'
+              $outfile      = arg
+            when '--cys'
+              $cys          = arg.to_i
+            when '--targetenv'
+              $targetenv    = (arg.to_i == 1) ? true : false
+            when '--weight'
+              $weight       = arg.to_i
+            when '--sigma'
+              $sigma        = arg.to_f
+            when '--autosigma'
+              $autosigma    = true
+            when '--pidmin'
+              $pidmin       = arg.to_f
+            when '--pidmax'
+              $pidmax       = arg.to_f
+            when '--noweight'
+              $noweight     = true
+            when '--noroundoff'
+              $noroundoff   = true
+            when '--smooth'
+              $smooth       = (arg.to_i == 1) ? :full : :partial
+            when '--nosmooth'
+              $nosmooth     = true
+            when '--p1smooth'
+              $p1smooth     = true
+            when '--scale'
+              $scale        = arg.to_f
+            when '--add'
+              $add          = arg.to_f
+            when '--penv'
+              warn "--penv option is not supported."
+              exit 1
+              $penv         = true
+            when '--heatmap'
+              $heatmap      = case arg.to_i
+                              when (0..2) then arg.to_i
+                              else
+                                warn "--heatmap #{arg.to_i} is not allowed."
+                                exit1
+                              end
+            when '--heatmap-columns'
+              $heatmapcol   = arg.to_i
+            when '--heatmap-stem'
+              $heatmapstem  = arg.to_s
+            when '--heatmap-format'
+              $heatmapformat   = case arg.to_i
+                              when 0 then 'png'
+                              when 1 then 'gif'
+                              when 2 then 'jpg'
+                              when 3 then 'bmp'
+                              when 4 then 'pdf'
+                              else
+                                warn "--heatmap-format #{arg.to_i} is not supported."
+                                exit 1
+                              end
+            when '--heatmap-value'
+              $heatmapvalue   = true
+            when '--verbose'
+              $logger.level = case arg.to_i
+                              when 0 then Logger::ERROR
+                              when 1 then Logger::WARN
+                              when 2 then Logger::INFO
+                              when 3 then Logger::DEBUG
+                              else
+                                warn "--verbose (-v) #{arg.to_i} is not supported."
+                                exit 1
+                              end
+            when '--version'
+              print_version
+              exit 0
+            end
+          end
+        rescue
+          # invalid option
+          exit 1
+        end
+        # when arguments are nonsense, print usage
+        if ((ARGV.length != 0) ||
+            (!$tem_list && !$tem_file) ||
+            ($tem_list && $tem_file))
+          print_usage
+          exit 1
+        end
+        # warn if any input file is missing
+        if $tem_list && !File.exist?($tem_list)
+          warn "Cannot find template list file, #{$tem_list}"
+          exit 1
+        end
+        if $tem_file && !File.exist?($tem_file)
+          warn "Cannot find template file, #{$tem_file}"
+          exit 1
+        end
+        if $classdef && !File.exist?($classdef)
+          warn "Cannot find environment class definition file, #{$classdef}"
+          exit 1
+        end
+        #
+        # Part 2 END
+        #
+        # Part 3.
+        #
+        # Reading Environment Class Definition File
+        #
+        # check --cys option and modify amino_acids set if necessary
+        if $cys == 2
+          $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
+        end
+        # create an EnvironmentFeatureList object for storing all environment
+        # features
+        $env_features = EnvironmentFeatureArray.new
+        # an array for storing indexes of constrained environment features
+        $cst_features = []
+        # add substituted amino acid (aa1) in a substitution to the environment
+        # feature list
+        $env_features << EnvironmentFeature.new('sequence',
+                                                $amino_acids,
+                                                $amino_acids,
+                                                'F',
+                                                'F')
+        # read environment class definiton file and store them into
+        # the hash prepared above
+        env_index = 1
+        IO.foreach($classdef) do |line|
+          line.chomp!
+          if line.start_with?('#')
+            next
+          elsif (env_ftr = line.chomp.split(/;/)).length == 5
+            $logger.info "An environment feature, #{line} detected."
+            if env_ftr[-1] == 'T'
+              # skip silenced environment feature
+              $logger.warn "The environment feature, #{line} silent."
+              next
+            end
+            if env_ftr[-2] == 'T'
+              $cst_features << env_index
+              $logger.warn "The environment feature, #{line} constrained."
+            end
+            $env_features << EnvironmentFeature.new(env_ftr[0],
+                                                    env_ftr[1].split(''),
+                                                    env_ftr[2].split(''),
+                                                    env_ftr[3],
+                                                    env_ftr[4])
+            env_index += 1
+          else
+            $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
+                          "a environment class definition."
+            exit 1
+          end
+        end
+        # a hash for storing all environment classes
+        $env_classes = EnvironmentClassHash.new
+        # generate all possible combinations of environment labels, and store
+        # every environment class into the hash prepared above with the label
+        # as a key
+        $env_features.label_combinations.each_with_index { |e, i|
+          $env_classes[e.flatten.join] = Environment.new(i,
+                                                         e.flatten.join,
+                                                         $amino_acids)
+        }
+        #
+        # Part 3 END
+        #
+        # Part 4.
+        #
+        # Reading TEM file or TEMLIST list file and couting substitutions
+        #
+        # a global file handle for output
+        $outfh = File.open($outfile, 'w')
+        if $tem_file
+          $tem_list_io = StringIO.new($tem_file)
+        end
+        if $tem_list
+          $tem_list_io = File.open($tem_list)
+        end
+        $tem_list_io.each_line do |tem_file|
+          tem_file.chomp!
+          ali = Bio::Alignment::OriginalAlignment.new
+          ff  = Bio::FlatFile.auto(tem_file)
+          ff.each_entry do |pir|
+            if (pir.definition == 'sequence') || (pir.definition == 'structure')
+              ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
+            end
+          end
+          if ali.size < 2
+            $logger.warn "Skipped #{tem_file} which has only one unique entry."
+            next
+          end
+          $ali_size   += 1
+          env_labels  = {}
+          disulphide  = {}
+          ali.each_pair do |key, seq|
+            # check disulphide bond environment first!
+            ff.rewind
+            ff.each_entry do |pir|
+              if ((pir.entry_id == key) &&
+                  ((pir.definition == "disulphide") ||
+                   (pir.definition == "disulfide")))
+                disulphide[key] = pir.data.remove_internal_spaces.split('')
+              end
+            end
+            $env_features.each_with_index do |ec, ei|
+              env_labels[key] = [] unless env_labels.has_key?(key)
+              ff.rewind
+              ff.each_entry do |pir|
+                if (pir.entry_id == key) && (pir.definition == ec.name)
+                  labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
+                    if sym == '-'
+                      '-'
+                    elsif sym == 'X' || sym == 'x'
+                      'X'
+                    else
+                      if ei == 0 # Amino Acid Environment Feature
+                        (disulphide.has_key?(key) &&
+                         (disulphide[key][pos] == 'F') &&
+                         (sym == 'C')) ? 'J' : sym
+                      else
+                        ec.labels[ec.symbols.index(sym)]
+                      end
+                    end
+                  end
+                  if env_labels[key].empty?
+                    env_labels[key] = labels
+                  else
+                    env_labels[key].each_with_index { |e, i|
+                      env_labels[key][i] = e + labels[i]
+                    }
+                  end
+                end
+              end
+            end
+          end
+          if $noweight
+            ali.each_pair do |id1, seq1|
+              ali.each_pair do |id2, seq2|
+                if id1 != id2
+                  pid  = calculate_pid(seq1, seq2)
+                  s1 = seq1.split('')
+                  s2 = seq2.split('')
+                  # check PID_MIN
+                  if $pidmin && (pid < $pidmin)
+                    $logger.info  "Skip alignment between #{id1} and #{id2} " +
+                                  "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
+                    next
+                  end
+                  # check PID_MAX
+                  if $pidmax && (pid > $pidmax)
+                    $logger.info  "Skip alignment between #{id1} and #{id2} " +
+                                  "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
+                    next
+                  end
+                  s1.each_with_index do |aa1, pos|
+                    aa1.upcase!
+                    aa2 = s2[pos].upcase
+                    if env_labels[id1][pos].include?('X')
+                      $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
+                      next
+                    end
+                    if env_labels[id2][pos].include?('X')
+                      $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
+                      next
+                    end
+                    unless $amino_acids.include?(aa1)
+                      $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
+                      next
+                    end
+                    unless $amino_acids.include?(aa2)
+                      $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
+                      next
+                    end
+                    aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
+                    aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
+                    if $cst_features.empty?
+                      $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
+                    elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
+                      $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
+                    else
+                      $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
+                      next
+                    end
+                    grp_label = env_labels[id1][pos][1..-1]
+                    if $aa_env_cnt.has_key? grp_label
+                      if $aa_env_cnt[grp_label].has_key? aa1
+                        $aa_env_cnt[grp_label][aa1] += 1
+                      else
+                        $aa_env_cnt[grp_label][aa1] = 1
+                      end
+                    else
+                      $aa_env_cnt[grp_label] = Hash.new(0)
+                      $aa_env_cnt[grp_label][aa1] = 1
+                    end
+                    if $aa_tot_cnt.has_key? aa1
+                      $aa_tot_cnt[aa1] += 1
+                    else
+                      $aa_tot_cnt[aa1] = 1
+                    end
+                    if aa1 != aa2
+                      if $aa_mut_cnt.has_key? aa1
+                        $aa_mut_cnt[aa1] += 1
+                      else
+                        $aa_mut_cnt[aa1] = 1
+                      end
+                    end
+                    $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
+                  end
+                end
+              end
+            end
+          else
+            # BLOSUM-like weighting
+            clusters = []
+            ali.each_pair { |i, s| clusters << [i] }
+            # a loop for single linkage clustering
+            begin
+              continue = false
+              0.upto(clusters.size - 2) do |i|
+                indexes = []
+                (i + 1).upto(clusters.size - 1) do |j|
+                  found = false
+                  clusters[i].each do |c1|
+                    clusters[j].each do |c2|
+                      if calculate_pid(ali[c1], ali[c2]) >= $weight
+                        indexes << j
+                        found = true
+                        break
+                      end
+                    end
+                    break if found
+                  end
+                end
+                unless indexes.empty?
+                  continue  = true
+                  group     = clusters[i]
+                  indexes.each do |k|
+                    group       = group.concat(clusters[k])
+                    clusters[k] = nil
+                  end
+                  clusters[i] = group
+                  clusters.compact!
+                end
+              end
+            end while(continue)
+            if clusters.size < 2
+              $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
+              next
+            end
+            clusters.combination(2).each do |cluster1, cluster2|
+              cluster1.each do |id1|
+                cluster2.each do |id2|
+                  seq1 = ali[id1].split('')
+                  seq2 = ali[id2].split('')
+                  seq1.each_with_index do |aa1, pos|
+                    aa1.upcase!
+                    aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
+                    if env_labels[id1][pos].include?('X')
+                      $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
+                      next
+                    end
+                    if env_labels[id2][pos].include?('X')
+                      $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
+                      next
+                    end
+                    unless $amino_acids.include?(aa1)
+                      $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
+                      next
+                    end
+                    unless $amino_acids.include?(aa2)
+                      $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
+                      next
+                    end
+                    aa1   = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
+                    aa2   = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
+                    cnt1  = 1.0 / cluster1.size
+                    cnt2  = 1.0 / cluster2.size
+                    jnt_cnt = cnt1 * cnt2
+                    if $cst_features.empty?
+                      $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
+                      $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
+                    elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
+                      $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
+                      $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
+                    else
+                      $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
+                      next
+                    end
+                    grp_label1 = env_labels[id1][pos][1..-1]
+                    grp_label2 = env_labels[id2][pos][1..-1]
+                    if $aa_env_cnt.has_key? grp_label1
+                      if $aa_env_cnt[grp_label1].has_key? aa1
+                        $aa_env_cnt[grp_label1][aa1] += cnt1
+                      else
+                        $aa_env_cnt[grp_label1][aa1] = cnt1
+                      end
+                    else
+                      $aa_env_cnt[grp_label1] = Hash.new(0.0)
+                      $aa_env_cnt[grp_label1][aa1] = cnt1
+                    end
+                    if $aa_env_cnt.has_key? grp_label2
+                      if $aa_env_cnt[grp_label2].has_key? aa2
+                        $aa_env_cnt[grp_label2][aa2] += cnt2
+                      else
+                        $aa_env_cnt[grp_label2][aa2] = cnt2
+                      end
+                    else
+                      $aa_env_cnt[grp_label2] = Hash.new(0.0)
+                      $aa_env_cnt[grp_label2][aa2] = cnt2
+                    end
+                    if $aa_tot_cnt.has_key? aa1
+                      $aa_tot_cnt[aa1] += cnt1
+                    else
+                      $aa_tot_cnt[aa1] = cnt1
+                    end
+                    if $aa_tot_cnt.has_key? aa2
+                      $aa_tot_cnt[aa2] += cnt2
+                    else
+                      $aa_tot_cnt[aa2] = cnt2
+                    end
+                    if aa1 != aa2
+                      if $aa_mut_cnt.has_key? aa1
+                        $aa_mut_cnt[aa1] += cnt1
+                      else
+                        $aa_mut_cnt[aa1] = cnt1
+                      end
+                      if $aa_mut_cnt.has_key? aa2
+                        $aa_mut_cnt[aa2] += cnt2
+                      else
+                        $aa_mut_cnt[aa2] = cnt2
+                      end
+                    end
+                    $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
+                    $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
+                  end
+                end
+              end
+            end
+          end
+          $logger.info "Analysing #{tem_file} done."
+        end
+        # print out default header
+        $outfh.puts <<HEADER
+# Environment-specific amino acid substitution matrices
+# Creator: egor version #{Egor::VERSION}
+# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
+#
+# Definitions for structural environments:
+# #{$env_features.size - 1} features used
+#
+HEADER
+        $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
+        $outfh.puts <<HEADER
+# (read in from #{$classdef})
+#
+# Number of alignments: #{$ali_size}
+# (list of .tem files read in from #{$tem_list})
+#
+# Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
+#
+# There are #{$amino_acids.size} amino acids considered.
+# #{$amino_acids.join}
+#
+HEADER
+        if $amino_acids.include? 'J'
+          $outfh.puts <<HEADER
+# C: Cystine (the disulfide-bonded form)
+# J: Cysteine (the free thiol form)
+#
+HEADER
+        end
+        if $noweight
+          $outfh.puts '# Weighting scheme: none'
+        else
+          $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
+        end
+        # calculate amino acid frequencies and mutabilities, and
+        # print them as default statistics in the header part
+        ala_factor  = if $aa_tot_cnt['A'] == 0
+                        0.0
+                      elsif $aa_mut_cnt['A'] == 0
+                        0.0
+                      else
+                        100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
+                      end
+        $tot_aa     = $aa_tot_cnt.values.sum
+        $outfh.puts '#'
+        $outfh.puts "# Total amino acid frequencies:\n"
+        $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
+        min_cnt = -1
+        min_sigma = nil
+        $amino_acids.each do |res|
+          if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
+            if min_cnt < 0
+              min_cnt = $aa_tot_cnt[res]
+              min_sigma = min_cnt / $min_cnt_sigma_ratio
+            elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
+              min_cnt = $aa_tot_cnt[res]
+              min_sigma = min_cnt / $min_cnt_sigma_ratio
+            end
+            $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
+          end
+          $aa_mutb[res]     = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
+          $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
+          $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
+        end
+        $amino_acids.each do |res|
+          if $noweight
+            $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
+              [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
+          else
+            $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
+              [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
+          end
+        end
+        if min_cnt > -1
+          $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
+          if $autosigma
+            $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
+            $sigma = min_sigma
+          end
+        end
+        $outfh.puts '#'
+        $outfh.puts '# RES: Amino acid one letter code'
+        $outfh.puts '# TOT_OBS: Total count of incidence'
+        $outfh.puts '# MUT_OBS: Total count of mutation'
+        $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
+        $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
+        $outfh.puts '# REL_FREQ: Relative frequency'
+        $outfh.puts '#'
+        #
+        # Part 4. END
+        #
+        # Part 5.
+        #
+        # Generating substitution frequency matrices
+        #
+        # calculating probabilities for each environment
+        $env_classes.values.each do |e|
+          if e.freq_array.sum != 0
+            e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
+          end
+        end
+        # count raw frequencies
+        $tot_cnt_mat    = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
+        group_matrices  = []
+        # for each combination of environment features
+        $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
+          grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
+          $amino_acids.each_with_index do |aa, aj|
+            freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
+            0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
+          end
+          $tot_cnt_mat += grp_cnt_mat
+          group_matrices << [group[0], grp_cnt_mat]
+        end
+        $logger.info "Counting substitutions done."
+        if $output == 0
+          heatmaps      = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
+          grp_max_val   = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
+          $heatmapcol ||= Math::sqrt(group_matrices.size).round
+          group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
+            # for a matrix file
+            stem = "#{grp_no}. #{grp_label}"
+            $outfh.puts ">#{grp_label} #{grp_no}"
+            $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
+                                                  :row_header => $amino_acids)
+            # for a heat map
+            if $heatmap == 0 or $heatmap == 2
+              grp_cnt_mat.heatmap(:col_header     => $amino_acids,
+                                  :row_header     => $amino_acids,
+                                  :rvg_width      => $rvg_width,
+                                  :rvg_height     => $rvg_height,
+                                  :canvas_width   => $canvas_width,
+                                  :canvas_height  => $canvas_height,
+                                  :max_val        => grp_max_val.ceil,
+                                  :min_val        => 0,
+                                  :print_value    => $heatmapvalue,
+                                  :title          => stem).write("#{stem}.#{$heatmapformat}")
+              $logger.info "Generating a heat map for #{stem} table done."
+            end
+            if $heatmap == 1 or $heatmap == 2
+              heatmaps << grp_cnt_mat.heatmap(:col_header       => $amino_acids,
+                                              :row_header       => $amino_acids,
+                                              :rvg_width        => $rvg_width,
+                                              :rvg_height       => $rvg_height - 50,
+                                              :canvas_width     => $canvas_width,
+                                              :canvas_height    => $canvas_height - 50,
+                                              :max_val          => grp_max_val.ceil,
+                                              :min_val          => 0,
+                                              :print_value      => $heatmapvalue,
+                                              :print_gradient   => false,
+                                              :title            => stem,
+                                              :title_font_size  => $rvg_width * $heatmapcol / 100.0)
+            end
+          end
+          if $heatmap == 1 or $heatmap == 2
+            file = "#{$heatmapstem}.#{$heatmapformat}"
+            heatmaps.heatmap(:columns   => $heatmapcol,
+                             :rvg_width => $rvg_width,
+                             :max_val   => grp_max_val.ceil,
+                             :min_val   => 0).write(file)
+            $logger.info "Generating heat maps in a file, #{file} done."
+          end
+          # total
+          $outfh.puts '>Total'
+          $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
+                                                 :row_header => $amino_acids)
+          if $heatmap == 0 or $heatmap == 2
+            stem    = "#{group_matrices.size}. TOTAL"
+            heatmap = $tot_cnt_mat.heatmap(:col_header    => $amino_acids,
+                                           :row_header    => $amino_acids,
+                                           :rvg_width     => $rvg_width,
+                                           :rvg_height    => $rvg_height,
+                                           :canvas_width  => $canvas_width,
+                                           :canvas_height => $canvas_height,
+                                           :max_val       => $tot_cnt_mat.max.ceil,
+                                           :min_val       => 0,
+                                           :print_value   => $heatmapvalue,
+                                           :title         => stem).write("#{stem}.#{$heatmapformat}")
+            $logger.info "Generating a heat map for #{stem} table done."
+          end
+          exit 0
+        end
+        #
+        # Part 5. END
+        #
+        # Part 6.
+        #
+        # Calculating substitution probability tables
+        #
+        if $output == 1
+          $outfh.puts <<HEADER
+#
+# Each column (j) represents the probability distribution for the
+# likelihood of acceptance of a mutational event by a residue type j in
+# a particular structural environment (specified after >) leading to
+# any other residue type (i) and sums up to 100.
+#
+HEADER
+        end
+        # when nosmoothing !!!
+        if ($output > 0) && $nosmooth
+          # reinitialize $tot_cnt_mat for pseudocounts
+          $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
+          # for each combination of environment features
+          pseudo_cnt = $add || (1.0 / $env_classes.group_size)
+          # add pseudo counts for each frequency vector
+          $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
+          # re-calculate probability vector for each environment class
+          $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
+          group_matrices = []
+          $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
+            grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
+            grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
+            $amino_acids.each_with_index do |aa, aj|
+              env_class = group[1].find { |e| e.label.start_with?(aa) }
+              0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
+              0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
+            end
+            $tot_cnt_mat += grp_cnt_mat
+            group_matrices << [group[0], grp_prob_mat]
+          end
+          if $output == 1
+            heatmaps      = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
+            grp_max_val   = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
+            $heatmapcol ||= Math::sqrt(group_matrices.size).round
+            group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
+              # for a matrix file
+              stem = "#{grp_no}. #{grp_label}"
+              $outfh.puts ">#{grp_label} #{grp_no}"
+              $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
+                                                     :row_header => $amino_acids)
+              # for a heat map
+              if $heatmap == 0 or $heatmap == 2
+                grp_prob_mat.heatmap(:col_header    => $amino_acids,
+                                     :row_header    => $amino_acids,
+                                     :rvg_width     => $rvg_width,
+                                     :rvg_height    => $rvg_height,
+                                     :canvas_width  => $canvas_width,
+                                     :canvas_height => $canvas_height,
+                                     :max_val       => grp_max_val.ceil,
+                                     :min_val       => 0,
+                                     :print_value   => $heatmapvalue,
+                                     :title         => stem).write("#{stem}.#{$heatmapformat}")
+                $logger.info "Generating a heat map for #{stem} table done."
+              end
+              if $heatmap == 1 or $heatmap == 2
+                heatmaps << grp_prob_mat.heatmap(:col_header      => $amino_acids,
+                                                 :row_header      => $amino_acids,
+                                                 :rvg_width       => $rvg_width,
+                                                 :rvg_height      => $rvg_height - 50,
+                                                 :canvas_width    => $canvas_width,
+                                                 :canvas_height   => $canvas_height - 50,
+                                                 :max_val         => grp_max_val.ceil,
+                                                 :min_val         => 0,
+                                                 :print_value     => $heatmapvalue,
+                                                 :print_gradient  => false,
+                                                 :title           => stem,
+                                                 :title_font_size => $rvg_width * $heatmapcol / 100.0)
+              end
+            end
+            # for heat maps in a single file
+            if $heatmap == 1 or $heatmap == 2
+              file = "#{$heatmapstem}.#{$heatmapformat}"
+              heatmaps.heatmap(:columns   => $heatmapcol,
+                               :rvg_width => $rvg_width,
+                               :max_val   => grp_max_val.ceil,
+                               :min_val   => 0).write(file)
+              $logger.info "Generating heat maps in a file, #{file} done."
+            end
+          end
+          $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
+          0.upto($amino_acids.size - 1) do |aj|
+            col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
+            0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
+          end
+          if $output == 1
+            $outfh.puts '>Total'
+            $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
+                                                    :row_header => $amino_acids)
+            $outfh.close
+            # for a heat map
+            if $heatmap == 0 or $heatmap == 2
+              stem = "#{group_matrices.size}. TOTAL"
+              $tot_prob_mat.heatmap(:col_header     => $amino_acids,
+                                    :row_header     => $amino_acids,
+                                    :rvg_width      => $rvg_width,
+                                    :rvg_height     => $rvg_height,
+                                    :canvas_width   => $canvas_width,
+                                    :canvas_height  => $canvas_height,
+                                    :max_val        => $tot_prob_mat.max.ceil,
+                                    :min_val        => 0,
+                                    :print_value    => $heatmapvalue,
+                                    :title          => stem).write("#{stem}.#{$heatmapformat}")
+              $logger.info "Generating a heat map for #{stem} table done."
+            end
+            exit 0
+          end
+          $logger.info 'Calculating substitution probabilities (no smoothing) done.'
+        end
+        # when smoothing!!!
+        if ($output > 0) && !$nosmooth
+          #
+          # p1 probabilities
+          #
+          p1      = NArray.float($amino_acids.size)
+          a0      = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
+          big_N   = $tot_aa.to_f
+          small_n = $amino_acids.size.to_f
+          omega1  = 1.0 / (1 + big_N / ($sigma * small_n))
+          omega2  = 1.0 - omega1
+          if ($smooth == :full) || $p1smooth
+            # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
+            0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
+            $smooth_prob[1] = p1
+          elsif ($smooth == :partial)
+            # no smoothing for p1 probabilities just as Kenji's subst
+            # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
+            0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
+            $smooth_prob[1] = p1
+          end
+          #
+          # p2 and above
+          #
+          env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
+          if $smooth == :partial
+            $outfh.puts <<HEADER
+#
+# Partial Smoothing:
+#
+HEADER
+            if $p1smooth
+              $outfh.puts <<HEADER
+# p1(ri) (i.e., amino acid composition) is estimated by summing over
+# each row in all matrices and smoothing them with A0 (a uniform distribution)
+#                              ^^^^^^^^^
+HEADER
+            else
+              $outfh.puts <<HEADER
+# p1(ri) (i.e., amino acid composition) is estimated by summing over
+# each row in all matrices without smoothing
+#                          ^^^^^^^^^^^^^^^^^
+HEADER
+            end
+            $outfh.puts <<HEADER
+# p2(ri|Rj) is estimated as:
+#    p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
+#
+# p3(ri|Rj,fq) is estimated as:
+#    p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
+# where
+#    A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
+#
+# The smoothing procedure is curtailed here and finally
+#                            ^^^^^^^^^
+# p5(ri|Rj,...) is estimated as:
+#    p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
+# where
+#    A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
+#
+# Weights (omegas) are calculated as in Topham et al. (1993)
+#
+# sigma value used is:  #{$sigma}
+#
+HEADER
+            1.upto($env_features.size) do |ci|
+              # for partial smoothing, only P1 ~ P3, and Pn are considered
+              if (ci > 2) && (ci < $env_features.size)
+                $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
+                next
+              end
+              env_labels.combination(ci) do |c1|
+                c1[0].product(*c1[1..-1]).each do |labels|
+                  pattern = '.' * $env_features.size
+                  labels.each do |label|
+                    i = label[0].chr.to_i
+                    l = label[1].chr
+                    pattern[i] = l
+                  end
+                  if pattern =~ /^\./
+                    $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
+                    next
+                  end
+                  # get environments matching the pattern created above
+                  # and calculate amino acid frequencies and their probabilities for all the environments
+                  envs      = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
+                  freq_arr  = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
+                  prob_arr  = NArray.float($amino_acids.size)
+                  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
+#                  # assess whether a residue type j is compatible with a particular combination of structural features
+#                  # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
+#                  if ci == $env_features.size
+#                    aa_label        = labels.find { |l| l.match(/^0/) }[1].chr
+#                    sub_pattern     = '.' * $env_features.size
+#                    sub_pattern[0]  = aa_label
+#                    sub_freq_sum    = 0
+#
+#                    labels[1..-1].each do |label|
+#                      next if label.start_with?('0')
+#                      i               = label[0].chr.to_i
+#                      l               = label[1].chr
+#                      sub_pattern[i]  = l
+#                      sub_envs        = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
+#                      sub_freq_arr    = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
+#                      sub_freq_sum    += sub_freq_arr.sum
+#                    end
+#
+#                    if sub_freq_sum == 0
+#                      if $smooth_prob.has_key?(ci + 1)
+#                        $smooth_prob[ci + 1][labels.to_set] = prob_arr
+#                      else
+#                        $smooth_prob[ci + 1] = {}
+#                        $smooth_prob[ci + 1][labels.to_set] = prob_arr
+#                      end
+#                      $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
+#                      next
+#                    end
+#                  end
+                  # collect priors
+                  priors = []
+                  if ci == 1
+                    priors << $smooth_prob[1]
+                  elsif ci == 2
+                    labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
+                      priors << $smooth_prob[2][c3.to_set]
+                    }
+                  elsif ci == $env_features.size
+                    labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
+                      priors << $smooth_prob[3][c3.to_set]
+                    }
+                  end
+                  # entropy based prior weighting step
+                  entropy_max     = Math::log($amino_acids.size)
+                  entropies       = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
+                    begin
+                      p == 0.0 ? s - 1 : s + p * Math::log(p)
+                    rescue
+                      #puts "P: #{p}"
+                    end
+                  } }
+                  mod_entropies   = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
+                  weights         = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
+                  weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
+                  # smoothing step
+                  smooth_prob_arr = NArray.float($amino_acids.size)
+                  big_N           = freq_arr.sum.to_f
+                  small_n         = $amino_acids.size.to_f
+                  omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
+                  omega2          = 1.0 - omega1
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
+                  # normalization step
+                  smooth_prob_arr_sum = smooth_prob_arr.sum
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
+                  # store smoothed probabilties in a hash using a set of envrionment labels as a key
+                  if $smooth_prob.has_key?(ci + 1)
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
+                  else
+                    $smooth_prob[ci + 1] = {}
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
+                  end
+                end
+              end
+            end
+            $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
+          else
+            $outfh.puts <<HEADER
+#
+# Full Smoothing:
+#
+# p1(ri) is estimated as:
+#     p1(ri) = omega1 * A0 + omega2 * W1(ri)
+#
+# p2(ri|f1q) is estimated as:
+#     p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
+#
+#     (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
+#
+# p3(ri|f1q,f2q) is estimated as:
+#    p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
+# where
+#    A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
+#
+# The smoothing procedure is NOT curtailed here and it goes upto
+#                            ^^^^^^^^^^^^^
+#
+# pn(ri|f1q,f2q,...,fn-1q) is estimated as:
+#    pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
+# where
+#    An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
+#
+# Weights (omegas) are calculated as in Topham et al. (1993)
+#
+# sigma value used is:  #{$sigma}
+#
+HEADER
+            # full smooting
+            1.upto($env_features.size) do |ci|
+              env_labels.combination(ci) do |c1|
+                c1[0].product(*c1[1..-1]).each do |labels|
+                  pattern = '.' * $env_features.size
+                  labels.each do |label|
+                    j = label[0].chr.to_i
+                    l = label[1].chr
+                    pattern[j] = l
+                  end
+                  # get environmetns, frequencies, and probabilities
+                  envs      = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
+                  freq_arr  = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
+                  prob_arr  = NArray.float($amino_acids.size)
+                  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
+                  # collect priors
+                  priors  = []
+                  if ci > 1
+                    labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
+                  else
+                    priors << $smooth_prob[1]
+                  end
+                  # entropy based weighting priors
+                  entropy_max = Math::log($amino_acids.size)
+                  entropies = priors.map do |prior|
+                    (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
+                  end
+                  weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
+                  # smoothing step
+                  smooth_prob_arr = NArray.float($amino_acids.size)
+                  big_N           = freq_arr.sum.to_f
+                  small_n         = $amino_acids.size.to_f
+                  omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
+                  omega2          = 1.0 - omega1
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
+                  # normalization step
+                  smooth_prob_arr_sum = smooth_prob_arr.sum
+                  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
+                  # store smoothed probabilties in a hash using a set of envrionment labels as a key
+                  if $smooth_prob.has_key?(ci + 1)
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
+                  else
+                    $smooth_prob[ci + 1] = {}
+                    $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
+                  end
+                end
+              end
+            end
+            $logger.info 'Calculating substitution probabilities (full smoothing) done.'
+          end
+          # updating smoothed probability array for each envrionment
+          $env_classes.values.each do |env|
+            env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
+          end
+          # sorting environments and build 21X21 substitution matrices
+          group_matrices = []
+          $env_classes.groups_sorted_by_residue_labels.each do |group|
+            # calculating 21X21 substitution probability matrix for each envrionment
+            grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
+            $amino_acids.each_with_index do |aa, ai|
+              smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
+              0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
+            end
+            group_matrices << [group[0], grp_prob_mat]
+          end
+          if $output == 1
+            heatmaps      = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
+            grp_max_val   = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
+            $heatmapcol ||= Math::sqrt(group_matrices.size).round
+            group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
+              # for a matrix file
+              stem = "#{grp_no}. #{grp_label}"
+              $outfh.puts ">#{grp_label} #{grp_no}"
+              $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
+                                                     :row_header => $amino_acids)
+              # for heat map generation
+              if $heatmap == 0 or $heatmap == 2
+                grp_prob_mat.heatmap(:col_header    => $amino_acids,
+                                     :row_header    => $amino_acids,
+                                     :rvg_width     => $rvg_width,
+                                     :rvg_height    => $rvg_height,
+                                     :canvas_width  => $canvas_width,
+                                     :canvas_height => $canvas_height,
+                                     :max_val       => grp_max_val.ceil,
+                                     :min_val       => 0,
+                                     :print_value   => $heatmapvalue,
+                                     :title         => stem).write("#{stem}.#{$heatmapformat}")
+                $logger.info "Generating a heat map for #{stem} table done."
+              end
+              if $heatmap == 1 or $heatmap == 2
+                heatmaps << grp_prob_mat.heatmap(:col_header      => $amino_acids,
+                                                 :row_header      => $amino_acids,
+                                                 :rvg_width       => $rvg_width,
+                                                 :rvg_height      => $rvg_height - 50,
+                                                 :canvas_width    => $canvas_width,
+                                                 :canvas_height   => $canvas_height - 50,
+                                                 :max_val         => grp_max_val.ceil,
+                                                 :min_val         => 0,
+                                                 :print_value     => $heatmapvalue,
+                                                 :print_gradient  => false,
+                                                 :title           => stem,
+                                                 :title_font_size => $rvg_width * $heatmapcol / 100.0)
+              end
+            end
+            # for heat maps in a single file
+            if $heatmap == 1 or $heatmap == 2
+              file = "#{$heatmapstem}.#{$heatmapformat}"
+              heatmaps.heatmap(:columns   => $heatmapcol,
+                               :rvg_width => $rvg_width,
+                               :max_val   => grp_max_val.ceil,
+                               :min_val   => 0).write(file)
+              $logger.info "Generating heat maps in a file, #{file} done."
+            end
+          end
+          # for a total substitution probability matrix
+          $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
+          $amino_acids.each_with_index do |aa, aj|
+            0.upto($amino_acids.size - 1) do |ai|
+              $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
+            end
+          end
+          if $output == 1
+            $outfh.puts '>Total'
+            $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
+                                                    :row_header => $amino_acids)
+            $outfh.close
+            # for a heat map
+            if $heatmap == 0 or $heatmap == 2
+              stem = "#{group_matrices.size}. TOTAL"
+              $tot_prob_mat.heatmap(:col_header     => $amino_acids,
+                                    :row_header     => $amino_acids,
+                                    :rvg_width      => $rvg_width,
+                                    :rvg_height     => $rvg_height,
+                                    :canvas_width   => $canvas_width,
+                                    :canvas_height  => $canvas_height,
+                                    :max_val        => $tot_prob_mat.max.ceil,
+                                    :min_val        => 0,
+                                    :print_value    => $heatmapvalue,
+                                    :title          => stem).write("#{stem}.#{$heatmapformat}")
+              $logger.info "Generating a heat map for #{stem} table done."
+            end
+            exit 0
+          end
+        end
+        #
+        # Part 6. END
+        #
+        # Part 7.
+        #
+        # Calculating log odds ratio scoring matrices
+        #
+        if $output == 2
+          $outfh.puts <<HEADER
+#
+# The probabilities were then divided by the background probabilities
+HEADER
+          if $penv
+            $outfh.puts <<HEADER
+# which were derived from the environment-dependent amino acid frequencies.
+#                             ^^^^^^^^^^^^^^^^^^^^^
+HEADER
+          else
+            $outfh.puts <<HEADER
+# which were derived from the environment-independent amino acid frequencies.
+#                             ^^^^^^^^^^^^^^^^^^^^^^^
+HEADER
+          end
+          grp_logo_mats = []
+          factor        = $scale / Math::log(2)
+          $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
+            # calculating substitution probability matrix for each envrionment
+            grp_label     = group[0]
+            grp_envs      = group[1]
+            grp_logo_mat  = $cys == 0 ?
+                            NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
+                            NMatrix.float($amino_acids.size, $amino_acids.size)
+            $amino_acids.each_with_index do |aa, aj|
+              env             = grp_envs.detect { |e| e.label.start_with?(aa) }
+              env.logo_array  = $cys == 0 ?
+                                NArray.float($amino_acids.size + 1) :
+                                NArray.float($amino_acids.size)
+              env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
+                pai                   = 100.0 * $aa_tot_freq[$amino_acids[ai]]
+                odds                  = prob / pai
+                env.logo_array[ai]    = factor * Math::log(odds)
+                grp_logo_mat[aj, ai]  = env.logo_array[ai]
+              end
+              # adding log odds ratio for 'U' (J or C) when --cyc is 0
+              if $cys == 0
+                pai                                 = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
+                prob                                = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
+                                                      env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
+                odds                                = prob / pai
+                env.logo_array[$amino_acids.size]   = factor * Math::log(odds)
+                grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
+              end
+            end
+            grp_logo_mats << [grp_label, grp_logo_mat]
+          end
+          $tot_logo_mat = $cys == 0 ?
+                          NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
+                          NMatrix.float($amino_acids.size, $amino_acids.size)
+          $amino_acids.each_with_index do |aa1, aj|
+            $amino_acids.each_with_index do |aa2, ai|
+              prob  = $tot_prob_mat[aj, ai]
+              pai   = 100.0 * $aa_tot_freq[$amino_acids[ai]]
+              odds  = prob / pai
+              $tot_logo_mat[aj, ai] = factor * Math::log(odds)
+            end
+            # adding log odds ratio for 'U' (J or C) when --cyc is 0
+            if $cys == 0
+              pai   = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
+              prob  = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
+              odds  = prob / pai
+              $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
+            end
+          end
+          # calculating relative entropy for each amino acid pair H and
+          # the expected score E in bit units
+          tot_E = 0.0
+          tot_H = 0.0
+          0.upto($tot_logo_mat.shape[0] - 1) do |j|
+            0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
+              if j != i
+                tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
+                tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
+              else
+                tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
+                tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
+              end
+            end
+          end
+          $outfh.puts <<HEADER
+#
+# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
+HEADER
+          unless $noroundoff
+            $outfh.puts <<HEADER
+# rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
+HEADER
+          end
+          $outfh.puts <<HEADER
+# For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
+#
+HEADER
+          grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
+          grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
+          abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
+          row_header  = $cys ? $amino_acids + %w[U] : $amino_acids
+          heatmaps    = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
+          $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
+          grp_logo_mats.each_with_index do |arr, grp_no|
+            grp_label     = arr[0]
+            grp_logo_mat  = arr[1]
+            stem          = "#{grp_no}. #{grp_label}"
+            unless $noroundoff
+              grp_logo_mat = grp_logo_mat.round
+            end
+            # for a matrix file
+            $outfh.puts ">#{grp_label} #{grp_no}"
+            $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
+                                                   :row_header => row_header)
+            # for a heat map
+            if $heatmap == 0 or $heatmap == 2
+              grp_logo_mat.heatmap(:col_header          => $amino_acids,
+                                   :row_header          => row_header,
+                                   :rvg_width           => $rvg_width,
+                                   :rvg_height          => $rvg_height,
+                                   :canvas_width        => $canvas_width,
+                                   :canvas_height       => $canvas_height,
+                                   :gradient_beg_color  => '#0000FF',
+                                   :gradient_mid_color  => '#FFFFFF',
+                                   :gradient_end_color  => '#FF0000',
+                                   :max_val             => abs_max_val.ceil,
+                                   :mid_val             => 0,
+                                   :min_val             => -1 * abs_max_val.ceil,
+                                   :print_value         => $heatmapvalue,
+                                   :title               => stem).write("#{stem}.#{$heatmapformat}")
+              $logger.info "Generating a heat map for #{stem} table done."
+            end
+            if $heatmap == 1 or $heatmap == 2
+              heatmaps << grp_logo_mat.heatmap(:col_header          => $amino_acids,
+                                               :row_header          => row_header,
+                                               :rvg_width           => $rvg_width,
+                                               :rvg_height          => $rvg_height - 50,
+                                               :canvas_width        => $canvas_width,
+                                               :canvas_height       => $canvas_height - 50,
+                                               :gradient_beg_color  => '#0000FF',
+                                               :gradient_mid_color  => '#FFFFFF',
+                                               :gradient_end_color  => '#FF0000',
+                                               :max_val             => abs_max_val.ceil,
+                                               :mid_val             => 0,
+                                               :min_val             => -1 * abs_max_val.ceil,
+                                               :print_value         => $heatmapvalue,
+                                               :print_gradient      => false,
+                                               :title               => stem,
+                                               :title_font_size     => $rvg_width * $heatmapcol / 100.0)
+            end
+          end
+          # for heat maps in a single file
+          if $heatmap == 1 or $heatmap == 2
+            file = "#{$heatmapstem}.#{$heatmapformat}"
+            heatmaps.heatmap(:columns             => $heatmapcol,
+                             :rvg_width           => $rvg_width,
+                             :gradient_beg_color  => '#0000FF',
+                             :gradient_mid_color  => '#FFFFFF',
+                             :gradient_end_color  => '#FF0000',
+                             :max_val             => abs_max_val.ceil,
+                             :mid_val             => 0,
+                             :min_val             => -1 * abs_max_val.ceil).write(file)
+            $logger.info "Generating heat maps in a file, #{file} done."
+          end
+          # for a matrix file
+          unless $noroundoff
+            $tot_logo_mat = $tot_logo_mat.round
+          end
+          $outfh.puts ">Total #{grp_logo_mats.size}"
+          $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
+                                                  :row_header => row_header)
+          # for a heat map
+          if $heatmap == 0 or $heatmap == 2
+            stem            = "#{group_matrices.size}. TOTAL"
+            tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
+            $tot_logo_mat.heatmap(:col_header          => $amino_acids,
+                                  :row_header          => row_header,
+                                  :rvg_width           => $rvg_width,
+                                  :rvg_height          => $rvg_height,
+                                  :canvas_width        => $canvas_width,
+                                  :canvas_height       => $canvas_height,
+                                  :gradient_beg_color  => '#0000FF',
+                                  :gradient_mid_color  => '#FFFFFF',
+                                  :gradient_end_color  => '#FF0000',
+                                  :max_val             => tot_abs_max_val.ceil,
+                                  :mid_val             => 0,
+                                  :min_val             => -1 * tot_abs_max_val.ceil,
+                                  :print_value         => $heatmapvalue,
+                                  :title               => stem).write("#{stem}.#{$heatmapformat}")
+            $logger.info "Generating a heat map for #{stem} table done."
+          end
+          $logger.info "Calculating log odds ratios done."
+        end
+        #
+        # Part 7. END
+        #
+        $outfh.close
+        exit 0
+      end
+    end
+  end # class CLI
+end # module Egor