RubyGems - ai4r - Versions diffs - 1.13 → 2.0 - Mend

ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

checksums.yaml +7 -0
data/README.md +174 -0
data/examples/classifiers/hyperpipes_data.csv +14 -0
data/examples/classifiers/hyperpipes_example.rb +22 -0
data/examples/classifiers/ib1_example.rb +12 -0
data/examples/classifiers/id3_example.rb +15 -10
data/examples/classifiers/id3_graphviz_example.rb +17 -0
data/examples/classifiers/logistic_regression_example.rb +11 -0
data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
data/examples/classifiers/naive_bayes_example.rb +12 -13
data/examples/classifiers/one_r_example.rb +27 -0
data/examples/classifiers/parameter_tutorial.rb +29 -0
data/examples/classifiers/prism_nominal_example.rb +15 -0
data/examples/classifiers/prism_numeric_example.rb +21 -0
data/examples/classifiers/simple_linear_regression_example.rb +14 -11
data/examples/classifiers/zero_and_one_r_example.rb +34 -0
data/examples/classifiers/zero_one_r_data.csv +8 -0
data/examples/clusterers/clusterer_example.rb +40 -34
data/examples/clusterers/dbscan_example.rb +17 -0
data/examples/clusterers/dendrogram_example.rb +17 -0
data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
data/examples/clusterers/kmeans_custom_example.rb +26 -0
data/examples/genetic_algorithm/bitstring_example.rb +41 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
data/examples/neural_network/backpropagation_example.rb +48 -48
data/examples/neural_network/hopfield_example.rb +45 -0
data/examples/neural_network/patterns_with_base_noise.rb +39 -39
data/examples/neural_network/patterns_with_noise.rb +41 -39
data/examples/neural_network/train_epochs_callback.rb +25 -0
data/examples/neural_network/training_patterns.rb +39 -39
data/examples/neural_network/transformer_text_classification.rb +78 -0
data/examples/neural_network/xor_example.rb +23 -22
data/examples/reinforcement/q_learning_example.rb +10 -0
data/examples/som/som_data.rb +155 -152
data/examples/som/som_multi_node_example.rb +12 -13
data/examples/som/som_single_example.rb +12 -15
data/examples/transformer/decode_classifier_example.rb +68 -0
data/examples/transformer/deterministic_example.rb +10 -0
data/examples/transformer/seq2seq_example.rb +16 -0
data/lib/ai4r/classifiers/classifier.rb +24 -16
data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
data/lib/ai4r/classifiers/ib1.rb +122 -32
data/lib/ai4r/classifiers/id3.rb +524 -145
data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
data/lib/ai4r/classifiers/one_r.rb +112 -44
data/lib/ai4r/classifiers/prism.rb +167 -76
data/lib/ai4r/classifiers/random_forest.rb +72 -0
data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
data/lib/ai4r/classifiers/votes.rb +57 -0
data/lib/ai4r/classifiers/zero_r.rb +71 -30
data/lib/ai4r/clusterers/average_linkage.rb +46 -27
data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
data/lib/ai4r/clusterers/clusterer.rb +29 -14
data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
data/lib/ai4r/clusterers/dbscan.rb +134 -0
data/lib/ai4r/clusterers/diana.rb +75 -49
data/lib/ai4r/clusterers/k_means.rb +270 -135
data/lib/ai4r/clusterers/median_linkage.rb +49 -33
data/lib/ai4r/clusterers/single_linkage.rb +196 -88
data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
data/lib/ai4r/data/data_set.rb +223 -103
data/lib/ai4r/data/parameterizable.rb +31 -25
data/lib/ai4r/data/proximity.rb +62 -62
data/lib/ai4r/data/statistics.rb +46 -35
data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
data/lib/ai4r/experiment/split.rb +39 -0
data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
data/lib/ai4r/neural_network/activation_functions.rb +37 -0
data/lib/ai4r/neural_network/backpropagation.rb +399 -134
data/lib/ai4r/neural_network/hopfield.rb +175 -58
data/lib/ai4r/neural_network/transformer.rb +194 -0
data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
data/lib/ai4r/reinforcement/q_learning.rb +51 -0
data/lib/ai4r/search/a_star.rb +76 -0
data/lib/ai4r/search/bfs.rb +50 -0
data/lib/ai4r/search/dfs.rb +50 -0
data/lib/ai4r/search/mcts.rb +118 -0
data/lib/ai4r/search.rb +12 -0
data/lib/ai4r/som/distance_metrics.rb +29 -0
data/lib/ai4r/som/layer.rb +28 -17
data/lib/ai4r/som/node.rb +61 -32
data/lib/ai4r/som/som.rb +158 -41
data/lib/ai4r/som/two_phase_layer.rb +21 -25
data/lib/ai4r/version.rb +3 -0
data/lib/ai4r.rb +57 -28
metadata +79 -109
data/README.rdoc +0 -39
data/test/classifiers/hyperpipes_test.rb +0 -84
data/test/classifiers/ib1_test.rb +0 -78
data/test/classifiers/id3_test.rb +0 -220
data/test/classifiers/multilayer_perceptron_test.rb +0 -79
data/test/classifiers/naive_bayes_test.rb +0 -43
data/test/classifiers/one_r_test.rb +0 -62
data/test/classifiers/prism_test.rb +0 -85
data/test/classifiers/simple_linear_regression_test.rb +0 -37
data/test/classifiers/zero_r_test.rb +0 -50
data/test/clusterers/average_linkage_test.rb +0 -51
data/test/clusterers/bisecting_k_means_test.rb +0 -66
data/test/clusterers/centroid_linkage_test.rb +0 -53
data/test/clusterers/complete_linkage_test.rb +0 -57
data/test/clusterers/diana_test.rb +0 -69
data/test/clusterers/k_means_test.rb +0 -167
data/test/clusterers/median_linkage_test.rb +0 -53
data/test/clusterers/single_linkage_test.rb +0 -122
data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
data/test/clusterers/ward_linkage_test.rb +0 -53
data/test/clusterers/weighted_average_linkage_test.rb +0 -53
data/test/data/data_set_test.rb +0 -104
data/test/data/proximity_test.rb +0 -87
data/test/data/statistics_test.rb +0 -65
data/test/experiment/classifier_evaluator_test.rb +0 -76
data/test/genetic_algorithm/chromosome_test.rb +0 -57
data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
data/test/neural_network/backpropagation_test.rb +0 -82
data/test/neural_network/hopfield_test.rb +0 -72
data/test/som/som_test.rb +0 -97

data/lib/ai4r/classifiers/id3.rb CHANGED Viewed

@@ -1,34 +1,34 @@
-# Author::    Sergio Fierens (Implementation, Quinlan is
+# frozen_string_literal: true
+# Author::    Sergio Fierens (Implementation, Quinlan is
 # the creator of the algorithm)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../classifiers/classifier'
+require_relative '../data/data_set'
+require_relative '../classifiers/classifier'
 module Ai4r
   module Classifiers
     # = Introduction
-    # This is an implementation of the ID3 algorithm (Quinlan)
-    # Given a set of preclassified examples, it builds a top-down
-    # induction of decision tree, biased by the information gain and
+    # This is an implementation of the ID3 algorithm (Quinlan)
+    # Given a set of preclassified examples, it builds a top-down
+    # induction of decision tree, biased by the information gain and
     # entropy measure.
     #
     # * http://en.wikipedia.org/wiki/Decision_tree
     # * http://en.wikipedia.org/wiki/ID3_algorithm
     #
     # = How to use it
-    #
+    #
     #   DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target'  ]
     #
-    #   DATA_ITEMS = [
+    #   DATA_ITEMS = [
     #          ['New York',  '<30',      'M', 'Y'],
     #          ['Chicago',     '<30',      'M', 'Y'],
     #          ['Chicago',     '<30',      'F', 'Y'],
@@ -45,286 +45,665 @@ module Ai4r
     #          ['New York',  '[50-80]', 'F', 'N'],
     #          ['Chicago',     '>80',      'F', 'Y']
     #        ]
-    #
+    #
     #   data_set = DataSet.new(:data_items=>DATA_SET, :data_labels=>DATA_LABELS)
     #   id3 = Ai4r::Classifiers::ID3.new.build(data_set)
-    #
+    #
     #   id3.get_rules
     #     # =>  if age_range=='<30' then marketing_target='Y'
     #           elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
     #           elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
     #           elsif age_range=='[50-80]' then marketing_target='N'
     #           elsif age_range=='>80' then marketing_target='Y'
-    #           else raise 'There was not enough information during training to do a proper induction for this data element' end
-    #
+    #           else
+    #             raise 'There was not enough information during training to do '
+    #                   'a proper induction for this data element'
+    #           end
+    #
     #   id3.eval(['New York', '<30', 'M'])
     #     # =>  'Y'
-    #
-    # = A better way to load the data
-    #
+    #
+    # = A better way to load the data
+    #
     # In the real life you will use lot more data training examples, with more
-    # attributes. Consider moving your data to an external CSV (comma separate
+    # attributes. Consider moving your data to an external CSV (comma separate
     # values) file.
-    #
+    #
     #   data_file = "#{File.dirname(__FILE__)}/data_set.csv"
     #   data_set = DataSet.load_csv_with_labels data_file
-    #   id3 = Ai4r::Classifiers::ID3.new.build(data_set)
-    #
+    #   id3 = Ai4r::Classifiers::ID3.new.build(data_set)
+    #
     # = A nice tip for data evaluation
-    #
+    #
     #   id3 = Ai4r::Classifiers::ID3.new.build(data_set)
     #
     #   age_range = '<30'
     #   marketing_target = nil
-    #   eval id3.get_rules
+    #   eval id3.get_rules
     #   puts marketing_target
-    #     # =>  'Y'
+    #     # =>  'Y'
     #
     # = More about ID3 and decision trees
-    #
+    #
     # * http://en.wikipedia.org/wiki/Decision_tree
     # * http://en.wikipedia.org/wiki/ID3_algorithm
-    #
+    #
     # = About the project
     # Author::    Sergio Fierens
     # License::   MPL 1.1
-    # Url::       http://ai4r.org/
+    # Url::       https://github.com/SergioFierens/ai4r
     class ID3 < Classifier
-      attr_reader :data_set
+      attr_reader :data_set, :majority_class, :validation_set
+      parameters_info max_depth: 'Maximum recursion depth. Default is nil (no limit).',
+                      min_gain: 'Minimum information gain required to split. Default is 0.',
+                      on_unknown: 'Behaviour when evaluating unseen attribute values: '
+      # @return [Object]
+      def initialize
+        super()
+        @max_depth = nil
+        @min_gain = 0
+        @on_unknown = :raise
+      end
       # Create a new ID3 classifier. You must provide a DataSet instance
       # as parameter. The last attribute of each item is considered as the
       # item class.
-      def build(data_set)
+      # @param data_set [Object]
+      # @param options [Object]
+      # @return [Object]
+      def build(data_set, options = {})
         data_set.check_not_empty
         @data_set = data_set
+        @validation_set = options[:validation_set]
         preprocess_data(@data_set.data_items)
-        return self
+        prune! if @validation_set
+        self
       end
       # You can evaluate new data, predicting its category.
       # e.g.
       #   id3.eval(['New York',  '<30', 'F'])  # => 'Y'
+      # @param data [Object]
+      # @return [Object]
       def eval(data)
-        @tree.value(data) if @tree
+        @tree&.value(data, self)
       end
       # This method returns the generated rules in ruby code.
       # e.g.
-      #
+      #
       #   id3.get_rules
       #     # =>  if age_range=='<30' then marketing_target='Y'
       #           elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
       #           elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
       #           elsif age_range=='[50-80]' then marketing_target='N'
       #           elsif age_range=='>80' then marketing_target='Y'
-      #           else raise 'There was not enough information during training to do a proper induction for this data element' end
+      #           else
+      #             raise 'There was not enough information during training to do '
+      #                   'a proper induction for this data element'
+      #           end
       #
-      # It is a nice way to inspect induction results, and also to execute them:
+      # It is a nice way to inspect induction results, and also to execute them:
       #     age_range = '<30'
       #     marketing_target = nil
-      #     eval id3.get_rules
+      #     eval id3.get_rules
       #     puts marketing_target
       #       # =>  'Y'
+      # @return [Object]
       def get_rules
-        #return "Empty ID3 tree" if !@tree
+        # return "Empty ID3 tree" if !@tree
         rules = @tree.get_rules
         rules = rules.collect do |rule|
-            "#{rule[0..-2].join(' and ')} then #{rule.last}"
+          "#{rule[0..-2].join(' and ')} then #{rule.last}"
         end
-        return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
+        error_msg = 'There was not enough information during training to do a proper induction for this data element'
+        "if #{rules.join("\nelsif ")}\nelse raise '#{error_msg}' end"
       end
-      private
+      # Return a nested Hash representation of the decision tree.  This
+      # structure can easily be converted to JSON or other formats.
+      # Leaf nodes are represented by their category value, while internal
+      # nodes are hashes keyed by attribute value.
+      # @return [Object]
+      def to_h
+        @tree&.to_h
+      end
+      # Generate GraphViz DOT syntax describing the decision tree.  Nodes are
+      # labeled with attribute names or category values and edges are labeled
+      # with attribute values.
+      # @return [Object]
+      def to_graphviz
+        return 'digraph G {}' unless @tree
+        lines = ['digraph G {']
+        @tree.to_graphviz(0, lines)
+        lines << '}'
+        lines.join("\n")
+      end
+      # Prune the decision tree using the validation set provided during build.
+      # Subtrees are replaced by a single leaf when this increases the
+      # classification accuracy on the validation data.
+      # @return [Object]
+      def prune!
+        return self unless @validation_set
+        @tree = prune_node(@tree, @validation_set.data_items)
+        self
+      end
+      # @param data_examples [Object]
+      # @return [Object]
       def preprocess_data(data_examples)
-        @tree = build_node(data_examples)
+        @majority_class = most_freq(data_examples, domain(data_examples))
+        @tree = build_node(data_examples, [], 0)
       end
-      private
-      def build_node(data_examples, flag_att = [])
-        return ErrorNode.new if data_examples.length == 0
-        domain = domain(data_examples)
-        return CategoryNode.new(@data_set.data_labels.last, domain.last[0]) if domain.last.length == 1
-        min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
-        flag_att << min_entropy_index
-        split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
-        return CategoryNode.new(@data_set.data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
-        nodes = split_data_examples.collect do |partial_data_examples|
-          build_node(partial_data_examples, flag_att)
+      # @param data_examples [Object]
+      # @param flag_att [Object]
+      # @param depth [Object]
+      # @return [Object]
+      def build_node(data_examples, flag_att = [], depth = 0)
+        return ErrorNode.new if data_examples.empty?
+        domain = domain(data_examples)
+        return CategoryNode.new(@data_set.category_label, domain.last[0]) if domain.last.length == 1
+        if flag_att.length >= domain.length - 1
+          return CategoryNode.new(@data_set.category_label,
+                                  most_freq(data_examples,
+                                            domain))
+        end
+        return CategoryNode.new(@data_set.category_label, most_freq(data_examples, domain)) if @max_depth && depth >= @max_depth
+        best_index = nil
+        best_entropy = nil
+        best_split = nil
+        best_threshold = nil
+        numeric = false
+        domain[0..-2].each_index do |index|
+          next if flag_att.include?(index)
+          if domain[index].all? { |v| v.is_a? Numeric }
+            threshold, split, entropy = best_numeric_split(data_examples, index, domain)
+            if best_entropy.nil? || entropy < best_entropy
+              best_entropy = entropy
+              best_index = index
+              best_split = split
+              best_threshold = threshold
+              numeric = true
+            end
+          else
+            freq_grid = freq_grid(index, data_examples, domain)
+            entropy = entropy(freq_grid, data_examples.length)
+            if best_entropy.nil? || entropy < best_entropy
+              best_entropy = entropy
+              best_index = index
+              best_split = split_data_examples(data_examples, domain, index)
+              numeric = false
+            end
+          end
+        end
+        gain = information_gain(data_examples, domain, best_index)
+        if gain < @min_gain
+          return CategoryNode.new(@data_set.category_label,
+                                  most_freq(data_examples, domain))
+        end
+        if best_split.length == 1
+          return CategoryNode.new(@data_set.category_label,
+                                  most_freq(data_examples, domain))
+        end
+        nodes = best_split.collect do |partial_data_examples|
+          build_node(partial_data_examples, numeric ? flag_att : [*flag_att, best_index], depth + 1)
+        end
+        majority = most_freq(data_examples, domain)
+        if numeric
+          EvaluationNode.new(@data_set.data_labels, best_index, best_threshold, nodes, true,
+                             majority)
+        else
+          EvaluationNode.new(@data_set.data_labels, best_index, domain[best_index], nodes, false,
+                             majority)
         end
-        return EvaluationNode.new(@data_set.data_labels, min_entropy_index, domain[min_entropy_index], nodes)
       end
-      private
+      # @param values [Object]
+      # @return [Object]
       def self.sum(values)
-        values.inject( 0 ) { |sum,x| sum+x }
+        values.sum
       end
-      private
+      # @param z [Object]
+      # @return [Object]
       def self.log2(z)
-        return 0.0 if z == 0
-        Math.log(z)/LOG2
-      end
-      private
-      def most_freq(examples, domain)
-        freqs = []
-        domain.last.length.times { freqs << 0}
-        examples.each do |example|
-          cat_index = domain.last.index(example.last)
-          freq = freqs[cat_index] + 1
-          freqs[cat_index] = freq
-        end
-        max_freq = freqs.max
-        max_freq_index = freqs.index(max_freq)
-        domain.last[max_freq_index]
+        return 0.0 if z.zero?
+        Math.log(z) / LOG2
       end
       private
+      # @param examples [Object]
+      # @param domain [Object]
+      # @return [Object]
+      def most_freq(examples, _domain)
+        examples.map(&:last).tally.max_by { _2 }&.first
+      end
+      # @param data_examples [Object]
+      # @param att_index [Object]
+      # @return [Object]
+      def split_data_examples_by_value(data_examples, att_index)
+        att_value_examples = Hash.new { |hsh, key| hsh[key] = [] }
+        data_examples.each do |example|
+          att_value = example[att_index]
+          att_value_examples[att_value] << example
+        end
+        att_value_examples
+      end
+      # @param data_examples [Object]
+      # @param domain [Object]
+      # @param att_index [Object]
+      # @return [Object]
       def split_data_examples(data_examples, domain, att_index)
+        att_value_examples = split_data_examples_by_value(data_examples, att_index)
+        attribute_domain = domain[att_index]
         data_examples_array = []
-        att_value_examples = {}
+        att_value_examples.each do |att_value, example_set|
+          att_value_index = attribute_domain.index(att_value)
+          data_examples_array[att_value_index] = example_set
+        end
+        data_examples_array
+      end
+      # @param data_examples [Object]
+      # @param att_index [Object]
+      # @param threshold [Object]
+      # @return [Object]
+      def split_data_examples_numeric(data_examples, att_index, threshold)
+        lower = []
+        higher = []
         data_examples.each do |example|
-          example_set = att_value_examples[example[att_index]]
-          example_set = [] if !example_set
-          example_set << example
-          att_value_examples.store(example[att_index], example_set)
+          if example[att_index] <= threshold
+            lower << example
+          else
+            higher << example
+          end
         end
-        att_value_examples.each_pair do |att_value, example_set|
-           att_value_index = domain[att_index].index(att_value)
-           data_examples_array[att_value_index] = example_set
+        [lower, higher]
+      end
+      # @param data_examples [Object]
+      # @param att_index [Object]
+      # @return [Object]
+      def candidate_thresholds(data_examples, att_index)
+        values = data_examples.collect { |d| d[att_index] }.uniq.sort
+        thresholds = []
+        values.each_cons(2) { |a, b| thresholds << ((a + b) / 2.0) }
+        thresholds
+      end
+      # @param split_data [Object]
+      # @param domain [Object]
+      # @return [Object]
+      def entropy_for_numeric_split(split_data, domain)
+        category_domain = domain.last
+        grid = split_data.collect do |subset|
+          counts = Array.new(category_domain.length, 0)
+          subset.each do |example|
+            cat_idx = category_domain.index(example.last)
+            counts[cat_idx] += 1
+          end
+          counts
+        end
+        entropy(grid, split_data[0].length + split_data[1].length)
+      end
+      # @param data_examples [Object]
+      # @param att_index [Object]
+      # @param domain [Object]
+      # @return [Object]
+      def best_numeric_split(data_examples, att_index, domain)
+        best_threshold = nil
+        best_entropy = nil
+        best_split = nil
+        candidate_thresholds(data_examples, att_index).each do |threshold|
+          split = split_data_examples_numeric(data_examples, att_index, threshold)
+          e = entropy_for_numeric_split(split, domain)
+          next unless best_entropy.nil? || e < best_entropy
+          best_entropy = e
+          best_threshold = threshold
+          best_split = split
         end
-        return data_examples_array
+        [best_threshold, best_split, best_entropy]
       end
-      private
-      def min_entropy_index(data_examples, domain, flag_att=[])
+      # @param data_examples [Object]
+      # @param domain [Object]
+      # @param flag_att [Object]
+      # @return [Object]
+      def min_entropy_index(data_examples, domain, flag_att = [])
         min_entropy = nil
         min_index = 0
         domain[0..-2].each_index do |index|
+          next if flag_att.include?(index)
           freq_grid = freq_grid(index, data_examples, domain)
           entropy = entropy(freq_grid, data_examples.length)
-          if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
-            min_entropy = entropy
-            min_index = index
+          if !min_entropy || entropy < min_entropy
+            min_entropy = entropy
+            min_index = index
           end
         end
-        return min_index
+        min_index
       end
-      private
+      # @param data_examples [Object]
+      # @param domain [Object]
+      # @param att_index [Object]
+      # @return [Object]
+      def information_gain(data_examples, domain, att_index)
+        total_entropy = class_entropy(data_examples, domain)
+        freq_grid_att = freq_grid(att_index, data_examples, domain)
+        att_entropy = entropy(freq_grid_att, data_examples.length)
+        total_entropy - att_entropy
+      end
+      # @param data_examples [Object]
+      # @param domain [Object]
+      # @return [Object]
+      def class_entropy(data_examples, domain)
+        category_domain = domain.last
+        freqs = Array.new(category_domain.length, 0)
+        data_examples.each do |ex|
+          cat = ex.last
+          idx = category_domain.index(cat)
+          freqs[idx] += 1
+        end
+        entropy([freqs], data_examples.length)
+      end
+      # @param data_examples [Object]
+      # @return [Object]
       def domain(data_examples)
-        #return build_domains(data_examples)
-        domain = []
-        @data_set.data_labels.length.times { domain << [] }
+        # return build_domains(data_examples)
+        domain = Array.new(@data_set.data_labels.length) { [] }
         data_examples.each do |data|
-          data.each_index do |i|
-            domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
+          data.each_with_index do |att_value, i|
+            domain[i] << att_value if i < domain.length && !domain[i].include?(att_value)
           end
         end
-        return domain
+        domain
       end
-      private
+      # @param att_index [Object]
+      # @param data_examples [Object]
+      # @param domain [Object]
+      # @return [Object]
       def freq_grid(att_index, data_examples, domain)
-        #Initialize empty grid
-        grid_element = []
-        domain.last.length.times { grid_element << 0}
-        grid = []
-        domain[att_index].length.times { grid << grid_element.clone }
-        #Fill frecuency with grid
+        # Initialize empty grid
+        feature_domain = domain[att_index]
+        category_domain = domain.last
+        grid = Array.new(feature_domain.length) { Array.new(category_domain.length, 0) }
+        # Fill frecuency with grid
         data_examples.each do |example|
           att_val = example[att_index]
-          att_val_index = domain[att_index].index(att_val)
+          att_val_index = feature_domain.index(att_val)
           category = example.last
-          category_index = domain.last.index(category)
-          freq = grid[att_val_index][category_index] + 1
-          grid[att_val_index][category_index] = freq
+          category_index = category_domain.index(category)
+          grid[att_val_index][category_index] += 1
         end
-        return grid
+        grid
       end
-      private
+      # @param freq_grid [Object]
+      # @param total_examples [Object]
+      # @return [Object]
       def entropy(freq_grid, total_examples)
-        #Calc entropy of each element
+        # Calc entropy of each element
         entropy = 0
         freq_grid.each do |att_freq|
           att_total_freq = ID3.sum(att_freq)
           partial_entropy = 0
-          if att_total_freq != 0
+          unless att_total_freq.zero?
             att_freq.each do |freq|
-              prop = freq.to_f/att_total_freq
-              partial_entropy += (-1*prop*ID3.log2(prop))
+              prop = freq.to_f / att_total_freq
+              partial_entropy += (-1 * prop * ID3.log2(prop))
             end
           end
-          entropy += (att_total_freq.to_f/total_examples) * partial_entropy
+          entropy += (att_total_freq.to_f / total_examples) * partial_entropy
         end
-        return entropy
+        entropy
+      end
+      # @param node [Object]
+      # @param examples [Object]
+      # @return [Object]
+      def prune_node(node, examples)
+        return node if node.is_a?(CategoryNode) || node.is_a?(ErrorNode)
+        subsets = split_examples(node, examples)
+        node.nodes.each_with_index do |child, i|
+          node.nodes[i] = prune_node(child, subsets[i])
+        end
+        leaf = CategoryNode.new(@data_set.category_label, node.majority)
+        replace_with_leaf?(leaf, node, examples) ? leaf : node
+      end
+      def split_examples(node, examples)
+        if node.numeric
+          Array.new(2) { [] }.tap do |subsets|
+            examples.each do |ex|
+              idx = ex[node.index] <= node.threshold ? 0 : 1
+              subsets[idx] << ex
+            end
+          end
+        else
+          Array.new(node.values.length) { [] }.tap do |subsets|
+            examples.each do |ex|
+              idx = node.values.index(ex[node.index])
+              subsets[idx] << ex if idx
+            end
+          end
+        end
+      end
+      def replace_with_leaf?(leaf, node, examples)
+        before = accuracy_for_node(node, examples)
+        after  = accuracy_for_node(leaf, examples)
+        after && before && after >= before
+      end
+      # @param node [Object]
+      # @param examples [Object]
+      # @return [Object]
+      def accuracy_for_node(node, examples)
+        return nil if examples.empty?
+        correct = examples.count do |ex|
+          node.value(ex[0..-2], self) == ex.last
+        end
+        correct.to_f / examples.length
       end
-      private
       LOG2 = Math.log(2)
     end
-    class EvaluationNode #:nodoc: all
-      attr_reader :index, :values, :nodes
-      def initialize(data_labels, index, values, nodes)
+    class EvaluationNode # :nodoc: all
+      attr_reader :index, :values, :nodes, :numeric, :threshold, :majority
+      # @param data_labels [Object]
+      # @param index [Object]
+      # @param values_or_threshold [Object]
+      # @param nodes [Object]
+      # @param numeric [Object]
+      # @param majority [Object]
+      # @return [Object]
+      def initialize(data_labels, index, values_or_threshold, nodes, numeric = false,
+                     majority = nil)
         @index = index
-        @values = values
+        @numeric = numeric
+        if numeric
+          @threshold = values_or_threshold
+          @values = nil
+        else
+          @values = values_or_threshold
+        end
         @nodes = nodes
+        @majority = majority
         @data_labels = data_labels
       end
-      def value(data)
+      # @param data [Object]
+      # @param classifier [Object]
+      # @return [Object]
+      def value(data, classifier = nil)
         value = data[@index]
-        return ErrorNode.new.value(data) if !@values.include?(value)
-        return nodes[@values.index(value)].value(data)
+        if @numeric
+          node = value <= @threshold ? @nodes[0] : @nodes[1]
+          node.value(data, classifier)
+        else
+          unless @values.include?(value)
+            return nil if classifier&.on_unknown == :nil
+            return @majority if classifier&.on_unknown == :most_frequent
+            return ErrorNode.new.value(data, classifier)
+          end
+          @nodes[@values.index(value)].value(data, classifier)
+        end
       end
+      # @return [Object]
       def get_rules
         rule_set = []
-        @nodes.each_index do |child_node_index|
-          my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
-          child_node = @nodes[child_node_index]
+        @nodes.each_with_index do |child_node, child_node_index|
+          if @numeric
+            op = child_node_index.zero? ? '<=' : '>'
+            my_rule = "#{@data_labels[@index]} #{op} #{@threshold}"
+          else
+            my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
+          end
           child_node_rules = child_node.get_rules
           child_node_rules.each do |child_rule|
             child_rule.unshift(my_rule)
           end
           rule_set += child_node_rules
         end
-        return rule_set
+        rule_set
+      end
+      # @return [Object]
+      def to_h
+        hash = {}
+        @nodes.each_with_index do |child, i|
+          hash[@values[i]] = child.to_h
+        end
+        { @data_labels[@index] => hash }
+      end
+      # @param id [Object]
+      # @param lines [Object]
+      # @param parent [Object]
+      # @param edge_label [Object]
+      # @return [Object]
+      def to_graphviz(id, lines, parent = nil, edge_label = nil)
+        my_id = id
+        lines << "  node#{my_id} [label=\"#{@data_labels[@index]}\"]"
+        lines << "  node#{parent} -> node#{my_id} [label=\"#{edge_label}\"]" if parent
+        next_id = my_id
+        @nodes.each_with_index do |child, idx|
+          next_id += 1
+          next_id = child.to_graphviz(next_id, lines, my_id, @values[idx])
+        end
+        next_id
       end
     end
-    class CategoryNode #:nodoc: all
+    class CategoryNode # :nodoc: all
+      # @param label [Object]
+      # @param value [Object]
+      # @return [Object]
       def initialize(label, value)
         @label = label
         @value = value
       end
-      def value(data)
-        return @value
+      # @param data [Object]
+      # @param classifier [Object]
+      # @return [Object]
+      def value(_data, _classifier = nil)
+        @value
       end
+      # @return [Object]
       def get_rules
-        return [["#{@label}='#{@value}'"]]
+        [["#{@label}='#{@value}'"]]
+      end
+      # @return [Object]
+      def to_h
+        @value
+      end
+      # @param id [Object]
+      # @param lines [Object]
+      # @param parent [Object]
+      # @param edge_label [Object]
+      # @return [Object]
+      def to_graphviz(id, lines, parent = nil, edge_label = nil)
+        my_id = id
+        lines << "  node#{my_id} [label=\"#{@value}\", shape=box]"
+        lines << "  node#{parent} -> node#{my_id} [label=\"#{edge_label}\"]" if parent
+        my_id
       end
     end
+    # Raised when the training data is insufficient to build a model.
     class ModelFailureError < StandardError
-      default_message = "There was not enough information during training to do a proper induction for this data element."
+      MSG = 'There was not enough information during training to do a proper ' \
+            'induction for this data element.'
     end
-    class ErrorNode #:nodoc: all
-      def value(data)
-        raise ModelFailureError, "There was not enough information during training to do a proper induction for the data element #{data}."
+    class ErrorNode # :nodoc: all
+      # @param data [Object]
+      # @param classifier [Object]
+      # @return [Object]
+      def value(data, _classifier = nil)
+        raise ModelFailureError, "#{ModelFailureError::MSG} for the data element #{data}."
       end
+      # @return [Object]
       def get_rules
-        return []
+        []
       end
-    end
+      # @return [Object]
+      def to_h
+        nil
+      end
+      # @param id [Object]
+      # @param lines [Object]
+      # @param parent [Object]
+      # @param edge_label [Object]
+      # @return [Object]
+      def to_graphviz(id, lines, parent = nil, edge_label = nil)
+        my_id = id
+        lines << "  node#{my_id} [label=\"?\", shape=box]"
+        lines << "  node#{parent} -> node#{my_id} [label=\"#{edge_label}\"]" if parent
+        my_id
+      end
+    end
   end
 end