RubyGems - rumale-tree - Versions diffs - 0.24.0 - Mend

rumale-tree 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +7 -0
data/LICENSE.txt +27 -0
data/README.md +33 -0
data/ext/rumale/tree/ext.c +575 -0
data/ext/rumale/tree/ext.h +12 -0
data/ext/rumale/tree/extconf.rb +32 -0
data/lib/rumale/tree/base_decision_tree.rb +154 -0
data/lib/rumale/tree/decision_tree_classifier.rb +148 -0
data/lib/rumale/tree/decision_tree_regressor.rb +113 -0
data/lib/rumale/tree/extra_tree_classifier.rb +89 -0
data/lib/rumale/tree/extra_tree_regressor.rb +80 -0
data/lib/rumale/tree/gradient_tree_regressor.rb +192 -0
data/lib/rumale/tree/node.rb +39 -0
data/lib/rumale/tree/version.rb +10 -0
data/lib/rumale/tree.rb +11 -0
metadata +93 -0

data/lib/rumale/tree/base_decision_tree.rb ADDED Viewed

@@ -0,0 +1,154 @@
+# frozen_string_literal: true
+require 'rumale/base/estimator'
+require 'rumale/validation'
+require 'rumale/tree/ext'
+require 'rumale/tree/node'
+module Rumale
+  module Tree
+    # BaseDecisionTree is an abstract class for implementation of decision tree-based estimator.
+    # This class is used internally.
+    class BaseDecisionTree < ::Rumale::Base::Estimator
+      # Initialize a decision tree-based estimator.
+      #
+      # @param criterion [String] The function to evalue spliting point.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: nil, max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        super()
+        @params = {
+          criterion: criterion,
+          max_depth: max_depth,
+          max_leaf_nodes: max_leaf_nodes,
+          min_samples_leaf: min_samples_leaf,
+          max_features: max_features,
+          random_seed: random_seed || srand
+        }
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
+      def apply(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
+      end
+      private
+      def partial_apply(tree, sample)
+        node = tree
+        until node.leaf
+          node = if node.right.nil?
+                   node.left
+                 elsif node.left.nil?
+                   node.right
+                 else
+                   sample[node.feature_id] <= node.threshold ? node.left : node.right
+                 end
+        end
+        node.leaf_id
+      end
+      def build_tree(x, y)
+        y = y.expand_dims(1).dup if y.shape[1].nil?
+        @feature_ids = Array.new(x.shape[1]) { |v| v }
+        @tree = grow_node(0, x, y, impurity(y))
+        @feature_ids = nil
+        nil
+      end
+      def grow_node(depth, x, y, impurity) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
+        # intialize node.
+        n_samples = x.shape[0]
+        node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
+        # terminate growing.
+        return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
+        return nil if n_samples < @params[:min_samples_leaf]
+        return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
+        return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
+        return put_leaf(node, y) if stop_growing?(y)
+        # calculate optimal parameters.
+        feature_id, left_imp, right_imp, threshold, gain =
+          rand_ids.map { |n| [n, *best_split(x[true, n], y, impurity)] }.max_by(&:last)
+        return put_leaf(node, y) if gain.nil? || gain.zero?
+        left_ids = x[true, feature_id].le(threshold).where
+        right_ids = x[true, feature_id].gt(threshold).where
+        node.left = if y.ndim == 1
+                      grow_node(depth + 1, x[left_ids, true], y[left_ids], left_imp)
+                    else
+                      grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
+                    end
+        node.right = if y.ndim == 1
+                       grow_node(depth + 1, x[right_ids, true], y[right_ids], right_imp)
+                     else
+                       grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
+                     end
+        return put_leaf(node, y) if node.left.nil? && node.right.nil?
+        node.feature_id = feature_id
+        node.threshold = threshold
+        node.leaf = false
+        node
+      end
+      def stop_growing?(_y)
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      def put_leaf(_node, _y)
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      def rand_ids
+        @feature_ids.sample(@params[:max_features], random: @sub_rng)
+      end
+      def best_split(_features, _y, _impurity)
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      def impurity(_y)
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      def eval_importance(n_samples, n_features)
+        @feature_importances = Numo::DFloat.zeros(n_features)
+        eval_importance_at_node(@tree)
+        @feature_importances /= n_samples
+        normalizer = @feature_importances.sum
+        @feature_importances /= normalizer if normalizer > 0.0
+        nil
+      end
+      def eval_importance_at_node(node)
+        return nil if node.leaf
+        return nil if node.left.nil? || node.right.nil?
+        gain = node.n_samples * node.impurity -
+               node.left.n_samples * node.left.impurity -
+               node.right.n_samples * node.right.impurity
+        @feature_importances[node.feature_id] += gain
+        eval_importance_at_node(node.left)
+        eval_importance_at_node(node.right)
+      end
+    end
+  end
+end

data/lib/rumale/tree/decision_tree_classifier.rb ADDED Viewed

@@ -0,0 +1,148 @@
+# frozen_string_literal: true
+require 'rumale/tree/base_decision_tree'
+require 'rumale/base/classifier'
+module Rumale
+  module Tree
+    # DecisionTreeClassifier is a class that implements decision tree for classification.
+    #
+    # @example
+    #   require 'rumale/tree/decision_tree_classifier'
+    #
+    #   estimator =
+    #     Rumale::Tree::DecisionTreeClassifier.new(
+    #       criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    class DecisionTreeClassifier < BaseDecisionTree
+      include ::Rumale::Base::Classifier
+      include ::Rumale::Tree::ExtDecisionTreeClassifier
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the labels assigned each leaf.
+      # @return [Numo::Int32] (size: n_leafs)
+      attr_reader :leaf_labels
+      # Create a new classifier with decision tree algorithm.
+      #
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [DecisionTreeClassifier] The learned classifier itself.
+      def fit(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_label_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        n_samples, n_features = x.shape
+        @params[:max_features] = n_features if @params[:max_features].nil?
+        @params[:max_features] = [@params[:max_features], n_features].min
+        y = Numo::Int32.cast(y) unless y.is_a?(Numo::Int32)
+        uniq_y = y.to_a.uniq.sort
+        @classes = Numo::Int32.asarray(uniq_y)
+        @n_leaves = 0
+        @leaf_labels = []
+        @feature_ids = Array.new(n_features) { |v| v }
+        @sub_rng = @rng.dup
+        build_tree(x, y.map { |v| uniq_y.index(v) })
+        eval_importance(n_samples, n_features)
+        @leaf_labels = Numo::Int32[*@leaf_labels]
+        self
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      def predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        @leaf_labels[apply(x)].dup
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        Numo::DFloat[*(Array.new(x.shape[0]) { |n| partial_predict_proba(@tree, x[n, true]) })]
+      end
+      private
+      def partial_predict_proba(tree, sample)
+        node = tree
+        until node.leaf
+          node = if node.right.nil?
+                   node.left
+                 elsif node.left.nil?
+                   node.right
+                 else
+                   sample[node.feature_id] <= node.threshold ? node.left : node.right
+                 end
+        end
+        node.probs
+      end
+      def build_tree(x, y)
+        @tree = grow_node(0, x, y, impurity(y))
+        nil
+      end
+      def put_leaf(node, y)
+        node.probs = y.bincount(minlength: @classes.size) / node.n_samples.to_f
+        node.leaf = true
+        node.leaf_id = @n_leaves
+        @n_leaves += 1
+        @leaf_labels.push(@classes[node.probs.max_index])
+        node
+      end
+      def best_split(features, y, whole_impurity)
+        order = features.sort_index
+        n_classes = @classes.size
+        find_split_params(@params[:criterion], whole_impurity, order, features, y, n_classes)
+      end
+      def impurity(y)
+        n_classes = @classes.size
+        node_impurity(@params[:criterion], y, n_classes)
+      end
+    end
+  end
+end

data/lib/rumale/tree/decision_tree_regressor.rb ADDED Viewed

@@ -0,0 +1,113 @@
+# frozen_string_literal: true
+require 'rumale/tree/base_decision_tree'
+require 'rumale/base/regressor'
+module Rumale
+  module Tree
+    # DecisionTreeRegressor is a class that implements decision tree for regression.
+    #
+    # @example
+    #   require 'rumale/tree/decision_tree_regressor'
+    #
+    #   estimator =
+    #     Rumale::Tree::DecisionTreeRegressor.new(
+    #       max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    class DecisionTreeRegressor < BaseDecisionTree
+      include ::Rumale::Base::Regressor
+      include ::Rumale::Tree::ExtDecisionTreeRegressor
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the values assigned each leaf.
+      # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
+      attr_reader :leaf_values
+      # Create a new regressor with decision tree algorithm.
+      #
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
+      # @return [DecisionTreeRegressor] The learned regressor itself.
+      def fit(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_target_value_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        n_samples, n_features = x.shape
+        @params[:max_features] = n_features if @params[:max_features].nil?
+        @params[:max_features] = [@params[:max_features], n_features].min
+        @n_leaves = 0
+        @leaf_values = []
+        @sub_rng = @rng.dup
+        build_tree(x, y)
+        eval_importance(n_samples, n_features)
+        @leaf_values = Numo::DFloat.cast(@leaf_values)
+        @leaf_values = @leaf_values.flatten.dup if @leaf_values.shape[1] == 1
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
+      def predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        @leaf_values.shape[1].nil? ? @leaf_values[apply(x)].dup : @leaf_values[apply(x), true].dup
+      end
+      private
+      def stop_growing?(y)
+        y.to_a.uniq.size == 1
+      end
+      def put_leaf(node, y)
+        node.probs = nil
+        node.leaf = true
+        node.leaf_id = @n_leaves
+        @n_leaves += 1
+        @leaf_values.push(y.mean(0))
+        node
+      end
+      def best_split(f, y, impurity)
+        find_split_params(@params[:criterion], impurity, f.sort_index, f, y)
+      end
+      def impurity(y)
+        node_impurity(@params[:criterion], y.to_a)
+      end
+    end
+  end
+end

data/lib/rumale/tree/extra_tree_classifier.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# frozen_string_literal: true
+require 'rumale/tree/decision_tree_classifier'
+module Rumale
+  module Tree
+    # ExtraTreeClassifier is a class that implements extra randomized tree for classification.
+    #
+    # @example
+    #   require 'rumale/tree/extra_tree_classifier'
+    #
+    #   estimator =
+    #     Rumale::Tree::ExtraTreeClassifier.new(
+    #       criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreeClassifier < DecisionTreeClassifier
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the labels assigned each leaf.
+      # @return [Numo::Int32] (size: n_leafs)
+      attr_reader :leaf_labels
+      # Create a new classifier with extra randomized tree algorithm.
+      #
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [ExtraTreeClassifier] The learned classifier itself.
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      private
+      def best_split(features, y, whole_impurity)
+        threshold = @sub_rng.rand(features.min..features.max)
+        l_ids = features.le(threshold).where
+        r_ids = features.gt(threshold).where
+        l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids])
+        r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids])
+        gain = whole_impurity -
+               l_impurity * l_ids.size.fdiv(y.size) -
+               r_impurity * r_ids.size.fdiv(y.size)
+        [l_impurity, r_impurity, threshold, gain]
+      end
+    end
+  end
+end

data/lib/rumale/tree/extra_tree_regressor.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# frozen_string_literal: true
+require 'rumale/tree/decision_tree_regressor'
+module Rumale
+  module Tree
+    # ExtraTreeRegressor is a class that implements extra randomized tree for regression.
+    #
+    # @example
+    #   require 'rumale/tree/extra_tree_regressor'
+    #
+    #   estimator =
+    #     Rumale::Tree::ExtraTreeRegressor.new(
+    #       max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreeRegressor < DecisionTreeRegressor
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the values assigned each leaf.
+      # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
+      attr_reader :leaf_values
+      # Create a new regressor with extra randomized tree algorithm.
+      #
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
+      # @return [ExtraTreeRegressor] The learned regressor itself.
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
+      private
+      def best_split(features, y, whole_impurity)
+        threshold = @sub_rng.rand(features.min..features.max)
+        l_ids = features.le(threshold).where
+        r_ids = features.gt(threshold).where
+        l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
+        r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
+        gain = whole_impurity -
+               l_impurity * l_ids.size.fdiv(y.shape[0]) -
+               r_impurity * r_ids.size.fdiv(y.shape[0])
+        [l_impurity, r_impurity, threshold, gain]
+      end
+    end
+  end
+end