RubyGems - rumale-ensemble - Versions diffs - 0.24.0 - Mend

rumale-ensemble 0.24.0

Files changed (19) hide show

checksums.yaml +7 -0
data/LICENSE.txt +27 -0
data/README.md +34 -0
data/lib/rumale/ensemble/ada_boost_classifier.rb +176 -0
data/lib/rumale/ensemble/ada_boost_regressor.rb +167 -0
data/lib/rumale/ensemble/extra_trees_classifier.rb +140 -0
data/lib/rumale/ensemble/extra_trees_regressor.rb +125 -0
data/lib/rumale/ensemble/gradient_boosting_classifier.rb +296 -0
data/lib/rumale/ensemble/gradient_boosting_regressor.rb +223 -0
data/lib/rumale/ensemble/random_forest_classifier.rb +184 -0
data/lib/rumale/ensemble/random_forest_regressor.rb +146 -0
data/lib/rumale/ensemble/stacking_classifier.rb +224 -0
data/lib/rumale/ensemble/stacking_regressor.rb +168 -0
data/lib/rumale/ensemble/value.rb +13 -0
data/lib/rumale/ensemble/version.rb +10 -0
data/lib/rumale/ensemble/voting_classifier.rb +129 -0
data/lib/rumale/ensemble/voting_regressor.rb +84 -0
data/lib/rumale/ensemble.rb +20 -0
metadata +152 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 71f67ae6338e6907a02b66affa8ad12b22254da82d6a1fdfea092844f8809a51
+  data.tar.gz: 7b301905c59c580ace8f17edc4dd2b526af267493f60f74c294652f6e137fc12
+SHA512:
+  metadata.gz: 65391ee173334b7b2bc41761fe4a66dd8bd0c1158c948187b9059b78b80c9343393e3a42d52e6906e54388e7e3ce86340eb479a3c443130bdf004b1954570853
+  data.tar.gz: 7f78362e3a06aacc18f1a71a0c0340a5322fd8d78a2acd74ac7e4a8b4bfcd9396b84cfa0dc2a01ad1f872ff057b6847b7cd6c06d3bbab45f0fc9087035715d11

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,27 @@
+Copyright (c) 2022 Atsushi Tatsuma
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED Viewed

@@ -0,0 +1,34 @@
+# Rumale::Ensemble
+[![Gem Version](https://badge.fury.io/rb/rumale-ensemble.svg)](https://badge.fury.io/rb/rumale-ensemble)
+[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-ensemble/LICENSE.txt)
+[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
+Rumale is a machine learning library in Ruby.
+Rumale::Ensemble provides ensemble learning algorithms,
+such as AdaBoost, Gradient Tree Boosting, and Random Forest,
+with Rumale interface.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'rumale-ensemble'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install rumale-ensemble
+## Documentation
+- [Rumale API Documentation - Ensemble](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
+## License
+The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).

data/lib/rumale/ensemble/ada_boost_classifier.rb ADDED Viewed

@@ -0,0 +1,176 @@
+# frozen_string_literal: true
+require 'rumale/utils'
+require 'rumale/validation'
+require 'rumale/base/estimator'
+require 'rumale/base/classifier'
+require 'rumale/tree/decision_tree_classifier'
+require 'rumale/ensemble/value'
+module Rumale
+  module Ensemble
+    # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
+    # This class uses decision tree for a weak learner.
+    #
+    # @example
+    #   require 'rumale/ensemble/ada_boost_classifier'
+    #
+    #   estimator =
+    #     Rumale::Ensemble::AdaBoostClassifier.new(
+    #       n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Zhu, J., Rosset, S., Zou, H., and Hashie, T., "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
+    class AdaBoostClassifier < ::Rumale::Base::Estimator
+      include ::Rumale::Base::Classifier
+      # Return the set of estimators.
+      # @return [Array<DecisionTreeClassifier>]
+      attr_reader :estimators
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new classifier with AdaBoost.
+      #
+      # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 50,
+                     criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, random_seed: nil)
+        super()
+        @params = {
+          n_estimators: n_estimators,
+          criterion: criterion,
+          max_depth: max_depth,
+          max_leaf_nodes: max_leaf_nodes,
+          min_samples_leaf: min_samples_leaf,
+          max_features: max_features,
+          random_seed: random_seed || srand
+        }
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [AdaBoostClassifier] The learned classifier itself.
+      def fit(x, y) # rubocop:disable Metrics/AbcSize
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_label_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        ## Initialize some variables.
+        n_samples, n_features = x.shape
+        @estimators = []
+        @feature_importances = Numo::DFloat.zeros(n_features)
+        @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
+        n_classes = @classes.shape[0]
+        sub_rng = @rng.dup
+        ## Boosting.
+        classes_arr = @classes.to_a
+        y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
+        n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
+        observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
+        @params[:n_estimators].times do |_t|
+          # Fit classfier.
+          ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
+          break if y[ids].to_a.uniq.size != n_classes
+          tree = ::Rumale::Tree::DecisionTreeClassifier.new(
+            criterion: @params[:criterion], max_depth: @params[:max_depth],
+            max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+            max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
+          )
+          tree.fit(x[ids, true], y[ids])
+          # Calculate estimator error.
+          proba = tree.predict_proba(x).clip(1.0e-15, nil)
+          pred = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
+          inds = pred.ne(y)
+          error = (observation_weights * inds).sum / observation_weights.sum
+          # Store model.
+          @estimators.push(tree)
+          @feature_importances += tree.feature_importances
+          break if error.zero?
+          # Update observation weights.
+          log_proba = Numo::NMath.log(proba)
+          observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(axis: 1))
+          observation_weights = observation_weights.clip(1.0e-15, nil)
+          sum_observation_weights = observation_weights.sum
+          break if sum_observation_weights.zero?
+          observation_weights /= sum_observation_weights
+        end
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+      # Calculate confidence scores for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
+      def decision_function(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        n_samples, = x.shape
+        n_classes = @classes.size
+        sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
+        @estimators.each do |tree|
+          log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
+          sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(axis: 1)].transpose)
+        end
+        sum_probs /= @estimators.size
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      def predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        n_samples, = x.shape
+        probs = decision_function(x)
+        Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        n_classes = @classes.size
+        probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
+        sum_probs = probs.sum(axis: 1)
+        probs /= Numo::DFloat[sum_probs].transpose
+        probs
+      end
+    end
+  end
+end

data/lib/rumale/ensemble/ada_boost_regressor.rb ADDED Viewed

@@ -0,0 +1,167 @@
+# frozen_string_literal: true
+require 'rumale/utils'
+require 'rumale/validation'
+require 'rumale/base/estimator'
+require 'rumale/base/regressor'
+require 'rumale/tree/decision_tree_regressor'
+require 'rumale/ensemble/value'
+module Rumale
+  module Ensemble
+    # AdaBoostRegressor is a class that implements AdaBoost for regression.
+    # This class uses decision tree for a weak learner.
+    #
+    # @example
+    #   require 'rumale/ensemble/ada_boost_regressor'
+    #
+    #   estimator =
+    #     Rumale::Ensemble::AdaBoostRegressor.new(
+    #       n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Shrestha, D. L., and Solomatine, D. P., "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
+    class AdaBoostRegressor < ::Rumale::Base::Estimator
+      include ::Rumale::Base::Regressor
+      # Return the set of estimators.
+      # @return [Array<DecisionTreeRegressor>]
+      attr_reader :estimators
+      # Return the weight for each weak learner.
+      # @return [Numo::DFloat] (size: n_estimates)
+      attr_reader :estimator_weights
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new regressor with random forest.
+      #
+      # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
+      # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
+      # @param exponent [Float] The exponent for the weight of each weak learner.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
+                     criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, random_seed: nil)
+        super()
+        @params = {
+          n_estimators: n_estimators,
+          threshold: threshold,
+          exponent: exponent,
+          criterion: criterion,
+          max_depth: max_depth,
+          max_leaf_nodes: max_leaf_nodes,
+          min_samples_leaf: min_samples_leaf,
+          max_features: max_features,
+          random_seed: random_seed || srand
+        }
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
+      # @return [AdaBoostRegressor] The learned regressor itself.
+      def fit(x, y) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_target_value_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        unless y.ndim == 1
+          raise ArgumentError,
+                'AdaBoostRegressor supports only single-target variable regression; ' \
+                'the target value array is expected to be 1-D'
+        end
+        # Initialize some variables.
+        n_samples, n_features = x.shape
+        @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
+        @estimators = []
+        @estimator_weights = []
+        @feature_importances = Numo::DFloat.zeros(n_features)
+        sub_rng = @rng.dup
+        # Construct forest.
+        @params[:n_estimators].times do |_t|
+          # Fit weak learner.
+          ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
+          tree = ::Rumale::Tree::DecisionTreeRegressor.new(
+            criterion: @params[:criterion], max_depth: @params[:max_depth],
+            max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+            max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
+          )
+          tree.fit(x[ids, true], y[ids])
+          pred = tree.predict(x)
+          # Calculate errors.
+          abs_err = ((pred - y) / y).abs
+          sum_target = abs_err.gt(@params[:threshold])
+          break if sum_target.count.zero?
+          err = observation_weights[sum_target].sum
+          break if err <= 0.0
+          # Calculate weight.
+          beta = err**@params[:exponent]
+          weight = Math.log(1.fdiv(beta))
+          # Store model.
+          @estimators.push(tree)
+          @estimator_weights.push(weight)
+          @feature_importances += weight * tree.feature_importances
+          # Update observation weights.
+          update = Numo::DFloat.ones(n_samples)
+          update_target = abs_err.le(@params[:threshold])
+          break if update_target.count.zero?
+          update[update_target] = beta
+          observation_weights *= update
+          observation_weights = observation_weights.clip(1.0e-15, nil)
+          sum_observation_weights = observation_weights.sum
+          break if sum_observation_weights.zero?
+          observation_weights /= sum_observation_weights
+        end
+        if @estimators.empty?
+          warn('Failed to converge, check hyper-parameters of AdaBoostRegressor.')
+          self
+        end
+        @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
+        @feature_importances /= @estimator_weights.sum
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
+      def predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        n_samples, = x.shape
+        predictions = Numo::DFloat.zeros(n_samples)
+        @estimators.size.times do |t|
+          predictions += @estimator_weights[t] * @estimators[t].predict(x)
+        end
+        sum_weight = @estimator_weights.sum
+        predictions / sum_weight
+      end
+    end
+  end
+end

data/lib/rumale/ensemble/extra_trees_classifier.rb ADDED Viewed

@@ -0,0 +1,140 @@
+# frozen_string_literal: true
+require 'rumale/validation'
+require 'rumale/tree/extra_tree_classifier'
+require 'rumale/ensemble/random_forest_classifier'
+require 'rumale/ensemble/value'
+module Rumale
+  module Ensemble
+    # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
+    # The algorithm of extremely randomized trees is similar to random forest.
+    # The features of the algorithm of extremely randomized trees are
+    # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
+    #
+    # @example
+    #   require 'rumale/ensemble/extra_trees_classifier'
+    #
+    #   estimator =
+    #     Rumale::Ensemble::ExtraTreesClassifier.new(
+    #       n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreesClassifier < RandomForestClassifier
+      # Return the set of estimators.
+      # @return [Array<ExtraTreeClassifier>]
+      attr_reader :estimators
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new classifier with extremely randomized trees.
+      #
+      # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers 'Math.sqrt(n_features)' features.
+      # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
+      #   If nil is given, the method does not execute in parallel.
+      #   If zero or less is given, it becomes equal to the number of processors.
+      #   This parameter is ignored if the Parallel gem is not loaded.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10,
+                     criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, n_jobs: nil, random_seed: nil)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [ExtraTreesClassifier] The learned classifier itself.
+      def fit(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_label_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        # Initialize some variables.
+        n_features = x.shape[1]
+        @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
+        sub_rng = @rng.dup
+        # Construct trees.
+        rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
+        @estimators = if enable_parallel?
+                        parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
+                      else
+                        Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
+                      end
+        @feature_importances =
+          if enable_parallel?
+            parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
+          else
+            @estimators.sum(&:feature_importances)
+          end
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      def predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        super
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        super
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
+      def apply(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        super
+      end
+      private
+      def plant_tree(rnd_seed)
+        ::Rumale::Tree::ExtraTreeClassifier.new(
+          criterion: @params[:criterion], max_depth: @params[:max_depth],
+          max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+          max_features: @params[:max_features], random_seed: rnd_seed
+        )
+      end
+    end
+  end
+end

data/lib/rumale/ensemble/extra_trees_regressor.rb ADDED Viewed

@@ -0,0 +1,125 @@
+# frozen_string_literal: true
+require 'rumale/validation'
+require 'rumale/tree/extra_tree_regressor'
+require 'rumale/ensemble/random_forest_regressor'
+require 'rumale/ensemble/value'
+module Rumale
+  module Ensemble
+    # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
+    # The algorithm of extremely randomized trees is similar to random forest.
+    # The features of the algorithm of extremely randomized trees are
+    # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
+    #
+    # @example
+    #   @require 'rumale/ensemble/extra_trees_regressor'
+    #
+    #   estimator =
+    #     Rumale::Ensemble::ExtraTreesRegressor.new(
+    #       n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreesRegressor < RandomForestRegressor
+      # Return the set of estimators.
+      # @return [Array<ExtraTreeRegressor>]
+      attr_reader :estimators
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new regressor with extremely randomized trees.
+      #
+      # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers 'Math.sqrt(n_features)' features.
+      # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
+      #   If nil is given, the methods do not execute in parallel.
+      #   If zero or less is given, it becomes equal to the number of processors.
+      #   This parameter is ignored if the Parallel gem is not loaded.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10,
+                     criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, n_jobs: nil, random_seed: nil)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @return [ExtraTreesRegressor] The learned regressor itself.
+      def fit(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_target_value_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        # Initialize some variables.
+        n_features = x.shape[1]
+        @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        sub_rng = @rng.dup
+        # Construct forest.
+        rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
+        @estimators = if enable_parallel?
+                        parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
+                      else
+                        Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
+                      end
+        @feature_importances =
+          if enable_parallel?
+            parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
+          else
+            @estimators.sum(&:feature_importances)
+          end
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
+      def predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        super
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
+      # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
+      def apply(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        super
+      end
+      private
+      def plant_tree(rnd_seed)
+        ::Rumale::Tree::ExtraTreeRegressor.new(
+          criterion: @params[:criterion], max_depth: @params[:max_depth],
+          max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+          max_features: @params[:max_features], random_seed: rnd_seed
+        )
+      end
+    end
+  end
+end