rumale-ensemble 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 71f67ae6338e6907a02b66affa8ad12b22254da82d6a1fdfea092844f8809a51
4
+ data.tar.gz: 7b301905c59c580ace8f17edc4dd2b526af267493f60f74c294652f6e137fc12
5
+ SHA512:
6
+ metadata.gz: 65391ee173334b7b2bc41761fe4a66dd8bd0c1158c948187b9059b78b80c9343393e3a42d52e6906e54388e7e3ce86340eb479a3c443130bdf004b1954570853
7
+ data.tar.gz: 7f78362e3a06aacc18f1a71a0c0340a5322fd8d78a2acd74ac7e4a8b4bfcd9396b84cfa0dc2a01ad1f872ff057b6847b7cd6c06d3bbab45f0fc9087035715d11
data/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2022 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # Rumale::Ensemble
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rumale-ensemble.svg)](https://badge.fury.io/rb/rumale-ensemble)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-ensemble/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
6
+
7
+ Rumale is a machine learning library in Ruby.
8
+ Rumale::Ensemble provides ensemble learning algorithms,
9
+ such as AdaBoost, Gradient Tree Boosting, and Random Forest,
10
+ with Rumale interface.
11
+
12
+ ## Installation
13
+
14
+ Add this line to your application's Gemfile:
15
+
16
+ ```ruby
17
+ gem 'rumale-ensemble'
18
+ ```
19
+
20
+ And then execute:
21
+
22
+ $ bundle install
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install rumale-ensemble
27
+
28
+ ## Documentation
29
+
30
+ - [Rumale API Documentation - Ensemble](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
31
+
32
+ ## License
33
+
34
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/utils'
4
+ require 'rumale/validation'
5
+ require 'rumale/base/estimator'
6
+ require 'rumale/base/classifier'
7
+ require 'rumale/tree/decision_tree_classifier'
8
+ require 'rumale/ensemble/value'
9
+
10
+ module Rumale
11
+ module Ensemble
12
+ # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
13
+ # This class uses decision tree for a weak learner.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/ada_boost_classifier'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::AdaBoostClassifier.new(
20
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_labels)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Zhu, J., Rosset, S., Zou, H., and Hashie, T., "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
26
+ class AdaBoostClassifier < ::Rumale::Base::Estimator
27
+ include ::Rumale::Base::Classifier
28
+
29
+ # Return the set of estimators.
30
+ # @return [Array<DecisionTreeClassifier>]
31
+ attr_reader :estimators
32
+
33
+ # Return the class labels.
34
+ # @return [Numo::Int32] (size: n_classes)
35
+ attr_reader :classes
36
+
37
+ # Return the importance for each feature.
38
+ # @return [Numo::DFloat] (size: n_features)
39
+ attr_reader :feature_importances
40
+
41
+ # Return the random generator for random selection of feature index.
42
+ # @return [Random]
43
+ attr_reader :rng
44
+
45
+ # Create a new classifier with AdaBoost.
46
+ #
47
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
48
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
49
+ # @param max_depth [Integer] The maximum depth of the tree.
50
+ # If nil is given, decision tree grows without concern for depth.
51
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
52
+ # If nil is given, number of leaves is not limited.
53
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
54
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
55
+ # If nil is given, split process considers all features.
56
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
57
+ # It is used to randomly determine the order of features when deciding spliting point.
58
+ def initialize(n_estimators: 50,
59
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
60
+ max_features: nil, random_seed: nil)
61
+ super()
62
+ @params = {
63
+ n_estimators: n_estimators,
64
+ criterion: criterion,
65
+ max_depth: max_depth,
66
+ max_leaf_nodes: max_leaf_nodes,
67
+ min_samples_leaf: min_samples_leaf,
68
+ max_features: max_features,
69
+ random_seed: random_seed || srand
70
+ }
71
+ @rng = Random.new(@params[:random_seed])
72
+ end
73
+
74
+ # Fit the model with given training data.
75
+ #
76
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
77
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
78
+ # @return [AdaBoostClassifier] The learned classifier itself.
79
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
80
+ x = ::Rumale::Validation.check_convert_sample_array(x)
81
+ y = ::Rumale::Validation.check_convert_label_array(y)
82
+ ::Rumale::Validation.check_sample_size(x, y)
83
+
84
+ ## Initialize some variables.
85
+ n_samples, n_features = x.shape
86
+ @estimators = []
87
+ @feature_importances = Numo::DFloat.zeros(n_features)
88
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
89
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
90
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
91
+ n_classes = @classes.shape[0]
92
+ sub_rng = @rng.dup
93
+ ## Boosting.
94
+ classes_arr = @classes.to_a
95
+ y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
96
+ n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
97
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
98
+ @params[:n_estimators].times do |_t|
99
+ # Fit classfier.
100
+ ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
101
+ break if y[ids].to_a.uniq.size != n_classes
102
+
103
+ tree = ::Rumale::Tree::DecisionTreeClassifier.new(
104
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
105
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
106
+ max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
107
+ )
108
+ tree.fit(x[ids, true], y[ids])
109
+ # Calculate estimator error.
110
+ proba = tree.predict_proba(x).clip(1.0e-15, nil)
111
+ pred = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
112
+ inds = pred.ne(y)
113
+ error = (observation_weights * inds).sum / observation_weights.sum
114
+ # Store model.
115
+ @estimators.push(tree)
116
+ @feature_importances += tree.feature_importances
117
+ break if error.zero?
118
+
119
+ # Update observation weights.
120
+ log_proba = Numo::NMath.log(proba)
121
+ observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(axis: 1))
122
+ observation_weights = observation_weights.clip(1.0e-15, nil)
123
+ sum_observation_weights = observation_weights.sum
124
+ break if sum_observation_weights.zero?
125
+
126
+ observation_weights /= sum_observation_weights
127
+ end
128
+ @feature_importances /= @feature_importances.sum
129
+ self
130
+ end
131
+
132
+ # Calculate confidence scores for samples.
133
+ #
134
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
135
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
136
+ def decision_function(x)
137
+ x = ::Rumale::Validation.check_convert_sample_array(x)
138
+
139
+ n_samples, = x.shape
140
+ n_classes = @classes.size
141
+ sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
142
+ @estimators.each do |tree|
143
+ log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
144
+ sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(axis: 1)].transpose)
145
+ end
146
+ sum_probs /= @estimators.size
147
+ end
148
+
149
+ # Predict class labels for samples.
150
+ #
151
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
152
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
153
+ def predict(x)
154
+ x = ::Rumale::Validation.check_convert_sample_array(x)
155
+
156
+ n_samples, = x.shape
157
+ probs = decision_function(x)
158
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
159
+ end
160
+
161
+ # Predict probability for samples.
162
+ #
163
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
164
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
165
+ def predict_proba(x)
166
+ x = ::Rumale::Validation.check_convert_sample_array(x)
167
+
168
+ n_classes = @classes.size
169
+ probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
170
+ sum_probs = probs.sum(axis: 1)
171
+ probs /= Numo::DFloat[sum_probs].transpose
172
+ probs
173
+ end
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,167 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/utils'
4
+ require 'rumale/validation'
5
+ require 'rumale/base/estimator'
6
+ require 'rumale/base/regressor'
7
+ require 'rumale/tree/decision_tree_regressor'
8
+ require 'rumale/ensemble/value'
9
+
10
+ module Rumale
11
+ module Ensemble
12
+ # AdaBoostRegressor is a class that implements AdaBoost for regression.
13
+ # This class uses decision tree for a weak learner.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/ada_boost_regressor'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::AdaBoostRegressor.new(
20
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_values)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Shrestha, D. L., and Solomatine, D. P., "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
26
+ class AdaBoostRegressor < ::Rumale::Base::Estimator
27
+ include ::Rumale::Base::Regressor
28
+
29
+ # Return the set of estimators.
30
+ # @return [Array<DecisionTreeRegressor>]
31
+ attr_reader :estimators
32
+
33
+ # Return the weight for each weak learner.
34
+ # @return [Numo::DFloat] (size: n_estimates)
35
+ attr_reader :estimator_weights
36
+
37
+ # Return the importance for each feature.
38
+ # @return [Numo::DFloat] (size: n_features)
39
+ attr_reader :feature_importances
40
+
41
+ # Return the random generator for random selection of feature index.
42
+ # @return [Random]
43
+ attr_reader :rng
44
+
45
+ # Create a new regressor with random forest.
46
+ #
47
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
48
+ # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
49
+ # @param exponent [Float] The exponent for the weight of each weak learner.
50
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
51
+ # @param max_depth [Integer] The maximum depth of the tree.
52
+ # If nil is given, decision tree grows without concern for depth.
53
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
54
+ # If nil is given, number of leaves is not limited.
55
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
56
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
57
+ # If nil is given, split process considers all features.
58
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
59
+ # It is used to randomly determine the order of features when deciding spliting point.
60
+ def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
61
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
62
+ max_features: nil, random_seed: nil)
63
+ super()
64
+ @params = {
65
+ n_estimators: n_estimators,
66
+ threshold: threshold,
67
+ exponent: exponent,
68
+ criterion: criterion,
69
+ max_depth: max_depth,
70
+ max_leaf_nodes: max_leaf_nodes,
71
+ min_samples_leaf: min_samples_leaf,
72
+ max_features: max_features,
73
+ random_seed: random_seed || srand
74
+ }
75
+ @rng = Random.new(@params[:random_seed])
76
+ end
77
+
78
+ # Fit the model with given training data.
79
+ #
80
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
82
+ # @return [AdaBoostRegressor] The learned regressor itself.
83
+ def fit(x, y) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
84
+ x = ::Rumale::Validation.check_convert_sample_array(x)
85
+ y = ::Rumale::Validation.check_convert_target_value_array(y)
86
+ ::Rumale::Validation.check_sample_size(x, y)
87
+ unless y.ndim == 1
88
+ raise ArgumentError,
89
+ 'AdaBoostRegressor supports only single-target variable regression; ' \
90
+ 'the target value array is expected to be 1-D'
91
+ end
92
+
93
+ # Initialize some variables.
94
+ n_samples, n_features = x.shape
95
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
96
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
97
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
98
+ @estimators = []
99
+ @estimator_weights = []
100
+ @feature_importances = Numo::DFloat.zeros(n_features)
101
+ sub_rng = @rng.dup
102
+ # Construct forest.
103
+ @params[:n_estimators].times do |_t|
104
+ # Fit weak learner.
105
+ ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
106
+ tree = ::Rumale::Tree::DecisionTreeRegressor.new(
107
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
108
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
109
+ max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
110
+ )
111
+ tree.fit(x[ids, true], y[ids])
112
+ pred = tree.predict(x)
113
+ # Calculate errors.
114
+ abs_err = ((pred - y) / y).abs
115
+ sum_target = abs_err.gt(@params[:threshold])
116
+ break if sum_target.count.zero?
117
+
118
+ err = observation_weights[sum_target].sum
119
+ break if err <= 0.0
120
+
121
+ # Calculate weight.
122
+ beta = err**@params[:exponent]
123
+ weight = Math.log(1.fdiv(beta))
124
+ # Store model.
125
+ @estimators.push(tree)
126
+ @estimator_weights.push(weight)
127
+ @feature_importances += weight * tree.feature_importances
128
+ # Update observation weights.
129
+ update = Numo::DFloat.ones(n_samples)
130
+ update_target = abs_err.le(@params[:threshold])
131
+ break if update_target.count.zero?
132
+
133
+ update[update_target] = beta
134
+ observation_weights *= update
135
+ observation_weights = observation_weights.clip(1.0e-15, nil)
136
+ sum_observation_weights = observation_weights.sum
137
+ break if sum_observation_weights.zero?
138
+
139
+ observation_weights /= sum_observation_weights
140
+ end
141
+ if @estimators.empty?
142
+ warn('Failed to converge, check hyper-parameters of AdaBoostRegressor.')
143
+ self
144
+ end
145
+ @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
146
+ @feature_importances /= @estimator_weights.sum
147
+ self
148
+ end
149
+
150
+ # Predict values for samples.
151
+ #
152
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
153
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
154
+ def predict(x)
155
+ x = ::Rumale::Validation.check_convert_sample_array(x)
156
+
157
+ n_samples, = x.shape
158
+ predictions = Numo::DFloat.zeros(n_samples)
159
+ @estimators.size.times do |t|
160
+ predictions += @estimator_weights[t] * @estimators[t].predict(x)
161
+ end
162
+ sum_weight = @estimator_weights.sum
163
+ predictions / sum_weight
164
+ end
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/tree/extra_tree_classifier'
5
+ require 'rumale/ensemble/random_forest_classifier'
6
+ require 'rumale/ensemble/value'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
11
+ # The algorithm of extremely randomized trees is similar to random forest.
12
+ # The features of the algorithm of extremely randomized trees are
13
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/extra_trees_classifier'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::ExtraTreesClassifier.new(
20
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_labels)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
26
+ class ExtraTreesClassifier < RandomForestClassifier
27
+ # Return the set of estimators.
28
+ # @return [Array<ExtraTreeClassifier>]
29
+ attr_reader :estimators
30
+
31
+ # Return the class labels.
32
+ # @return [Numo::Int32] (size: n_classes)
33
+ attr_reader :classes
34
+
35
+ # Return the importance for each feature.
36
+ # @return [Numo::DFloat] (size: n_features)
37
+ attr_reader :feature_importances
38
+
39
+ # Return the random generator for random selection of feature index.
40
+ # @return [Random]
41
+ attr_reader :rng
42
+
43
+ # Create a new classifier with extremely randomized trees.
44
+ #
45
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
46
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
+ # @param max_depth [Integer] The maximum depth of the tree.
48
+ # If nil is given, extra tree grows without concern for depth.
49
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
50
+ # If nil is given, number of leaves is not limited.
51
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
+ # If nil is given, split process considers 'Math.sqrt(n_features)' features.
54
+ # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
55
+ # If nil is given, the method does not execute in parallel.
56
+ # If zero or less is given, it becomes equal to the number of processors.
57
+ # This parameter is ignored if the Parallel gem is not loaded.
58
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
59
+ # It is used to randomly determine the order of features when deciding spliting point.
60
+ def initialize(n_estimators: 10,
61
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
62
+ max_features: nil, n_jobs: nil, random_seed: nil)
63
+ super
64
+ end
65
+
66
+ # Fit the model with given training data.
67
+ #
68
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
69
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
70
+ # @return [ExtraTreesClassifier] The learned classifier itself.
71
+ def fit(x, y)
72
+ x = ::Rumale::Validation.check_convert_sample_array(x)
73
+ y = ::Rumale::Validation.check_convert_label_array(y)
74
+ ::Rumale::Validation.check_sample_size(x, y)
75
+
76
+ # Initialize some variables.
77
+ n_features = x.shape[1]
78
+ @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
79
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
80
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
81
+ sub_rng = @rng.dup
82
+ # Construct trees.
83
+ rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
84
+ @estimators = if enable_parallel?
85
+ parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
86
+ else
87
+ Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
88
+ end
89
+ @feature_importances =
90
+ if enable_parallel?
91
+ parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
92
+ else
93
+ @estimators.sum(&:feature_importances)
94
+ end
95
+ @feature_importances /= @feature_importances.sum
96
+ self
97
+ end
98
+
99
+ # Predict class labels for samples.
100
+ #
101
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
102
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
103
+ def predict(x)
104
+ x = ::Rumale::Validation.check_convert_sample_array(x)
105
+
106
+ super
107
+ end
108
+
109
+ # Predict probability for samples.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
112
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
113
+ def predict_proba(x)
114
+ x = ::Rumale::Validation.check_convert_sample_array(x)
115
+
116
+ super
117
+ end
118
+
119
+ # Return the index of the leaf that each sample reached.
120
+ #
121
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
122
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
123
+ def apply(x)
124
+ x = ::Rumale::Validation.check_convert_sample_array(x)
125
+
126
+ super
127
+ end
128
+
129
+ private
130
+
131
+ def plant_tree(rnd_seed)
132
+ ::Rumale::Tree::ExtraTreeClassifier.new(
133
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
134
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
135
+ max_features: @params[:max_features], random_seed: rnd_seed
136
+ )
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/tree/extra_tree_regressor'
5
+ require 'rumale/ensemble/random_forest_regressor'
6
+ require 'rumale/ensemble/value'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
11
+ # The algorithm of extremely randomized trees is similar to random forest.
12
+ # The features of the algorithm of extremely randomized trees are
13
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
14
+ #
15
+ # @example
16
+ # @require 'rumale/ensemble/extra_trees_regressor'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::ExtraTreesRegressor.new(
20
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_values)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
26
+ class ExtraTreesRegressor < RandomForestRegressor
27
+ # Return the set of estimators.
28
+ # @return [Array<ExtraTreeRegressor>]
29
+ attr_reader :estimators
30
+
31
+ # Return the importance for each feature.
32
+ # @return [Numo::DFloat] (size: n_features)
33
+ attr_reader :feature_importances
34
+
35
+ # Return the random generator for random selection of feature index.
36
+ # @return [Random]
37
+ attr_reader :rng
38
+
39
+ # Create a new regressor with extremely randomized trees.
40
+ #
41
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
42
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
43
+ # @param max_depth [Integer] The maximum depth of the tree.
44
+ # If nil is given, extra tree grows without concern for depth.
45
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
46
+ # If nil is given, number of leaves is not limited.
47
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
+ # If nil is given, split process considers 'Math.sqrt(n_features)' features.
50
+ # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
51
+ # If nil is given, the methods do not execute in parallel.
52
+ # If zero or less is given, it becomes equal to the number of processors.
53
+ # This parameter is ignored if the Parallel gem is not loaded.
54
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
55
+ # It is used to randomly determine the order of features when deciding spliting point.
56
+ def initialize(n_estimators: 10,
57
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
+ max_features: nil, n_jobs: nil, random_seed: nil)
59
+ super
60
+ end
61
+
62
+ # Fit the model with given training data.
63
+ #
64
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
65
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
66
+ # @return [ExtraTreesRegressor] The learned regressor itself.
67
+ def fit(x, y)
68
+ x = ::Rumale::Validation.check_convert_sample_array(x)
69
+ y = ::Rumale::Validation.check_convert_target_value_array(y)
70
+ ::Rumale::Validation.check_sample_size(x, y)
71
+
72
+ # Initialize some variables.
73
+ n_features = x.shape[1]
74
+ @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
75
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
76
+ sub_rng = @rng.dup
77
+ # Construct forest.
78
+ rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
79
+ @estimators = if enable_parallel?
80
+ parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
81
+ else
82
+ Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
83
+ end
84
+ @feature_importances =
85
+ if enable_parallel?
86
+ parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
87
+ else
88
+ @estimators.sum(&:feature_importances)
89
+ end
90
+ @feature_importances /= @feature_importances.sum
91
+ self
92
+ end
93
+
94
+ # Predict values for samples.
95
+ #
96
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
97
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
98
+ def predict(x)
99
+ x = ::Rumale::Validation.check_convert_sample_array(x)
100
+
101
+ super
102
+ end
103
+
104
+ # Return the index of the leaf that each sample reached.
105
+ #
106
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
107
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
108
+ def apply(x)
109
+ x = ::Rumale::Validation.check_convert_sample_array(x)
110
+
111
+ super
112
+ end
113
+
114
+ private
115
+
116
+ def plant_tree(rnd_seed)
117
+ ::Rumale::Tree::ExtraTreeRegressor.new(
118
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
119
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
120
+ max_features: @params[:max_features], random_seed: rnd_seed
121
+ )
122
+ end
123
+ end
124
+ end
125
+ end