rumale-ensemble 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 71f67ae6338e6907a02b66affa8ad12b22254da82d6a1fdfea092844f8809a51
4
+ data.tar.gz: 7b301905c59c580ace8f17edc4dd2b526af267493f60f74c294652f6e137fc12
5
+ SHA512:
6
+ metadata.gz: 65391ee173334b7b2bc41761fe4a66dd8bd0c1158c948187b9059b78b80c9343393e3a42d52e6906e54388e7e3ce86340eb479a3c443130bdf004b1954570853
7
+ data.tar.gz: 7f78362e3a06aacc18f1a71a0c0340a5322fd8d78a2acd74ac7e4a8b4bfcd9396b84cfa0dc2a01ad1f872ff057b6847b7cd6c06d3bbab45f0fc9087035715d11
data/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2022 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # Rumale::Ensemble
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rumale-ensemble.svg)](https://badge.fury.io/rb/rumale-ensemble)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-ensemble/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
6
+
7
+ Rumale is a machine learning library in Ruby.
8
+ Rumale::Ensemble provides ensemble learning algorithms,
9
+ such as AdaBoost, Gradient Tree Boosting, and Random Forest,
10
+ with Rumale interface.
11
+
12
+ ## Installation
13
+
14
+ Add this line to your application's Gemfile:
15
+
16
+ ```ruby
17
+ gem 'rumale-ensemble'
18
+ ```
19
+
20
+ And then execute:
21
+
22
+ $ bundle install
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install rumale-ensemble
27
+
28
+ ## Documentation
29
+
30
+ - [Rumale API Documentation - Ensemble](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
31
+
32
+ ## License
33
+
34
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/utils'
4
+ require 'rumale/validation'
5
+ require 'rumale/base/estimator'
6
+ require 'rumale/base/classifier'
7
+ require 'rumale/tree/decision_tree_classifier'
8
+ require 'rumale/ensemble/value'
9
+
10
+ module Rumale
11
+ module Ensemble
12
+ # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
13
+ # This class uses decision tree for a weak learner.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/ada_boost_classifier'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::AdaBoostClassifier.new(
20
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_labels)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Zhu, J., Rosset, S., Zou, H., and Hashie, T., "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
26
+ class AdaBoostClassifier < ::Rumale::Base::Estimator
27
+ include ::Rumale::Base::Classifier
28
+
29
+ # Return the set of estimators.
30
+ # @return [Array<DecisionTreeClassifier>]
31
+ attr_reader :estimators
32
+
33
+ # Return the class labels.
34
+ # @return [Numo::Int32] (size: n_classes)
35
+ attr_reader :classes
36
+
37
+ # Return the importance for each feature.
38
+ # @return [Numo::DFloat] (size: n_features)
39
+ attr_reader :feature_importances
40
+
41
+ # Return the random generator for random selection of feature index.
42
+ # @return [Random]
43
+ attr_reader :rng
44
+
45
+ # Create a new classifier with AdaBoost.
46
+ #
47
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
48
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
49
+ # @param max_depth [Integer] The maximum depth of the tree.
50
+ # If nil is given, decision tree grows without concern for depth.
51
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
52
+ # If nil is given, number of leaves is not limited.
53
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
54
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
55
+ # If nil is given, split process considers all features.
56
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
57
+ # It is used to randomly determine the order of features when deciding spliting point.
58
+ def initialize(n_estimators: 50,
59
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
60
+ max_features: nil, random_seed: nil)
61
+ super()
62
+ @params = {
63
+ n_estimators: n_estimators,
64
+ criterion: criterion,
65
+ max_depth: max_depth,
66
+ max_leaf_nodes: max_leaf_nodes,
67
+ min_samples_leaf: min_samples_leaf,
68
+ max_features: max_features,
69
+ random_seed: random_seed || srand
70
+ }
71
+ @rng = Random.new(@params[:random_seed])
72
+ end
73
+
74
+ # Fit the model with given training data.
75
+ #
76
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
77
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
78
+ # @return [AdaBoostClassifier] The learned classifier itself.
79
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
80
+ x = ::Rumale::Validation.check_convert_sample_array(x)
81
+ y = ::Rumale::Validation.check_convert_label_array(y)
82
+ ::Rumale::Validation.check_sample_size(x, y)
83
+
84
+ ## Initialize some variables.
85
+ n_samples, n_features = x.shape
86
+ @estimators = []
87
+ @feature_importances = Numo::DFloat.zeros(n_features)
88
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
89
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
90
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
91
+ n_classes = @classes.shape[0]
92
+ sub_rng = @rng.dup
93
+ ## Boosting.
94
+ classes_arr = @classes.to_a
95
+ y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
96
+ n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
97
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
98
+ @params[:n_estimators].times do |_t|
99
+ # Fit classfier.
100
+ ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
101
+ break if y[ids].to_a.uniq.size != n_classes
102
+
103
+ tree = ::Rumale::Tree::DecisionTreeClassifier.new(
104
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
105
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
106
+ max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
107
+ )
108
+ tree.fit(x[ids, true], y[ids])
109
+ # Calculate estimator error.
110
+ proba = tree.predict_proba(x).clip(1.0e-15, nil)
111
+ pred = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
112
+ inds = pred.ne(y)
113
+ error = (observation_weights * inds).sum / observation_weights.sum
114
+ # Store model.
115
+ @estimators.push(tree)
116
+ @feature_importances += tree.feature_importances
117
+ break if error.zero?
118
+
119
+ # Update observation weights.
120
+ log_proba = Numo::NMath.log(proba)
121
+ observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(axis: 1))
122
+ observation_weights = observation_weights.clip(1.0e-15, nil)
123
+ sum_observation_weights = observation_weights.sum
124
+ break if sum_observation_weights.zero?
125
+
126
+ observation_weights /= sum_observation_weights
127
+ end
128
+ @feature_importances /= @feature_importances.sum
129
+ self
130
+ end
131
+
132
+ # Calculate confidence scores for samples.
133
+ #
134
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
135
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
136
+ def decision_function(x)
137
+ x = ::Rumale::Validation.check_convert_sample_array(x)
138
+
139
+ n_samples, = x.shape
140
+ n_classes = @classes.size
141
+ sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
142
+ @estimators.each do |tree|
143
+ log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
144
+ sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(axis: 1)].transpose)
145
+ end
146
+ sum_probs /= @estimators.size
147
+ end
148
+
149
+ # Predict class labels for samples.
150
+ #
151
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
152
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
153
+ def predict(x)
154
+ x = ::Rumale::Validation.check_convert_sample_array(x)
155
+
156
+ n_samples, = x.shape
157
+ probs = decision_function(x)
158
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
159
+ end
160
+
161
+ # Predict probability for samples.
162
+ #
163
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
164
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
165
+ def predict_proba(x)
166
+ x = ::Rumale::Validation.check_convert_sample_array(x)
167
+
168
+ n_classes = @classes.size
169
+ probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
170
+ sum_probs = probs.sum(axis: 1)
171
+ probs /= Numo::DFloat[sum_probs].transpose
172
+ probs
173
+ end
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,167 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/utils'
4
+ require 'rumale/validation'
5
+ require 'rumale/base/estimator'
6
+ require 'rumale/base/regressor'
7
+ require 'rumale/tree/decision_tree_regressor'
8
+ require 'rumale/ensemble/value'
9
+
10
+ module Rumale
11
+ module Ensemble
12
+ # AdaBoostRegressor is a class that implements AdaBoost for regression.
13
+ # This class uses decision tree for a weak learner.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/ada_boost_regressor'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::AdaBoostRegressor.new(
20
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_values)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Shrestha, D. L., and Solomatine, D. P., "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
26
+ class AdaBoostRegressor < ::Rumale::Base::Estimator
27
+ include ::Rumale::Base::Regressor
28
+
29
+ # Return the set of estimators.
30
+ # @return [Array<DecisionTreeRegressor>]
31
+ attr_reader :estimators
32
+
33
+ # Return the weight for each weak learner.
34
+ # @return [Numo::DFloat] (size: n_estimates)
35
+ attr_reader :estimator_weights
36
+
37
+ # Return the importance for each feature.
38
+ # @return [Numo::DFloat] (size: n_features)
39
+ attr_reader :feature_importances
40
+
41
+ # Return the random generator for random selection of feature index.
42
+ # @return [Random]
43
+ attr_reader :rng
44
+
45
+ # Create a new regressor with random forest.
46
+ #
47
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
48
+ # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
49
+ # @param exponent [Float] The exponent for the weight of each weak learner.
50
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
51
+ # @param max_depth [Integer] The maximum depth of the tree.
52
+ # If nil is given, decision tree grows without concern for depth.
53
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
54
+ # If nil is given, number of leaves is not limited.
55
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
56
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
57
+ # If nil is given, split process considers all features.
58
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
59
+ # It is used to randomly determine the order of features when deciding spliting point.
60
+ def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
61
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
62
+ max_features: nil, random_seed: nil)
63
+ super()
64
+ @params = {
65
+ n_estimators: n_estimators,
66
+ threshold: threshold,
67
+ exponent: exponent,
68
+ criterion: criterion,
69
+ max_depth: max_depth,
70
+ max_leaf_nodes: max_leaf_nodes,
71
+ min_samples_leaf: min_samples_leaf,
72
+ max_features: max_features,
73
+ random_seed: random_seed || srand
74
+ }
75
+ @rng = Random.new(@params[:random_seed])
76
+ end
77
+
78
+ # Fit the model with given training data.
79
+ #
80
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
82
+ # @return [AdaBoostRegressor] The learned regressor itself.
83
+ def fit(x, y) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
84
+ x = ::Rumale::Validation.check_convert_sample_array(x)
85
+ y = ::Rumale::Validation.check_convert_target_value_array(y)
86
+ ::Rumale::Validation.check_sample_size(x, y)
87
+ unless y.ndim == 1
88
+ raise ArgumentError,
89
+ 'AdaBoostRegressor supports only single-target variable regression; ' \
90
+ 'the target value array is expected to be 1-D'
91
+ end
92
+
93
+ # Initialize some variables.
94
+ n_samples, n_features = x.shape
95
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
96
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
97
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
98
+ @estimators = []
99
+ @estimator_weights = []
100
+ @feature_importances = Numo::DFloat.zeros(n_features)
101
+ sub_rng = @rng.dup
102
+ # Construct forest.
103
+ @params[:n_estimators].times do |_t|
104
+ # Fit weak learner.
105
+ ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
106
+ tree = ::Rumale::Tree::DecisionTreeRegressor.new(
107
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
108
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
109
+ max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
110
+ )
111
+ tree.fit(x[ids, true], y[ids])
112
+ pred = tree.predict(x)
113
+ # Calculate errors.
114
+ abs_err = ((pred - y) / y).abs
115
+ sum_target = abs_err.gt(@params[:threshold])
116
+ break if sum_target.count.zero?
117
+
118
+ err = observation_weights[sum_target].sum
119
+ break if err <= 0.0
120
+
121
+ # Calculate weight.
122
+ beta = err**@params[:exponent]
123
+ weight = Math.log(1.fdiv(beta))
124
+ # Store model.
125
+ @estimators.push(tree)
126
+ @estimator_weights.push(weight)
127
+ @feature_importances += weight * tree.feature_importances
128
+ # Update observation weights.
129
+ update = Numo::DFloat.ones(n_samples)
130
+ update_target = abs_err.le(@params[:threshold])
131
+ break if update_target.count.zero?
132
+
133
+ update[update_target] = beta
134
+ observation_weights *= update
135
+ observation_weights = observation_weights.clip(1.0e-15, nil)
136
+ sum_observation_weights = observation_weights.sum
137
+ break if sum_observation_weights.zero?
138
+
139
+ observation_weights /= sum_observation_weights
140
+ end
141
+ if @estimators.empty?
142
+ warn('Failed to converge, check hyper-parameters of AdaBoostRegressor.')
143
+ self
144
+ end
145
+ @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
146
+ @feature_importances /= @estimator_weights.sum
147
+ self
148
+ end
149
+
150
+ # Predict values for samples.
151
+ #
152
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
153
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
154
+ def predict(x)
155
+ x = ::Rumale::Validation.check_convert_sample_array(x)
156
+
157
+ n_samples, = x.shape
158
+ predictions = Numo::DFloat.zeros(n_samples)
159
+ @estimators.size.times do |t|
160
+ predictions += @estimator_weights[t] * @estimators[t].predict(x)
161
+ end
162
+ sum_weight = @estimator_weights.sum
163
+ predictions / sum_weight
164
+ end
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/tree/extra_tree_classifier'
5
+ require 'rumale/ensemble/random_forest_classifier'
6
+ require 'rumale/ensemble/value'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
11
+ # The algorithm of extremely randomized trees is similar to random forest.
12
+ # The features of the algorithm of extremely randomized trees are
13
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/extra_trees_classifier'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::ExtraTreesClassifier.new(
20
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_labels)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
26
+ class ExtraTreesClassifier < RandomForestClassifier
27
+ # Return the set of estimators.
28
+ # @return [Array<ExtraTreeClassifier>]
29
+ attr_reader :estimators
30
+
31
+ # Return the class labels.
32
+ # @return [Numo::Int32] (size: n_classes)
33
+ attr_reader :classes
34
+
35
+ # Return the importance for each feature.
36
+ # @return [Numo::DFloat] (size: n_features)
37
+ attr_reader :feature_importances
38
+
39
+ # Return the random generator for random selection of feature index.
40
+ # @return [Random]
41
+ attr_reader :rng
42
+
43
+ # Create a new classifier with extremely randomized trees.
44
+ #
45
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
46
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
+ # @param max_depth [Integer] The maximum depth of the tree.
48
+ # If nil is given, extra tree grows without concern for depth.
49
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
50
+ # If nil is given, number of leaves is not limited.
51
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
+ # If nil is given, split process considers 'Math.sqrt(n_features)' features.
54
+ # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
55
+ # If nil is given, the method does not execute in parallel.
56
+ # If zero or less is given, it becomes equal to the number of processors.
57
+ # This parameter is ignored if the Parallel gem is not loaded.
58
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
59
+ # It is used to randomly determine the order of features when deciding spliting point.
60
+ def initialize(n_estimators: 10,
61
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
62
+ max_features: nil, n_jobs: nil, random_seed: nil)
63
+ super
64
+ end
65
+
66
+ # Fit the model with given training data.
67
+ #
68
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
69
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
70
+ # @return [ExtraTreesClassifier] The learned classifier itself.
71
+ def fit(x, y)
72
+ x = ::Rumale::Validation.check_convert_sample_array(x)
73
+ y = ::Rumale::Validation.check_convert_label_array(y)
74
+ ::Rumale::Validation.check_sample_size(x, y)
75
+
76
+ # Initialize some variables.
77
+ n_features = x.shape[1]
78
+ @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
79
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
80
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
81
+ sub_rng = @rng.dup
82
+ # Construct trees.
83
+ rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
84
+ @estimators = if enable_parallel?
85
+ parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
86
+ else
87
+ Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
88
+ end
89
+ @feature_importances =
90
+ if enable_parallel?
91
+ parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
92
+ else
93
+ @estimators.sum(&:feature_importances)
94
+ end
95
+ @feature_importances /= @feature_importances.sum
96
+ self
97
+ end
98
+
99
+ # Predict class labels for samples.
100
+ #
101
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
102
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
103
+ def predict(x)
104
+ x = ::Rumale::Validation.check_convert_sample_array(x)
105
+
106
+ super
107
+ end
108
+
109
+ # Predict probability for samples.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
112
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
113
+ def predict_proba(x)
114
+ x = ::Rumale::Validation.check_convert_sample_array(x)
115
+
116
+ super
117
+ end
118
+
119
+ # Return the index of the leaf that each sample reached.
120
+ #
121
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
122
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
123
+ def apply(x)
124
+ x = ::Rumale::Validation.check_convert_sample_array(x)
125
+
126
+ super
127
+ end
128
+
129
+ private
130
+
131
+ def plant_tree(rnd_seed)
132
+ ::Rumale::Tree::ExtraTreeClassifier.new(
133
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
134
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
135
+ max_features: @params[:max_features], random_seed: rnd_seed
136
+ )
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/tree/extra_tree_regressor'
5
+ require 'rumale/ensemble/random_forest_regressor'
6
+ require 'rumale/ensemble/value'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
11
+ # The algorithm of extremely randomized trees is similar to random forest.
12
+ # The features of the algorithm of extremely randomized trees are
13
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
14
+ #
15
+ # @example
16
+ # @require 'rumale/ensemble/extra_trees_regressor'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::ExtraTreesRegressor.new(
20
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_values)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
26
+ class ExtraTreesRegressor < RandomForestRegressor
27
+ # Return the set of estimators.
28
+ # @return [Array<ExtraTreeRegressor>]
29
+ attr_reader :estimators
30
+
31
+ # Return the importance for each feature.
32
+ # @return [Numo::DFloat] (size: n_features)
33
+ attr_reader :feature_importances
34
+
35
+ # Return the random generator for random selection of feature index.
36
+ # @return [Random]
37
+ attr_reader :rng
38
+
39
+ # Create a new regressor with extremely randomized trees.
40
+ #
41
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
42
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
43
+ # @param max_depth [Integer] The maximum depth of the tree.
44
+ # If nil is given, extra tree grows without concern for depth.
45
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
46
+ # If nil is given, number of leaves is not limited.
47
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
+ # If nil is given, split process considers 'Math.sqrt(n_features)' features.
50
+ # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
51
+ # If nil is given, the methods do not execute in parallel.
52
+ # If zero or less is given, it becomes equal to the number of processors.
53
+ # This parameter is ignored if the Parallel gem is not loaded.
54
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
55
+ # It is used to randomly determine the order of features when deciding spliting point.
56
+ def initialize(n_estimators: 10,
57
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
+ max_features: nil, n_jobs: nil, random_seed: nil)
59
+ super
60
+ end
61
+
62
+ # Fit the model with given training data.
63
+ #
64
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
65
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
66
+ # @return [ExtraTreesRegressor] The learned regressor itself.
67
+ def fit(x, y)
68
+ x = ::Rumale::Validation.check_convert_sample_array(x)
69
+ y = ::Rumale::Validation.check_convert_target_value_array(y)
70
+ ::Rumale::Validation.check_sample_size(x, y)
71
+
72
+ # Initialize some variables.
73
+ n_features = x.shape[1]
74
+ @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
75
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
76
+ sub_rng = @rng.dup
77
+ # Construct forest.
78
+ rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
79
+ @estimators = if enable_parallel?
80
+ parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
81
+ else
82
+ Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
83
+ end
84
+ @feature_importances =
85
+ if enable_parallel?
86
+ parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
87
+ else
88
+ @estimators.sum(&:feature_importances)
89
+ end
90
+ @feature_importances /= @feature_importances.sum
91
+ self
92
+ end
93
+
94
+ # Predict values for samples.
95
+ #
96
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
97
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
98
+ def predict(x)
99
+ x = ::Rumale::Validation.check_convert_sample_array(x)
100
+
101
+ super
102
+ end
103
+
104
+ # Return the index of the leaf that each sample reached.
105
+ #
106
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
107
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
108
+ def apply(x)
109
+ x = ::Rumale::Validation.check_convert_sample_array(x)
110
+
111
+ super
112
+ end
113
+
114
+ private
115
+
116
+ def plant_tree(rnd_seed)
117
+ ::Rumale::Tree::ExtraTreeRegressor.new(
118
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
119
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
120
+ max_features: @params[:max_features], random_seed: rnd_seed
121
+ )
122
+ end
123
+ end
124
+ end
125
+ end