rumale-ensemble 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +176 -0
- data/lib/rumale/ensemble/ada_boost_regressor.rb +167 -0
- data/lib/rumale/ensemble/extra_trees_classifier.rb +140 -0
- data/lib/rumale/ensemble/extra_trees_regressor.rb +125 -0
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +296 -0
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +223 -0
- data/lib/rumale/ensemble/random_forest_classifier.rb +184 -0
- data/lib/rumale/ensemble/random_forest_regressor.rb +146 -0
- data/lib/rumale/ensemble/stacking_classifier.rb +224 -0
- data/lib/rumale/ensemble/stacking_regressor.rb +168 -0
- data/lib/rumale/ensemble/value.rb +13 -0
- data/lib/rumale/ensemble/version.rb +10 -0
- data/lib/rumale/ensemble/voting_classifier.rb +129 -0
- data/lib/rumale/ensemble/voting_regressor.rb +84 -0
- data/lib/rumale/ensemble.rb +20 -0
- metadata +152 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 71f67ae6338e6907a02b66affa8ad12b22254da82d6a1fdfea092844f8809a51
|
4
|
+
data.tar.gz: 7b301905c59c580ace8f17edc4dd2b526af267493f60f74c294652f6e137fc12
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 65391ee173334b7b2bc41761fe4a66dd8bd0c1158c948187b9059b78b80c9343393e3a42d52e6906e54388e7e3ce86340eb479a3c443130bdf004b1954570853
|
7
|
+
data.tar.gz: 7f78362e3a06aacc18f1a71a0c0340a5322fd8d78a2acd74ac7e4a8b4bfcd9396b84cfa0dc2a01ad1f872ff057b6847b7cd6c06d3bbab45f0fc9087035715d11
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
Copyright (c) 2022 Atsushi Tatsuma
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
8
|
+
list of conditions and the following disclaimer.
|
9
|
+
|
10
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
12
|
+
and/or other materials provided with the distribution.
|
13
|
+
|
14
|
+
* Neither the name of the copyright holder nor the names of its
|
15
|
+
contributors may be used to endorse or promote products derived from
|
16
|
+
this software without specific prior written permission.
|
17
|
+
|
18
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
19
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
20
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
21
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
22
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
23
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
24
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
25
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
26
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# Rumale::Ensemble
|
2
|
+
|
3
|
+
[](https://badge.fury.io/rb/rumale-ensemble)
|
4
|
+
[](https://github.com/yoshoku/rumale/blob/main/rumale-ensemble/LICENSE.txt)
|
5
|
+
[](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
|
6
|
+
|
7
|
+
Rumale is a machine learning library in Ruby.
|
8
|
+
Rumale::Ensemble provides ensemble learning algorithms,
|
9
|
+
such as AdaBoost, Gradient Tree Boosting, and Random Forest,
|
10
|
+
with Rumale interface.
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
gem 'rumale-ensemble'
|
18
|
+
```
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
$ bundle install
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
|
26
|
+
$ gem install rumale-ensemble
|
27
|
+
|
28
|
+
## Documentation
|
29
|
+
|
30
|
+
- [Rumale API Documentation - Ensemble](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble.html)
|
31
|
+
|
32
|
+
## License
|
33
|
+
|
34
|
+
The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
|
@@ -0,0 +1,176 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/utils'
|
4
|
+
require 'rumale/validation'
|
5
|
+
require 'rumale/base/estimator'
|
6
|
+
require 'rumale/base/classifier'
|
7
|
+
require 'rumale/tree/decision_tree_classifier'
|
8
|
+
require 'rumale/ensemble/value'
|
9
|
+
|
10
|
+
module Rumale
|
11
|
+
module Ensemble
|
12
|
+
# AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
|
13
|
+
# This class uses decision tree for a weak learner.
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# require 'rumale/ensemble/ada_boost_classifier'
|
17
|
+
#
|
18
|
+
# estimator =
|
19
|
+
# Rumale::Ensemble::AdaBoostClassifier.new(
|
20
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
21
|
+
# estimator.fit(training_samples, traininig_labels)
|
22
|
+
# results = estimator.predict(testing_samples)
|
23
|
+
#
|
24
|
+
# *Reference*
|
25
|
+
# - Zhu, J., Rosset, S., Zou, H., and Hashie, T., "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
|
26
|
+
class AdaBoostClassifier < ::Rumale::Base::Estimator
|
27
|
+
include ::Rumale::Base::Classifier
|
28
|
+
|
29
|
+
# Return the set of estimators.
|
30
|
+
# @return [Array<DecisionTreeClassifier>]
|
31
|
+
attr_reader :estimators
|
32
|
+
|
33
|
+
# Return the class labels.
|
34
|
+
# @return [Numo::Int32] (size: n_classes)
|
35
|
+
attr_reader :classes
|
36
|
+
|
37
|
+
# Return the importance for each feature.
|
38
|
+
# @return [Numo::DFloat] (size: n_features)
|
39
|
+
attr_reader :feature_importances
|
40
|
+
|
41
|
+
# Return the random generator for random selection of feature index.
|
42
|
+
# @return [Random]
|
43
|
+
attr_reader :rng
|
44
|
+
|
45
|
+
# Create a new classifier with AdaBoost.
|
46
|
+
#
|
47
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
|
48
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
49
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
50
|
+
# If nil is given, decision tree grows without concern for depth.
|
51
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
52
|
+
# If nil is given, number of leaves is not limited.
|
53
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
54
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
55
|
+
# If nil is given, split process considers all features.
|
56
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
57
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
58
|
+
def initialize(n_estimators: 50,
|
59
|
+
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
60
|
+
max_features: nil, random_seed: nil)
|
61
|
+
super()
|
62
|
+
@params = {
|
63
|
+
n_estimators: n_estimators,
|
64
|
+
criterion: criterion,
|
65
|
+
max_depth: max_depth,
|
66
|
+
max_leaf_nodes: max_leaf_nodes,
|
67
|
+
min_samples_leaf: min_samples_leaf,
|
68
|
+
max_features: max_features,
|
69
|
+
random_seed: random_seed || srand
|
70
|
+
}
|
71
|
+
@rng = Random.new(@params[:random_seed])
|
72
|
+
end
|
73
|
+
|
74
|
+
# Fit the model with given training data.
|
75
|
+
#
|
76
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
77
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
78
|
+
# @return [AdaBoostClassifier] The learned classifier itself.
|
79
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
80
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
81
|
+
y = ::Rumale::Validation.check_convert_label_array(y)
|
82
|
+
::Rumale::Validation.check_sample_size(x, y)
|
83
|
+
|
84
|
+
## Initialize some variables.
|
85
|
+
n_samples, n_features = x.shape
|
86
|
+
@estimators = []
|
87
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
88
|
+
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
89
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
90
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
91
|
+
n_classes = @classes.shape[0]
|
92
|
+
sub_rng = @rng.dup
|
93
|
+
## Boosting.
|
94
|
+
classes_arr = @classes.to_a
|
95
|
+
y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
|
96
|
+
n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
|
97
|
+
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
|
98
|
+
@params[:n_estimators].times do |_t|
|
99
|
+
# Fit classfier.
|
100
|
+
ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
|
101
|
+
break if y[ids].to_a.uniq.size != n_classes
|
102
|
+
|
103
|
+
tree = ::Rumale::Tree::DecisionTreeClassifier.new(
|
104
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
105
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
106
|
+
max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
|
107
|
+
)
|
108
|
+
tree.fit(x[ids, true], y[ids])
|
109
|
+
# Calculate estimator error.
|
110
|
+
proba = tree.predict_proba(x).clip(1.0e-15, nil)
|
111
|
+
pred = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
|
112
|
+
inds = pred.ne(y)
|
113
|
+
error = (observation_weights * inds).sum / observation_weights.sum
|
114
|
+
# Store model.
|
115
|
+
@estimators.push(tree)
|
116
|
+
@feature_importances += tree.feature_importances
|
117
|
+
break if error.zero?
|
118
|
+
|
119
|
+
# Update observation weights.
|
120
|
+
log_proba = Numo::NMath.log(proba)
|
121
|
+
observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(axis: 1))
|
122
|
+
observation_weights = observation_weights.clip(1.0e-15, nil)
|
123
|
+
sum_observation_weights = observation_weights.sum
|
124
|
+
break if sum_observation_weights.zero?
|
125
|
+
|
126
|
+
observation_weights /= sum_observation_weights
|
127
|
+
end
|
128
|
+
@feature_importances /= @feature_importances.sum
|
129
|
+
self
|
130
|
+
end
|
131
|
+
|
132
|
+
# Calculate confidence scores for samples.
|
133
|
+
#
|
134
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
135
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
|
136
|
+
def decision_function(x)
|
137
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
138
|
+
|
139
|
+
n_samples, = x.shape
|
140
|
+
n_classes = @classes.size
|
141
|
+
sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
|
142
|
+
@estimators.each do |tree|
|
143
|
+
log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
|
144
|
+
sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(axis: 1)].transpose)
|
145
|
+
end
|
146
|
+
sum_probs /= @estimators.size
|
147
|
+
end
|
148
|
+
|
149
|
+
# Predict class labels for samples.
|
150
|
+
#
|
151
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
152
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
153
|
+
def predict(x)
|
154
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
155
|
+
|
156
|
+
n_samples, = x.shape
|
157
|
+
probs = decision_function(x)
|
158
|
+
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
|
159
|
+
end
|
160
|
+
|
161
|
+
# Predict probability for samples.
|
162
|
+
#
|
163
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
164
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
165
|
+
def predict_proba(x)
|
166
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
167
|
+
|
168
|
+
n_classes = @classes.size
|
169
|
+
probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
|
170
|
+
sum_probs = probs.sum(axis: 1)
|
171
|
+
probs /= Numo::DFloat[sum_probs].transpose
|
172
|
+
probs
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/utils'
|
4
|
+
require 'rumale/validation'
|
5
|
+
require 'rumale/base/estimator'
|
6
|
+
require 'rumale/base/regressor'
|
7
|
+
require 'rumale/tree/decision_tree_regressor'
|
8
|
+
require 'rumale/ensemble/value'
|
9
|
+
|
10
|
+
module Rumale
|
11
|
+
module Ensemble
|
12
|
+
# AdaBoostRegressor is a class that implements AdaBoost for regression.
|
13
|
+
# This class uses decision tree for a weak learner.
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# require 'rumale/ensemble/ada_boost_regressor'
|
17
|
+
#
|
18
|
+
# estimator =
|
19
|
+
# Rumale::Ensemble::AdaBoostRegressor.new(
|
20
|
+
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
21
|
+
# estimator.fit(training_samples, traininig_values)
|
22
|
+
# results = estimator.predict(testing_samples)
|
23
|
+
#
|
24
|
+
# *Reference*
|
25
|
+
# - Shrestha, D. L., and Solomatine, D. P., "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
|
26
|
+
class AdaBoostRegressor < ::Rumale::Base::Estimator
|
27
|
+
include ::Rumale::Base::Regressor
|
28
|
+
|
29
|
+
# Return the set of estimators.
|
30
|
+
# @return [Array<DecisionTreeRegressor>]
|
31
|
+
attr_reader :estimators
|
32
|
+
|
33
|
+
# Return the weight for each weak learner.
|
34
|
+
# @return [Numo::DFloat] (size: n_estimates)
|
35
|
+
attr_reader :estimator_weights
|
36
|
+
|
37
|
+
# Return the importance for each feature.
|
38
|
+
# @return [Numo::DFloat] (size: n_features)
|
39
|
+
attr_reader :feature_importances
|
40
|
+
|
41
|
+
# Return the random generator for random selection of feature index.
|
42
|
+
# @return [Random]
|
43
|
+
attr_reader :rng
|
44
|
+
|
45
|
+
# Create a new regressor with random forest.
|
46
|
+
#
|
47
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
|
48
|
+
# @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
|
49
|
+
# @param exponent [Float] The exponent for the weight of each weak learner.
|
50
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
51
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
52
|
+
# If nil is given, decision tree grows without concern for depth.
|
53
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
54
|
+
# If nil is given, number of leaves is not limited.
|
55
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
56
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
57
|
+
# If nil is given, split process considers all features.
|
58
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
59
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
60
|
+
def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
|
61
|
+
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
62
|
+
max_features: nil, random_seed: nil)
|
63
|
+
super()
|
64
|
+
@params = {
|
65
|
+
n_estimators: n_estimators,
|
66
|
+
threshold: threshold,
|
67
|
+
exponent: exponent,
|
68
|
+
criterion: criterion,
|
69
|
+
max_depth: max_depth,
|
70
|
+
max_leaf_nodes: max_leaf_nodes,
|
71
|
+
min_samples_leaf: min_samples_leaf,
|
72
|
+
max_features: max_features,
|
73
|
+
random_seed: random_seed || srand
|
74
|
+
}
|
75
|
+
@rng = Random.new(@params[:random_seed])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Fit the model with given training data.
|
79
|
+
#
|
80
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
81
|
+
# @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
|
82
|
+
# @return [AdaBoostRegressor] The learned regressor itself.
|
83
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
84
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
85
|
+
y = ::Rumale::Validation.check_convert_target_value_array(y)
|
86
|
+
::Rumale::Validation.check_sample_size(x, y)
|
87
|
+
unless y.ndim == 1
|
88
|
+
raise ArgumentError,
|
89
|
+
'AdaBoostRegressor supports only single-target variable regression; ' \
|
90
|
+
'the target value array is expected to be 1-D'
|
91
|
+
end
|
92
|
+
|
93
|
+
# Initialize some variables.
|
94
|
+
n_samples, n_features = x.shape
|
95
|
+
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
96
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
97
|
+
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
|
98
|
+
@estimators = []
|
99
|
+
@estimator_weights = []
|
100
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
101
|
+
sub_rng = @rng.dup
|
102
|
+
# Construct forest.
|
103
|
+
@params[:n_estimators].times do |_t|
|
104
|
+
# Fit weak learner.
|
105
|
+
ids = ::Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
|
106
|
+
tree = ::Rumale::Tree::DecisionTreeRegressor.new(
|
107
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
108
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
109
|
+
max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
|
110
|
+
)
|
111
|
+
tree.fit(x[ids, true], y[ids])
|
112
|
+
pred = tree.predict(x)
|
113
|
+
# Calculate errors.
|
114
|
+
abs_err = ((pred - y) / y).abs
|
115
|
+
sum_target = abs_err.gt(@params[:threshold])
|
116
|
+
break if sum_target.count.zero?
|
117
|
+
|
118
|
+
err = observation_weights[sum_target].sum
|
119
|
+
break if err <= 0.0
|
120
|
+
|
121
|
+
# Calculate weight.
|
122
|
+
beta = err**@params[:exponent]
|
123
|
+
weight = Math.log(1.fdiv(beta))
|
124
|
+
# Store model.
|
125
|
+
@estimators.push(tree)
|
126
|
+
@estimator_weights.push(weight)
|
127
|
+
@feature_importances += weight * tree.feature_importances
|
128
|
+
# Update observation weights.
|
129
|
+
update = Numo::DFloat.ones(n_samples)
|
130
|
+
update_target = abs_err.le(@params[:threshold])
|
131
|
+
break if update_target.count.zero?
|
132
|
+
|
133
|
+
update[update_target] = beta
|
134
|
+
observation_weights *= update
|
135
|
+
observation_weights = observation_weights.clip(1.0e-15, nil)
|
136
|
+
sum_observation_weights = observation_weights.sum
|
137
|
+
break if sum_observation_weights.zero?
|
138
|
+
|
139
|
+
observation_weights /= sum_observation_weights
|
140
|
+
end
|
141
|
+
if @estimators.empty?
|
142
|
+
warn('Failed to converge, check hyper-parameters of AdaBoostRegressor.')
|
143
|
+
self
|
144
|
+
end
|
145
|
+
@estimator_weights = Numo::DFloat.asarray(@estimator_weights)
|
146
|
+
@feature_importances /= @estimator_weights.sum
|
147
|
+
self
|
148
|
+
end
|
149
|
+
|
150
|
+
# Predict values for samples.
|
151
|
+
#
|
152
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
153
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
154
|
+
def predict(x)
|
155
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
156
|
+
|
157
|
+
n_samples, = x.shape
|
158
|
+
predictions = Numo::DFloat.zeros(n_samples)
|
159
|
+
@estimators.size.times do |t|
|
160
|
+
predictions += @estimator_weights[t] * @estimators[t].predict(x)
|
161
|
+
end
|
162
|
+
sum_weight = @estimator_weights.sum
|
163
|
+
predictions / sum_weight
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/validation'
|
4
|
+
require 'rumale/tree/extra_tree_classifier'
|
5
|
+
require 'rumale/ensemble/random_forest_classifier'
|
6
|
+
require 'rumale/ensemble/value'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Ensemble
|
10
|
+
# ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
|
11
|
+
# The algorithm of extremely randomized trees is similar to random forest.
|
12
|
+
# The features of the algorithm of extremely randomized trees are
|
13
|
+
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# require 'rumale/ensemble/extra_trees_classifier'
|
17
|
+
#
|
18
|
+
# estimator =
|
19
|
+
# Rumale::Ensemble::ExtraTreesClassifier.new(
|
20
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
21
|
+
# estimator.fit(training_samples, traininig_labels)
|
22
|
+
# results = estimator.predict(testing_samples)
|
23
|
+
#
|
24
|
+
# *Reference*
|
25
|
+
# - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
26
|
+
class ExtraTreesClassifier < RandomForestClassifier
|
27
|
+
# Return the set of estimators.
|
28
|
+
# @return [Array<ExtraTreeClassifier>]
|
29
|
+
attr_reader :estimators
|
30
|
+
|
31
|
+
# Return the class labels.
|
32
|
+
# @return [Numo::Int32] (size: n_classes)
|
33
|
+
attr_reader :classes
|
34
|
+
|
35
|
+
# Return the importance for each feature.
|
36
|
+
# @return [Numo::DFloat] (size: n_features)
|
37
|
+
attr_reader :feature_importances
|
38
|
+
|
39
|
+
# Return the random generator for random selection of feature index.
|
40
|
+
# @return [Random]
|
41
|
+
attr_reader :rng
|
42
|
+
|
43
|
+
# Create a new classifier with extremely randomized trees.
|
44
|
+
#
|
45
|
+
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
46
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
47
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
48
|
+
# If nil is given, extra tree grows without concern for depth.
|
49
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
50
|
+
# If nil is given, number of leaves is not limited.
|
51
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
52
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
53
|
+
# If nil is given, split process considers 'Math.sqrt(n_features)' features.
|
54
|
+
# @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
|
55
|
+
# If nil is given, the method does not execute in parallel.
|
56
|
+
# If zero or less is given, it becomes equal to the number of processors.
|
57
|
+
# This parameter is ignored if the Parallel gem is not loaded.
|
58
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
59
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
60
|
+
def initialize(n_estimators: 10,
|
61
|
+
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
62
|
+
max_features: nil, n_jobs: nil, random_seed: nil)
|
63
|
+
super
|
64
|
+
end
|
65
|
+
|
66
|
+
# Fit the model with given training data.
|
67
|
+
#
|
68
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
69
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
70
|
+
# @return [ExtraTreesClassifier] The learned classifier itself.
|
71
|
+
def fit(x, y)
|
72
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
73
|
+
y = ::Rumale::Validation.check_convert_label_array(y)
|
74
|
+
::Rumale::Validation.check_sample_size(x, y)
|
75
|
+
|
76
|
+
# Initialize some variables.
|
77
|
+
n_features = x.shape[1]
|
78
|
+
@params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
|
79
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
80
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
81
|
+
sub_rng = @rng.dup
|
82
|
+
# Construct trees.
|
83
|
+
rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
|
84
|
+
@estimators = if enable_parallel?
|
85
|
+
parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
86
|
+
else
|
87
|
+
Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
88
|
+
end
|
89
|
+
@feature_importances =
|
90
|
+
if enable_parallel?
|
91
|
+
parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
|
92
|
+
else
|
93
|
+
@estimators.sum(&:feature_importances)
|
94
|
+
end
|
95
|
+
@feature_importances /= @feature_importances.sum
|
96
|
+
self
|
97
|
+
end
|
98
|
+
|
99
|
+
# Predict class labels for samples.
|
100
|
+
#
|
101
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
102
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
103
|
+
def predict(x)
|
104
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
105
|
+
|
106
|
+
super
|
107
|
+
end
|
108
|
+
|
109
|
+
# Predict probability for samples.
|
110
|
+
#
|
111
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
112
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
113
|
+
def predict_proba(x)
|
114
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
115
|
+
|
116
|
+
super
|
117
|
+
end
|
118
|
+
|
119
|
+
# Return the index of the leaf that each sample reached.
|
120
|
+
#
|
121
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
122
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
123
|
+
def apply(x)
|
124
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
125
|
+
|
126
|
+
super
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
def plant_tree(rnd_seed)
|
132
|
+
::Rumale::Tree::ExtraTreeClassifier.new(
|
133
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
134
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
135
|
+
max_features: @params[:max_features], random_seed: rnd_seed
|
136
|
+
)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/validation'
|
4
|
+
require 'rumale/tree/extra_tree_regressor'
|
5
|
+
require 'rumale/ensemble/random_forest_regressor'
|
6
|
+
require 'rumale/ensemble/value'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Ensemble
|
10
|
+
# ExtraTreesRegressor is a class that implements extremely randomized trees for regression
|
11
|
+
# The algorithm of extremely randomized trees is similar to random forest.
|
12
|
+
# The features of the algorithm of extremely randomized trees are
|
13
|
+
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# @require 'rumale/ensemble/extra_trees_regressor'
|
17
|
+
#
|
18
|
+
# estimator =
|
19
|
+
# Rumale::Ensemble::ExtraTreesRegressor.new(
|
20
|
+
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
21
|
+
# estimator.fit(training_samples, traininig_values)
|
22
|
+
# results = estimator.predict(testing_samples)
|
23
|
+
#
|
24
|
+
# *Reference*
|
25
|
+
# - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
26
|
+
class ExtraTreesRegressor < RandomForestRegressor
|
27
|
+
# Return the set of estimators.
|
28
|
+
# @return [Array<ExtraTreeRegressor>]
|
29
|
+
attr_reader :estimators
|
30
|
+
|
31
|
+
# Return the importance for each feature.
|
32
|
+
# @return [Numo::DFloat] (size: n_features)
|
33
|
+
attr_reader :feature_importances
|
34
|
+
|
35
|
+
# Return the random generator for random selection of feature index.
|
36
|
+
# @return [Random]
|
37
|
+
attr_reader :rng
|
38
|
+
|
39
|
+
# Create a new regressor with extremely randomized trees.
|
40
|
+
#
|
41
|
+
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
42
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
43
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
44
|
+
# If nil is given, extra tree grows without concern for depth.
|
45
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
46
|
+
# If nil is given, number of leaves is not limited.
|
47
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
48
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
49
|
+
# If nil is given, split process considers 'Math.sqrt(n_features)' features.
|
50
|
+
# @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
|
51
|
+
# If nil is given, the methods do not execute in parallel.
|
52
|
+
# If zero or less is given, it becomes equal to the number of processors.
|
53
|
+
# This parameter is ignored if the Parallel gem is not loaded.
|
54
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
55
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
56
|
+
def initialize(n_estimators: 10,
|
57
|
+
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
58
|
+
max_features: nil, n_jobs: nil, random_seed: nil)
|
59
|
+
super
|
60
|
+
end
|
61
|
+
|
62
|
+
# Fit the model with given training data.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
65
|
+
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
66
|
+
# @return [ExtraTreesRegressor] The learned regressor itself.
|
67
|
+
def fit(x, y)
|
68
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
69
|
+
y = ::Rumale::Validation.check_convert_target_value_array(y)
|
70
|
+
::Rumale::Validation.check_sample_size(x, y)
|
71
|
+
|
72
|
+
# Initialize some variables.
|
73
|
+
n_features = x.shape[1]
|
74
|
+
@params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
|
75
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
76
|
+
sub_rng = @rng.dup
|
77
|
+
# Construct forest.
|
78
|
+
rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE) }
|
79
|
+
@estimators = if enable_parallel?
|
80
|
+
parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
81
|
+
else
|
82
|
+
Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
83
|
+
end
|
84
|
+
@feature_importances =
|
85
|
+
if enable_parallel?
|
86
|
+
parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
|
87
|
+
else
|
88
|
+
@estimators.sum(&:feature_importances)
|
89
|
+
end
|
90
|
+
@feature_importances /= @feature_importances.sum
|
91
|
+
self
|
92
|
+
end
|
93
|
+
|
94
|
+
# Predict values for samples.
|
95
|
+
#
|
96
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
97
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
98
|
+
def predict(x)
|
99
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
100
|
+
|
101
|
+
super
|
102
|
+
end
|
103
|
+
|
104
|
+
# Return the index of the leaf that each sample reached.
|
105
|
+
#
|
106
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
|
107
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
108
|
+
def apply(x)
|
109
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
110
|
+
|
111
|
+
super
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def plant_tree(rnd_seed)
|
117
|
+
::Rumale::Tree::ExtraTreeRegressor.new(
|
118
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
119
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
120
|
+
max_features: @params[:max_features], random_seed: rnd_seed
|
121
|
+
)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|