rumale-ensemble 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,296 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/tree/gradient_tree_regressor'
7
+ require 'rumale/ensemble/value'
8
+
9
+ module Rumale
10
+ module Ensemble
11
+ # GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
12
+ # The class use negative binomial log-likelihood for the loss function.
13
+ # For multiclass classification problem, it uses one-vs-the-rest strategy.
14
+ #
15
+ # @example
16
+ # require 'rumale/ensemble/gradient_boosting_classifier'
17
+ #
18
+ # estimator =
19
+ # Rumale::Ensemble::GradientBoostingClassifier.new(
20
+ # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
21
+ # estimator.fit(training_samples, traininig_values)
22
+ # results = estimator.predict(testing_samples)
23
+ #
24
+ # *Reference*
25
+ # - Friedman, J H., "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
26
+ # - Friedman, J H., "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
27
+ # - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
28
+ #
29
+ class GradientBoostingClassifier < ::Rumale::Base::Estimator # rubocop:disable Metrics/ClassLength
30
+ include ::Rumale::Base::Classifier
31
+
32
+ # Return the set of estimators.
33
+ # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
34
+ attr_reader :estimators
35
+
36
+ # Return the class labels.
37
+ # @return [Numo::Int32] (size: n_classes)
38
+ attr_reader :classes
39
+
40
+ # Return the importance for each feature.
41
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
42
+ # @return [Numo::DFloat] (size: n_features)
43
+ attr_reader :feature_importances
44
+
45
+ # Return the random generator for random selection of feature index.
46
+ # @return [Random]
47
+ attr_reader :rng
48
+
49
+ # Create a new classifier with gradient tree boosting.
50
+ #
51
+ # @param n_estimators [Integer] The numeber of trees for contructing classifier.
52
+ # @param learning_rate [Float] The boosting learining rate
53
+ # @param reg_lambda [Float] The L2 regularization term on weight.
54
+ # @param subsample [Float] The subsampling ratio of the training samples.
55
+ # @param max_depth [Integer] The maximum depth of the tree.
56
+ # If nil is given, decision tree grows without concern for depth.
57
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
58
+ # If nil is given, number of leaves is not limited.
59
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
60
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
61
+ # If nil is given, split process considers all features.
62
+ # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
63
+ # If nil is given, the methods do not execute in parallel.
64
+ # If zero or less is given, it becomes equal to the number of processors.
65
+ # This parameter is ignored if the Parallel gem is not loaded.
66
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
67
+ # It is used to randomly determine the order of features when deciding spliting point.
68
+ def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
69
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
70
+ max_features: nil, n_jobs: nil, random_seed: nil)
71
+ super()
72
+ @params = {
73
+ n_estimators: n_estimators,
74
+ learning_rate: learning_rate,
75
+ reg_lambda: reg_lambda,
76
+ subsample: subsample,
77
+ max_depth: max_depth,
78
+ max_leaf_nodes: max_leaf_nodes,
79
+ min_samples_leaf: min_samples_leaf,
80
+ max_features: max_features,
81
+ n_jobs: n_jobs,
82
+ random_seed: random_seed || srand
83
+ }
84
+ @rng = Random.new(@params[:random_seed])
85
+ end
86
+
87
+ # Fit the model with given training data.
88
+ #
89
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
90
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
91
+ # @return [GradientBoostingClassifier] The learned classifier itself.
92
+ def fit(x, y)
93
+ x = ::Rumale::Validation.check_convert_sample_array(x)
94
+ y = ::Rumale::Validation.check_convert_label_array(y)
95
+ ::Rumale::Validation.check_sample_size(x, y)
96
+
97
+ # initialize some variables.
98
+ n_features = x.shape[1]
99
+ @params[:max_features] = n_features if @params[:max_features].nil?
100
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
101
+ @classes = Numo::Int32[*y.to_a.uniq.sort]
102
+ n_classes = @classes.size
103
+ # train estimator.
104
+ if n_classes > 2
105
+ @base_predictions = multiclass_base_predictions(y)
106
+ @estimators = multiclass_estimators(x, y)
107
+ else
108
+ negative_label = y.to_a.uniq.min
109
+ bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
110
+ y_mean = bin_y.mean
111
+ @base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
112
+ @estimators = partial_fit(x, bin_y, @base_predictions)
113
+ end
114
+ # calculate feature importances.
115
+ @feature_importances = if n_classes > 2
116
+ multiclass_feature_importances
117
+ else
118
+ @estimators.sum(&:feature_importances)
119
+ end
120
+ self
121
+ end
122
+
123
+ # Calculate confidence scores for samples.
124
+ #
125
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
126
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
127
+ def decision_function(x)
128
+ x = ::Rumale::Validation.check_convert_sample_array(x)
129
+
130
+ n_classes = @classes.size
131
+ if n_classes > 2
132
+ multiclass_scores(x)
133
+ else
134
+ @estimators.sum { |tree| tree.predict(x) } + @base_predictions
135
+ end
136
+ end
137
+
138
+ # Predict class labels for samples.
139
+ #
140
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
141
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
142
+ def predict(x)
143
+ x = ::Rumale::Validation.check_convert_sample_array(x)
144
+
145
+ n_samples = x.shape[0]
146
+ probs = predict_proba(x)
147
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
148
+ end
149
+
150
+ # Predict probability for samples.
151
+ #
152
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
153
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
154
+ def predict_proba(x)
155
+ x = ::Rumale::Validation.check_convert_sample_array(x)
156
+
157
+ proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
158
+
159
+ return (proba.transpose / proba.sum(axis: 1)).transpose.dup if @classes.size > 2
160
+
161
+ n_samples, = x.shape
162
+ probs = Numo::DFloat.zeros(n_samples, 2)
163
+ probs[true, 1] = proba
164
+ probs[true, 0] = 1.0 - proba
165
+ probs
166
+ end
167
+
168
+ # Return the index of the leaf that each sample reached.
169
+ #
170
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
171
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
172
+ def apply(x)
173
+ x = ::Rumale::Validation.check_convert_sample_array(x)
174
+
175
+ n_classes = @classes.size
176
+ leaf_ids = if n_classes > 2
177
+ Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
178
+ else
179
+ @estimators.map { |tree| tree.apply(x) }
180
+ end
181
+ Numo::Int32[*leaf_ids].transpose.dup
182
+ end
183
+
184
+ private
185
+
186
+ def partial_fit(x, y, init_pred)
187
+ # initialize some variables.
188
+ estimators = []
189
+ n_samples = x.shape[0]
190
+ n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
191
+ whole_ids = Array.new(n_samples) { |v| v }
192
+ y_pred = Numo::DFloat.ones(n_samples) * init_pred
193
+ sub_rng = @rng.dup
194
+ # grow trees.
195
+ @params[:n_estimators].times do |_t|
196
+ # subsampling
197
+ ids = whole_ids.sample(n_sub_samples, random: sub_rng)
198
+ x_sub = x[ids, true]
199
+ y_sub = y[ids]
200
+ y_pred_sub = y_pred[ids]
201
+ # train tree
202
+ g = gradient(y_sub, y_pred_sub)
203
+ h = hessian(y_sub, y_pred_sub)
204
+ tree = plant_tree(sub_rng)
205
+ tree.fit(x_sub, y_sub, g, h)
206
+ estimators.push(tree)
207
+ # update
208
+ y_pred += tree.predict(x)
209
+ end
210
+ estimators
211
+ end
212
+
213
+ # for debug
214
+ #
215
+ # def loss(y_true, y_pred)
216
+ # # y_true in {-1, 1}
217
+ # Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
218
+ # end
219
+
220
+ def gradient(y_true, y_pred)
221
+ # y in {-1, 1}
222
+ -2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
223
+ end
224
+
225
+ def hessian(y_true, y_pred)
226
+ abs_response = gradient(y_true, y_pred).abs
227
+ abs_response * (2.0 - abs_response)
228
+ end
229
+
230
+ def plant_tree(sub_rng)
231
+ ::Rumale::Tree::GradientTreeRegressor.new(
232
+ reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
233
+ max_depth: @params[:max_depth],
234
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
235
+ max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
236
+ )
237
+ end
238
+
239
+ def multiclass_base_predictions(y)
240
+ n_classes = @classes.size
241
+ b = if enable_parallel?
242
+ parallel_map(n_classes) do |n|
243
+ bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
244
+ y_mean = bin_y.mean
245
+ 0.5 * Math.log((1.0 + y_mean) / (1.0 - y_mean))
246
+ end
247
+ else
248
+ Array.new(n_classes) do |n|
249
+ bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
250
+ y_mean = bin_y.mean
251
+ 0.5 * Math.log((1.0 + y_mean) / (1.0 - y_mean))
252
+ end
253
+ end
254
+ Numo::DFloat.asarray(b)
255
+ end
256
+
257
+ def multiclass_estimators(x, y)
258
+ n_classes = @classes.size
259
+ if enable_parallel?
260
+ parallel_map(n_classes) do |n|
261
+ bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
262
+ partial_fit(x, bin_y, @base_predictions[n])
263
+ end
264
+ else
265
+ Array.new(n_classes) do |n|
266
+ bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
267
+ partial_fit(x, bin_y, @base_predictions[n])
268
+ end
269
+ end
270
+ end
271
+
272
+ def multiclass_feature_importances
273
+ n_classes = @classes.size
274
+ if enable_parallel?
275
+ parallel_map(n_classes) { |n| @estimators[n].sum(&:feature_importances) }.sum
276
+ else
277
+ Array.new(n_classes) { |n| @estimators[n].sum(&:feature_importances) }.sum
278
+ end
279
+ end
280
+
281
+ def multiclass_scores(x)
282
+ n_classes = @classes.size
283
+ s = if enable_parallel?
284
+ parallel_map(n_classes) do |n|
285
+ @estimators[n].sum { |tree| tree.predict(x) }
286
+ end
287
+ else
288
+ Array.new(n_classes) do |n|
289
+ @estimators[n].sum { |tree| tree.predict(x) }
290
+ end
291
+ end
292
+ Numo::DFloat.asarray(s).transpose + @base_predictions
293
+ end
294
+ end
295
+ end
296
+ end
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/gradient_tree_regressor'
7
+ require 'rumale/ensemble/value'
8
+
9
+ module Rumale
10
+ module Ensemble
11
+ # GradientBoostingRegressor is a class that implements gradient tree boosting for regression.
12
+ # The class use L2 loss for the loss function.
13
+ #
14
+ # @example
15
+ # require 'rumale/ensemble/gradient_boosting_regressor'
16
+ #
17
+ # estimator =
18
+ # Rumale::Ensemble::GradientBoostingRegressor.new(
19
+ # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
20
+ # estimator.fit(training_samples, traininig_values)
21
+ # results = estimator.predict(testing_samples)
22
+ #
23
+ # *Reference*
24
+ # - Friedman, J H. "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
25
+ # - Friedman, J H. "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
26
+ # - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
27
+ #
28
+ class GradientBoostingRegressor < ::Rumale::Base::Estimator
29
+ include ::Rumale::Base::Regressor
30
+
31
+ # Return the set of estimators.
32
+ # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
33
+ attr_reader :estimators
34
+
35
+ # Return the importance for each feature.
36
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
37
+ # @return [Numo::DFloat] (size: n_features)
38
+ attr_reader :feature_importances
39
+
40
+ # Return the random generator for random selection of feature index.
41
+ # @return [Random]
42
+ attr_reader :rng
43
+
44
+ # Create a new regressor with gradient tree boosting.
45
+ #
46
+ # @param n_estimators [Integer] The numeber of trees for contructing regressor.
47
+ # @param learning_rate [Float] The boosting learining rate
48
+ # @param reg_lambda [Float] The L2 regularization term on weight.
49
+ # @param subsample [Float] The subsampling ratio of the training samples.
50
+ # @param max_depth [Integer] The maximum depth of the tree.
51
+ # If nil is given, decision tree grows without concern for depth.
52
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
53
+ # If nil is given, number of leaves is not limited.
54
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
55
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
56
+ # If nil is given, split process considers all features.
57
+ # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
58
+ # If nil is given, the methods do not execute in parallel.
59
+ # If zero or less is given, it becomes equal to the number of processors.
60
+ # This parameter is ignored if the Parallel gem is not loaded.
61
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
62
+ # It is used to randomly determine the order of features when deciding spliting point.
63
+ def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
64
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
65
+ max_features: nil, n_jobs: nil, random_seed: nil)
66
+ super()
67
+ @params = {
68
+ n_estimators: n_estimators,
69
+ learning_rate: learning_rate,
70
+ reg_lambda: reg_lambda,
71
+ subsample: subsample,
72
+ max_depth: max_depth,
73
+ max_leaf_nodes: max_leaf_nodes,
74
+ min_samples_leaf: min_samples_leaf,
75
+ max_features: max_features,
76
+ n_jobs: n_jobs,
77
+ random_seed: random_seed || srand
78
+ }
79
+ @rng = Random.new(@params[:random_seed])
80
+ end
81
+
82
+ # Fit the model with given training data.
83
+ #
84
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
85
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
86
+ # @return [GradientBoostingRegressor] The learned regressor itself.
87
+ def fit(x, y)
88
+ # initialize some variables.
89
+ n_features = x.shape[1]
90
+ @params[:max_features] = n_features if @params[:max_features].nil?
91
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
92
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
93
+ # train regressor.
94
+ @base_predictions = n_outputs > 1 ? y.mean(0) : y.mean
95
+ @estimators = if n_outputs > 1
96
+ multivar_estimators(x, y)
97
+ else
98
+ partial_fit(x, y, @base_predictions)
99
+ end
100
+ # calculate feature importances.
101
+ @feature_importances = if n_outputs > 1
102
+ multivar_feature_importances
103
+ else
104
+ @estimators.sum(&:feature_importances)
105
+ end
106
+ self
107
+ end
108
+
109
+ # Predict values for samples.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
112
+ # @return [Numo::DFloat] (shape: [n_samples]) Predicted values per sample.
113
+ def predict(x)
114
+ n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
115
+ if n_outputs > 1
116
+ multivar_predict(x)
117
+ elsif enable_parallel?
118
+ parallel_map(@params[:n_estimators]) { |n| @estimators[n].predict(x) }.sum + @base_predictions
119
+ else
120
+ @estimators.sum { |tree| tree.predict(x) } + @base_predictions
121
+ end
122
+ end
123
+
124
+ # Return the index of the leaf that each sample reached.
125
+ #
126
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
127
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
128
+ def apply(x)
129
+ n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
130
+ leaf_ids = if n_outputs > 1
131
+ Array.new(n_outputs) { |n| @estimators[n].map { |tree| tree.apply(x) } }
132
+ else
133
+ @estimators.map { |tree| tree.apply(x) }
134
+ end
135
+ Numo::Int32[*leaf_ids].transpose.dup
136
+ end
137
+
138
+ private
139
+
140
+ def partial_fit(x, y, init_pred)
141
+ # initialize some variables.
142
+ estimators = []
143
+ n_samples = x.shape[0]
144
+ n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
145
+ whole_ids = Array.new(n_samples) { |v| v }
146
+ y_pred = Numo::DFloat.ones(n_samples) * init_pred
147
+ sub_rng = @rng.dup
148
+ # grow trees.
149
+ @params[:n_estimators].times do |_t|
150
+ # subsampling
151
+ ids = whole_ids.sample(n_sub_samples, random: sub_rng)
152
+ x_sub = x[ids, true]
153
+ y_sub = y[ids]
154
+ y_pred_sub = y_pred[ids]
155
+ # train tree
156
+ g = gradient(y_sub, y_pred_sub)
157
+ h = hessian(n_sub_samples)
158
+ tree = plant_tree(sub_rng)
159
+ tree.fit(x_sub, y_sub, g, h)
160
+ estimators.push(tree)
161
+ # update
162
+ y_pred += tree.predict(x)
163
+ end
164
+ estimators
165
+ end
166
+
167
+ # for debug
168
+ #
169
+ # def loss(y_true, y_pred)
170
+ # ((y_true - y_pred)**2).mean
171
+ # end
172
+
173
+ def gradient(y_true, y_pred)
174
+ y_pred - y_true
175
+ end
176
+
177
+ def hessian(n_samples)
178
+ Numo::DFloat.ones(n_samples)
179
+ end
180
+
181
+ def plant_tree(sub_rng)
182
+ ::Rumale::Tree::GradientTreeRegressor.new(
183
+ reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
184
+ max_depth: @params[:max_depth],
185
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
186
+ max_features: @params[:max_features], random_seed: sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)
187
+ )
188
+ end
189
+
190
+ def multivar_estimators(x, y)
191
+ n_outputs = y.shape[1]
192
+ if enable_parallel?
193
+ parallel_map(n_outputs) { |n| partial_fit(x, y[true, n], @base_predictions[n]) }
194
+ else
195
+ Array.new(n_outputs) { |n| partial_fit(x, y[true, n], @base_predictions[n]) }
196
+ end
197
+ end
198
+
199
+ def multivar_feature_importances
200
+ n_outputs = @estimators.size
201
+ if enable_parallel?
202
+ parallel_map(n_outputs) { |n| @estimators[n].sum(&:feature_importances) }.sum
203
+ else
204
+ Array.new(n_outputs) { |n| @estimators[n].sum(&:feature_importances) }.sum
205
+ end
206
+ end
207
+
208
+ def multivar_predict(x)
209
+ n_outputs = @estimators.size
210
+ pred = if enable_parallel?
211
+ parallel_map(n_outputs) do |n|
212
+ @estimators[n].sum { |tree| tree.predict(x) }
213
+ end
214
+ else
215
+ Array.new(n_outputs) do |n|
216
+ @estimators[n].sum { |tree| tree.predict(x) }
217
+ end
218
+ end
219
+ Numo::DFloat.asarray(pred).transpose + @base_predictions
220
+ end
221
+ end
222
+ end
223
+ end