rumale 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/ext/rumale/rumale.c +74 -0
- data/lib/rumale.rb +4 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +1 -1
- data/lib/rumale/ensemble/ada_boost_regressor.rb +1 -1
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +278 -0
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +230 -0
- data/lib/rumale/preprocessing/bin_discretizer.rb +108 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +1 -1
- data/lib/rumale/tree/base_decision_tree.rb +4 -5
- data/lib/rumale/tree/extra_tree_classifier.rb +2 -2
- data/lib/rumale/tree/extra_tree_regressor.rb +2 -2
- data/lib/rumale/tree/gradient_tree_regressor.rb +228 -0
- data/lib/rumale/tree/node.rb +1 -1
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d73f16bcd1d149babe18c1db66d3f72bb9a1206
|
4
|
+
data.tar.gz: 247fd7d548563ef27622c293073236468f634b7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a4a92d08ee0a8295d96a930a46fb67a9299a9e0beb717d52186347fef3b70727e35a2375e6c50f5da37ab699132fe0d3c3beeeb0a9730a158e3a5864f6b8364
|
7
|
+
data.tar.gz: a614c5002c750f9091a0b7b80b678115ea6b65a1a7d0de621431ee942f8f1678d36c64a271cdb1cc0c4a68c49d20bacfe934d844381ad78361c09e762e02e872
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# 0.9.2
|
2
|
+
- Add class for Gradient tree boosting classifier.
|
3
|
+
- Add class for Gradient tree boosting regressor.
|
4
|
+
- Add class for discretizing feature values.
|
5
|
+
- Refactor extra-trees estimators.
|
6
|
+
- Refactor decision tree base class.
|
7
|
+
- Fix some typos on document ([#6](https://github.com/yoshoku/rumale/pull/6)).
|
8
|
+
|
1
9
|
# 0.9.1
|
2
10
|
- Add class for Extra-Trees classifier.
|
3
11
|
- Add class for Extra-Trees regressor.
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
|
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
15
|
+
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
16
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
17
17
|
|
18
18
|
This project was formerly known as "SVMKit".
|
data/ext/rumale/rumale.c
CHANGED
@@ -334,6 +334,72 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
334
334
|
return opt_params;
|
335
335
|
}
|
336
336
|
|
337
|
+
/**
|
338
|
+
* @!visibility private
|
339
|
+
* Find for split point with maximum information gain.
|
340
|
+
*
|
341
|
+
* @overload find_split_params(sorted_features, sorted_gradient, sorted_hessian, sum_gradient, sum_hessian) -> Array<Float>
|
342
|
+
*
|
343
|
+
* @param sorted_features [Array<Float>] (size: n_samples) The feature values sorted in ascending order.
|
344
|
+
* @param sorted_targets [Array<Float>] (size: n_samples) The target values sorted according to feature values.
|
345
|
+
* @param sorted_gradient [Array<Float>] (size: n_samples) The gradient values of loss function sorted according to feature values.
|
346
|
+
* @param sorted_hessian [Array<Float>] (size: n_samples) The hessian values of loss function sorted according to feature values.
|
347
|
+
* @param sum_gradient [Float] The sum of gradient values.
|
348
|
+
* @param sum_hessian [Float] The sum of hessian values.
|
349
|
+
* @param reg_lambda [Float] The L2 regularization term on weight.
|
350
|
+
* @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
|
351
|
+
*/
|
352
|
+
static VALUE
|
353
|
+
find_split_params_grad_reg
|
354
|
+
(VALUE self, VALUE sorted_f, VALUE sorted_g, VALUE sorted_h, VALUE sum_g, VALUE sum_h, VALUE reg_l)
|
355
|
+
{
|
356
|
+
const long n_elements = RARRAY_LEN(sorted_f);
|
357
|
+
const double s_grad = NUM2DBL(sum_g);
|
358
|
+
const double s_hess = NUM2DBL(sum_h);
|
359
|
+
const double reg_lambda = NUM2DBL(reg_l);
|
360
|
+
long curr_pos = 0;
|
361
|
+
long next_pos = 0;
|
362
|
+
double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
|
363
|
+
double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
|
364
|
+
double next_el;
|
365
|
+
double l_grad = 0.0;
|
366
|
+
double l_hess = 0.0;
|
367
|
+
double r_grad;
|
368
|
+
double r_hess;
|
369
|
+
double gain;
|
370
|
+
VALUE opt_params = rb_ary_new2(2);
|
371
|
+
|
372
|
+
/* Initialize optimal parameters. */
|
373
|
+
rb_ary_store(opt_params, 0, rb_ary_entry(sorted_f, 0)); /* threshold */
|
374
|
+
rb_ary_store(opt_params, 1, DBL2NUM(0)); /* gain */
|
375
|
+
|
376
|
+
/* Find optimal parameters. */
|
377
|
+
while (curr_pos < n_elements && curr_el != last_el) {
|
378
|
+
next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
|
379
|
+
while (next_pos < n_elements && next_el == curr_el) {
|
380
|
+
l_grad += NUM2DBL(rb_ary_entry(sorted_g, next_pos));
|
381
|
+
l_hess += NUM2DBL(rb_ary_entry(sorted_h, next_pos));
|
382
|
+
next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
|
383
|
+
}
|
384
|
+
/* Calculate gain of new split. */
|
385
|
+
r_grad = s_grad - l_grad;
|
386
|
+
r_hess = s_hess - l_hess;
|
387
|
+
gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
|
388
|
+
(r_grad * r_grad) / (r_hess + reg_lambda) -
|
389
|
+
(s_grad * s_grad) / (s_hess + reg_lambda);
|
390
|
+
/* Update optimal parameters. */
|
391
|
+
if (gain > NUM2DBL(rb_ary_entry(opt_params, 1))) {
|
392
|
+
rb_ary_store(opt_params, 0, DBL2NUM(0.5 * (curr_el + next_el)));
|
393
|
+
rb_ary_store(opt_params, 1, DBL2NUM(gain));
|
394
|
+
}
|
395
|
+
if (next_pos == n_elements) break;
|
396
|
+
curr_pos = next_pos;
|
397
|
+
curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
|
398
|
+
}
|
399
|
+
|
400
|
+
return opt_params;
|
401
|
+
}
|
402
|
+
|
337
403
|
/**
|
338
404
|
* @!visibility private
|
339
405
|
* Calculate impurity based on criterion.
|
@@ -406,9 +472,17 @@ void Init_rumale(void)
|
|
406
472
|
* This module is used internally.
|
407
473
|
*/
|
408
474
|
VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
|
475
|
+
/**
|
476
|
+
* Document-module: Rumale::Tree::ExtGradientTreeRegressor
|
477
|
+
* @!visibility private
|
478
|
+
* The mixin module consisting of extension method for GradientTreeRegressor class.
|
479
|
+
* This module is used internally.
|
480
|
+
*/
|
481
|
+
VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
|
409
482
|
|
410
483
|
rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
|
411
484
|
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
|
485
|
+
rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 6);
|
412
486
|
rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
|
413
487
|
rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
|
414
488
|
}
|
data/lib/rumale.rb
CHANGED
@@ -47,8 +47,11 @@ require 'rumale/tree/decision_tree_classifier'
|
|
47
47
|
require 'rumale/tree/decision_tree_regressor'
|
48
48
|
require 'rumale/tree/extra_tree_classifier'
|
49
49
|
require 'rumale/tree/extra_tree_regressor'
|
50
|
+
require 'rumale/tree/gradient_tree_regressor'
|
50
51
|
require 'rumale/ensemble/ada_boost_classifier'
|
51
52
|
require 'rumale/ensemble/ada_boost_regressor'
|
53
|
+
require 'rumale/ensemble/gradient_boosting_classifier'
|
54
|
+
require 'rumale/ensemble/gradient_boosting_regressor'
|
52
55
|
require 'rumale/ensemble/random_forest_classifier'
|
53
56
|
require 'rumale/ensemble/random_forest_regressor'
|
54
57
|
require 'rumale/ensemble/extra_trees_classifier'
|
@@ -61,6 +64,7 @@ require 'rumale/preprocessing/l2_normalizer'
|
|
61
64
|
require 'rumale/preprocessing/min_max_scaler'
|
62
65
|
require 'rumale/preprocessing/max_abs_scaler'
|
63
66
|
require 'rumale/preprocessing/standard_scaler'
|
67
|
+
require 'rumale/preprocessing/bin_discretizer'
|
64
68
|
require 'rumale/preprocessing/label_encoder'
|
65
69
|
require 'rumale/preprocessing/one_hot_encoder'
|
66
70
|
require 'rumale/model_selection/k_fold'
|
@@ -42,7 +42,7 @@ module Rumale
|
|
42
42
|
|
43
43
|
# Create a new classifier with AdaBoost.
|
44
44
|
#
|
45
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing
|
45
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
|
46
46
|
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
47
47
|
# @param max_depth [Integer] The maximum depth of the tree.
|
48
48
|
# If nil is given, decision tree grows without concern for depth.
|
@@ -42,7 +42,7 @@ module Rumale
|
|
42
42
|
|
43
43
|
# Create a new regressor with random forest.
|
44
44
|
#
|
45
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing
|
45
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
|
46
46
|
# @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
|
47
47
|
# @param exponent [Float] The exponent for the weight of each weak learner.
|
48
48
|
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
@@ -0,0 +1,278 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/values'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/classifier'
|
6
|
+
require 'rumale/tree/gradient_tree_regressor'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Ensemble
|
10
|
+
# GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
|
11
|
+
# The class use negative binomial log-likelihood for the loss function.
|
12
|
+
# For multiclass classification problem, it uses one-vs-the-rest strategy.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# estimator =
|
16
|
+
# Rumale::Ensemble::GradientBoostingClassifier.new(
|
17
|
+
# n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
|
18
|
+
# estimator.fit(training_samples, traininig_values)
|
19
|
+
# results = estimator.predict(testing_samples)
|
20
|
+
#
|
21
|
+
# *reference*
|
22
|
+
# - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
23
|
+
# - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
24
|
+
# - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
25
|
+
#
|
26
|
+
class GradientBoostingClassifier
|
27
|
+
include Base::BaseEstimator
|
28
|
+
include Base::Classifier
|
29
|
+
|
30
|
+
# Return the set of estimators.
|
31
|
+
# @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
|
32
|
+
attr_reader :estimators
|
33
|
+
|
34
|
+
# Return the class labels.
|
35
|
+
# @return [Numo::Int32] (size: n_classes)
|
36
|
+
attr_reader :classes
|
37
|
+
|
38
|
+
# Return the importance for each feature.
|
39
|
+
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
40
|
+
# @return [Numo::DFloat] (size: n_features)
|
41
|
+
attr_reader :feature_importances
|
42
|
+
|
43
|
+
# Return the random generator for random selection of feature index.
|
44
|
+
# @return [Random]
|
45
|
+
attr_reader :rng
|
46
|
+
|
47
|
+
# Create a new classifier with gradient tree boosting.
|
48
|
+
#
|
49
|
+
# @param n_estimators [Integer] The numeber of trees for contructing classifier.
|
50
|
+
# @param learning_rate [Float] The boosting learining rate
|
51
|
+
# @param reg_lambda [Float] The L2 regularization term on weight.
|
52
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
53
|
+
# If nil is given, decision tree grows without concern for depth.
|
54
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
55
|
+
# If nil is given, number of leaves is not limited.
|
56
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
57
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
58
|
+
# If nil is given, split process considers all features.
|
59
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
60
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
61
|
+
def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
|
62
|
+
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
63
|
+
max_features: nil, random_seed: nil)
|
64
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
65
|
+
max_features: max_features, random_seed: random_seed)
|
66
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
67
|
+
check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
|
68
|
+
check_params_positive(n_estimators: n_estimators,
|
69
|
+
learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
|
70
|
+
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
71
|
+
max_features: max_features)
|
72
|
+
@params = {}
|
73
|
+
@params[:n_estimators] = n_estimators
|
74
|
+
@params[:learning_rate] = learning_rate
|
75
|
+
@params[:reg_lambda] = reg_lambda
|
76
|
+
@params[:subsample] = subsample
|
77
|
+
@params[:max_depth] = max_depth
|
78
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
79
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
80
|
+
@params[:max_features] = max_features
|
81
|
+
@params[:random_seed] = random_seed
|
82
|
+
@params[:random_seed] ||= srand
|
83
|
+
@estimators = nil
|
84
|
+
@classes = nil
|
85
|
+
@base_predictions = nil
|
86
|
+
@feature_importances = nil
|
87
|
+
@rng = Random.new(@params[:random_seed])
|
88
|
+
end
|
89
|
+
|
90
|
+
# Fit the model with given training data.
|
91
|
+
#
|
92
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
93
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
94
|
+
# @return [GradientBoostingClassifier] The learned classifier itself.
|
95
|
+
def fit(x, y)
|
96
|
+
check_sample_array(x)
|
97
|
+
check_label_array(y)
|
98
|
+
check_sample_label_size(x, y)
|
99
|
+
|
100
|
+
n_features = x.shape[1]
|
101
|
+
@params[:max_features] = n_features if @params[:max_features].nil?
|
102
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
103
|
+
|
104
|
+
# train estimator.
|
105
|
+
@classes = Numo::Int32[*y.to_a.uniq.sort]
|
106
|
+
n_classes = @classes.size
|
107
|
+
if n_classes > 2
|
108
|
+
@base_predictions = Numo::DFloat.zeros(n_classes)
|
109
|
+
@estimators = Array.new(n_classes) do |n|
|
110
|
+
bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
|
111
|
+
y_mean = bin_y.mean
|
112
|
+
@base_predictions[n] = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
|
113
|
+
partial_fit(x, bin_y, @base_predictions[n])
|
114
|
+
end
|
115
|
+
else
|
116
|
+
negative_label = y.to_a.uniq.min
|
117
|
+
bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
|
118
|
+
y_mean = bin_y.mean
|
119
|
+
@base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
|
120
|
+
@estimators = partial_fit(x, bin_y, @base_predictions)
|
121
|
+
end
|
122
|
+
|
123
|
+
# calculate feature importances.
|
124
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
125
|
+
if n_classes > 2
|
126
|
+
n_classes.times do |n|
|
127
|
+
@estimators[n].each { |tree| @feature_importances += tree.feature_importances }
|
128
|
+
end
|
129
|
+
else
|
130
|
+
@estimators.each { |tree| @feature_importances += tree.feature_importances }
|
131
|
+
end
|
132
|
+
|
133
|
+
self
|
134
|
+
end
|
135
|
+
|
136
|
+
# Calculate confidence scores for samples.
|
137
|
+
#
|
138
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
139
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
|
140
|
+
def decision_function(x)
|
141
|
+
check_sample_array(x)
|
142
|
+
n_samples = x.shape[0]
|
143
|
+
n_classes = @classes.size
|
144
|
+
if n_classes > 2
|
145
|
+
scores = Numo::DFloat.ones(n_samples, n_classes) * @base_predictions
|
146
|
+
n_classes.times do |n|
|
147
|
+
@estimators[n].each { |tree| scores[true, n] += tree.predict(x) }
|
148
|
+
end
|
149
|
+
else
|
150
|
+
scores = Numo::DFloat.ones(n_samples) * @base_predictions
|
151
|
+
@estimators.each { |tree| scores += tree.predict(x) }
|
152
|
+
end
|
153
|
+
scores
|
154
|
+
end
|
155
|
+
|
156
|
+
# Predict class labels for samples.
|
157
|
+
#
|
158
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
159
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
160
|
+
def predict(x)
|
161
|
+
check_sample_array(x)
|
162
|
+
n_samples = x.shape[0]
|
163
|
+
probs = predict_proba(x)
|
164
|
+
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
|
165
|
+
end
|
166
|
+
|
167
|
+
# Predict probability for samples.
|
168
|
+
#
|
169
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
170
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
171
|
+
def predict_proba(x)
|
172
|
+
check_sample_array(x)
|
173
|
+
|
174
|
+
proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
|
175
|
+
|
176
|
+
return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
|
177
|
+
|
178
|
+
n_samples, = x.shape
|
179
|
+
probs = Numo::DFloat.zeros(n_samples, 2)
|
180
|
+
probs[true, 1] = proba
|
181
|
+
probs[true, 0] = 1.0 - proba
|
182
|
+
probs
|
183
|
+
end
|
184
|
+
|
185
|
+
# Return the index of the leaf that each sample reached.
|
186
|
+
#
|
187
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
188
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
|
189
|
+
def apply(x)
|
190
|
+
check_sample_array(x)
|
191
|
+
n_classes = @classes.size
|
192
|
+
leaf_ids = if n_classes > 2
|
193
|
+
Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
|
194
|
+
else
|
195
|
+
@estimators.map { |tree| tree.apply(x) }
|
196
|
+
end
|
197
|
+
Numo::Int32[*leaf_ids].transpose
|
198
|
+
end
|
199
|
+
|
200
|
+
# Dump marshal data.
|
201
|
+
# @return [Hash] The marshal data about GradientBoostingClassifier.
|
202
|
+
def marshal_dump
|
203
|
+
{ params: @params,
|
204
|
+
estimators: @estimators,
|
205
|
+
classes: @classes,
|
206
|
+
base_predictions: @base_predictions,
|
207
|
+
feature_importances: @feature_importances,
|
208
|
+
rng: @rng }
|
209
|
+
end
|
210
|
+
|
211
|
+
# Load marshal data.
|
212
|
+
# @return [nil]
|
213
|
+
def marshal_load(obj)
|
214
|
+
@params = obj[:params]
|
215
|
+
@estimators = obj[:estimators]
|
216
|
+
@classes = obj[:classes]
|
217
|
+
@base_predictions = obj[:base_predictions]
|
218
|
+
@feature_importances = obj[:feature_importances]
|
219
|
+
@rng = obj[:rng]
|
220
|
+
nil
|
221
|
+
end
|
222
|
+
|
223
|
+
private
|
224
|
+
|
225
|
+
def partial_fit(x, y, init_pred)
|
226
|
+
# initialize some variables.
|
227
|
+
estimators = []
|
228
|
+
n_samples = x.shape[0]
|
229
|
+
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
230
|
+
whole_ids = Array.new(n_samples) { |v| v }
|
231
|
+
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
232
|
+
# grow trees.
|
233
|
+
@params[:n_estimators].times do |_t|
|
234
|
+
# subsampling
|
235
|
+
ids = whole_ids.sample(n_sub_samples, random: @rng)
|
236
|
+
x_sub = x[ids, true]
|
237
|
+
y_sub = y[ids]
|
238
|
+
y_pred_sub = y_pred[ids]
|
239
|
+
# train tree
|
240
|
+
g = gradient(y_sub, y_pred_sub)
|
241
|
+
h = hessian(y_sub, y_pred_sub)
|
242
|
+
tree = plant_tree
|
243
|
+
tree.fit(x_sub, y_sub, g, h)
|
244
|
+
estimators.push(tree)
|
245
|
+
# update
|
246
|
+
y_pred += tree.predict(x)
|
247
|
+
end
|
248
|
+
estimators
|
249
|
+
end
|
250
|
+
|
251
|
+
# for debug
|
252
|
+
#
|
253
|
+
# def loss(y_true, y_pred)
|
254
|
+
# # y_true in {-1, 1}
|
255
|
+
# Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
|
256
|
+
# end
|
257
|
+
|
258
|
+
def gradient(y_true, y_pred)
|
259
|
+
# y in {-1, 1}
|
260
|
+
-2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
|
261
|
+
end
|
262
|
+
|
263
|
+
def hessian(y_true, y_pred)
|
264
|
+
abs_response = gradient(y_true, y_pred).abs
|
265
|
+
abs_response * (2.0 - abs_response)
|
266
|
+
end
|
267
|
+
|
268
|
+
def plant_tree
|
269
|
+
Rumale::Tree::GradientTreeRegressor.new(
|
270
|
+
reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
|
271
|
+
max_depth: @params[:max_depth],
|
272
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
273
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
274
|
+
)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
@@ -0,0 +1,230 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/values'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/regressor'
|
6
|
+
require 'rumale/tree/gradient_tree_regressor'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Ensemble
|
10
|
+
# GradientBoostingRegressor is a class that implements gradient tree boosting for regression.
|
11
|
+
# The class use L2 loss for the loss function.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# estimator =
|
15
|
+
# Rumale::Ensemble::GradientBoostingRegressor.new(
|
16
|
+
# n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
|
17
|
+
# estimator.fit(training_samples, traininig_values)
|
18
|
+
# results = estimator.predict(testing_samples)
|
19
|
+
#
|
20
|
+
# *reference*
|
21
|
+
# - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
22
|
+
# - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
23
|
+
# - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
24
|
+
#
|
25
|
+
class GradientBoostingRegressor
|
26
|
+
include Base::BaseEstimator
|
27
|
+
include Base::Regressor
|
28
|
+
|
29
|
+
# Return the set of estimators.
|
30
|
+
# @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
|
31
|
+
attr_reader :estimators
|
32
|
+
|
33
|
+
# Return the importance for each feature.
|
34
|
+
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
35
|
+
# @return [Numo::DFloat] (size: n_features)
|
36
|
+
attr_reader :feature_importances
|
37
|
+
|
38
|
+
# Return the random generator for random selection of feature index.
|
39
|
+
# @return [Random]
|
40
|
+
attr_reader :rng
|
41
|
+
|
42
|
+
# Create a new regressor with gradient tree boosting.
|
43
|
+
#
|
44
|
+
# @param n_estimators [Integer] The numeber of trees for contructing regressor.
|
45
|
+
# @param learning_rate [Float] The boosting learining rate
|
46
|
+
# @param reg_lambda [Float] The L2 regularization term on weight.
|
47
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
48
|
+
# If nil is given, decision tree grows without concern for depth.
|
49
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
50
|
+
# If nil is given, number of leaves is not limited.
|
51
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
52
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
53
|
+
# If nil is given, split process considers all features.
|
54
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
55
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
56
|
+
def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
|
57
|
+
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
58
|
+
max_features: nil, random_seed: nil)
|
59
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
60
|
+
max_features: max_features, random_seed: random_seed)
|
61
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
62
|
+
check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
|
63
|
+
check_params_positive(n_estimators: n_estimators,
|
64
|
+
learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
|
65
|
+
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
66
|
+
max_features: max_features)
|
67
|
+
@params = {}
|
68
|
+
@params[:n_estimators] = n_estimators
|
69
|
+
@params[:learning_rate] = learning_rate
|
70
|
+
@params[:reg_lambda] = reg_lambda
|
71
|
+
@params[:subsample] = subsample
|
72
|
+
@params[:max_depth] = max_depth
|
73
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
74
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
75
|
+
@params[:max_features] = max_features
|
76
|
+
@params[:random_seed] = random_seed
|
77
|
+
@params[:random_seed] ||= srand
|
78
|
+
@estimators = nil
|
79
|
+
@base_predictions = nil
|
80
|
+
@feature_importances = nil
|
81
|
+
@rng = Random.new(@params[:random_seed])
|
82
|
+
end
|
83
|
+
|
84
|
+
# Fit the model with given training data.
|
85
|
+
#
|
86
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
87
|
+
# @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
|
88
|
+
# @return [GradientBoostingRegressor] The learned regressor itself.
|
89
|
+
def fit(x, y)
|
90
|
+
check_sample_array(x)
|
91
|
+
check_tvalue_array(y)
|
92
|
+
check_sample_tvalue_size(x, y)
|
93
|
+
|
94
|
+
n_features = x.shape[1]
|
95
|
+
@params[:max_features] = n_features if @params[:max_features].nil?
|
96
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
97
|
+
|
98
|
+
# train regressor.
|
99
|
+
n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
|
100
|
+
@base_predictions = n_outputs > 1 ? y.mean(0) : y.mean
|
101
|
+
@estimators = if n_outputs > 1
|
102
|
+
Array.new(n_outputs) do |n|
|
103
|
+
partial_fit(x, y[true, n], @base_predictions[n])
|
104
|
+
end
|
105
|
+
else
|
106
|
+
partial_fit(x, y, @base_predictions)
|
107
|
+
end
|
108
|
+
|
109
|
+
# calculate feature importances.
|
110
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
111
|
+
if n_outputs > 1
|
112
|
+
n_outputs.times do |n|
|
113
|
+
@estimators[n].each { |tree| @feature_importances += tree.feature_importances }
|
114
|
+
end
|
115
|
+
else
|
116
|
+
@estimators.each { |tree| @feature_importances += tree.feature_importances }
|
117
|
+
end
|
118
|
+
|
119
|
+
self
|
120
|
+
end
|
121
|
+
|
122
|
+
# Predict values for samples.
|
123
|
+
#
|
124
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
125
|
+
# @return [Numo::DFloat] (shape: [n_samples]) Predicted values per sample.
|
126
|
+
def predict(x)
|
127
|
+
check_sample_array(x)
|
128
|
+
n_samples = x.shape[0]
|
129
|
+
n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
|
130
|
+
if n_outputs > 1
|
131
|
+
predicted = Numo::DFloat.ones(n_samples, n_outputs) * @base_predictions
|
132
|
+
n_outputs.times do |n|
|
133
|
+
@estimators[n].each { |tree| predicted[true, n] += tree.predict(x) }
|
134
|
+
end
|
135
|
+
else
|
136
|
+
predicted = Numo::DFloat.ones(n_samples) * @base_predictions
|
137
|
+
@estimators.each { |tree| predicted += tree.predict(x) }
|
138
|
+
end
|
139
|
+
predicted
|
140
|
+
end
|
141
|
+
|
142
|
+
# Return the index of the leaf that each sample reached.
|
143
|
+
#
|
144
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
145
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
146
|
+
def apply(x)
|
147
|
+
check_sample_array(x)
|
148
|
+
n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
|
149
|
+
leaf_ids = if n_outputs > 1
|
150
|
+
Array.new(n_outputs) { |n| @estimators[n].map { |tree| tree.apply(x) } }
|
151
|
+
else
|
152
|
+
@estimators.map { |tree| tree.apply(x) }
|
153
|
+
end
|
154
|
+
Numo::Int32[*leaf_ids].transpose
|
155
|
+
end
|
156
|
+
|
157
|
+
# Dump marshal data.
|
158
|
+
# @return [Hash] The marshal data about GradientBoostingRegressor.
|
159
|
+
def marshal_dump
|
160
|
+
{ params: @params,
|
161
|
+
estimators: @estimators,
|
162
|
+
base_predictions: @base_predictions,
|
163
|
+
feature_importances: @feature_importances,
|
164
|
+
rng: @rng }
|
165
|
+
end
|
166
|
+
|
167
|
+
# Load marshal data.
|
168
|
+
# @return [nil]
|
169
|
+
def marshal_load(obj)
|
170
|
+
@params = obj[:params]
|
171
|
+
@estimators = obj[:estimators]
|
172
|
+
@base_predictions = obj[:base_predictions]
|
173
|
+
@feature_importances = obj[:feature_importances]
|
174
|
+
@rng = obj[:rng]
|
175
|
+
nil
|
176
|
+
end
|
177
|
+
|
178
|
+
private
|
179
|
+
|
180
|
+
def partial_fit(x, y, init_pred)
|
181
|
+
# initialize some variables.
|
182
|
+
estimators = []
|
183
|
+
n_samples = x.shape[0]
|
184
|
+
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
185
|
+
whole_ids = Array.new(n_samples) { |v| v }
|
186
|
+
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
187
|
+
# grow trees.
|
188
|
+
@params[:n_estimators].times do |_t|
|
189
|
+
# subsampling
|
190
|
+
ids = whole_ids.sample(n_sub_samples, random: @rng)
|
191
|
+
x_sub = x[ids, true]
|
192
|
+
y_sub = y[ids]
|
193
|
+
y_pred_sub = y_pred[ids]
|
194
|
+
# train tree
|
195
|
+
g = gradient(y_sub, y_pred_sub)
|
196
|
+
h = hessian(n_sub_samples)
|
197
|
+
tree = plant_tree
|
198
|
+
tree.fit(x_sub, y_sub, g, h)
|
199
|
+
estimators.push(tree)
|
200
|
+
# update
|
201
|
+
y_pred += tree.predict(x)
|
202
|
+
end
|
203
|
+
estimators
|
204
|
+
end
|
205
|
+
|
206
|
+
# for debug
|
207
|
+
#
|
208
|
+
# def loss(y_true, y_pred)
|
209
|
+
# ((y_true - y_pred)**2).mean
|
210
|
+
# end
|
211
|
+
|
212
|
+
def gradient(y_true, y_pred)
|
213
|
+
y_pred - y_true
|
214
|
+
end
|
215
|
+
|
216
|
+
def hessian(n_samples)
|
217
|
+
Numo::DFloat.ones(n_samples)
|
218
|
+
end
|
219
|
+
|
220
|
+
def plant_tree
|
221
|
+
Rumale::Tree::GradientTreeRegressor.new(
|
222
|
+
reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
|
223
|
+
max_depth: @params[:max_depth],
|
224
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
225
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
226
|
+
)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Discretizes features with a given number of bins.
|
9
|
+
# In some cases, discretizing features may accelerate decision tree training.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# discretizer = Rumale::Preprocessing::BinDiscretizer.new(n_bins: 4)
|
13
|
+
# samples = Numo::DFloat.new(5, 2).rand - 0.5
|
14
|
+
# transformed = discretizer.fit_transform(samples)
|
15
|
+
# # > pp samples
|
16
|
+
# # Numo::DFloat#shape=[5,2]
|
17
|
+
# # [[-0.438246, -0.126933],
|
18
|
+
# # [ 0.294815, -0.298958],
|
19
|
+
# # [-0.383959, -0.155968],
|
20
|
+
# # [ 0.039948, 0.237815],
|
21
|
+
# # [-0.334911, -0.449117]]
|
22
|
+
# # > pp transformed
|
23
|
+
# # Numo::DFloat#shape=[5,2]
|
24
|
+
# # [[0, 1],
|
25
|
+
# # [3, 0],
|
26
|
+
# # [0, 1],
|
27
|
+
# # [2, 3],
|
28
|
+
# # [0, 0]]
|
29
|
+
class BinDiscretizer
|
30
|
+
include Base::BaseEstimator
|
31
|
+
include Base::Transformer
|
32
|
+
|
33
|
+
# Return the feature steps to be used discretizing.
|
34
|
+
# @return [Array<Numo::DFloat>] (shape: [n_features, n_bins])
|
35
|
+
attr_reader :feature_steps
|
36
|
+
|
37
|
+
# Create a new discretizer for features with given number of bins.
|
38
|
+
#
|
39
|
+
# @param n_bins [Integer] The number of bins to be used disretizing feature values.
|
40
|
+
def initialize(n_bins: 32)
|
41
|
+
@params = {}
|
42
|
+
@params[:n_bins] = n_bins
|
43
|
+
@feature_steps = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
# Fit feature ranges to be discretized.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> BinDiscretizer
|
49
|
+
#
|
50
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the feature ranges.
|
51
|
+
# @return [BinDiscretizer]
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
check_sample_array(x)
|
54
|
+
n_features = x.shape[1]
|
55
|
+
max_vals = x.max(0)
|
56
|
+
min_vals = x.min(0)
|
57
|
+
@feature_steps = Array.new(n_features) do |n|
|
58
|
+
Numo::DFloat.linspace(min_vals[n], max_vals[n], @params[:n_bins] + 1)[0...@params[:n_bins]]
|
59
|
+
end
|
60
|
+
self
|
61
|
+
end
|
62
|
+
|
63
|
+
# Fit feature ranges to be discretized, then return discretized samples.
|
64
|
+
#
|
65
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
66
|
+
#
|
67
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
|
68
|
+
# @return [Numo::DFloat] The discretized samples.
|
69
|
+
def fit_transform(x, _y = nil)
|
70
|
+
check_sample_array(x)
|
71
|
+
fit(x).transform(x)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Peform discretizing the given samples.
|
75
|
+
#
|
76
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
|
77
|
+
# @return [Numo::DFloat] The discretized samples.
|
78
|
+
def transform(x)
|
79
|
+
check_sample_array(x)
|
80
|
+
n_samples, n_features = x.shape
|
81
|
+
transformed = Numo::DFloat.zeros(n_samples, n_features)
|
82
|
+
n_features.times do |n|
|
83
|
+
steps = @feature_steps[n]
|
84
|
+
@params[:n_bins].times do |bin|
|
85
|
+
mask = x[true, n].ge(steps[bin]).where
|
86
|
+
transformed[mask, n] = bin
|
87
|
+
end
|
88
|
+
end
|
89
|
+
transformed
|
90
|
+
end
|
91
|
+
|
92
|
+
# Dump marshal data.
|
93
|
+
# @return [Hash] The marshal data about BinDiscretizer
|
94
|
+
def marshal_dump
|
95
|
+
{ params: @params,
|
96
|
+
feature_steps: @feature_steps }
|
97
|
+
end
|
98
|
+
|
99
|
+
# Load marshal data.
|
100
|
+
# @return [nil]
|
101
|
+
def marshal_load(obj)
|
102
|
+
@params = obj[:params]
|
103
|
+
@feature_steps = obj[:feature_steps]
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -9,7 +9,7 @@ module Rumale
|
|
9
9
|
# Normalize samples to unit L2-norm.
|
10
10
|
#
|
11
11
|
# @example
|
12
|
-
# normalizer = Rumale::Preprocessing::
|
12
|
+
# normalizer = Rumale::Preprocessing::L2Normalizer.new
|
13
13
|
# new_samples = normalizer.fit_transform(samples)
|
14
14
|
class L2Normalizer
|
15
15
|
include Base::BaseEstimator
|
@@ -86,14 +86,13 @@ module Rumale
|
|
86
86
|
return put_leaf(node, y) if stop_growing?(y)
|
87
87
|
|
88
88
|
# calculate optimal parameters.
|
89
|
-
feature_id,
|
90
|
-
|
91
|
-
limp, rimp, th, ga = best_split(ft, y, whole_impurity)
|
92
|
-
[fid, ft.le(th).where, ft.gt(th).where, limp, rimp, th, ga]
|
93
|
-
end.max_by(&:last)
|
89
|
+
feature_id, left_imp, right_imp, threshold, gain =
|
90
|
+
rand_ids(n_features).map { |n| [n, *best_split(x[true, n], y, whole_impurity)] }.max_by(&:last)
|
94
91
|
|
95
92
|
return put_leaf(node, y) if gain.nil? || gain.zero?
|
96
93
|
|
94
|
+
left_ids = x[true, feature_id].le(threshold).where
|
95
|
+
right_ids = x[true, feature_id].gt(threshold).where
|
97
96
|
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
|
98
97
|
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
|
99
98
|
|
@@ -107,8 +107,8 @@ module Rumale
|
|
107
107
|
threshold = @rng.rand(features.min..features.max)
|
108
108
|
l_ids = features.le(threshold).where
|
109
109
|
r_ids = features.gt(threshold).where
|
110
|
-
l_impurity = l_ids.
|
111
|
-
r_impurity = r_ids.
|
110
|
+
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
111
|
+
r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
|
112
112
|
gain = whole_impurity -
|
113
113
|
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
114
114
|
r_impurity * r_ids.size.fdiv(y.shape[0])
|
@@ -94,8 +94,8 @@ module Rumale
|
|
94
94
|
threshold = @rng.rand(features.min..features.max)
|
95
95
|
l_ids = features.le(threshold).where
|
96
96
|
r_ids = features.gt(threshold).where
|
97
|
-
l_impurity = l_ids.
|
98
|
-
r_impurity = r_ids.
|
97
|
+
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
98
|
+
r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
|
99
99
|
gain = whole_impurity -
|
100
100
|
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
101
101
|
r_impurity * r_ids.size.fdiv(y.shape[0])
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/rumale'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/regressor'
|
6
|
+
require 'rumale/tree/node'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Tree
|
10
|
+
# GradientTreeRegressor is a class that implements decision tree for regression with exact gredy algorithm.
|
11
|
+
# This class is used internally for estimators with gradient tree boosting.
|
12
|
+
#
|
13
|
+
# *reference*
|
14
|
+
# - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
15
|
+
# - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
16
|
+
# - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
17
|
+
#
|
18
|
+
class GradientTreeRegressor
|
19
|
+
include Base::BaseEstimator
|
20
|
+
include Base::Regressor
|
21
|
+
include ExtGradientTreeRegressor
|
22
|
+
|
23
|
+
# Return the importance for each feature.
|
24
|
+
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
26
|
+
attr_reader :feature_importances
|
27
|
+
|
28
|
+
# Return the learned tree.
|
29
|
+
# @return [Node]
|
30
|
+
attr_reader :tree
|
31
|
+
|
32
|
+
# Return the random generator for random selection of feature index.
|
33
|
+
# @return [Random]
|
34
|
+
attr_reader :rng
|
35
|
+
|
36
|
+
# Return the values assigned each leaf.
|
37
|
+
# @return [Numo::DFloat] (shape: [n_leaves])
|
38
|
+
attr_reader :leaf_weights
|
39
|
+
|
40
|
+
# Initialize a gradient tree regressor
|
41
|
+
#
|
42
|
+
# @param reg_lambda [Float] The L2 regularization term on weight.
|
43
|
+
# @param shrinkage_rate [Float] The shrinkage rate for weight.
|
44
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
45
|
+
# If nil is given, decision tree grows without concern for depth.
|
46
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
47
|
+
# If nil is given, number of leaves is not limited.
|
48
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
49
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
50
|
+
# If nil is given, split process considers all features.
|
51
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
52
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
53
|
+
def initialize(reg_lambda: 0.0, shrinkage_rate: 1.0,
|
54
|
+
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
|
55
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
56
|
+
max_features: max_features, random_seed: random_seed)
|
57
|
+
check_params_float(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate)
|
58
|
+
check_params_integer(min_samples_leaf: min_samples_leaf)
|
59
|
+
check_params_positive(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate,
|
60
|
+
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
61
|
+
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
62
|
+
@params = {}
|
63
|
+
@params[:reg_lambda] = reg_lambda
|
64
|
+
@params[:shrinkage_rate] = shrinkage_rate
|
65
|
+
@params[:max_depth] = max_depth
|
66
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
67
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
68
|
+
@params[:max_features] = max_features
|
69
|
+
@params[:random_seed] = random_seed
|
70
|
+
@params[:random_seed] ||= srand
|
71
|
+
@tree = nil
|
72
|
+
@feature_importances = nil
|
73
|
+
@n_leaves = nil
|
74
|
+
@leaf_weights = nil
|
75
|
+
@rng = Random.new(@params[:random_seed])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Fit the model with given training data.
|
79
|
+
#
|
80
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
81
|
+
# @param y [Numo::DFloat] (shape: [n_samples]) The taget values to be used for fitting the model.
|
82
|
+
# @param g [Numo::DFloat] (shape: [n_samples]) The gradient of loss function.
|
83
|
+
# @param h [Numo::DFloat] (shape: [n_samples]) The hessian of loss function.
|
84
|
+
# @return [GradientTreeRegressor] The learned regressor itself.
|
85
|
+
def fit(x, y, g, h)
|
86
|
+
check_sample_array(x)
|
87
|
+
check_tvalue_array(y)
|
88
|
+
check_sample_tvalue_size(x, y)
|
89
|
+
check_params_type(Numo::DFloat, g: g, h: g)
|
90
|
+
# Initialize some variables.
|
91
|
+
n_features = x.shape[1]
|
92
|
+
@params[:max_features] ||= n_features
|
93
|
+
@n_leaves = 0
|
94
|
+
@leaf_weights = []
|
95
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
96
|
+
# Build tree.
|
97
|
+
build_tree(x, y, g, h)
|
98
|
+
@leaf_weights = Numo::DFloat[*@leaf_weights]
|
99
|
+
self
|
100
|
+
end
|
101
|
+
|
102
|
+
# Predict values for samples.
|
103
|
+
#
|
104
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
105
|
+
# @return [Numo::DFloat] (size: n_samples) Predicted values per sample.
|
106
|
+
def predict(x)
|
107
|
+
check_sample_array(x)
|
108
|
+
@leaf_weights[apply(x)].dup
|
109
|
+
end
|
110
|
+
|
111
|
+
# Return the index of the leaf that each sample reached.
|
112
|
+
#
|
113
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
114
|
+
# @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
|
115
|
+
def apply(x)
|
116
|
+
check_sample_array(x)
|
117
|
+
Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
|
118
|
+
end
|
119
|
+
|
120
|
+
# Dump marshal data.
|
121
|
+
# @return [Hash] The marshal data about DecisionTreeRegressor
|
122
|
+
def marshal_dump
|
123
|
+
{ params: @params,
|
124
|
+
tree: @tree,
|
125
|
+
feature_importances: @feature_importances,
|
126
|
+
leaf_weights: @leaf_weights,
|
127
|
+
rng: @rng }
|
128
|
+
end
|
129
|
+
|
130
|
+
# Load marshal data.
|
131
|
+
# @return [nil]
|
132
|
+
def marshal_load(obj)
|
133
|
+
@params = obj[:params]
|
134
|
+
@tree = obj[:tree]
|
135
|
+
@feature_importances = obj[:feature_importances]
|
136
|
+
@leaf_weights = obj[:leaf_weights]
|
137
|
+
@rng = obj[:rng]
|
138
|
+
nil
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
|
143
|
+
def apply_at_node(node, sample)
|
144
|
+
return node.leaf_id if node.leaf
|
145
|
+
return apply_at_node(node.left, sample) if node.right.nil?
|
146
|
+
return apply_at_node(node.right, sample) if node.left.nil?
|
147
|
+
if sample[node.feature_id] <= node.threshold
|
148
|
+
apply_at_node(node.left, sample)
|
149
|
+
else
|
150
|
+
apply_at_node(node.right, sample)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def build_tree(x, y, g, h)
|
155
|
+
@tree = grow_node(0, x, y, g, h)
|
156
|
+
nil
|
157
|
+
end
|
158
|
+
|
159
|
+
def grow_node(depth, x, y, g, h)
|
160
|
+
# intialize some variables.
|
161
|
+
sum_g = g.sum
|
162
|
+
sum_h = h.sum
|
163
|
+
n_samples, n_features = x.shape
|
164
|
+
node = Node.new(depth: depth, n_samples: n_samples)
|
165
|
+
|
166
|
+
# terminate growing.
|
167
|
+
unless @params[:max_leaf_nodes].nil?
|
168
|
+
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
169
|
+
end
|
170
|
+
|
171
|
+
return nil if n_samples < @params[:min_samples_leaf]
|
172
|
+
return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
|
173
|
+
|
174
|
+
unless @params[:max_depth].nil?
|
175
|
+
return put_leaf(node, sum_g, sum_h) if depth == @params[:max_depth]
|
176
|
+
end
|
177
|
+
|
178
|
+
return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
|
179
|
+
|
180
|
+
# calculate optimal parameters.
|
181
|
+
feature_id, threshold, gain =
|
182
|
+
rand_ids(n_features).map { |n| [n, *best_split(x[true, n], g, h, sum_g, sum_h)] }.max_by(&:last)
|
183
|
+
|
184
|
+
return put_leaf(node, sum_g, sum_h) if gain.nil? || gain.zero?
|
185
|
+
|
186
|
+
left_ids = x[true, feature_id].le(threshold).where
|
187
|
+
right_ids = x[true, feature_id].gt(threshold).where
|
188
|
+
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], g[left_ids], h[left_ids])
|
189
|
+
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], g[right_ids], h[right_ids])
|
190
|
+
|
191
|
+
return put_leaf(node, sum_g, sum_h) if node.left.nil? && node.right.nil?
|
192
|
+
|
193
|
+
@feature_importances[feature_id] += 1.0
|
194
|
+
|
195
|
+
node.feature_id = feature_id
|
196
|
+
node.threshold = threshold
|
197
|
+
node.leaf = false
|
198
|
+
node
|
199
|
+
end
|
200
|
+
|
201
|
+
def stop_growing?(y)
|
202
|
+
y.to_a.uniq.size == 1
|
203
|
+
end
|
204
|
+
|
205
|
+
def put_leaf(node, sum_g, sum_h)
|
206
|
+
node.probs = nil
|
207
|
+
node.leaf = true
|
208
|
+
node.leaf_id = @n_leaves
|
209
|
+
weight = -@params[:shrinkage_rate] * sum_g / (sum_h + @params[:reg_lambda])
|
210
|
+
@leaf_weights.push(weight)
|
211
|
+
@n_leaves += 1
|
212
|
+
node
|
213
|
+
end
|
214
|
+
|
215
|
+
def best_split(features, g, h, sum_g, sum_h)
|
216
|
+
order = features.sort_index
|
217
|
+
sorted_f = features[order].to_a
|
218
|
+
sorted_g = g[order].to_a
|
219
|
+
sorted_h = h[order].to_a
|
220
|
+
find_split_params(sorted_f, sorted_g, sorted_h, sum_g, sum_h, @params[:reg_lambda])
|
221
|
+
end
|
222
|
+
|
223
|
+
def rand_ids(n)
|
224
|
+
[*0...n].sample(@params[:max_features], random: @rng)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
data/lib/rumale/tree/node.rb
CHANGED
@@ -21,7 +21,7 @@ module Rumale
|
|
21
21
|
# @param feature_id [Integer] The feature index used for evaluation.
|
22
22
|
# @param threshold [Float] The threshold value of the feature for splitting the node.
|
23
23
|
def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
|
24
|
-
leaf:
|
24
|
+
leaf: false, leaf_id: nil,
|
25
25
|
left: nil, right: nil, feature_id: 0, threshold: 0.0)
|
26
26
|
@depth = depth
|
27
27
|
@impurity = impurity
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
|
|
17
17
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
18
18
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
19
19
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
20
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
20
|
+
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
21
21
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
22
22
|
MSG
|
23
23
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05-
|
11
|
+
date: 2019-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -99,7 +99,7 @@ description: |
|
|
99
99
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
100
100
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
101
101
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
102
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
102
|
+
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
103
103
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
104
104
|
email:
|
105
105
|
- yoshoku@outlook.com
|
@@ -142,6 +142,8 @@ files:
|
|
142
142
|
- lib/rumale/ensemble/ada_boost_regressor.rb
|
143
143
|
- lib/rumale/ensemble/extra_trees_classifier.rb
|
144
144
|
- lib/rumale/ensemble/extra_trees_regressor.rb
|
145
|
+
- lib/rumale/ensemble/gradient_boosting_classifier.rb
|
146
|
+
- lib/rumale/ensemble/gradient_boosting_regressor.rb
|
145
147
|
- lib/rumale/ensemble/random_forest_classifier.rb
|
146
148
|
- lib/rumale/ensemble/random_forest_regressor.rb
|
147
149
|
- lib/rumale/evaluation_measure/accuracy.rb
|
@@ -191,6 +193,7 @@ files:
|
|
191
193
|
- lib/rumale/polynomial_model/base_factorization_machine.rb
|
192
194
|
- lib/rumale/polynomial_model/factorization_machine_classifier.rb
|
193
195
|
- lib/rumale/polynomial_model/factorization_machine_regressor.rb
|
196
|
+
- lib/rumale/preprocessing/bin_discretizer.rb
|
194
197
|
- lib/rumale/preprocessing/l2_normalizer.rb
|
195
198
|
- lib/rumale/preprocessing/label_encoder.rb
|
196
199
|
- lib/rumale/preprocessing/max_abs_scaler.rb
|
@@ -203,6 +206,7 @@ files:
|
|
203
206
|
- lib/rumale/tree/decision_tree_regressor.rb
|
204
207
|
- lib/rumale/tree/extra_tree_classifier.rb
|
205
208
|
- lib/rumale/tree/extra_tree_regressor.rb
|
209
|
+
- lib/rumale/tree/gradient_tree_regressor.rb
|
206
210
|
- lib/rumale/tree/node.rb
|
207
211
|
- lib/rumale/utils.rb
|
208
212
|
- lib/rumale/validation.rb
|