rumale 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/ext/rumale/rumale.c +74 -0
- data/lib/rumale.rb +4 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +1 -1
- data/lib/rumale/ensemble/ada_boost_regressor.rb +1 -1
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +278 -0
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +230 -0
- data/lib/rumale/preprocessing/bin_discretizer.rb +108 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +1 -1
- data/lib/rumale/tree/base_decision_tree.rb +4 -5
- data/lib/rumale/tree/extra_tree_classifier.rb +2 -2
- data/lib/rumale/tree/extra_tree_regressor.rb +2 -2
- data/lib/rumale/tree/gradient_tree_regressor.rb +228 -0
- data/lib/rumale/tree/node.rb +1 -1
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d73f16bcd1d149babe18c1db66d3f72bb9a1206
|
4
|
+
data.tar.gz: 247fd7d548563ef27622c293073236468f634b7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a4a92d08ee0a8295d96a930a46fb67a9299a9e0beb717d52186347fef3b70727e35a2375e6c50f5da37ab699132fe0d3c3beeeb0a9730a158e3a5864f6b8364
|
7
|
+
data.tar.gz: a614c5002c750f9091a0b7b80b678115ea6b65a1a7d0de621431ee942f8f1678d36c64a271cdb1cc0c4a68c49d20bacfe934d844381ad78361c09e762e02e872
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# 0.9.2
|
2
|
+
- Add class for Gradient tree boosting classifier.
|
3
|
+
- Add class for Gradient tree boosting regressor.
|
4
|
+
- Add class for discretizing feature values.
|
5
|
+
- Refactor extra-trees estimators.
|
6
|
+
- Refactor decision tree base class.
|
7
|
+
- Fix some typos on document ([#6](https://github.com/yoshoku/rumale/pull/6)).
|
8
|
+
|
1
9
|
# 0.9.1
|
2
10
|
- Add class for Extra-Trees classifier.
|
3
11
|
- Add class for Extra-Trees regressor.
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
|
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
15
|
+
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
16
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
17
17
|
|
18
18
|
This project was formerly known as "SVMKit".
|
data/ext/rumale/rumale.c
CHANGED
@@ -334,6 +334,72 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
334
334
|
return opt_params;
|
335
335
|
}
|
336
336
|
|
337
|
+
/**
|
338
|
+
* @!visibility private
|
339
|
+
* Find for split point with maximum information gain.
|
340
|
+
*
|
341
|
+
* @overload find_split_params(sorted_features, sorted_gradient, sorted_hessian, sum_gradient, sum_hessian) -> Array<Float>
|
342
|
+
*
|
343
|
+
* @param sorted_features [Array<Float>] (size: n_samples) The feature values sorted in ascending order.
|
344
|
+
* @param sorted_targets [Array<Float>] (size: n_samples) The target values sorted according to feature values.
|
345
|
+
* @param sorted_gradient [Array<Float>] (size: n_samples) The gradient values of loss function sorted according to feature values.
|
346
|
+
* @param sorted_hessian [Array<Float>] (size: n_samples) The hessian values of loss function sorted according to feature values.
|
347
|
+
* @param sum_gradient [Float] The sum of gradient values.
|
348
|
+
* @param sum_hessian [Float] The sum of hessian values.
|
349
|
+
* @param reg_lambda [Float] The L2 regularization term on weight.
|
350
|
+
* @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
|
351
|
+
*/
|
352
|
+
static VALUE
|
353
|
+
find_split_params_grad_reg
|
354
|
+
(VALUE self, VALUE sorted_f, VALUE sorted_g, VALUE sorted_h, VALUE sum_g, VALUE sum_h, VALUE reg_l)
|
355
|
+
{
|
356
|
+
const long n_elements = RARRAY_LEN(sorted_f);
|
357
|
+
const double s_grad = NUM2DBL(sum_g);
|
358
|
+
const double s_hess = NUM2DBL(sum_h);
|
359
|
+
const double reg_lambda = NUM2DBL(reg_l);
|
360
|
+
long curr_pos = 0;
|
361
|
+
long next_pos = 0;
|
362
|
+
double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
|
363
|
+
double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
|
364
|
+
double next_el;
|
365
|
+
double l_grad = 0.0;
|
366
|
+
double l_hess = 0.0;
|
367
|
+
double r_grad;
|
368
|
+
double r_hess;
|
369
|
+
double gain;
|
370
|
+
VALUE opt_params = rb_ary_new2(2);
|
371
|
+
|
372
|
+
/* Initialize optimal parameters. */
|
373
|
+
rb_ary_store(opt_params, 0, rb_ary_entry(sorted_f, 0)); /* threshold */
|
374
|
+
rb_ary_store(opt_params, 1, DBL2NUM(0)); /* gain */
|
375
|
+
|
376
|
+
/* Find optimal parameters. */
|
377
|
+
while (curr_pos < n_elements && curr_el != last_el) {
|
378
|
+
next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
|
379
|
+
while (next_pos < n_elements && next_el == curr_el) {
|
380
|
+
l_grad += NUM2DBL(rb_ary_entry(sorted_g, next_pos));
|
381
|
+
l_hess += NUM2DBL(rb_ary_entry(sorted_h, next_pos));
|
382
|
+
next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
|
383
|
+
}
|
384
|
+
/* Calculate gain of new split. */
|
385
|
+
r_grad = s_grad - l_grad;
|
386
|
+
r_hess = s_hess - l_hess;
|
387
|
+
gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
|
388
|
+
(r_grad * r_grad) / (r_hess + reg_lambda) -
|
389
|
+
(s_grad * s_grad) / (s_hess + reg_lambda);
|
390
|
+
/* Update optimal parameters. */
|
391
|
+
if (gain > NUM2DBL(rb_ary_entry(opt_params, 1))) {
|
392
|
+
rb_ary_store(opt_params, 0, DBL2NUM(0.5 * (curr_el + next_el)));
|
393
|
+
rb_ary_store(opt_params, 1, DBL2NUM(gain));
|
394
|
+
}
|
395
|
+
if (next_pos == n_elements) break;
|
396
|
+
curr_pos = next_pos;
|
397
|
+
curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
|
398
|
+
}
|
399
|
+
|
400
|
+
return opt_params;
|
401
|
+
}
|
402
|
+
|
337
403
|
/**
|
338
404
|
* @!visibility private
|
339
405
|
* Calculate impurity based on criterion.
|
@@ -406,9 +472,17 @@ void Init_rumale(void)
|
|
406
472
|
* This module is used internally.
|
407
473
|
*/
|
408
474
|
VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
|
475
|
+
/**
|
476
|
+
* Document-module: Rumale::Tree::ExtGradientTreeRegressor
|
477
|
+
* @!visibility private
|
478
|
+
* The mixin module consisting of extension method for GradientTreeRegressor class.
|
479
|
+
* This module is used internally.
|
480
|
+
*/
|
481
|
+
VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
|
409
482
|
|
410
483
|
rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
|
411
484
|
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
|
485
|
+
rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 6);
|
412
486
|
rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
|
413
487
|
rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
|
414
488
|
}
|
data/lib/rumale.rb
CHANGED
@@ -47,8 +47,11 @@ require 'rumale/tree/decision_tree_classifier'
|
|
47
47
|
require 'rumale/tree/decision_tree_regressor'
|
48
48
|
require 'rumale/tree/extra_tree_classifier'
|
49
49
|
require 'rumale/tree/extra_tree_regressor'
|
50
|
+
require 'rumale/tree/gradient_tree_regressor'
|
50
51
|
require 'rumale/ensemble/ada_boost_classifier'
|
51
52
|
require 'rumale/ensemble/ada_boost_regressor'
|
53
|
+
require 'rumale/ensemble/gradient_boosting_classifier'
|
54
|
+
require 'rumale/ensemble/gradient_boosting_regressor'
|
52
55
|
require 'rumale/ensemble/random_forest_classifier'
|
53
56
|
require 'rumale/ensemble/random_forest_regressor'
|
54
57
|
require 'rumale/ensemble/extra_trees_classifier'
|
@@ -61,6 +64,7 @@ require 'rumale/preprocessing/l2_normalizer'
|
|
61
64
|
require 'rumale/preprocessing/min_max_scaler'
|
62
65
|
require 'rumale/preprocessing/max_abs_scaler'
|
63
66
|
require 'rumale/preprocessing/standard_scaler'
|
67
|
+
require 'rumale/preprocessing/bin_discretizer'
|
64
68
|
require 'rumale/preprocessing/label_encoder'
|
65
69
|
require 'rumale/preprocessing/one_hot_encoder'
|
66
70
|
require 'rumale/model_selection/k_fold'
|
@@ -42,7 +42,7 @@ module Rumale
|
|
42
42
|
|
43
43
|
# Create a new classifier with AdaBoost.
|
44
44
|
#
|
45
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing
|
45
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
|
46
46
|
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
47
47
|
# @param max_depth [Integer] The maximum depth of the tree.
|
48
48
|
# If nil is given, decision tree grows without concern for depth.
|
@@ -42,7 +42,7 @@ module Rumale
|
|
42
42
|
|
43
43
|
# Create a new regressor with random forest.
|
44
44
|
#
|
45
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing
|
45
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
|
46
46
|
# @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
|
47
47
|
# @param exponent [Float] The exponent for the weight of each weak learner.
|
48
48
|
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
@@ -0,0 +1,278 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/values'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/classifier'
|
6
|
+
require 'rumale/tree/gradient_tree_regressor'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Ensemble
|
10
|
+
# GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
|
11
|
+
# The class use negative binomial log-likelihood for the loss function.
|
12
|
+
# For multiclass classification problem, it uses one-vs-the-rest strategy.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# estimator =
|
16
|
+
# Rumale::Ensemble::GradientBoostingClassifier.new(
|
17
|
+
# n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
|
18
|
+
# estimator.fit(training_samples, traininig_values)
|
19
|
+
# results = estimator.predict(testing_samples)
|
20
|
+
#
|
21
|
+
# *reference*
|
22
|
+
# - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
23
|
+
# - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
24
|
+
# - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
25
|
+
#
|
26
|
+
class GradientBoostingClassifier
|
27
|
+
include Base::BaseEstimator
|
28
|
+
include Base::Classifier
|
29
|
+
|
30
|
+
# Return the set of estimators.
|
31
|
+
# @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
|
32
|
+
attr_reader :estimators
|
33
|
+
|
34
|
+
# Return the class labels.
|
35
|
+
# @return [Numo::Int32] (size: n_classes)
|
36
|
+
attr_reader :classes
|
37
|
+
|
38
|
+
# Return the importance for each feature.
|
39
|
+
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
40
|
+
# @return [Numo::DFloat] (size: n_features)
|
41
|
+
attr_reader :feature_importances
|
42
|
+
|
43
|
+
# Return the random generator for random selection of feature index.
|
44
|
+
# @return [Random]
|
45
|
+
attr_reader :rng
|
46
|
+
|
47
|
+
# Create a new classifier with gradient tree boosting.
|
48
|
+
#
|
49
|
+
# @param n_estimators [Integer] The numeber of trees for contructing classifier.
|
50
|
+
# @param learning_rate [Float] The boosting learining rate
|
51
|
+
# @param reg_lambda [Float] The L2 regularization term on weight.
|
52
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
53
|
+
# If nil is given, decision tree grows without concern for depth.
|
54
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
55
|
+
# If nil is given, number of leaves is not limited.
|
56
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
57
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
58
|
+
# If nil is given, split process considers all features.
|
59
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
60
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
61
|
+
def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
|
62
|
+
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
63
|
+
max_features: nil, random_seed: nil)
|
64
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
65
|
+
max_features: max_features, random_seed: random_seed)
|
66
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
67
|
+
check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
|
68
|
+
check_params_positive(n_estimators: n_estimators,
|
69
|
+
learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
|
70
|
+
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
71
|
+
max_features: max_features)
|
72
|
+
@params = {}
|
73
|
+
@params[:n_estimators] = n_estimators
|
74
|
+
@params[:learning_rate] = learning_rate
|
75
|
+
@params[:reg_lambda] = reg_lambda
|
76
|
+
@params[:subsample] = subsample
|
77
|
+
@params[:max_depth] = max_depth
|
78
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
79
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
80
|
+
@params[:max_features] = max_features
|
81
|
+
@params[:random_seed] = random_seed
|
82
|
+
@params[:random_seed] ||= srand
|
83
|
+
@estimators = nil
|
84
|
+
@classes = nil
|
85
|
+
@base_predictions = nil
|
86
|
+
@feature_importances = nil
|
87
|
+
@rng = Random.new(@params[:random_seed])
|
88
|
+
end
|
89
|
+
|
90
|
+
# Fit the model with given training data.
|
91
|
+
#
|
92
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
93
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
94
|
+
# @return [GradientBoostingClassifier] The learned classifier itself.
|
95
|
+
def fit(x, y)
|
96
|
+
check_sample_array(x)
|
97
|
+
check_label_array(y)
|
98
|
+
check_sample_label_size(x, y)
|
99
|
+
|
100
|
+
n_features = x.shape[1]
|
101
|
+
@params[:max_features] = n_features if @params[:max_features].nil?
|
102
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
103
|
+
|
104
|
+
# train estimator.
|
105
|
+
@classes = Numo::Int32[*y.to_a.uniq.sort]
|
106
|
+
n_classes = @classes.size
|
107
|
+
if n_classes > 2
|
108
|
+
@base_predictions = Numo::DFloat.zeros(n_classes)
|
109
|
+
@estimators = Array.new(n_classes) do |n|
|
110
|
+
bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
|
111
|
+
y_mean = bin_y.mean
|
112
|
+
@base_predictions[n] = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
|
113
|
+
partial_fit(x, bin_y, @base_predictions[n])
|
114
|
+
end
|
115
|
+
else
|
116
|
+
negative_label = y.to_a.uniq.min
|
117
|
+
bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
|
118
|
+
y_mean = bin_y.mean
|
119
|
+
@base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
|
120
|
+
@estimators = partial_fit(x, bin_y, @base_predictions)
|
121
|
+
end
|
122
|
+
|
123
|
+
# calculate feature importances.
|
124
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
125
|
+
if n_classes > 2
|
126
|
+
n_classes.times do |n|
|
127
|
+
@estimators[n].each { |tree| @feature_importances += tree.feature_importances }
|
128
|
+
end
|
129
|
+
else
|
130
|
+
@estimators.each { |tree| @feature_importances += tree.feature_importances }
|
131
|
+
end
|
132
|
+
|
133
|
+
self
|
134
|
+
end
|
135
|
+
|
136
|
+
# Calculate confidence scores for samples.
|
137
|
+
#
|
138
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
139
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
|
140
|
+
def decision_function(x)
|
141
|
+
check_sample_array(x)
|
142
|
+
n_samples = x.shape[0]
|
143
|
+
n_classes = @classes.size
|
144
|
+
if n_classes > 2
|
145
|
+
scores = Numo::DFloat.ones(n_samples, n_classes) * @base_predictions
|
146
|
+
n_classes.times do |n|
|
147
|
+
@estimators[n].each { |tree| scores[true, n] += tree.predict(x) }
|
148
|
+
end
|
149
|
+
else
|
150
|
+
scores = Numo::DFloat.ones(n_samples) * @base_predictions
|
151
|
+
@estimators.each { |tree| scores += tree.predict(x) }
|
152
|
+
end
|
153
|
+
scores
|
154
|
+
end
|
155
|
+
|
156
|
+
# Predict class labels for samples.
|
157
|
+
#
|
158
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
159
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
160
|
+
def predict(x)
|
161
|
+
check_sample_array(x)
|
162
|
+
n_samples = x.shape[0]
|
163
|
+
probs = predict_proba(x)
|
164
|
+
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
|
165
|
+
end
|
166
|
+
|
167
|
+
# Predict probability for samples.
|
168
|
+
#
|
169
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
170
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
171
|
+
def predict_proba(x)
|
172
|
+
check_sample_array(x)
|
173
|
+
|
174
|
+
proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
|
175
|
+
|
176
|
+
return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
|
177
|
+
|
178
|
+
n_samples, = x.shape
|
179
|
+
probs = Numo::DFloat.zeros(n_samples, 2)
|
180
|
+
probs[true, 1] = proba
|
181
|
+
probs[true, 0] = 1.0 - proba
|
182
|
+
probs
|
183
|
+
end
|
184
|
+
|
185
|
+
# Return the index of the leaf that each sample reached.
|
186
|
+
#
|
187
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
188
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
|
189
|
+
def apply(x)
|
190
|
+
check_sample_array(x)
|
191
|
+
n_classes = @classes.size
|
192
|
+
leaf_ids = if n_classes > 2
|
193
|
+
Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
|
194
|
+
else
|
195
|
+
@estimators.map { |tree| tree.apply(x) }
|
196
|
+
end
|
197
|
+
Numo::Int32[*leaf_ids].transpose
|
198
|
+
end
|
199
|
+
|
200
|
+
# Dump marshal data.
|
201
|
+
# @return [Hash] The marshal data about GradientBoostingClassifier.
|
202
|
+
def marshal_dump
|
203
|
+
{ params: @params,
|
204
|
+
estimators: @estimators,
|
205
|
+
classes: @classes,
|
206
|
+
base_predictions: @base_predictions,
|
207
|
+
feature_importances: @feature_importances,
|
208
|
+
rng: @rng }
|
209
|
+
end
|
210
|
+
|
211
|
+
# Load marshal data.
|
212
|
+
# @return [nil]
|
213
|
+
def marshal_load(obj)
|
214
|
+
@params = obj[:params]
|
215
|
+
@estimators = obj[:estimators]
|
216
|
+
@classes = obj[:classes]
|
217
|
+
@base_predictions = obj[:base_predictions]
|
218
|
+
@feature_importances = obj[:feature_importances]
|
219
|
+
@rng = obj[:rng]
|
220
|
+
nil
|
221
|
+
end
|
222
|
+
|
223
|
+
private
|
224
|
+
|
225
|
+
def partial_fit(x, y, init_pred)
|
226
|
+
# initialize some variables.
|
227
|
+
estimators = []
|
228
|
+
n_samples = x.shape[0]
|
229
|
+
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
230
|
+
whole_ids = Array.new(n_samples) { |v| v }
|
231
|
+
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
232
|
+
# grow trees.
|
233
|
+
@params[:n_estimators].times do |_t|
|
234
|
+
# subsampling
|
235
|
+
ids = whole_ids.sample(n_sub_samples, random: @rng)
|
236
|
+
x_sub = x[ids, true]
|
237
|
+
y_sub = y[ids]
|
238
|
+
y_pred_sub = y_pred[ids]
|
239
|
+
# train tree
|
240
|
+
g = gradient(y_sub, y_pred_sub)
|
241
|
+
h = hessian(y_sub, y_pred_sub)
|
242
|
+
tree = plant_tree
|
243
|
+
tree.fit(x_sub, y_sub, g, h)
|
244
|
+
estimators.push(tree)
|
245
|
+
# update
|
246
|
+
y_pred += tree.predict(x)
|
247
|
+
end
|
248
|
+
estimators
|
249
|
+
end
|
250
|
+
|
251
|
+
# for debug
|
252
|
+
#
|
253
|
+
# def loss(y_true, y_pred)
|
254
|
+
# # y_true in {-1, 1}
|
255
|
+
# Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
|
256
|
+
# end
|
257
|
+
|
258
|
+
def gradient(y_true, y_pred)
|
259
|
+
# y in {-1, 1}
|
260
|
+
-2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
|
261
|
+
end
|
262
|
+
|
263
|
+
def hessian(y_true, y_pred)
|
264
|
+
abs_response = gradient(y_true, y_pred).abs
|
265
|
+
abs_response * (2.0 - abs_response)
|
266
|
+
end
|
267
|
+
|
268
|
+
def plant_tree
|
269
|
+
Rumale::Tree::GradientTreeRegressor.new(
|
270
|
+
reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
|
271
|
+
max_depth: @params[:max_depth],
|
272
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
273
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
274
|
+
)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
@@ -0,0 +1,230 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/values'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/regressor'
|
6
|
+
require 'rumale/tree/gradient_tree_regressor'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Ensemble
|
10
|
+
# GradientBoostingRegressor is a class that implements gradient tree boosting for regression.
|
11
|
+
# The class use L2 loss for the loss function.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# estimator =
|
15
|
+
# Rumale::Ensemble::GradientBoostingRegressor.new(
|
16
|
+
# n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
|
17
|
+
# estimator.fit(training_samples, traininig_values)
|
18
|
+
# results = estimator.predict(testing_samples)
|
19
|
+
#
|
20
|
+
# *reference*
|
21
|
+
# - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
22
|
+
# - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
23
|
+
# - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
24
|
+
#
|
25
|
+
class GradientBoostingRegressor
|
26
|
+
include Base::BaseEstimator
|
27
|
+
include Base::Regressor
|
28
|
+
|
29
|
+
# Return the set of estimators.
|
30
|
+
# @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
|
31
|
+
attr_reader :estimators
|
32
|
+
|
33
|
+
# Return the importance for each feature.
|
34
|
+
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
35
|
+
# @return [Numo::DFloat] (size: n_features)
|
36
|
+
attr_reader :feature_importances
|
37
|
+
|
38
|
+
# Return the random generator for random selection of feature index.
|
39
|
+
# @return [Random]
|
40
|
+
attr_reader :rng
|
41
|
+
|
42
|
+
# Create a new regressor with gradient tree boosting.
|
43
|
+
#
|
44
|
+
# @param n_estimators [Integer] The numeber of trees for contructing regressor.
|
45
|
+
# @param learning_rate [Float] The boosting learining rate
|
46
|
+
# @param reg_lambda [Float] The L2 regularization term on weight.
|
47
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
48
|
+
# If nil is given, decision tree grows without concern for depth.
|
49
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
50
|
+
# If nil is given, number of leaves is not limited.
|
51
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
52
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
53
|
+
# If nil is given, split process considers all features.
|
54
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
55
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
56
|
+
def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
|
57
|
+
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
58
|
+
max_features: nil, random_seed: nil)
|
59
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
60
|
+
max_features: max_features, random_seed: random_seed)
|
61
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
62
|
+
check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
|
63
|
+
check_params_positive(n_estimators: n_estimators,
|
64
|
+
learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
|
65
|
+
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
66
|
+
max_features: max_features)
|
67
|
+
@params = {}
|
68
|
+
@params[:n_estimators] = n_estimators
|
69
|
+
@params[:learning_rate] = learning_rate
|
70
|
+
@params[:reg_lambda] = reg_lambda
|
71
|
+
@params[:subsample] = subsample
|
72
|
+
@params[:max_depth] = max_depth
|
73
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
74
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
75
|
+
@params[:max_features] = max_features
|
76
|
+
@params[:random_seed] = random_seed
|
77
|
+
@params[:random_seed] ||= srand
|
78
|
+
@estimators = nil
|
79
|
+
@base_predictions = nil
|
80
|
+
@feature_importances = nil
|
81
|
+
@rng = Random.new(@params[:random_seed])
|
82
|
+
end
|
83
|
+
|
84
|
+
# Fit the model with given training data.
|
85
|
+
#
|
86
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
87
|
+
# @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
|
88
|
+
# @return [GradientBoostingRegressor] The learned regressor itself.
|
89
|
+
def fit(x, y)
|
90
|
+
check_sample_array(x)
|
91
|
+
check_tvalue_array(y)
|
92
|
+
check_sample_tvalue_size(x, y)
|
93
|
+
|
94
|
+
n_features = x.shape[1]
|
95
|
+
@params[:max_features] = n_features if @params[:max_features].nil?
|
96
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
97
|
+
|
98
|
+
# train regressor.
|
99
|
+
n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
|
100
|
+
@base_predictions = n_outputs > 1 ? y.mean(0) : y.mean
|
101
|
+
@estimators = if n_outputs > 1
|
102
|
+
Array.new(n_outputs) do |n|
|
103
|
+
partial_fit(x, y[true, n], @base_predictions[n])
|
104
|
+
end
|
105
|
+
else
|
106
|
+
partial_fit(x, y, @base_predictions)
|
107
|
+
end
|
108
|
+
|
109
|
+
# calculate feature importances.
|
110
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
111
|
+
if n_outputs > 1
|
112
|
+
n_outputs.times do |n|
|
113
|
+
@estimators[n].each { |tree| @feature_importances += tree.feature_importances }
|
114
|
+
end
|
115
|
+
else
|
116
|
+
@estimators.each { |tree| @feature_importances += tree.feature_importances }
|
117
|
+
end
|
118
|
+
|
119
|
+
self
|
120
|
+
end
|
121
|
+
|
122
|
+
# Predict values for samples.
|
123
|
+
#
|
124
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
125
|
+
# @return [Numo::DFloat] (shape: [n_samples]) Predicted values per sample.
|
126
|
+
def predict(x)
|
127
|
+
check_sample_array(x)
|
128
|
+
n_samples = x.shape[0]
|
129
|
+
n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
|
130
|
+
if n_outputs > 1
|
131
|
+
predicted = Numo::DFloat.ones(n_samples, n_outputs) * @base_predictions
|
132
|
+
n_outputs.times do |n|
|
133
|
+
@estimators[n].each { |tree| predicted[true, n] += tree.predict(x) }
|
134
|
+
end
|
135
|
+
else
|
136
|
+
predicted = Numo::DFloat.ones(n_samples) * @base_predictions
|
137
|
+
@estimators.each { |tree| predicted += tree.predict(x) }
|
138
|
+
end
|
139
|
+
predicted
|
140
|
+
end
|
141
|
+
|
142
|
+
# Return the index of the leaf that each sample reached.
|
143
|
+
#
|
144
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
145
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
146
|
+
def apply(x)
|
147
|
+
check_sample_array(x)
|
148
|
+
n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
|
149
|
+
leaf_ids = if n_outputs > 1
|
150
|
+
Array.new(n_outputs) { |n| @estimators[n].map { |tree| tree.apply(x) } }
|
151
|
+
else
|
152
|
+
@estimators.map { |tree| tree.apply(x) }
|
153
|
+
end
|
154
|
+
Numo::Int32[*leaf_ids].transpose
|
155
|
+
end
|
156
|
+
|
157
|
+
# Dump marshal data.
|
158
|
+
# @return [Hash] The marshal data about GradientBoostingRegressor.
|
159
|
+
def marshal_dump
|
160
|
+
{ params: @params,
|
161
|
+
estimators: @estimators,
|
162
|
+
base_predictions: @base_predictions,
|
163
|
+
feature_importances: @feature_importances,
|
164
|
+
rng: @rng }
|
165
|
+
end
|
166
|
+
|
167
|
+
# Load marshal data.
|
168
|
+
# @return [nil]
|
169
|
+
def marshal_load(obj)
|
170
|
+
@params = obj[:params]
|
171
|
+
@estimators = obj[:estimators]
|
172
|
+
@base_predictions = obj[:base_predictions]
|
173
|
+
@feature_importances = obj[:feature_importances]
|
174
|
+
@rng = obj[:rng]
|
175
|
+
nil
|
176
|
+
end
|
177
|
+
|
178
|
+
private
|
179
|
+
|
180
|
+
def partial_fit(x, y, init_pred)
|
181
|
+
# initialize some variables.
|
182
|
+
estimators = []
|
183
|
+
n_samples = x.shape[0]
|
184
|
+
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
185
|
+
whole_ids = Array.new(n_samples) { |v| v }
|
186
|
+
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
187
|
+
# grow trees.
|
188
|
+
@params[:n_estimators].times do |_t|
|
189
|
+
# subsampling
|
190
|
+
ids = whole_ids.sample(n_sub_samples, random: @rng)
|
191
|
+
x_sub = x[ids, true]
|
192
|
+
y_sub = y[ids]
|
193
|
+
y_pred_sub = y_pred[ids]
|
194
|
+
# train tree
|
195
|
+
g = gradient(y_sub, y_pred_sub)
|
196
|
+
h = hessian(n_sub_samples)
|
197
|
+
tree = plant_tree
|
198
|
+
tree.fit(x_sub, y_sub, g, h)
|
199
|
+
estimators.push(tree)
|
200
|
+
# update
|
201
|
+
y_pred += tree.predict(x)
|
202
|
+
end
|
203
|
+
estimators
|
204
|
+
end
|
205
|
+
|
206
|
+
# for debug
|
207
|
+
#
|
208
|
+
# def loss(y_true, y_pred)
|
209
|
+
# ((y_true - y_pred)**2).mean
|
210
|
+
# end
|
211
|
+
|
212
|
+
def gradient(y_true, y_pred)
|
213
|
+
y_pred - y_true
|
214
|
+
end
|
215
|
+
|
216
|
+
def hessian(n_samples)
|
217
|
+
Numo::DFloat.ones(n_samples)
|
218
|
+
end
|
219
|
+
|
220
|
+
def plant_tree
|
221
|
+
Rumale::Tree::GradientTreeRegressor.new(
|
222
|
+
reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
|
223
|
+
max_depth: @params[:max_depth],
|
224
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
225
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
226
|
+
)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Discretizes features with a given number of bins.
|
9
|
+
# In some cases, discretizing features may accelerate decision tree training.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# discretizer = Rumale::Preprocessing::BinDiscretizer.new(n_bins: 4)
|
13
|
+
# samples = Numo::DFloat.new(5, 2).rand - 0.5
|
14
|
+
# transformed = discretizer.fit_transform(samples)
|
15
|
+
# # > pp samples
|
16
|
+
# # Numo::DFloat#shape=[5,2]
|
17
|
+
# # [[-0.438246, -0.126933],
|
18
|
+
# # [ 0.294815, -0.298958],
|
19
|
+
# # [-0.383959, -0.155968],
|
20
|
+
# # [ 0.039948, 0.237815],
|
21
|
+
# # [-0.334911, -0.449117]]
|
22
|
+
# # > pp transformed
|
23
|
+
# # Numo::DFloat#shape=[5,2]
|
24
|
+
# # [[0, 1],
|
25
|
+
# # [3, 0],
|
26
|
+
# # [0, 1],
|
27
|
+
# # [2, 3],
|
28
|
+
# # [0, 0]]
|
29
|
+
class BinDiscretizer
|
30
|
+
include Base::BaseEstimator
|
31
|
+
include Base::Transformer
|
32
|
+
|
33
|
+
# Return the feature steps to be used discretizing.
|
34
|
+
# @return [Array<Numo::DFloat>] (shape: [n_features, n_bins])
|
35
|
+
attr_reader :feature_steps
|
36
|
+
|
37
|
+
# Create a new discretizer for features with given number of bins.
|
38
|
+
#
|
39
|
+
# @param n_bins [Integer] The number of bins to be used disretizing feature values.
|
40
|
+
def initialize(n_bins: 32)
|
41
|
+
@params = {}
|
42
|
+
@params[:n_bins] = n_bins
|
43
|
+
@feature_steps = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
# Fit feature ranges to be discretized.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> BinDiscretizer
|
49
|
+
#
|
50
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the feature ranges.
|
51
|
+
# @return [BinDiscretizer]
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
check_sample_array(x)
|
54
|
+
n_features = x.shape[1]
|
55
|
+
max_vals = x.max(0)
|
56
|
+
min_vals = x.min(0)
|
57
|
+
@feature_steps = Array.new(n_features) do |n|
|
58
|
+
Numo::DFloat.linspace(min_vals[n], max_vals[n], @params[:n_bins] + 1)[0...@params[:n_bins]]
|
59
|
+
end
|
60
|
+
self
|
61
|
+
end
|
62
|
+
|
63
|
+
# Fit feature ranges to be discretized, then return discretized samples.
|
64
|
+
#
|
65
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
66
|
+
#
|
67
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
|
68
|
+
# @return [Numo::DFloat] The discretized samples.
|
69
|
+
def fit_transform(x, _y = nil)
|
70
|
+
check_sample_array(x)
|
71
|
+
fit(x).transform(x)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Peform discretizing the given samples.
|
75
|
+
#
|
76
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
|
77
|
+
# @return [Numo::DFloat] The discretized samples.
|
78
|
+
def transform(x)
|
79
|
+
check_sample_array(x)
|
80
|
+
n_samples, n_features = x.shape
|
81
|
+
transformed = Numo::DFloat.zeros(n_samples, n_features)
|
82
|
+
n_features.times do |n|
|
83
|
+
steps = @feature_steps[n]
|
84
|
+
@params[:n_bins].times do |bin|
|
85
|
+
mask = x[true, n].ge(steps[bin]).where
|
86
|
+
transformed[mask, n] = bin
|
87
|
+
end
|
88
|
+
end
|
89
|
+
transformed
|
90
|
+
end
|
91
|
+
|
92
|
+
# Dump marshal data.
|
93
|
+
# @return [Hash] The marshal data about BinDiscretizer
|
94
|
+
def marshal_dump
|
95
|
+
{ params: @params,
|
96
|
+
feature_steps: @feature_steps }
|
97
|
+
end
|
98
|
+
|
99
|
+
# Load marshal data.
|
100
|
+
# @return [nil]
|
101
|
+
def marshal_load(obj)
|
102
|
+
@params = obj[:params]
|
103
|
+
@feature_steps = obj[:feature_steps]
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -9,7 +9,7 @@ module Rumale
|
|
9
9
|
# Normalize samples to unit L2-norm.
|
10
10
|
#
|
11
11
|
# @example
|
12
|
-
# normalizer = Rumale::Preprocessing::
|
12
|
+
# normalizer = Rumale::Preprocessing::L2Normalizer.new
|
13
13
|
# new_samples = normalizer.fit_transform(samples)
|
14
14
|
class L2Normalizer
|
15
15
|
include Base::BaseEstimator
|
@@ -86,14 +86,13 @@ module Rumale
|
|
86
86
|
return put_leaf(node, y) if stop_growing?(y)
|
87
87
|
|
88
88
|
# calculate optimal parameters.
|
89
|
-
feature_id,
|
90
|
-
|
91
|
-
limp, rimp, th, ga = best_split(ft, y, whole_impurity)
|
92
|
-
[fid, ft.le(th).where, ft.gt(th).where, limp, rimp, th, ga]
|
93
|
-
end.max_by(&:last)
|
89
|
+
feature_id, left_imp, right_imp, threshold, gain =
|
90
|
+
rand_ids(n_features).map { |n| [n, *best_split(x[true, n], y, whole_impurity)] }.max_by(&:last)
|
94
91
|
|
95
92
|
return put_leaf(node, y) if gain.nil? || gain.zero?
|
96
93
|
|
94
|
+
left_ids = x[true, feature_id].le(threshold).where
|
95
|
+
right_ids = x[true, feature_id].gt(threshold).where
|
97
96
|
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
|
98
97
|
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
|
99
98
|
|
@@ -107,8 +107,8 @@ module Rumale
|
|
107
107
|
threshold = @rng.rand(features.min..features.max)
|
108
108
|
l_ids = features.le(threshold).where
|
109
109
|
r_ids = features.gt(threshold).where
|
110
|
-
l_impurity = l_ids.
|
111
|
-
r_impurity = r_ids.
|
110
|
+
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
111
|
+
r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
|
112
112
|
gain = whole_impurity -
|
113
113
|
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
114
114
|
r_impurity * r_ids.size.fdiv(y.shape[0])
|
@@ -94,8 +94,8 @@ module Rumale
|
|
94
94
|
threshold = @rng.rand(features.min..features.max)
|
95
95
|
l_ids = features.le(threshold).where
|
96
96
|
r_ids = features.gt(threshold).where
|
97
|
-
l_impurity = l_ids.
|
98
|
-
r_impurity = r_ids.
|
97
|
+
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
98
|
+
r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
|
99
99
|
gain = whole_impurity -
|
100
100
|
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
101
101
|
r_impurity * r_ids.size.fdiv(y.shape[0])
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/rumale'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/regressor'
|
6
|
+
require 'rumale/tree/node'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Tree
|
10
|
+
# GradientTreeRegressor is a class that implements decision tree for regression with exact gredy algorithm.
|
11
|
+
# This class is used internally for estimators with gradient tree boosting.
|
12
|
+
#
|
13
|
+
# *reference*
|
14
|
+
# - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
15
|
+
# - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
16
|
+
# - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
17
|
+
#
|
18
|
+
class GradientTreeRegressor
|
19
|
+
include Base::BaseEstimator
|
20
|
+
include Base::Regressor
|
21
|
+
include ExtGradientTreeRegressor
|
22
|
+
|
23
|
+
# Return the importance for each feature.
|
24
|
+
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
26
|
+
attr_reader :feature_importances
|
27
|
+
|
28
|
+
# Return the learned tree.
|
29
|
+
# @return [Node]
|
30
|
+
attr_reader :tree
|
31
|
+
|
32
|
+
# Return the random generator for random selection of feature index.
|
33
|
+
# @return [Random]
|
34
|
+
attr_reader :rng
|
35
|
+
|
36
|
+
# Return the values assigned each leaf.
|
37
|
+
# @return [Numo::DFloat] (shape: [n_leaves])
|
38
|
+
attr_reader :leaf_weights
|
39
|
+
|
40
|
+
# Initialize a gradient tree regressor
|
41
|
+
#
|
42
|
+
# @param reg_lambda [Float] The L2 regularization term on weight.
|
43
|
+
# @param shrinkage_rate [Float] The shrinkage rate for weight.
|
44
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
45
|
+
# If nil is given, decision tree grows without concern for depth.
|
46
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
47
|
+
# If nil is given, number of leaves is not limited.
|
48
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
49
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
50
|
+
# If nil is given, split process considers all features.
|
51
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
52
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
53
|
+
def initialize(reg_lambda: 0.0, shrinkage_rate: 1.0,
|
54
|
+
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
|
55
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
56
|
+
max_features: max_features, random_seed: random_seed)
|
57
|
+
check_params_float(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate)
|
58
|
+
check_params_integer(min_samples_leaf: min_samples_leaf)
|
59
|
+
check_params_positive(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate,
|
60
|
+
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
61
|
+
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
62
|
+
@params = {}
|
63
|
+
@params[:reg_lambda] = reg_lambda
|
64
|
+
@params[:shrinkage_rate] = shrinkage_rate
|
65
|
+
@params[:max_depth] = max_depth
|
66
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
67
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
68
|
+
@params[:max_features] = max_features
|
69
|
+
@params[:random_seed] = random_seed
|
70
|
+
@params[:random_seed] ||= srand
|
71
|
+
@tree = nil
|
72
|
+
@feature_importances = nil
|
73
|
+
@n_leaves = nil
|
74
|
+
@leaf_weights = nil
|
75
|
+
@rng = Random.new(@params[:random_seed])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Fit the model with given training data.
|
79
|
+
#
|
80
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
81
|
+
# @param y [Numo::DFloat] (shape: [n_samples]) The taget values to be used for fitting the model.
|
82
|
+
# @param g [Numo::DFloat] (shape: [n_samples]) The gradient of loss function.
|
83
|
+
# @param h [Numo::DFloat] (shape: [n_samples]) The hessian of loss function.
|
84
|
+
# @return [GradientTreeRegressor] The learned regressor itself.
|
85
|
+
def fit(x, y, g, h)
|
86
|
+
check_sample_array(x)
|
87
|
+
check_tvalue_array(y)
|
88
|
+
check_sample_tvalue_size(x, y)
|
89
|
+
check_params_type(Numo::DFloat, g: g, h: g)
|
90
|
+
# Initialize some variables.
|
91
|
+
n_features = x.shape[1]
|
92
|
+
@params[:max_features] ||= n_features
|
93
|
+
@n_leaves = 0
|
94
|
+
@leaf_weights = []
|
95
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
96
|
+
# Build tree.
|
97
|
+
build_tree(x, y, g, h)
|
98
|
+
@leaf_weights = Numo::DFloat[*@leaf_weights]
|
99
|
+
self
|
100
|
+
end
|
101
|
+
|
102
|
+
# Predict values for samples.
|
103
|
+
#
|
104
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
105
|
+
# @return [Numo::DFloat] (size: n_samples) Predicted values per sample.
|
106
|
+
def predict(x)
|
107
|
+
check_sample_array(x)
|
108
|
+
@leaf_weights[apply(x)].dup
|
109
|
+
end
|
110
|
+
|
111
|
+
# Return the index of the leaf that each sample reached.
|
112
|
+
#
|
113
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
114
|
+
# @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
|
115
|
+
def apply(x)
|
116
|
+
check_sample_array(x)
|
117
|
+
Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
|
118
|
+
end
|
119
|
+
|
120
|
+
# Dump marshal data.
|
121
|
+
# @return [Hash] The marshal data about DecisionTreeRegressor
|
122
|
+
def marshal_dump
|
123
|
+
{ params: @params,
|
124
|
+
tree: @tree,
|
125
|
+
feature_importances: @feature_importances,
|
126
|
+
leaf_weights: @leaf_weights,
|
127
|
+
rng: @rng }
|
128
|
+
end
|
129
|
+
|
130
|
+
# Load marshal data.
|
131
|
+
# @return [nil]
|
132
|
+
def marshal_load(obj)
|
133
|
+
@params = obj[:params]
|
134
|
+
@tree = obj[:tree]
|
135
|
+
@feature_importances = obj[:feature_importances]
|
136
|
+
@leaf_weights = obj[:leaf_weights]
|
137
|
+
@rng = obj[:rng]
|
138
|
+
nil
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
|
143
|
+
def apply_at_node(node, sample)
|
144
|
+
return node.leaf_id if node.leaf
|
145
|
+
return apply_at_node(node.left, sample) if node.right.nil?
|
146
|
+
return apply_at_node(node.right, sample) if node.left.nil?
|
147
|
+
if sample[node.feature_id] <= node.threshold
|
148
|
+
apply_at_node(node.left, sample)
|
149
|
+
else
|
150
|
+
apply_at_node(node.right, sample)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def build_tree(x, y, g, h)
|
155
|
+
@tree = grow_node(0, x, y, g, h)
|
156
|
+
nil
|
157
|
+
end
|
158
|
+
|
159
|
+
def grow_node(depth, x, y, g, h)
|
160
|
+
# intialize some variables.
|
161
|
+
sum_g = g.sum
|
162
|
+
sum_h = h.sum
|
163
|
+
n_samples, n_features = x.shape
|
164
|
+
node = Node.new(depth: depth, n_samples: n_samples)
|
165
|
+
|
166
|
+
# terminate growing.
|
167
|
+
unless @params[:max_leaf_nodes].nil?
|
168
|
+
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
169
|
+
end
|
170
|
+
|
171
|
+
return nil if n_samples < @params[:min_samples_leaf]
|
172
|
+
return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
|
173
|
+
|
174
|
+
unless @params[:max_depth].nil?
|
175
|
+
return put_leaf(node, sum_g, sum_h) if depth == @params[:max_depth]
|
176
|
+
end
|
177
|
+
|
178
|
+
return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
|
179
|
+
|
180
|
+
# calculate optimal parameters.
|
181
|
+
feature_id, threshold, gain =
|
182
|
+
rand_ids(n_features).map { |n| [n, *best_split(x[true, n], g, h, sum_g, sum_h)] }.max_by(&:last)
|
183
|
+
|
184
|
+
return put_leaf(node, sum_g, sum_h) if gain.nil? || gain.zero?
|
185
|
+
|
186
|
+
left_ids = x[true, feature_id].le(threshold).where
|
187
|
+
right_ids = x[true, feature_id].gt(threshold).where
|
188
|
+
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], g[left_ids], h[left_ids])
|
189
|
+
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], g[right_ids], h[right_ids])
|
190
|
+
|
191
|
+
return put_leaf(node, sum_g, sum_h) if node.left.nil? && node.right.nil?
|
192
|
+
|
193
|
+
@feature_importances[feature_id] += 1.0
|
194
|
+
|
195
|
+
node.feature_id = feature_id
|
196
|
+
node.threshold = threshold
|
197
|
+
node.leaf = false
|
198
|
+
node
|
199
|
+
end
|
200
|
+
|
201
|
+
def stop_growing?(y)
|
202
|
+
y.to_a.uniq.size == 1
|
203
|
+
end
|
204
|
+
|
205
|
+
def put_leaf(node, sum_g, sum_h)
|
206
|
+
node.probs = nil
|
207
|
+
node.leaf = true
|
208
|
+
node.leaf_id = @n_leaves
|
209
|
+
weight = -@params[:shrinkage_rate] * sum_g / (sum_h + @params[:reg_lambda])
|
210
|
+
@leaf_weights.push(weight)
|
211
|
+
@n_leaves += 1
|
212
|
+
node
|
213
|
+
end
|
214
|
+
|
215
|
+
def best_split(features, g, h, sum_g, sum_h)
|
216
|
+
order = features.sort_index
|
217
|
+
sorted_f = features[order].to_a
|
218
|
+
sorted_g = g[order].to_a
|
219
|
+
sorted_h = h[order].to_a
|
220
|
+
find_split_params(sorted_f, sorted_g, sorted_h, sum_g, sum_h, @params[:reg_lambda])
|
221
|
+
end
|
222
|
+
|
223
|
+
def rand_ids(n)
|
224
|
+
[*0...n].sample(@params[:max_features], random: @rng)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
data/lib/rumale/tree/node.rb
CHANGED
@@ -21,7 +21,7 @@ module Rumale
|
|
21
21
|
# @param feature_id [Integer] The feature index used for evaluation.
|
22
22
|
# @param threshold [Float] The threshold value of the feature for splitting the node.
|
23
23
|
def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
|
24
|
-
leaf:
|
24
|
+
leaf: false, leaf_id: nil,
|
25
25
|
left: nil, right: nil, feature_id: 0, threshold: 0.0)
|
26
26
|
@depth = depth
|
27
27
|
@impurity = impurity
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
|
|
17
17
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
18
18
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
19
19
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
20
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
20
|
+
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
21
21
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
22
22
|
MSG
|
23
23
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05-
|
11
|
+
date: 2019-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -99,7 +99,7 @@ description: |
|
|
99
99
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
100
100
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
101
101
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
102
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
102
|
+
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
103
103
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
104
104
|
email:
|
105
105
|
- yoshoku@outlook.com
|
@@ -142,6 +142,8 @@ files:
|
|
142
142
|
- lib/rumale/ensemble/ada_boost_regressor.rb
|
143
143
|
- lib/rumale/ensemble/extra_trees_classifier.rb
|
144
144
|
- lib/rumale/ensemble/extra_trees_regressor.rb
|
145
|
+
- lib/rumale/ensemble/gradient_boosting_classifier.rb
|
146
|
+
- lib/rumale/ensemble/gradient_boosting_regressor.rb
|
145
147
|
- lib/rumale/ensemble/random_forest_classifier.rb
|
146
148
|
- lib/rumale/ensemble/random_forest_regressor.rb
|
147
149
|
- lib/rumale/evaluation_measure/accuracy.rb
|
@@ -191,6 +193,7 @@ files:
|
|
191
193
|
- lib/rumale/polynomial_model/base_factorization_machine.rb
|
192
194
|
- lib/rumale/polynomial_model/factorization_machine_classifier.rb
|
193
195
|
- lib/rumale/polynomial_model/factorization_machine_regressor.rb
|
196
|
+
- lib/rumale/preprocessing/bin_discretizer.rb
|
194
197
|
- lib/rumale/preprocessing/l2_normalizer.rb
|
195
198
|
- lib/rumale/preprocessing/label_encoder.rb
|
196
199
|
- lib/rumale/preprocessing/max_abs_scaler.rb
|
@@ -203,6 +206,7 @@ files:
|
|
203
206
|
- lib/rumale/tree/decision_tree_regressor.rb
|
204
207
|
- lib/rumale/tree/extra_tree_classifier.rb
|
205
208
|
- lib/rumale/tree/extra_tree_regressor.rb
|
209
|
+
- lib/rumale/tree/gradient_tree_regressor.rb
|
206
210
|
- lib/rumale/tree/node.rb
|
207
211
|
- lib/rumale/utils.rb
|
208
212
|
- lib/rumale/validation.rb
|