rumale 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
4
- data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
3
+ metadata.gz: 1d73f16bcd1d149babe18c1db66d3f72bb9a1206
4
+ data.tar.gz: 247fd7d548563ef27622c293073236468f634b7d
5
5
  SHA512:
6
- metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
7
- data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71
6
+ metadata.gz: 6a4a92d08ee0a8295d96a930a46fb67a9299a9e0beb717d52186347fef3b70727e35a2375e6c50f5da37ab699132fe0d3c3beeeb0a9730a158e3a5864f6b8364
7
+ data.tar.gz: a614c5002c750f9091a0b7b80b678115ea6b65a1a7d0de621431ee942f8f1678d36c64a271cdb1cc0c4a68c49d20bacfe934d844381ad78361c09e762e02e872
@@ -1,3 +1,5 @@
1
+ require: rubocop-performance
2
+
1
3
  inherit_from: .rubocop_todo.yml
2
4
 
3
5
  AllCops:
@@ -1,3 +1,11 @@
1
+ # 0.9.2
2
+ - Add class for Gradient tree boosting classifier.
3
+ - Add class for Gradient tree boosting regressor.
4
+ - Add class for discretizing feature values.
5
+ - Refactor extra-trees estimators.
6
+ - Refactor decision tree base class.
7
+ - Fix some typos on document ([#6](https://github.com/yoshoku/rumale/pull/6)).
8
+
1
9
  # 0.9.1
2
10
  - Add class for Extra-Trees classifier.
3
11
  - Add class for Extra-Trees regressor.
data/README.md CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
12
12
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
13
13
  Rumale supports Linear / Kernel Support Vector Machine,
14
14
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
15
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
15
+ Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
16
16
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
17
17
 
18
18
  This project was formerly known as "SVMKit".
@@ -334,6 +334,72 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
334
334
  return opt_params;
335
335
  }
336
336
 
337
+ /**
338
+ * @!visibility private
339
+ * Find for split point with maximum information gain.
340
+ *
341
+ * @overload find_split_params(sorted_features, sorted_gradient, sorted_hessian, sum_gradient, sum_hessian) -> Array<Float>
342
+ *
343
+ * @param sorted_features [Array<Float>] (size: n_samples) The feature values sorted in ascending order.
344
+ * @param sorted_targets [Array<Float>] (size: n_samples) The target values sorted according to feature values.
345
+ * @param sorted_gradient [Array<Float>] (size: n_samples) The gradient values of loss function sorted according to feature values.
346
+ * @param sorted_hessian [Array<Float>] (size: n_samples) The hessian values of loss function sorted according to feature values.
347
+ * @param sum_gradient [Float] The sum of gradient values.
348
+ * @param sum_hessian [Float] The sum of hessian values.
349
+ * @param reg_lambda [Float] The L2 regularization term on weight.
350
+ * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
351
+ */
352
+ static VALUE
353
+ find_split_params_grad_reg
354
+ (VALUE self, VALUE sorted_f, VALUE sorted_g, VALUE sorted_h, VALUE sum_g, VALUE sum_h, VALUE reg_l)
355
+ {
356
+ const long n_elements = RARRAY_LEN(sorted_f);
357
+ const double s_grad = NUM2DBL(sum_g);
358
+ const double s_hess = NUM2DBL(sum_h);
359
+ const double reg_lambda = NUM2DBL(reg_l);
360
+ long curr_pos = 0;
361
+ long next_pos = 0;
362
+ double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
363
+ double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
364
+ double next_el;
365
+ double l_grad = 0.0;
366
+ double l_hess = 0.0;
367
+ double r_grad;
368
+ double r_hess;
369
+ double gain;
370
+ VALUE opt_params = rb_ary_new2(2);
371
+
372
+ /* Initialize optimal parameters. */
373
+ rb_ary_store(opt_params, 0, rb_ary_entry(sorted_f, 0)); /* threshold */
374
+ rb_ary_store(opt_params, 1, DBL2NUM(0)); /* gain */
375
+
376
+ /* Find optimal parameters. */
377
+ while (curr_pos < n_elements && curr_el != last_el) {
378
+ next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
379
+ while (next_pos < n_elements && next_el == curr_el) {
380
+ l_grad += NUM2DBL(rb_ary_entry(sorted_g, next_pos));
381
+ l_hess += NUM2DBL(rb_ary_entry(sorted_h, next_pos));
382
+ next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
383
+ }
384
+ /* Calculate gain of new split. */
385
+ r_grad = s_grad - l_grad;
386
+ r_hess = s_hess - l_hess;
387
+ gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
388
+ (r_grad * r_grad) / (r_hess + reg_lambda) -
389
+ (s_grad * s_grad) / (s_hess + reg_lambda);
390
+ /* Update optimal parameters. */
391
+ if (gain > NUM2DBL(rb_ary_entry(opt_params, 1))) {
392
+ rb_ary_store(opt_params, 0, DBL2NUM(0.5 * (curr_el + next_el)));
393
+ rb_ary_store(opt_params, 1, DBL2NUM(gain));
394
+ }
395
+ if (next_pos == n_elements) break;
396
+ curr_pos = next_pos;
397
+ curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
398
+ }
399
+
400
+ return opt_params;
401
+ }
402
+
337
403
  /**
338
404
  * @!visibility private
339
405
  * Calculate impurity based on criterion.
@@ -406,9 +472,17 @@ void Init_rumale(void)
406
472
  * This module is used internally.
407
473
  */
408
474
  VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
475
+ /**
476
+ * Document-module: Rumale::Tree::ExtGradientTreeRegressor
477
+ * @!visibility private
478
+ * The mixin module consisting of extension method for GradientTreeRegressor class.
479
+ * This module is used internally.
480
+ */
481
+ VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
409
482
 
410
483
  rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
411
484
  rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
485
+ rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 6);
412
486
  rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
413
487
  rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
414
488
  }
@@ -47,8 +47,11 @@ require 'rumale/tree/decision_tree_classifier'
47
47
  require 'rumale/tree/decision_tree_regressor'
48
48
  require 'rumale/tree/extra_tree_classifier'
49
49
  require 'rumale/tree/extra_tree_regressor'
50
+ require 'rumale/tree/gradient_tree_regressor'
50
51
  require 'rumale/ensemble/ada_boost_classifier'
51
52
  require 'rumale/ensemble/ada_boost_regressor'
53
+ require 'rumale/ensemble/gradient_boosting_classifier'
54
+ require 'rumale/ensemble/gradient_boosting_regressor'
52
55
  require 'rumale/ensemble/random_forest_classifier'
53
56
  require 'rumale/ensemble/random_forest_regressor'
54
57
  require 'rumale/ensemble/extra_trees_classifier'
@@ -61,6 +64,7 @@ require 'rumale/preprocessing/l2_normalizer'
61
64
  require 'rumale/preprocessing/min_max_scaler'
62
65
  require 'rumale/preprocessing/max_abs_scaler'
63
66
  require 'rumale/preprocessing/standard_scaler'
67
+ require 'rumale/preprocessing/bin_discretizer'
64
68
  require 'rumale/preprocessing/label_encoder'
65
69
  require 'rumale/preprocessing/one_hot_encoder'
66
70
  require 'rumale/model_selection/k_fold'
@@ -42,7 +42,7 @@ module Rumale
42
42
 
43
43
  # Create a new classifier with AdaBoost.
44
44
  #
45
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
46
46
  # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
47
  # @param max_depth [Integer] The maximum depth of the tree.
48
48
  # If nil is given, decision tree grows without concern for depth.
@@ -42,7 +42,7 @@ module Rumale
42
42
 
43
43
  # Create a new regressor with random forest.
44
44
  #
45
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
46
46
  # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
47
47
  # @param exponent [Float] The exponent for the weight of each weak learner.
48
48
  # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
@@ -0,0 +1,278 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/tree/gradient_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
11
+ # The class use negative binomial log-likelihood for the loss function.
12
+ # For multiclass classification problem, it uses one-vs-the-rest strategy.
13
+ #
14
+ # @example
15
+ # estimator =
16
+ # Rumale::Ensemble::GradientBoostingClassifier.new(
17
+ # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
18
+ # estimator.fit(training_samples, traininig_values)
19
+ # results = estimator.predict(testing_samples)
20
+ #
21
+ # *reference*
22
+ # - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
23
+ # - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
24
+ # - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
25
+ #
26
+ class GradientBoostingClassifier
27
+ include Base::BaseEstimator
28
+ include Base::Classifier
29
+
30
+ # Return the set of estimators.
31
+ # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
32
+ attr_reader :estimators
33
+
34
+ # Return the class labels.
35
+ # @return [Numo::Int32] (size: n_classes)
36
+ attr_reader :classes
37
+
38
+ # Return the importance for each feature.
39
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
40
+ # @return [Numo::DFloat] (size: n_features)
41
+ attr_reader :feature_importances
42
+
43
+ # Return the random generator for random selection of feature index.
44
+ # @return [Random]
45
+ attr_reader :rng
46
+
47
+ # Create a new classifier with gradient tree boosting.
48
+ #
49
+ # @param n_estimators [Integer] The numeber of trees for contructing classifier.
50
+ # @param learning_rate [Float] The boosting learining rate
51
+ # @param reg_lambda [Float] The L2 regularization term on weight.
52
+ # @param max_depth [Integer] The maximum depth of the tree.
53
+ # If nil is given, decision tree grows without concern for depth.
54
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
55
+ # If nil is given, number of leaves is not limited.
56
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
57
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
58
+ # If nil is given, split process considers all features.
59
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
60
+ # It is used to randomly determine the order of features when deciding spliting point.
61
+ def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
62
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
63
+ max_features: nil, random_seed: nil)
64
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
65
+ max_features: max_features, random_seed: random_seed)
66
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
67
+ check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
68
+ check_params_positive(n_estimators: n_estimators,
69
+ learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
70
+ max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
71
+ max_features: max_features)
72
+ @params = {}
73
+ @params[:n_estimators] = n_estimators
74
+ @params[:learning_rate] = learning_rate
75
+ @params[:reg_lambda] = reg_lambda
76
+ @params[:subsample] = subsample
77
+ @params[:max_depth] = max_depth
78
+ @params[:max_leaf_nodes] = max_leaf_nodes
79
+ @params[:min_samples_leaf] = min_samples_leaf
80
+ @params[:max_features] = max_features
81
+ @params[:random_seed] = random_seed
82
+ @params[:random_seed] ||= srand
83
+ @estimators = nil
84
+ @classes = nil
85
+ @base_predictions = nil
86
+ @feature_importances = nil
87
+ @rng = Random.new(@params[:random_seed])
88
+ end
89
+
90
+ # Fit the model with given training data.
91
+ #
92
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
93
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
94
+ # @return [GradientBoostingClassifier] The learned classifier itself.
95
+ def fit(x, y)
96
+ check_sample_array(x)
97
+ check_label_array(y)
98
+ check_sample_label_size(x, y)
99
+
100
+ n_features = x.shape[1]
101
+ @params[:max_features] = n_features if @params[:max_features].nil?
102
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
103
+
104
+ # train estimator.
105
+ @classes = Numo::Int32[*y.to_a.uniq.sort]
106
+ n_classes = @classes.size
107
+ if n_classes > 2
108
+ @base_predictions = Numo::DFloat.zeros(n_classes)
109
+ @estimators = Array.new(n_classes) do |n|
110
+ bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
111
+ y_mean = bin_y.mean
112
+ @base_predictions[n] = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
113
+ partial_fit(x, bin_y, @base_predictions[n])
114
+ end
115
+ else
116
+ negative_label = y.to_a.uniq.min
117
+ bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
118
+ y_mean = bin_y.mean
119
+ @base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
120
+ @estimators = partial_fit(x, bin_y, @base_predictions)
121
+ end
122
+
123
+ # calculate feature importances.
124
+ @feature_importances = Numo::DFloat.zeros(n_features)
125
+ if n_classes > 2
126
+ n_classes.times do |n|
127
+ @estimators[n].each { |tree| @feature_importances += tree.feature_importances }
128
+ end
129
+ else
130
+ @estimators.each { |tree| @feature_importances += tree.feature_importances }
131
+ end
132
+
133
+ self
134
+ end
135
+
136
+ # Calculate confidence scores for samples.
137
+ #
138
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
139
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
140
+ def decision_function(x)
141
+ check_sample_array(x)
142
+ n_samples = x.shape[0]
143
+ n_classes = @classes.size
144
+ if n_classes > 2
145
+ scores = Numo::DFloat.ones(n_samples, n_classes) * @base_predictions
146
+ n_classes.times do |n|
147
+ @estimators[n].each { |tree| scores[true, n] += tree.predict(x) }
148
+ end
149
+ else
150
+ scores = Numo::DFloat.ones(n_samples) * @base_predictions
151
+ @estimators.each { |tree| scores += tree.predict(x) }
152
+ end
153
+ scores
154
+ end
155
+
156
+ # Predict class labels for samples.
157
+ #
158
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
159
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
160
+ def predict(x)
161
+ check_sample_array(x)
162
+ n_samples = x.shape[0]
163
+ probs = predict_proba(x)
164
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
165
+ end
166
+
167
+ # Predict probability for samples.
168
+ #
169
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
170
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
171
+ def predict_proba(x)
172
+ check_sample_array(x)
173
+
174
+ proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
175
+
176
+ return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
177
+
178
+ n_samples, = x.shape
179
+ probs = Numo::DFloat.zeros(n_samples, 2)
180
+ probs[true, 1] = proba
181
+ probs[true, 0] = 1.0 - proba
182
+ probs
183
+ end
184
+
185
+ # Return the index of the leaf that each sample reached.
186
+ #
187
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
188
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
189
+ def apply(x)
190
+ check_sample_array(x)
191
+ n_classes = @classes.size
192
+ leaf_ids = if n_classes > 2
193
+ Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
194
+ else
195
+ @estimators.map { |tree| tree.apply(x) }
196
+ end
197
+ Numo::Int32[*leaf_ids].transpose
198
+ end
199
+
200
+ # Dump marshal data.
201
+ # @return [Hash] The marshal data about GradientBoostingClassifier.
202
+ def marshal_dump
203
+ { params: @params,
204
+ estimators: @estimators,
205
+ classes: @classes,
206
+ base_predictions: @base_predictions,
207
+ feature_importances: @feature_importances,
208
+ rng: @rng }
209
+ end
210
+
211
+ # Load marshal data.
212
+ # @return [nil]
213
+ def marshal_load(obj)
214
+ @params = obj[:params]
215
+ @estimators = obj[:estimators]
216
+ @classes = obj[:classes]
217
+ @base_predictions = obj[:base_predictions]
218
+ @feature_importances = obj[:feature_importances]
219
+ @rng = obj[:rng]
220
+ nil
221
+ end
222
+
223
+ private
224
+
225
+ def partial_fit(x, y, init_pred)
226
+ # initialize some variables.
227
+ estimators = []
228
+ n_samples = x.shape[0]
229
+ n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
230
+ whole_ids = Array.new(n_samples) { |v| v }
231
+ y_pred = Numo::DFloat.ones(n_samples) * init_pred
232
+ # grow trees.
233
+ @params[:n_estimators].times do |_t|
234
+ # subsampling
235
+ ids = whole_ids.sample(n_sub_samples, random: @rng)
236
+ x_sub = x[ids, true]
237
+ y_sub = y[ids]
238
+ y_pred_sub = y_pred[ids]
239
+ # train tree
240
+ g = gradient(y_sub, y_pred_sub)
241
+ h = hessian(y_sub, y_pred_sub)
242
+ tree = plant_tree
243
+ tree.fit(x_sub, y_sub, g, h)
244
+ estimators.push(tree)
245
+ # update
246
+ y_pred += tree.predict(x)
247
+ end
248
+ estimators
249
+ end
250
+
251
+ # for debug
252
+ #
253
+ # def loss(y_true, y_pred)
254
+ # # y_true in {-1, 1}
255
+ # Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
256
+ # end
257
+
258
+ def gradient(y_true, y_pred)
259
+ # y in {-1, 1}
260
+ -2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
261
+ end
262
+
263
+ def hessian(y_true, y_pred)
264
+ abs_response = gradient(y_true, y_pred).abs
265
+ abs_response * (2.0 - abs_response)
266
+ end
267
+
268
+ def plant_tree
269
+ Rumale::Tree::GradientTreeRegressor.new(
270
+ reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
271
+ max_depth: @params[:max_depth],
272
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
273
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
274
+ )
275
+ end
276
+ end
277
+ end
278
+ end
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/gradient_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # GradientBoostingRegressor is a class that implements gradient tree boosting for regression.
11
+ # The class use L2 loss for the loss function.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::GradientBoostingRegressor.new(
16
+ # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *reference*
21
+ # - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
22
+ # - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
23
+ # - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
24
+ #
25
+ class GradientBoostingRegressor
26
+ include Base::BaseEstimator
27
+ include Base::Regressor
28
+
29
+ # Return the set of estimators.
30
+ # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
31
+ attr_reader :estimators
32
+
33
+ # Return the importance for each feature.
34
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
35
+ # @return [Numo::DFloat] (size: n_features)
36
+ attr_reader :feature_importances
37
+
38
+ # Return the random generator for random selection of feature index.
39
+ # @return [Random]
40
+ attr_reader :rng
41
+
42
+ # Create a new regressor with gradient tree boosting.
43
+ #
44
+ # @param n_estimators [Integer] The numeber of trees for contructing regressor.
45
+ # @param learning_rate [Float] The boosting learining rate
46
+ # @param reg_lambda [Float] The L2 regularization term on weight.
47
+ # @param max_depth [Integer] The maximum depth of the tree.
48
+ # If nil is given, decision tree grows without concern for depth.
49
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
50
+ # If nil is given, number of leaves is not limited.
51
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
+ # If nil is given, split process considers all features.
54
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
55
+ # It is used to randomly determine the order of features when deciding spliting point.
56
+ def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
57
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
+ max_features: nil, random_seed: nil)
59
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
+ max_features: max_features, random_seed: random_seed)
61
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
62
+ check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
63
+ check_params_positive(n_estimators: n_estimators,
64
+ learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
65
+ max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
66
+ max_features: max_features)
67
+ @params = {}
68
+ @params[:n_estimators] = n_estimators
69
+ @params[:learning_rate] = learning_rate
70
+ @params[:reg_lambda] = reg_lambda
71
+ @params[:subsample] = subsample
72
+ @params[:max_depth] = max_depth
73
+ @params[:max_leaf_nodes] = max_leaf_nodes
74
+ @params[:min_samples_leaf] = min_samples_leaf
75
+ @params[:max_features] = max_features
76
+ @params[:random_seed] = random_seed
77
+ @params[:random_seed] ||= srand
78
+ @estimators = nil
79
+ @base_predictions = nil
80
+ @feature_importances = nil
81
+ @rng = Random.new(@params[:random_seed])
82
+ end
83
+
84
+ # Fit the model with given training data.
85
+ #
86
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
87
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
88
+ # @return [GradientBoostingRegressor] The learned regressor itself.
89
+ def fit(x, y)
90
+ check_sample_array(x)
91
+ check_tvalue_array(y)
92
+ check_sample_tvalue_size(x, y)
93
+
94
+ n_features = x.shape[1]
95
+ @params[:max_features] = n_features if @params[:max_features].nil?
96
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
97
+
98
+ # train regressor.
99
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
100
+ @base_predictions = n_outputs > 1 ? y.mean(0) : y.mean
101
+ @estimators = if n_outputs > 1
102
+ Array.new(n_outputs) do |n|
103
+ partial_fit(x, y[true, n], @base_predictions[n])
104
+ end
105
+ else
106
+ partial_fit(x, y, @base_predictions)
107
+ end
108
+
109
+ # calculate feature importances.
110
+ @feature_importances = Numo::DFloat.zeros(n_features)
111
+ if n_outputs > 1
112
+ n_outputs.times do |n|
113
+ @estimators[n].each { |tree| @feature_importances += tree.feature_importances }
114
+ end
115
+ else
116
+ @estimators.each { |tree| @feature_importances += tree.feature_importances }
117
+ end
118
+
119
+ self
120
+ end
121
+
122
+ # Predict values for samples.
123
+ #
124
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
125
+ # @return [Numo::DFloat] (shape: [n_samples]) Predicted values per sample.
126
+ def predict(x)
127
+ check_sample_array(x)
128
+ n_samples = x.shape[0]
129
+ n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
130
+ if n_outputs > 1
131
+ predicted = Numo::DFloat.ones(n_samples, n_outputs) * @base_predictions
132
+ n_outputs.times do |n|
133
+ @estimators[n].each { |tree| predicted[true, n] += tree.predict(x) }
134
+ end
135
+ else
136
+ predicted = Numo::DFloat.ones(n_samples) * @base_predictions
137
+ @estimators.each { |tree| predicted += tree.predict(x) }
138
+ end
139
+ predicted
140
+ end
141
+
142
+ # Return the index of the leaf that each sample reached.
143
+ #
144
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
145
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
146
+ def apply(x)
147
+ check_sample_array(x)
148
+ n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
149
+ leaf_ids = if n_outputs > 1
150
+ Array.new(n_outputs) { |n| @estimators[n].map { |tree| tree.apply(x) } }
151
+ else
152
+ @estimators.map { |tree| tree.apply(x) }
153
+ end
154
+ Numo::Int32[*leaf_ids].transpose
155
+ end
156
+
157
+ # Dump marshal data.
158
+ # @return [Hash] The marshal data about GradientBoostingRegressor.
159
+ def marshal_dump
160
+ { params: @params,
161
+ estimators: @estimators,
162
+ base_predictions: @base_predictions,
163
+ feature_importances: @feature_importances,
164
+ rng: @rng }
165
+ end
166
+
167
+ # Load marshal data.
168
+ # @return [nil]
169
+ def marshal_load(obj)
170
+ @params = obj[:params]
171
+ @estimators = obj[:estimators]
172
+ @base_predictions = obj[:base_predictions]
173
+ @feature_importances = obj[:feature_importances]
174
+ @rng = obj[:rng]
175
+ nil
176
+ end
177
+
178
+ private
179
+
180
+ def partial_fit(x, y, init_pred)
181
+ # initialize some variables.
182
+ estimators = []
183
+ n_samples = x.shape[0]
184
+ n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
185
+ whole_ids = Array.new(n_samples) { |v| v }
186
+ y_pred = Numo::DFloat.ones(n_samples) * init_pred
187
+ # grow trees.
188
+ @params[:n_estimators].times do |_t|
189
+ # subsampling
190
+ ids = whole_ids.sample(n_sub_samples, random: @rng)
191
+ x_sub = x[ids, true]
192
+ y_sub = y[ids]
193
+ y_pred_sub = y_pred[ids]
194
+ # train tree
195
+ g = gradient(y_sub, y_pred_sub)
196
+ h = hessian(n_sub_samples)
197
+ tree = plant_tree
198
+ tree.fit(x_sub, y_sub, g, h)
199
+ estimators.push(tree)
200
+ # update
201
+ y_pred += tree.predict(x)
202
+ end
203
+ estimators
204
+ end
205
+
206
+ # for debug
207
+ #
208
+ # def loss(y_true, y_pred)
209
+ # ((y_true - y_pred)**2).mean
210
+ # end
211
+
212
+ def gradient(y_true, y_pred)
213
+ y_pred - y_true
214
+ end
215
+
216
+ def hessian(n_samples)
217
+ Numo::DFloat.ones(n_samples)
218
+ end
219
+
220
+ def plant_tree
221
+ Rumale::Tree::GradientTreeRegressor.new(
222
+ reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
223
+ max_depth: @params[:max_depth],
224
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
225
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
226
+ )
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module Preprocessing
8
+ # Discretizes features with a given number of bins.
9
+ # In some cases, discretizing features may accelerate decision tree training.
10
+ #
11
+ # @example
12
+ # discretizer = Rumale::Preprocessing::BinDiscretizer.new(n_bins: 4)
13
+ # samples = Numo::DFloat.new(5, 2).rand - 0.5
14
+ # transformed = discretizer.fit_transform(samples)
15
+ # # > pp samples
16
+ # # Numo::DFloat#shape=[5,2]
17
+ # # [[-0.438246, -0.126933],
18
+ # # [ 0.294815, -0.298958],
19
+ # # [-0.383959, -0.155968],
20
+ # # [ 0.039948, 0.237815],
21
+ # # [-0.334911, -0.449117]]
22
+ # # > pp transformed
23
+ # # Numo::DFloat#shape=[5,2]
24
+ # # [[0, 1],
25
+ # # [3, 0],
26
+ # # [0, 1],
27
+ # # [2, 3],
28
+ # # [0, 0]]
29
+ class BinDiscretizer
30
+ include Base::BaseEstimator
31
+ include Base::Transformer
32
+
33
+ # Return the feature steps to be used discretizing.
34
+ # @return [Array<Numo::DFloat>] (shape: [n_features, n_bins])
35
+ attr_reader :feature_steps
36
+
37
+ # Create a new discretizer for features with given number of bins.
38
+ #
39
+ # @param n_bins [Integer] The number of bins to be used disretizing feature values.
40
+ def initialize(n_bins: 32)
41
+ @params = {}
42
+ @params[:n_bins] = n_bins
43
+ @feature_steps = nil
44
+ end
45
+
46
+ # Fit feature ranges to be discretized.
47
+ #
48
+ # @overload fit(x) -> BinDiscretizer
49
+ #
50
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the feature ranges.
51
+ # @return [BinDiscretizer]
52
+ def fit(x, _y = nil)
53
+ check_sample_array(x)
54
+ n_features = x.shape[1]
55
+ max_vals = x.max(0)
56
+ min_vals = x.min(0)
57
+ @feature_steps = Array.new(n_features) do |n|
58
+ Numo::DFloat.linspace(min_vals[n], max_vals[n], @params[:n_bins] + 1)[0...@params[:n_bins]]
59
+ end
60
+ self
61
+ end
62
+
63
+ # Fit feature ranges to be discretized, then return discretized samples.
64
+ #
65
+ # @overload fit_transform(x) -> Numo::DFloat
66
+ #
67
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
68
+ # @return [Numo::DFloat] The discretized samples.
69
+ def fit_transform(x, _y = nil)
70
+ check_sample_array(x)
71
+ fit(x).transform(x)
72
+ end
73
+
74
+ # Peform discretizing the given samples.
75
+ #
76
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
77
+ # @return [Numo::DFloat] The discretized samples.
78
+ def transform(x)
79
+ check_sample_array(x)
80
+ n_samples, n_features = x.shape
81
+ transformed = Numo::DFloat.zeros(n_samples, n_features)
82
+ n_features.times do |n|
83
+ steps = @feature_steps[n]
84
+ @params[:n_bins].times do |bin|
85
+ mask = x[true, n].ge(steps[bin]).where
86
+ transformed[mask, n] = bin
87
+ end
88
+ end
89
+ transformed
90
+ end
91
+
92
+ # Dump marshal data.
93
+ # @return [Hash] The marshal data about BinDiscretizer
94
+ def marshal_dump
95
+ { params: @params,
96
+ feature_steps: @feature_steps }
97
+ end
98
+
99
+ # Load marshal data.
100
+ # @return [nil]
101
+ def marshal_load(obj)
102
+ @params = obj[:params]
103
+ @feature_steps = obj[:feature_steps]
104
+ nil
105
+ end
106
+ end
107
+ end
108
+ end
@@ -9,7 +9,7 @@ module Rumale
9
9
  # Normalize samples to unit L2-norm.
10
10
  #
11
11
  # @example
12
- # normalizer = Rumale::Preprocessing::StandardScaler.new
12
+ # normalizer = Rumale::Preprocessing::L2Normalizer.new
13
13
  # new_samples = normalizer.fit_transform(samples)
14
14
  class L2Normalizer
15
15
  include Base::BaseEstimator
@@ -86,14 +86,13 @@ module Rumale
86
86
  return put_leaf(node, y) if stop_growing?(y)
87
87
 
88
88
  # calculate optimal parameters.
89
- feature_id, left_ids, right_ids, left_imp, right_imp, threshold, gain = rand_ids(n_features).map do |fid|
90
- ft = x[true, fid]
91
- limp, rimp, th, ga = best_split(ft, y, whole_impurity)
92
- [fid, ft.le(th).where, ft.gt(th).where, limp, rimp, th, ga]
93
- end.max_by(&:last)
89
+ feature_id, left_imp, right_imp, threshold, gain =
90
+ rand_ids(n_features).map { |n| [n, *best_split(x[true, n], y, whole_impurity)] }.max_by(&:last)
94
91
 
95
92
  return put_leaf(node, y) if gain.nil? || gain.zero?
96
93
 
94
+ left_ids = x[true, feature_id].le(threshold).where
95
+ right_ids = x[true, feature_id].gt(threshold).where
97
96
  node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
98
97
  node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
99
98
 
@@ -107,8 +107,8 @@ module Rumale
107
107
  threshold = @rng.rand(features.min..features.max)
108
108
  l_ids = features.le(threshold).where
109
109
  r_ids = features.gt(threshold).where
110
- l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
111
- r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
110
+ l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
111
+ r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
112
112
  gain = whole_impurity -
113
113
  l_impurity * l_ids.size.fdiv(y.shape[0]) -
114
114
  r_impurity * r_ids.size.fdiv(y.shape[0])
@@ -94,8 +94,8 @@ module Rumale
94
94
  threshold = @rng.rand(features.min..features.max)
95
95
  l_ids = features.le(threshold).where
96
96
  r_ids = features.gt(threshold).where
97
- l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
98
- r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
97
+ l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
98
+ r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
99
99
  gain = whole_impurity -
100
100
  l_impurity * l_ids.size.fdiv(y.shape[0]) -
101
101
  r_impurity * r_ids.size.fdiv(y.shape[0])
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/rumale'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/node'
7
+
8
+ module Rumale
9
+ module Tree
10
+ # GradientTreeRegressor is a class that implements decision tree for regression with exact gredy algorithm.
11
+ # This class is used internally for estimators with gradient tree boosting.
12
+ #
13
+ # *reference*
14
+ # - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
15
+ # - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
16
+ # - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
17
+ #
18
+ class GradientTreeRegressor
19
+ include Base::BaseEstimator
20
+ include Base::Regressor
21
+ include ExtGradientTreeRegressor
22
+
23
+ # Return the importance for each feature.
24
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
25
+ # @return [Numo::DFloat] (shape: [n_features])
26
+ attr_reader :feature_importances
27
+
28
+ # Return the learned tree.
29
+ # @return [Node]
30
+ attr_reader :tree
31
+
32
+ # Return the random generator for random selection of feature index.
33
+ # @return [Random]
34
+ attr_reader :rng
35
+
36
+ # Return the values assigned each leaf.
37
+ # @return [Numo::DFloat] (shape: [n_leaves])
38
+ attr_reader :leaf_weights
39
+
40
+ # Initialize a gradient tree regressor
41
+ #
42
+ # @param reg_lambda [Float] The L2 regularization term on weight.
43
+ # @param shrinkage_rate [Float] The shrinkage rate for weight.
44
+ # @param max_depth [Integer] The maximum depth of the tree.
45
+ # If nil is given, decision tree grows without concern for depth.
46
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
47
+ # If nil is given, number of leaves is not limited.
48
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
49
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
50
+ # If nil is given, split process considers all features.
51
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
52
+ # It is used to randomly determine the order of features when deciding spliting point.
53
+ def initialize(reg_lambda: 0.0, shrinkage_rate: 1.0,
54
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
55
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
+ max_features: max_features, random_seed: random_seed)
57
+ check_params_float(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate)
58
+ check_params_integer(min_samples_leaf: min_samples_leaf)
59
+ check_params_positive(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate,
60
+ max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
61
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
62
+ @params = {}
63
+ @params[:reg_lambda] = reg_lambda
64
+ @params[:shrinkage_rate] = shrinkage_rate
65
+ @params[:max_depth] = max_depth
66
+ @params[:max_leaf_nodes] = max_leaf_nodes
67
+ @params[:min_samples_leaf] = min_samples_leaf
68
+ @params[:max_features] = max_features
69
+ @params[:random_seed] = random_seed
70
+ @params[:random_seed] ||= srand
71
+ @tree = nil
72
+ @feature_importances = nil
73
+ @n_leaves = nil
74
+ @leaf_weights = nil
75
+ @rng = Random.new(@params[:random_seed])
76
+ end
77
+
78
+ # Fit the model with given training data.
79
+ #
80
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
+ # @param y [Numo::DFloat] (shape: [n_samples]) The taget values to be used for fitting the model.
82
+ # @param g [Numo::DFloat] (shape: [n_samples]) The gradient of loss function.
83
+ # @param h [Numo::DFloat] (shape: [n_samples]) The hessian of loss function.
84
+ # @return [GradientTreeRegressor] The learned regressor itself.
85
+ def fit(x, y, g, h)
86
+ check_sample_array(x)
87
+ check_tvalue_array(y)
88
+ check_sample_tvalue_size(x, y)
89
+ check_params_type(Numo::DFloat, g: g, h: g)
90
+ # Initialize some variables.
91
+ n_features = x.shape[1]
92
+ @params[:max_features] ||= n_features
93
+ @n_leaves = 0
94
+ @leaf_weights = []
95
+ @feature_importances = Numo::DFloat.zeros(n_features)
96
+ # Build tree.
97
+ build_tree(x, y, g, h)
98
+ @leaf_weights = Numo::DFloat[*@leaf_weights]
99
+ self
100
+ end
101
+
102
+ # Predict values for samples.
103
+ #
104
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
105
+ # @return [Numo::DFloat] (size: n_samples) Predicted values per sample.
106
+ def predict(x)
107
+ check_sample_array(x)
108
+ @leaf_weights[apply(x)].dup
109
+ end
110
+
111
+ # Return the index of the leaf that each sample reached.
112
+ #
113
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
114
+ # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
115
+ def apply(x)
116
+ check_sample_array(x)
117
+ Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
118
+ end
119
+
120
+ # Dump marshal data.
121
+ # @return [Hash] The marshal data about DecisionTreeRegressor
122
+ def marshal_dump
123
+ { params: @params,
124
+ tree: @tree,
125
+ feature_importances: @feature_importances,
126
+ leaf_weights: @leaf_weights,
127
+ rng: @rng }
128
+ end
129
+
130
+ # Load marshal data.
131
+ # @return [nil]
132
+ def marshal_load(obj)
133
+ @params = obj[:params]
134
+ @tree = obj[:tree]
135
+ @feature_importances = obj[:feature_importances]
136
+ @leaf_weights = obj[:leaf_weights]
137
+ @rng = obj[:rng]
138
+ nil
139
+ end
140
+
141
+ private
142
+
143
+ def apply_at_node(node, sample)
144
+ return node.leaf_id if node.leaf
145
+ return apply_at_node(node.left, sample) if node.right.nil?
146
+ return apply_at_node(node.right, sample) if node.left.nil?
147
+ if sample[node.feature_id] <= node.threshold
148
+ apply_at_node(node.left, sample)
149
+ else
150
+ apply_at_node(node.right, sample)
151
+ end
152
+ end
153
+
154
+ def build_tree(x, y, g, h)
155
+ @tree = grow_node(0, x, y, g, h)
156
+ nil
157
+ end
158
+
159
+ def grow_node(depth, x, y, g, h)
160
+ # intialize some variables.
161
+ sum_g = g.sum
162
+ sum_h = h.sum
163
+ n_samples, n_features = x.shape
164
+ node = Node.new(depth: depth, n_samples: n_samples)
165
+
166
+ # terminate growing.
167
+ unless @params[:max_leaf_nodes].nil?
168
+ return nil if @n_leaves >= @params[:max_leaf_nodes]
169
+ end
170
+
171
+ return nil if n_samples < @params[:min_samples_leaf]
172
+ return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
173
+
174
+ unless @params[:max_depth].nil?
175
+ return put_leaf(node, sum_g, sum_h) if depth == @params[:max_depth]
176
+ end
177
+
178
+ return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
179
+
180
+ # calculate optimal parameters.
181
+ feature_id, threshold, gain =
182
+ rand_ids(n_features).map { |n| [n, *best_split(x[true, n], g, h, sum_g, sum_h)] }.max_by(&:last)
183
+
184
+ return put_leaf(node, sum_g, sum_h) if gain.nil? || gain.zero?
185
+
186
+ left_ids = x[true, feature_id].le(threshold).where
187
+ right_ids = x[true, feature_id].gt(threshold).where
188
+ node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], g[left_ids], h[left_ids])
189
+ node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], g[right_ids], h[right_ids])
190
+
191
+ return put_leaf(node, sum_g, sum_h) if node.left.nil? && node.right.nil?
192
+
193
+ @feature_importances[feature_id] += 1.0
194
+
195
+ node.feature_id = feature_id
196
+ node.threshold = threshold
197
+ node.leaf = false
198
+ node
199
+ end
200
+
201
+ def stop_growing?(y)
202
+ y.to_a.uniq.size == 1
203
+ end
204
+
205
+ def put_leaf(node, sum_g, sum_h)
206
+ node.probs = nil
207
+ node.leaf = true
208
+ node.leaf_id = @n_leaves
209
+ weight = -@params[:shrinkage_rate] * sum_g / (sum_h + @params[:reg_lambda])
210
+ @leaf_weights.push(weight)
211
+ @n_leaves += 1
212
+ node
213
+ end
214
+
215
+ def best_split(features, g, h, sum_g, sum_h)
216
+ order = features.sort_index
217
+ sorted_f = features[order].to_a
218
+ sorted_g = g[order].to_a
219
+ sorted_h = h[order].to_a
220
+ find_split_params(sorted_f, sorted_g, sorted_h, sum_g, sum_h, @params[:reg_lambda])
221
+ end
222
+
223
+ def rand_ids(n)
224
+ [*0...n].sample(@params[:max_features], random: @rng)
225
+ end
226
+ end
227
+ end
228
+ end
@@ -21,7 +21,7 @@ module Rumale
21
21
  # @param feature_id [Integer] The feature index used for evaluation.
22
22
  # @param threshold [Float] The threshold value of the feature for splitting the node.
23
23
  def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
24
- leaf: true, leaf_id: 0,
24
+ leaf: false, leaf_id: nil,
25
25
  left: nil, right: nil, feature_id: 0, threshold: 0.0)
26
26
  @depth = depth
27
27
  @impurity = impurity
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.9.1'
6
+ VERSION = '0.9.2'
7
7
  end
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
17
17
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
18
18
  Rumale currently supports Linear / Kernel Support Vector Machine,
19
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
20
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
20
+ Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
21
21
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
22
22
  MSG
23
23
  spec.homepage = 'https://github.com/yoshoku/rumale'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-05-01 00:00:00.000000000 Z
11
+ date: 2019-05-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -99,7 +99,7 @@ description: |
99
99
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
100
100
  Rumale currently supports Linear / Kernel Support Vector Machine,
101
101
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
102
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
102
+ Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
103
103
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
104
104
  email:
105
105
  - yoshoku@outlook.com
@@ -142,6 +142,8 @@ files:
142
142
  - lib/rumale/ensemble/ada_boost_regressor.rb
143
143
  - lib/rumale/ensemble/extra_trees_classifier.rb
144
144
  - lib/rumale/ensemble/extra_trees_regressor.rb
145
+ - lib/rumale/ensemble/gradient_boosting_classifier.rb
146
+ - lib/rumale/ensemble/gradient_boosting_regressor.rb
145
147
  - lib/rumale/ensemble/random_forest_classifier.rb
146
148
  - lib/rumale/ensemble/random_forest_regressor.rb
147
149
  - lib/rumale/evaluation_measure/accuracy.rb
@@ -191,6 +193,7 @@ files:
191
193
  - lib/rumale/polynomial_model/base_factorization_machine.rb
192
194
  - lib/rumale/polynomial_model/factorization_machine_classifier.rb
193
195
  - lib/rumale/polynomial_model/factorization_machine_regressor.rb
196
+ - lib/rumale/preprocessing/bin_discretizer.rb
194
197
  - lib/rumale/preprocessing/l2_normalizer.rb
195
198
  - lib/rumale/preprocessing/label_encoder.rb
196
199
  - lib/rumale/preprocessing/max_abs_scaler.rb
@@ -203,6 +206,7 @@ files:
203
206
  - lib/rumale/tree/decision_tree_regressor.rb
204
207
  - lib/rumale/tree/extra_tree_classifier.rb
205
208
  - lib/rumale/tree/extra_tree_regressor.rb
209
+ - lib/rumale/tree/gradient_tree_regressor.rb
206
210
  - lib/rumale/tree/node.rb
207
211
  - lib/rumale/utils.rb
208
212
  - lib/rumale/validation.rb