rumale 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
4
- data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
3
+ metadata.gz: 1d73f16bcd1d149babe18c1db66d3f72bb9a1206
4
+ data.tar.gz: 247fd7d548563ef27622c293073236468f634b7d
5
5
  SHA512:
6
- metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
7
- data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71
6
+ metadata.gz: 6a4a92d08ee0a8295d96a930a46fb67a9299a9e0beb717d52186347fef3b70727e35a2375e6c50f5da37ab699132fe0d3c3beeeb0a9730a158e3a5864f6b8364
7
+ data.tar.gz: a614c5002c750f9091a0b7b80b678115ea6b65a1a7d0de621431ee942f8f1678d36c64a271cdb1cc0c4a68c49d20bacfe934d844381ad78361c09e762e02e872
@@ -1,3 +1,5 @@
1
+ require: rubocop-performance
2
+
1
3
  inherit_from: .rubocop_todo.yml
2
4
 
3
5
  AllCops:
@@ -1,3 +1,11 @@
1
+ # 0.9.2
2
+ - Add class for Gradient tree boosting classifier.
3
+ - Add class for Gradient tree boosting regressor.
4
+ - Add class for discretizing feature values.
5
+ - Refactor extra-trees estimators.
6
+ - Refactor decision tree base class.
7
+ - Fix some typos on document ([#6](https://github.com/yoshoku/rumale/pull/6)).
8
+
1
9
  # 0.9.1
2
10
  - Add class for Extra-Trees classifier.
3
11
  - Add class for Extra-Trees regressor.
data/README.md CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
12
12
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
13
13
  Rumale supports Linear / Kernel Support Vector Machine,
14
14
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
15
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
15
+ Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
16
16
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
17
17
 
18
18
  This project was formerly known as "SVMKit".
@@ -334,6 +334,72 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
334
334
  return opt_params;
335
335
  }
336
336
 
337
+ /**
338
+ * @!visibility private
339
+ * Find for split point with maximum information gain.
340
+ *
341
+ * @overload find_split_params(sorted_features, sorted_gradient, sorted_hessian, sum_gradient, sum_hessian) -> Array<Float>
342
+ *
343
+ * @param sorted_features [Array<Float>] (size: n_samples) The feature values sorted in ascending order.
344
+ * @param sorted_targets [Array<Float>] (size: n_samples) The target values sorted according to feature values.
345
+ * @param sorted_gradient [Array<Float>] (size: n_samples) The gradient values of loss function sorted according to feature values.
346
+ * @param sorted_hessian [Array<Float>] (size: n_samples) The hessian values of loss function sorted according to feature values.
347
+ * @param sum_gradient [Float] The sum of gradient values.
348
+ * @param sum_hessian [Float] The sum of hessian values.
349
+ * @param reg_lambda [Float] The L2 regularization term on weight.
350
+ * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
351
+ */
352
+ static VALUE
353
+ find_split_params_grad_reg
354
+ (VALUE self, VALUE sorted_f, VALUE sorted_g, VALUE sorted_h, VALUE sum_g, VALUE sum_h, VALUE reg_l)
355
+ {
356
+ const long n_elements = RARRAY_LEN(sorted_f);
357
+ const double s_grad = NUM2DBL(sum_g);
358
+ const double s_hess = NUM2DBL(sum_h);
359
+ const double reg_lambda = NUM2DBL(reg_l);
360
+ long curr_pos = 0;
361
+ long next_pos = 0;
362
+ double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
363
+ double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
364
+ double next_el;
365
+ double l_grad = 0.0;
366
+ double l_hess = 0.0;
367
+ double r_grad;
368
+ double r_hess;
369
+ double gain;
370
+ VALUE opt_params = rb_ary_new2(2);
371
+
372
+ /* Initialize optimal parameters. */
373
+ rb_ary_store(opt_params, 0, rb_ary_entry(sorted_f, 0)); /* threshold */
374
+ rb_ary_store(opt_params, 1, DBL2NUM(0)); /* gain */
375
+
376
+ /* Find optimal parameters. */
377
+ while (curr_pos < n_elements && curr_el != last_el) {
378
+ next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
379
+ while (next_pos < n_elements && next_el == curr_el) {
380
+ l_grad += NUM2DBL(rb_ary_entry(sorted_g, next_pos));
381
+ l_hess += NUM2DBL(rb_ary_entry(sorted_h, next_pos));
382
+ next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
383
+ }
384
+ /* Calculate gain of new split. */
385
+ r_grad = s_grad - l_grad;
386
+ r_hess = s_hess - l_hess;
387
+ gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
388
+ (r_grad * r_grad) / (r_hess + reg_lambda) -
389
+ (s_grad * s_grad) / (s_hess + reg_lambda);
390
+ /* Update optimal parameters. */
391
+ if (gain > NUM2DBL(rb_ary_entry(opt_params, 1))) {
392
+ rb_ary_store(opt_params, 0, DBL2NUM(0.5 * (curr_el + next_el)));
393
+ rb_ary_store(opt_params, 1, DBL2NUM(gain));
394
+ }
395
+ if (next_pos == n_elements) break;
396
+ curr_pos = next_pos;
397
+ curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
398
+ }
399
+
400
+ return opt_params;
401
+ }
402
+
337
403
  /**
338
404
  * @!visibility private
339
405
  * Calculate impurity based on criterion.
@@ -406,9 +472,17 @@ void Init_rumale(void)
406
472
  * This module is used internally.
407
473
  */
408
474
  VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
475
+ /**
476
+ * Document-module: Rumale::Tree::ExtGradientTreeRegressor
477
+ * @!visibility private
478
+ * The mixin module consisting of extension method for GradientTreeRegressor class.
479
+ * This module is used internally.
480
+ */
481
+ VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
409
482
 
410
483
  rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
411
484
  rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
485
+ rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 6);
412
486
  rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
413
487
  rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
414
488
  }
@@ -47,8 +47,11 @@ require 'rumale/tree/decision_tree_classifier'
47
47
  require 'rumale/tree/decision_tree_regressor'
48
48
  require 'rumale/tree/extra_tree_classifier'
49
49
  require 'rumale/tree/extra_tree_regressor'
50
+ require 'rumale/tree/gradient_tree_regressor'
50
51
  require 'rumale/ensemble/ada_boost_classifier'
51
52
  require 'rumale/ensemble/ada_boost_regressor'
53
+ require 'rumale/ensemble/gradient_boosting_classifier'
54
+ require 'rumale/ensemble/gradient_boosting_regressor'
52
55
  require 'rumale/ensemble/random_forest_classifier'
53
56
  require 'rumale/ensemble/random_forest_regressor'
54
57
  require 'rumale/ensemble/extra_trees_classifier'
@@ -61,6 +64,7 @@ require 'rumale/preprocessing/l2_normalizer'
61
64
  require 'rumale/preprocessing/min_max_scaler'
62
65
  require 'rumale/preprocessing/max_abs_scaler'
63
66
  require 'rumale/preprocessing/standard_scaler'
67
+ require 'rumale/preprocessing/bin_discretizer'
64
68
  require 'rumale/preprocessing/label_encoder'
65
69
  require 'rumale/preprocessing/one_hot_encoder'
66
70
  require 'rumale/model_selection/k_fold'
@@ -42,7 +42,7 @@ module Rumale
42
42
 
43
43
  # Create a new classifier with AdaBoost.
44
44
  #
45
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
46
46
  # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
47
  # @param max_depth [Integer] The maximum depth of the tree.
48
48
  # If nil is given, decision tree grows without concern for depth.
@@ -42,7 +42,7 @@ module Rumale
42
42
 
43
43
  # Create a new regressor with random forest.
44
44
  #
45
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
46
46
  # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
47
47
  # @param exponent [Float] The exponent for the weight of each weak learner.
48
48
  # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
@@ -0,0 +1,278 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/tree/gradient_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
11
+ # The class use negative binomial log-likelihood for the loss function.
12
+ # For multiclass classification problem, it uses one-vs-the-rest strategy.
13
+ #
14
+ # @example
15
+ # estimator =
16
+ # Rumale::Ensemble::GradientBoostingClassifier.new(
17
+ # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
18
+ # estimator.fit(training_samples, traininig_values)
19
+ # results = estimator.predict(testing_samples)
20
+ #
21
+ # *reference*
22
+ # - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
23
+ # - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
24
+ # - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
25
+ #
26
+ class GradientBoostingClassifier
27
+ include Base::BaseEstimator
28
+ include Base::Classifier
29
+
30
+ # Return the set of estimators.
31
+ # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
32
+ attr_reader :estimators
33
+
34
+ # Return the class labels.
35
+ # @return [Numo::Int32] (size: n_classes)
36
+ attr_reader :classes
37
+
38
+ # Return the importance for each feature.
39
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
40
+ # @return [Numo::DFloat] (size: n_features)
41
+ attr_reader :feature_importances
42
+
43
+ # Return the random generator for random selection of feature index.
44
+ # @return [Random]
45
+ attr_reader :rng
46
+
47
+ # Create a new classifier with gradient tree boosting.
48
+ #
49
+ # @param n_estimators [Integer] The numeber of trees for contructing classifier.
50
+ # @param learning_rate [Float] The boosting learining rate
51
+ # @param reg_lambda [Float] The L2 regularization term on weight.
52
+ # @param max_depth [Integer] The maximum depth of the tree.
53
+ # If nil is given, decision tree grows without concern for depth.
54
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
55
+ # If nil is given, number of leaves is not limited.
56
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
57
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
58
+ # If nil is given, split process considers all features.
59
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
60
+ # It is used to randomly determine the order of features when deciding spliting point.
61
+ def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
62
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
63
+ max_features: nil, random_seed: nil)
64
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
65
+ max_features: max_features, random_seed: random_seed)
66
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
67
+ check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
68
+ check_params_positive(n_estimators: n_estimators,
69
+ learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
70
+ max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
71
+ max_features: max_features)
72
+ @params = {}
73
+ @params[:n_estimators] = n_estimators
74
+ @params[:learning_rate] = learning_rate
75
+ @params[:reg_lambda] = reg_lambda
76
+ @params[:subsample] = subsample
77
+ @params[:max_depth] = max_depth
78
+ @params[:max_leaf_nodes] = max_leaf_nodes
79
+ @params[:min_samples_leaf] = min_samples_leaf
80
+ @params[:max_features] = max_features
81
+ @params[:random_seed] = random_seed
82
+ @params[:random_seed] ||= srand
83
+ @estimators = nil
84
+ @classes = nil
85
+ @base_predictions = nil
86
+ @feature_importances = nil
87
+ @rng = Random.new(@params[:random_seed])
88
+ end
89
+
90
+ # Fit the model with given training data.
91
+ #
92
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
93
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
94
+ # @return [GradientBoostingClassifier] The learned classifier itself.
95
+ def fit(x, y)
96
+ check_sample_array(x)
97
+ check_label_array(y)
98
+ check_sample_label_size(x, y)
99
+
100
+ n_features = x.shape[1]
101
+ @params[:max_features] = n_features if @params[:max_features].nil?
102
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
103
+
104
+ # train estimator.
105
+ @classes = Numo::Int32[*y.to_a.uniq.sort]
106
+ n_classes = @classes.size
107
+ if n_classes > 2
108
+ @base_predictions = Numo::DFloat.zeros(n_classes)
109
+ @estimators = Array.new(n_classes) do |n|
110
+ bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
111
+ y_mean = bin_y.mean
112
+ @base_predictions[n] = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
113
+ partial_fit(x, bin_y, @base_predictions[n])
114
+ end
115
+ else
116
+ negative_label = y.to_a.uniq.min
117
+ bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
118
+ y_mean = bin_y.mean
119
+ @base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
120
+ @estimators = partial_fit(x, bin_y, @base_predictions)
121
+ end
122
+
123
+ # calculate feature importances.
124
+ @feature_importances = Numo::DFloat.zeros(n_features)
125
+ if n_classes > 2
126
+ n_classes.times do |n|
127
+ @estimators[n].each { |tree| @feature_importances += tree.feature_importances }
128
+ end
129
+ else
130
+ @estimators.each { |tree| @feature_importances += tree.feature_importances }
131
+ end
132
+
133
+ self
134
+ end
135
+
136
+ # Calculate confidence scores for samples.
137
+ #
138
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
139
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
140
+ def decision_function(x)
141
+ check_sample_array(x)
142
+ n_samples = x.shape[0]
143
+ n_classes = @classes.size
144
+ if n_classes > 2
145
+ scores = Numo::DFloat.ones(n_samples, n_classes) * @base_predictions
146
+ n_classes.times do |n|
147
+ @estimators[n].each { |tree| scores[true, n] += tree.predict(x) }
148
+ end
149
+ else
150
+ scores = Numo::DFloat.ones(n_samples) * @base_predictions
151
+ @estimators.each { |tree| scores += tree.predict(x) }
152
+ end
153
+ scores
154
+ end
155
+
156
+ # Predict class labels for samples.
157
+ #
158
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
159
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
160
+ def predict(x)
161
+ check_sample_array(x)
162
+ n_samples = x.shape[0]
163
+ probs = predict_proba(x)
164
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
165
+ end
166
+
167
+ # Predict probability for samples.
168
+ #
169
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
170
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
171
+ def predict_proba(x)
172
+ check_sample_array(x)
173
+
174
+ proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
175
+
176
+ return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
177
+
178
+ n_samples, = x.shape
179
+ probs = Numo::DFloat.zeros(n_samples, 2)
180
+ probs[true, 1] = proba
181
+ probs[true, 0] = 1.0 - proba
182
+ probs
183
+ end
184
+
185
+ # Return the index of the leaf that each sample reached.
186
+ #
187
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
188
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
189
+ def apply(x)
190
+ check_sample_array(x)
191
+ n_classes = @classes.size
192
+ leaf_ids = if n_classes > 2
193
+ Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
194
+ else
195
+ @estimators.map { |tree| tree.apply(x) }
196
+ end
197
+ Numo::Int32[*leaf_ids].transpose
198
+ end
199
+
200
+ # Dump marshal data.
201
+ # @return [Hash] The marshal data about GradientBoostingClassifier.
202
+ def marshal_dump
203
+ { params: @params,
204
+ estimators: @estimators,
205
+ classes: @classes,
206
+ base_predictions: @base_predictions,
207
+ feature_importances: @feature_importances,
208
+ rng: @rng }
209
+ end
210
+
211
+ # Load marshal data.
212
+ # @return [nil]
213
+ def marshal_load(obj)
214
+ @params = obj[:params]
215
+ @estimators = obj[:estimators]
216
+ @classes = obj[:classes]
217
+ @base_predictions = obj[:base_predictions]
218
+ @feature_importances = obj[:feature_importances]
219
+ @rng = obj[:rng]
220
+ nil
221
+ end
222
+
223
+ private
224
+
225
+ def partial_fit(x, y, init_pred)
226
+ # initialize some variables.
227
+ estimators = []
228
+ n_samples = x.shape[0]
229
+ n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
230
+ whole_ids = Array.new(n_samples) { |v| v }
231
+ y_pred = Numo::DFloat.ones(n_samples) * init_pred
232
+ # grow trees.
233
+ @params[:n_estimators].times do |_t|
234
+ # subsampling
235
+ ids = whole_ids.sample(n_sub_samples, random: @rng)
236
+ x_sub = x[ids, true]
237
+ y_sub = y[ids]
238
+ y_pred_sub = y_pred[ids]
239
+ # train tree
240
+ g = gradient(y_sub, y_pred_sub)
241
+ h = hessian(y_sub, y_pred_sub)
242
+ tree = plant_tree
243
+ tree.fit(x_sub, y_sub, g, h)
244
+ estimators.push(tree)
245
+ # update
246
+ y_pred += tree.predict(x)
247
+ end
248
+ estimators
249
+ end
250
+
251
+ # for debug
252
+ #
253
+ # def loss(y_true, y_pred)
254
+ # # y_true in {-1, 1}
255
+ # Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
256
+ # end
257
+
258
+ def gradient(y_true, y_pred)
259
+ # y in {-1, 1}
260
+ -2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
261
+ end
262
+
263
+ def hessian(y_true, y_pred)
264
+ abs_response = gradient(y_true, y_pred).abs
265
+ abs_response * (2.0 - abs_response)
266
+ end
267
+
268
+ def plant_tree
269
+ Rumale::Tree::GradientTreeRegressor.new(
270
+ reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
271
+ max_depth: @params[:max_depth],
272
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
273
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
274
+ )
275
+ end
276
+ end
277
+ end
278
+ end
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/gradient_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # GradientBoostingRegressor is a class that implements gradient tree boosting for regression.
11
+ # The class use L2 loss for the loss function.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::GradientBoostingRegressor.new(
16
+ # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *reference*
21
+ # - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
22
+ # - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
23
+ # - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
24
+ #
25
+ class GradientBoostingRegressor
26
+ include Base::BaseEstimator
27
+ include Base::Regressor
28
+
29
+ # Return the set of estimators.
30
+ # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
31
+ attr_reader :estimators
32
+
33
+ # Return the importance for each feature.
34
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
35
+ # @return [Numo::DFloat] (size: n_features)
36
+ attr_reader :feature_importances
37
+
38
+ # Return the random generator for random selection of feature index.
39
+ # @return [Random]
40
+ attr_reader :rng
41
+
42
+ # Create a new regressor with gradient tree boosting.
43
+ #
44
+ # @param n_estimators [Integer] The numeber of trees for contructing regressor.
45
+ # @param learning_rate [Float] The boosting learining rate
46
+ # @param reg_lambda [Float] The L2 regularization term on weight.
47
+ # @param max_depth [Integer] The maximum depth of the tree.
48
+ # If nil is given, decision tree grows without concern for depth.
49
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
50
+ # If nil is given, number of leaves is not limited.
51
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
+ # If nil is given, split process considers all features.
54
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
55
+ # It is used to randomly determine the order of features when deciding spliting point.
56
+ def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
57
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
+ max_features: nil, random_seed: nil)
59
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
+ max_features: max_features, random_seed: random_seed)
61
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
62
+ check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
63
+ check_params_positive(n_estimators: n_estimators,
64
+ learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
65
+ max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
66
+ max_features: max_features)
67
+ @params = {}
68
+ @params[:n_estimators] = n_estimators
69
+ @params[:learning_rate] = learning_rate
70
+ @params[:reg_lambda] = reg_lambda
71
+ @params[:subsample] = subsample
72
+ @params[:max_depth] = max_depth
73
+ @params[:max_leaf_nodes] = max_leaf_nodes
74
+ @params[:min_samples_leaf] = min_samples_leaf
75
+ @params[:max_features] = max_features
76
+ @params[:random_seed] = random_seed
77
+ @params[:random_seed] ||= srand
78
+ @estimators = nil
79
+ @base_predictions = nil
80
+ @feature_importances = nil
81
+ @rng = Random.new(@params[:random_seed])
82
+ end
83
+
84
+ # Fit the model with given training data.
85
+ #
86
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
87
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
88
+ # @return [GradientBoostingRegressor] The learned regressor itself.
89
+ def fit(x, y)
90
+ check_sample_array(x)
91
+ check_tvalue_array(y)
92
+ check_sample_tvalue_size(x, y)
93
+
94
+ n_features = x.shape[1]
95
+ @params[:max_features] = n_features if @params[:max_features].nil?
96
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
97
+
98
+ # train regressor.
99
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
100
+ @base_predictions = n_outputs > 1 ? y.mean(0) : y.mean
101
+ @estimators = if n_outputs > 1
102
+ Array.new(n_outputs) do |n|
103
+ partial_fit(x, y[true, n], @base_predictions[n])
104
+ end
105
+ else
106
+ partial_fit(x, y, @base_predictions)
107
+ end
108
+
109
+ # calculate feature importances.
110
+ @feature_importances = Numo::DFloat.zeros(n_features)
111
+ if n_outputs > 1
112
+ n_outputs.times do |n|
113
+ @estimators[n].each { |tree| @feature_importances += tree.feature_importances }
114
+ end
115
+ else
116
+ @estimators.each { |tree| @feature_importances += tree.feature_importances }
117
+ end
118
+
119
+ self
120
+ end
121
+
122
+ # Predict values for samples.
123
+ #
124
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
125
+ # @return [Numo::DFloat] (shape: [n_samples]) Predicted values per sample.
126
+ def predict(x)
127
+ check_sample_array(x)
128
+ n_samples = x.shape[0]
129
+ n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
130
+ if n_outputs > 1
131
+ predicted = Numo::DFloat.ones(n_samples, n_outputs) * @base_predictions
132
+ n_outputs.times do |n|
133
+ @estimators[n].each { |tree| predicted[true, n] += tree.predict(x) }
134
+ end
135
+ else
136
+ predicted = Numo::DFloat.ones(n_samples) * @base_predictions
137
+ @estimators.each { |tree| predicted += tree.predict(x) }
138
+ end
139
+ predicted
140
+ end
141
+
142
+ # Return the index of the leaf that each sample reached.
143
+ #
144
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
145
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
146
+ def apply(x)
147
+ check_sample_array(x)
148
+ n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
149
+ leaf_ids = if n_outputs > 1
150
+ Array.new(n_outputs) { |n| @estimators[n].map { |tree| tree.apply(x) } }
151
+ else
152
+ @estimators.map { |tree| tree.apply(x) }
153
+ end
154
+ Numo::Int32[*leaf_ids].transpose
155
+ end
156
+
157
+ # Dump marshal data.
158
+ # @return [Hash] The marshal data about GradientBoostingRegressor.
159
+ def marshal_dump
160
+ { params: @params,
161
+ estimators: @estimators,
162
+ base_predictions: @base_predictions,
163
+ feature_importances: @feature_importances,
164
+ rng: @rng }
165
+ end
166
+
167
+ # Load marshal data.
168
+ # @return [nil]
169
+ def marshal_load(obj)
170
+ @params = obj[:params]
171
+ @estimators = obj[:estimators]
172
+ @base_predictions = obj[:base_predictions]
173
+ @feature_importances = obj[:feature_importances]
174
+ @rng = obj[:rng]
175
+ nil
176
+ end
177
+
178
+ private
179
+
180
+ def partial_fit(x, y, init_pred)
181
+ # initialize some variables.
182
+ estimators = []
183
+ n_samples = x.shape[0]
184
+ n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
185
+ whole_ids = Array.new(n_samples) { |v| v }
186
+ y_pred = Numo::DFloat.ones(n_samples) * init_pred
187
+ # grow trees.
188
+ @params[:n_estimators].times do |_t|
189
+ # subsampling
190
+ ids = whole_ids.sample(n_sub_samples, random: @rng)
191
+ x_sub = x[ids, true]
192
+ y_sub = y[ids]
193
+ y_pred_sub = y_pred[ids]
194
+ # train tree
195
+ g = gradient(y_sub, y_pred_sub)
196
+ h = hessian(n_sub_samples)
197
+ tree = plant_tree
198
+ tree.fit(x_sub, y_sub, g, h)
199
+ estimators.push(tree)
200
+ # update
201
+ y_pred += tree.predict(x)
202
+ end
203
+ estimators
204
+ end
205
+
206
+ # for debug
207
+ #
208
+ # def loss(y_true, y_pred)
209
+ # ((y_true - y_pred)**2).mean
210
+ # end
211
+
212
+ def gradient(y_true, y_pred)
213
+ y_pred - y_true
214
+ end
215
+
216
+ def hessian(n_samples)
217
+ Numo::DFloat.ones(n_samples)
218
+ end
219
+
220
+ def plant_tree
221
+ Rumale::Tree::GradientTreeRegressor.new(
222
+ reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
223
+ max_depth: @params[:max_depth],
224
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
225
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
226
+ )
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module Preprocessing
8
+ # Discretizes features with a given number of bins.
9
+ # In some cases, discretizing features may accelerate decision tree training.
10
+ #
11
+ # @example
12
+ # discretizer = Rumale::Preprocessing::BinDiscretizer.new(n_bins: 4)
13
+ # samples = Numo::DFloat.new(5, 2).rand - 0.5
14
+ # transformed = discretizer.fit_transform(samples)
15
+ # # > pp samples
16
+ # # Numo::DFloat#shape=[5,2]
17
+ # # [[-0.438246, -0.126933],
18
+ # # [ 0.294815, -0.298958],
19
+ # # [-0.383959, -0.155968],
20
+ # # [ 0.039948, 0.237815],
21
+ # # [-0.334911, -0.449117]]
22
+ # # > pp transformed
23
+ # # Numo::DFloat#shape=[5,2]
24
+ # # [[0, 1],
25
+ # # [3, 0],
26
+ # # [0, 1],
27
+ # # [2, 3],
28
+ # # [0, 0]]
29
+ class BinDiscretizer
30
+ include Base::BaseEstimator
31
+ include Base::Transformer
32
+
33
+ # Return the feature steps to be used discretizing.
34
+ # @return [Array<Numo::DFloat>] (shape: [n_features, n_bins])
35
+ attr_reader :feature_steps
36
+
37
+ # Create a new discretizer for features with given number of bins.
38
+ #
39
+ # @param n_bins [Integer] The number of bins to be used disretizing feature values.
40
+ def initialize(n_bins: 32)
41
+ @params = {}
42
+ @params[:n_bins] = n_bins
43
+ @feature_steps = nil
44
+ end
45
+
46
+ # Fit feature ranges to be discretized.
47
+ #
48
+ # @overload fit(x) -> BinDiscretizer
49
+ #
50
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the feature ranges.
51
+ # @return [BinDiscretizer]
52
+ def fit(x, _y = nil)
53
+ check_sample_array(x)
54
+ n_features = x.shape[1]
55
+ max_vals = x.max(0)
56
+ min_vals = x.min(0)
57
+ @feature_steps = Array.new(n_features) do |n|
58
+ Numo::DFloat.linspace(min_vals[n], max_vals[n], @params[:n_bins] + 1)[0...@params[:n_bins]]
59
+ end
60
+ self
61
+ end
62
+
63
+ # Fit feature ranges to be discretized, then return discretized samples.
64
+ #
65
+ # @overload fit_transform(x) -> Numo::DFloat
66
+ #
67
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
68
+ # @return [Numo::DFloat] The discretized samples.
69
+ def fit_transform(x, _y = nil)
70
+ check_sample_array(x)
71
+ fit(x).transform(x)
72
+ end
73
+
74
+ # Peform discretizing the given samples.
75
+ #
76
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be discretized.
77
+ # @return [Numo::DFloat] The discretized samples.
78
+ def transform(x)
79
+ check_sample_array(x)
80
+ n_samples, n_features = x.shape
81
+ transformed = Numo::DFloat.zeros(n_samples, n_features)
82
+ n_features.times do |n|
83
+ steps = @feature_steps[n]
84
+ @params[:n_bins].times do |bin|
85
+ mask = x[true, n].ge(steps[bin]).where
86
+ transformed[mask, n] = bin
87
+ end
88
+ end
89
+ transformed
90
+ end
91
+
92
+ # Dump marshal data.
93
+ # @return [Hash] The marshal data about BinDiscretizer
94
+ def marshal_dump
95
+ { params: @params,
96
+ feature_steps: @feature_steps }
97
+ end
98
+
99
+ # Load marshal data.
100
+ # @return [nil]
101
+ def marshal_load(obj)
102
+ @params = obj[:params]
103
+ @feature_steps = obj[:feature_steps]
104
+ nil
105
+ end
106
+ end
107
+ end
108
+ end
@@ -9,7 +9,7 @@ module Rumale
9
9
  # Normalize samples to unit L2-norm.
10
10
  #
11
11
  # @example
12
- # normalizer = Rumale::Preprocessing::StandardScaler.new
12
+ # normalizer = Rumale::Preprocessing::L2Normalizer.new
13
13
  # new_samples = normalizer.fit_transform(samples)
14
14
  class L2Normalizer
15
15
  include Base::BaseEstimator
@@ -86,14 +86,13 @@ module Rumale
86
86
  return put_leaf(node, y) if stop_growing?(y)
87
87
 
88
88
  # calculate optimal parameters.
89
- feature_id, left_ids, right_ids, left_imp, right_imp, threshold, gain = rand_ids(n_features).map do |fid|
90
- ft = x[true, fid]
91
- limp, rimp, th, ga = best_split(ft, y, whole_impurity)
92
- [fid, ft.le(th).where, ft.gt(th).where, limp, rimp, th, ga]
93
- end.max_by(&:last)
89
+ feature_id, left_imp, right_imp, threshold, gain =
90
+ rand_ids(n_features).map { |n| [n, *best_split(x[true, n], y, whole_impurity)] }.max_by(&:last)
94
91
 
95
92
  return put_leaf(node, y) if gain.nil? || gain.zero?
96
93
 
94
+ left_ids = x[true, feature_id].le(threshold).where
95
+ right_ids = x[true, feature_id].gt(threshold).where
97
96
  node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
98
97
  node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
99
98
 
@@ -107,8 +107,8 @@ module Rumale
107
107
  threshold = @rng.rand(features.min..features.max)
108
108
  l_ids = features.le(threshold).where
109
109
  r_ids = features.gt(threshold).where
110
- l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
111
- r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
110
+ l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
111
+ r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
112
112
  gain = whole_impurity -
113
113
  l_impurity * l_ids.size.fdiv(y.shape[0]) -
114
114
  r_impurity * r_ids.size.fdiv(y.shape[0])
@@ -94,8 +94,8 @@ module Rumale
94
94
  threshold = @rng.rand(features.min..features.max)
95
95
  l_ids = features.le(threshold).where
96
96
  r_ids = features.gt(threshold).where
97
- l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
98
- r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
97
+ l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
98
+ r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
99
99
  gain = whole_impurity -
100
100
  l_impurity * l_ids.size.fdiv(y.shape[0]) -
101
101
  r_impurity * r_ids.size.fdiv(y.shape[0])
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/rumale'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/node'
7
+
8
+ module Rumale
9
+ module Tree
10
+ # GradientTreeRegressor is a class that implements decision tree for regression with exact gredy algorithm.
11
+ # This class is used internally for estimators with gradient tree boosting.
12
+ #
13
+ # *reference*
14
+ # - J H. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
15
+ # - J H. Friedman, "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
16
+ # - T. Chen and C. Guestrin, "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
17
+ #
18
+ class GradientTreeRegressor
19
+ include Base::BaseEstimator
20
+ include Base::Regressor
21
+ include ExtGradientTreeRegressor
22
+
23
+ # Return the importance for each feature.
24
+ # The feature importances are calculated based on the numbers of times the feature is used for splitting.
25
+ # @return [Numo::DFloat] (shape: [n_features])
26
+ attr_reader :feature_importances
27
+
28
+ # Return the learned tree.
29
+ # @return [Node]
30
+ attr_reader :tree
31
+
32
+ # Return the random generator for random selection of feature index.
33
+ # @return [Random]
34
+ attr_reader :rng
35
+
36
+ # Return the values assigned each leaf.
37
+ # @return [Numo::DFloat] (shape: [n_leaves])
38
+ attr_reader :leaf_weights
39
+
40
+ # Initialize a gradient tree regressor
41
+ #
42
+ # @param reg_lambda [Float] The L2 regularization term on weight.
43
+ # @param shrinkage_rate [Float] The shrinkage rate for weight.
44
+ # @param max_depth [Integer] The maximum depth of the tree.
45
+ # If nil is given, decision tree grows without concern for depth.
46
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
47
+ # If nil is given, number of leaves is not limited.
48
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
49
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
50
+ # If nil is given, split process considers all features.
51
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
52
+ # It is used to randomly determine the order of features when deciding spliting point.
53
+ def initialize(reg_lambda: 0.0, shrinkage_rate: 1.0,
54
+ max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
55
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
+ max_features: max_features, random_seed: random_seed)
57
+ check_params_float(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate)
58
+ check_params_integer(min_samples_leaf: min_samples_leaf)
59
+ check_params_positive(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate,
60
+ max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
61
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
62
+ @params = {}
63
+ @params[:reg_lambda] = reg_lambda
64
+ @params[:shrinkage_rate] = shrinkage_rate
65
+ @params[:max_depth] = max_depth
66
+ @params[:max_leaf_nodes] = max_leaf_nodes
67
+ @params[:min_samples_leaf] = min_samples_leaf
68
+ @params[:max_features] = max_features
69
+ @params[:random_seed] = random_seed
70
+ @params[:random_seed] ||= srand
71
+ @tree = nil
72
+ @feature_importances = nil
73
+ @n_leaves = nil
74
+ @leaf_weights = nil
75
+ @rng = Random.new(@params[:random_seed])
76
+ end
77
+
78
+ # Fit the model with given training data.
79
+ #
80
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
+ # @param y [Numo::DFloat] (shape: [n_samples]) The taget values to be used for fitting the model.
82
+ # @param g [Numo::DFloat] (shape: [n_samples]) The gradient of loss function.
83
+ # @param h [Numo::DFloat] (shape: [n_samples]) The hessian of loss function.
84
+ # @return [GradientTreeRegressor] The learned regressor itself.
85
+ def fit(x, y, g, h)
86
+ check_sample_array(x)
87
+ check_tvalue_array(y)
88
+ check_sample_tvalue_size(x, y)
89
+ check_params_type(Numo::DFloat, g: g, h: g)
90
+ # Initialize some variables.
91
+ n_features = x.shape[1]
92
+ @params[:max_features] ||= n_features
93
+ @n_leaves = 0
94
+ @leaf_weights = []
95
+ @feature_importances = Numo::DFloat.zeros(n_features)
96
+ # Build tree.
97
+ build_tree(x, y, g, h)
98
+ @leaf_weights = Numo::DFloat[*@leaf_weights]
99
+ self
100
+ end
101
+
102
+ # Predict values for samples.
103
+ #
104
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
105
+ # @return [Numo::DFloat] (size: n_samples) Predicted values per sample.
106
+ def predict(x)
107
+ check_sample_array(x)
108
+ @leaf_weights[apply(x)].dup
109
+ end
110
+
111
+ # Return the index of the leaf that each sample reached.
112
+ #
113
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
114
+ # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
115
+ def apply(x)
116
+ check_sample_array(x)
117
+ Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
118
+ end
119
+
120
+ # Dump marshal data.
121
+ # @return [Hash] The marshal data about DecisionTreeRegressor
122
+ def marshal_dump
123
+ { params: @params,
124
+ tree: @tree,
125
+ feature_importances: @feature_importances,
126
+ leaf_weights: @leaf_weights,
127
+ rng: @rng }
128
+ end
129
+
130
+ # Load marshal data.
131
+ # @return [nil]
132
+ def marshal_load(obj)
133
+ @params = obj[:params]
134
+ @tree = obj[:tree]
135
+ @feature_importances = obj[:feature_importances]
136
+ @leaf_weights = obj[:leaf_weights]
137
+ @rng = obj[:rng]
138
+ nil
139
+ end
140
+
141
+ private
142
+
143
+ def apply_at_node(node, sample)
144
+ return node.leaf_id if node.leaf
145
+ return apply_at_node(node.left, sample) if node.right.nil?
146
+ return apply_at_node(node.right, sample) if node.left.nil?
147
+ if sample[node.feature_id] <= node.threshold
148
+ apply_at_node(node.left, sample)
149
+ else
150
+ apply_at_node(node.right, sample)
151
+ end
152
+ end
153
+
154
+ def build_tree(x, y, g, h)
155
+ @tree = grow_node(0, x, y, g, h)
156
+ nil
157
+ end
158
+
159
+ def grow_node(depth, x, y, g, h)
160
+ # intialize some variables.
161
+ sum_g = g.sum
162
+ sum_h = h.sum
163
+ n_samples, n_features = x.shape
164
+ node = Node.new(depth: depth, n_samples: n_samples)
165
+
166
+ # terminate growing.
167
+ unless @params[:max_leaf_nodes].nil?
168
+ return nil if @n_leaves >= @params[:max_leaf_nodes]
169
+ end
170
+
171
+ return nil if n_samples < @params[:min_samples_leaf]
172
+ return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
173
+
174
+ unless @params[:max_depth].nil?
175
+ return put_leaf(node, sum_g, sum_h) if depth == @params[:max_depth]
176
+ end
177
+
178
+ return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
179
+
180
+ # calculate optimal parameters.
181
+ feature_id, threshold, gain =
182
+ rand_ids(n_features).map { |n| [n, *best_split(x[true, n], g, h, sum_g, sum_h)] }.max_by(&:last)
183
+
184
+ return put_leaf(node, sum_g, sum_h) if gain.nil? || gain.zero?
185
+
186
+ left_ids = x[true, feature_id].le(threshold).where
187
+ right_ids = x[true, feature_id].gt(threshold).where
188
+ node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], g[left_ids], h[left_ids])
189
+ node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], g[right_ids], h[right_ids])
190
+
191
+ return put_leaf(node, sum_g, sum_h) if node.left.nil? && node.right.nil?
192
+
193
+ @feature_importances[feature_id] += 1.0
194
+
195
+ node.feature_id = feature_id
196
+ node.threshold = threshold
197
+ node.leaf = false
198
+ node
199
+ end
200
+
201
+ def stop_growing?(y)
202
+ y.to_a.uniq.size == 1
203
+ end
204
+
205
+ def put_leaf(node, sum_g, sum_h)
206
+ node.probs = nil
207
+ node.leaf = true
208
+ node.leaf_id = @n_leaves
209
+ weight = -@params[:shrinkage_rate] * sum_g / (sum_h + @params[:reg_lambda])
210
+ @leaf_weights.push(weight)
211
+ @n_leaves += 1
212
+ node
213
+ end
214
+
215
+ def best_split(features, g, h, sum_g, sum_h)
216
+ order = features.sort_index
217
+ sorted_f = features[order].to_a
218
+ sorted_g = g[order].to_a
219
+ sorted_h = h[order].to_a
220
+ find_split_params(sorted_f, sorted_g, sorted_h, sum_g, sum_h, @params[:reg_lambda])
221
+ end
222
+
223
+ def rand_ids(n)
224
+ [*0...n].sample(@params[:max_features], random: @rng)
225
+ end
226
+ end
227
+ end
228
+ end
@@ -21,7 +21,7 @@ module Rumale
21
21
  # @param feature_id [Integer] The feature index used for evaluation.
22
22
  # @param threshold [Float] The threshold value of the feature for splitting the node.
23
23
  def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
24
- leaf: true, leaf_id: 0,
24
+ leaf: false, leaf_id: nil,
25
25
  left: nil, right: nil, feature_id: 0, threshold: 0.0)
26
26
  @depth = depth
27
27
  @impurity = impurity
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.9.1'
6
+ VERSION = '0.9.2'
7
7
  end
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
17
17
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
18
18
  Rumale currently supports Linear / Kernel Support Vector Machine,
19
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
20
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
20
+ Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
21
21
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
22
22
  MSG
23
23
  spec.homepage = 'https://github.com/yoshoku/rumale'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-05-01 00:00:00.000000000 Z
11
+ date: 2019-05-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -99,7 +99,7 @@ description: |
99
99
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
100
100
  Rumale currently supports Linear / Kernel Support Vector Machine,
101
101
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
102
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
102
+ Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
103
103
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
104
104
  email:
105
105
  - yoshoku@outlook.com
@@ -142,6 +142,8 @@ files:
142
142
  - lib/rumale/ensemble/ada_boost_regressor.rb
143
143
  - lib/rumale/ensemble/extra_trees_classifier.rb
144
144
  - lib/rumale/ensemble/extra_trees_regressor.rb
145
+ - lib/rumale/ensemble/gradient_boosting_classifier.rb
146
+ - lib/rumale/ensemble/gradient_boosting_regressor.rb
145
147
  - lib/rumale/ensemble/random_forest_classifier.rb
146
148
  - lib/rumale/ensemble/random_forest_regressor.rb
147
149
  - lib/rumale/evaluation_measure/accuracy.rb
@@ -191,6 +193,7 @@ files:
191
193
  - lib/rumale/polynomial_model/base_factorization_machine.rb
192
194
  - lib/rumale/polynomial_model/factorization_machine_classifier.rb
193
195
  - lib/rumale/polynomial_model/factorization_machine_regressor.rb
196
+ - lib/rumale/preprocessing/bin_discretizer.rb
194
197
  - lib/rumale/preprocessing/l2_normalizer.rb
195
198
  - lib/rumale/preprocessing/label_encoder.rb
196
199
  - lib/rumale/preprocessing/max_abs_scaler.rb
@@ -203,6 +206,7 @@ files:
203
206
  - lib/rumale/tree/decision_tree_regressor.rb
204
207
  - lib/rumale/tree/extra_tree_classifier.rb
205
208
  - lib/rumale/tree/extra_tree_regressor.rb
209
+ - lib/rumale/tree/gradient_tree_regressor.rb
206
210
  - lib/rumale/tree/node.rb
207
211
  - lib/rumale/utils.rb
208
212
  - lib/rumale/validation.rb