rumale 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2dfbc60c9d47e741fc91497f8c58ade390e6c8f
4
- data.tar.gz: d4cbc26e0d81fbe0de5e83d785cc836e9a5b2099
3
+ metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
4
+ data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
5
5
  SHA512:
6
- metadata.gz: 7f2b4b8ba5d7511215a2e850add19f0942cbff4157a8373eba1950c0eac9fcd0e44925d3a88b2a709c0308ef4c03cca44c501b710f4a22dc4dd573e6866d94dc
7
- data.tar.gz: 4630710eef59af88274e9a411a6ad12de7e4a616280f8fc94d185e24c7bc667bf8c1f662425c64cf05f6ec9accd914ac32e1039688d09629b920329ad85354c8
6
+ metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
7
+ data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ # 0.9.1
2
+ - Add class for Extra-Trees classifier.
3
+ - Add class for Extra-Trees regressor.
4
+ - Refactor extension modules of decision tree estimators for improving performance.
5
+
1
6
  # 0.9.0
2
7
  ## Breaking changes
3
8
  - Decide to introduce Ruby extensions for improving performance.
data/README.md CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
12
12
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
13
13
  Rumale supports Linear / Kernel Support Vector Machine,
14
14
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
15
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor classifier,
15
+ Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
16
16
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
17
17
 
18
18
  This project was formerly known as "SVMKit".
data/ext/rumale/rumale.c CHANGED
@@ -183,33 +183,32 @@ sub_sum_vec(VALUE sum_vec, VALUE target)
183
183
  * @!visibility private
184
184
  * Find for split point with maximum information gain.
185
185
  *
186
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, uniqed_features, n_classes) -> Array<Float>
186
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, n_classes) -> Array<Float>
187
187
  *
188
188
  * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
189
189
  * @param impurity [Float] The impurity of whole dataset.
190
190
  * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
191
191
  * @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
192
- * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
193
192
  * @param n_classes [Integer] The number of classes.
194
193
  * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
195
194
  */
196
195
  static VALUE
197
- find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f, VALUE n_classes_)
196
+ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE n_classes_)
198
197
  {
199
- long i;
200
- long curr_pos;
201
- long next_pos;
202
- long n_l_elements;
203
- long n_r_elements;
204
198
  const long n_classes = NUM2LONG(n_classes_);
205
199
  const long n_elements = RARRAY_LEN(sorted_f);
206
- const long n_uniq_elements = RARRAY_LEN(uniqed_f);
207
200
  const double w_impurity = NUM2DBL(whole_impurity);
201
+ long iter = 0;
202
+ long curr_pos = 0;
203
+ long next_pos = 0;
204
+ long n_l_elements = 0;
205
+ long n_r_elements = n_elements;
206
+ double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
207
+ double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
208
+ double next_el;
208
209
  double l_impurity;
209
210
  double r_impurity;
210
211
  double gain;
211
- double curr_el;
212
- double next_el;
213
212
  VALUE l_histogram = create_zero_vector(n_classes);
214
213
  VALUE r_histogram = create_zero_vector(n_classes);
215
214
  VALUE opt_params = rb_ary_new2(4);
@@ -217,22 +216,18 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
217
216
  /* Initialize optimal parameters. */
218
217
  rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
219
218
  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
220
- rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
219
+ rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
221
220
  rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
222
221
 
223
222
  /* Initialize child node variables. */
224
- n_l_elements = 0;
225
- n_r_elements = n_elements;
226
- for (i = 0; i < n_elements; i++) {
227
- increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, i)));
223
+ for (iter = 0; iter < n_elements; iter++) {
224
+ increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, iter)));
228
225
  }
229
226
 
230
227
  /* Find optimal parameters. */
231
- for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
232
- /* Find new split point. */
233
- curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
228
+ while (curr_pos < n_elements && curr_el != last_el) {
234
229
  next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
235
- while (next_pos < n_elements && next_el <= curr_el) {
230
+ while (next_pos < n_elements && next_el == curr_el) {
236
231
  increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
237
232
  n_l_elements++;
238
233
  decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
@@ -250,6 +245,9 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
250
245
  rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
251
246
  rb_ary_store(opt_params, 3, DBL2NUM(gain));
252
247
  }
248
+ if (next_pos == n_elements) break;
249
+ curr_pos = next_pos;
250
+ curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
253
251
  }
254
252
 
255
253
  return opt_params;
@@ -259,32 +257,31 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
259
257
  * @!visibility private
260
258
  * Find for split point with maximum information gain.
261
259
  *
262
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets, uniqed_features) -> Array<Float>
260
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
263
261
  *
264
262
  * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
265
263
  * @param impurity [Float] The impurity of whole dataset.
266
264
  * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
267
265
  * @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
268
- * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
269
266
  * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
270
267
  */
271
268
  static VALUE
272
- find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f)
269
+ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y)
273
270
  {
274
- long i;
275
- long curr_pos;
276
- long next_pos;
277
- long n_l_elements;
278
- long n_r_elements;
279
271
  const long n_elements = RARRAY_LEN(sorted_f);
280
- const long n_uniq_elements = RARRAY_LEN(uniqed_f);
281
272
  const long n_dimensions = RARRAY_LEN(rb_ary_entry(sorted_y, 0));
282
273
  const double w_impurity = NUM2DBL(whole_impurity);
274
+ long iter = 0;
275
+ long curr_pos = 0;
276
+ long next_pos = 0;
277
+ long n_l_elements = 0;
278
+ long n_r_elements = n_elements;
279
+ double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
280
+ double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
281
+ double next_el;
283
282
  double l_impurity;
284
283
  double r_impurity;
285
284
  double gain;
286
- double curr_el;
287
- double next_el;
288
285
  VALUE l_sum_vec = create_zero_vector(n_dimensions);
289
286
  VALUE r_sum_vec = create_zero_vector(n_dimensions);
290
287
  VALUE l_target_vecs = rb_ary_new();
@@ -295,24 +292,20 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
295
292
  /* Initialize optimal parameters. */
296
293
  rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
297
294
  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
298
- rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
295
+ rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
299
296
  rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
300
297
 
301
298
  /* Initialize child node variables. */
302
- n_l_elements = 0;
303
- n_r_elements = n_elements;
304
- for (i = 0; i < n_elements; i++) {
305
- target = rb_ary_entry(sorted_y, i);
299
+ for (iter = 0; iter < n_elements; iter++) {
300
+ target = rb_ary_entry(sorted_y, iter);
306
301
  add_sum_vec(r_sum_vec, target);
307
302
  rb_ary_push(r_target_vecs, target);
308
303
  }
309
304
 
310
305
  /* Find optimal parameters. */
311
- for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
312
- /* Find new split point. */
313
- curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
306
+ while (curr_pos < n_elements && curr_el != last_el) {
314
307
  next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
315
- while (next_pos < n_elements && next_el <= curr_el) {
308
+ while (next_pos < n_elements && next_el == curr_el) {
316
309
  target = rb_ary_entry(sorted_y, next_pos);
317
310
  add_sum_vec(l_sum_vec, target);
318
311
  rb_ary_push(l_target_vecs, target);
@@ -333,6 +326,9 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
333
326
  rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
334
327
  rb_ary_store(opt_params, 3, DBL2NUM(gain));
335
328
  }
329
+ if (next_pos == n_elements) break;
330
+ curr_pos = next_pos;
331
+ curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
336
332
  }
337
333
 
338
334
  return opt_params;
@@ -411,8 +407,8 @@ void Init_rumale(void)
411
407
  */
412
408
  VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
413
409
 
414
- rb_define_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
415
- rb_define_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
416
- rb_define_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
417
- rb_define_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
410
+ rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
411
+ rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
412
+ rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
413
+ rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
418
414
  }
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/extra_tree_classifier'
4
+ require 'rumale/ensemble/random_forest_classifier'
5
+
6
+ module Rumale
7
+ module Ensemble
8
+ # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
9
+ # The algorithm of extremely randomized trees is similar to random forest.
10
+ # The features of the algorithm of extremely randomized trees are
11
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::ExtraTreesClassifier.new(
16
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_labels)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
22
+ class ExtraTreesClassifier < RandomForestClassifier
23
+ # Return the set of estimators.
24
+ # @return [Array<ExtraTreeClassifier>]
25
+ attr_reader :estimators
26
+
27
+ # Return the class labels.
28
+ # @return [Numo::Int32] (size: n_classes)
29
+ attr_reader :classes
30
+
31
+ # Return the importance for each feature.
32
+ # @return [Numo::DFloat] (size: n_features)
33
+ attr_reader :feature_importances
34
+
35
+ # Return the random generator for random selection of feature index.
36
+ # @return [Random]
37
+ attr_reader :rng
38
+
39
+ # Create a new classifier with extremely randomized trees.
40
+ #
41
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
42
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
43
+ # @param max_depth [Integer] The maximum depth of the tree.
44
+ # If nil is given, extra tree grows without concern for depth.
45
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
46
+ # If nil is given, number of leaves is not limited.
47
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
+ # If nil is given, split process considers all features.
50
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
51
+ # It is used to randomly determine the order of features when deciding spliting point.
52
+ def initialize(n_estimators: 10,
53
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
54
+ max_features: nil, random_seed: nil)
55
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
+ max_features: max_features, random_seed: random_seed)
57
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
58
+ check_params_string(criterion: criterion)
59
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
60
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
61
+ max_features: max_features)
62
+ super
63
+ end
64
+
65
+ # Fit the model with given training data.
66
+ #
67
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
68
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
69
+ # @return [ExtraTreesClassifier] The learned classifier itself.
70
+ def fit(x, y)
71
+ check_sample_array(x)
72
+ check_label_array(y)
73
+ check_sample_label_size(x, y)
74
+ # Initialize some variables.
75
+ n_features = x.shape[1]
76
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
77
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
78
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
79
+ @feature_importances = Numo::DFloat.zeros(n_features)
80
+ # Construct trees.
81
+ @estimators = Array.new(@params[:n_estimators]) do
82
+ tree = Tree::ExtraTreeClassifier.new(
83
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
84
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
85
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
86
+ )
87
+ tree.fit(x, y)
88
+ @feature_importances += tree.feature_importances
89
+ tree
90
+ end
91
+ @feature_importances /= @feature_importances.sum
92
+ self
93
+ end
94
+
95
+ # Predict class labels for samples.
96
+ #
97
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
98
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
99
+ def predict(x)
100
+ check_sample_array(x)
101
+ super
102
+ end
103
+
104
+ # Predict probability for samples.
105
+ #
106
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
107
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
108
+ def predict_proba(x)
109
+ check_sample_array(x)
110
+ super
111
+ end
112
+
113
+ # Return the index of the leaf that each sample reached.
114
+ #
115
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
116
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
117
+ def apply(x)
118
+ check_sample_array(x)
119
+ super
120
+ end
121
+
122
+ # Dump marshal data.
123
+ # @return [Hash] The marshal data about ExtraTreesClassifier.
124
+ def marshal_dump
125
+ super
126
+ end
127
+
128
+ # Load marshal data.
129
+ # @return [nil]
130
+ def marshal_load(obj)
131
+ super
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/extra_tree_regressor'
4
+ require 'rumale/ensemble/random_forest_regressor'
5
+
6
+ module Rumale
7
+ module Ensemble
8
+ # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
9
+ # The algorithm of extremely randomized trees is similar to random forest.
10
+ # The features of the algorithm of extremely randomized trees are
11
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::ExtraTreesRegressor.new(
16
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
22
+ class ExtraTreesRegressor < RandomForestRegressor
23
+ # Return the set of estimators.
24
+ # @return [Array<ExtraTreeRegressor>]
25
+ attr_reader :estimators
26
+
27
+ # Return the importance for each feature.
28
+ # @return [Numo::DFloat] (size: n_features)
29
+ attr_reader :feature_importances
30
+
31
+ # Return the random generator for random selection of feature index.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Create a new regressor with extremely randomized trees.
36
+ #
37
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
38
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
39
+ # @param max_depth [Integer] The maximum depth of the tree.
40
+ # If nil is given, extra tree grows without concern for depth.
41
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
42
+ # If nil is given, number of leaves is not limited.
43
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
44
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
45
+ # If nil is given, split process considers all features.
46
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
47
+ # It is used to randomly determine the order of features when deciding spliting point.
48
+ def initialize(n_estimators: 10,
49
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
50
+ max_features: nil, random_seed: nil)
51
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
52
+ max_features: max_features, random_seed: random_seed)
53
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
54
+ check_params_string(criterion: criterion)
55
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
56
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
57
+ max_features: max_features)
58
+ super
59
+ end
60
+
61
+ # Fit the model with given training data.
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
64
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
65
+ # @return [ExtraTreesRegressor] The learned regressor itself.
66
+ def fit(x, y)
67
+ check_sample_array(x)
68
+ check_tvalue_array(y)
69
+ check_sample_tvalue_size(x, y)
70
+ # Initialize some variables.
71
+ n_features = x.shape[1]
72
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
73
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
74
+ @feature_importances = Numo::DFloat.zeros(n_features)
75
+ # Construct forest.
76
+ @estimators = Array.new(@params[:n_estimators]) do
77
+ tree = Tree::ExtraTreeRegressor.new(
78
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
79
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
80
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
81
+ )
82
+ tree.fit(x, y)
83
+ @feature_importances += tree.feature_importances
84
+ tree
85
+ end
86
+ @feature_importances /= @feature_importances.sum
87
+ self
88
+ end
89
+
90
+ # Predict values for samples.
91
+ #
92
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
93
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
94
+ def predict(x)
95
+ check_sample_array(x)
96
+ super
97
+ end
98
+
99
+ # Return the index of the leaf that each sample reached.
100
+ #
101
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
102
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
103
+ def apply(x)
104
+ check_sample_array(x)
105
+ super
106
+ end
107
+
108
+ # Dump marshal data.
109
+ # @return [Hash] The marshal data about ExtraTreesRegressor.
110
+ def marshal_dump
111
+ super
112
+ end
113
+
114
+ # Load marshal data.
115
+ # @return [nil]
116
+ def marshal_load(obj)
117
+ super
118
+ end
119
+ end
120
+ end
121
+ end
@@ -155,9 +155,9 @@ module Rumale
155
155
  def best_split(features, y, whole_impurity)
156
156
  order = features.sort_index
157
157
  sorted_f = features[order].to_a
158
- sorted_y = y[order, true].to_a.flatten
158
+ sorted_y = y[order, 0].to_a
159
159
  n_classes = @classes.size
160
- find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq, n_classes)
160
+ find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, n_classes)
161
161
  end
162
162
 
163
163
  def impurity(y)
@@ -129,7 +129,7 @@ module Rumale
129
129
  order = features.sort_index
130
130
  sorted_f = features[order].to_a
131
131
  sorted_y = y[order, true].to_a
132
- find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq)
132
+ find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
133
133
  end
134
134
 
135
135
  def impurity(y)
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/decision_tree_classifier'
4
+
5
+ module Rumale
6
+ module Tree
7
+ # ExtraTreeClassifier is a class that implements extra randomized tree for classification.
8
+ #
9
+ # @example
10
+ # estimator =
11
+ # Rumale::Tree::ExtraTreeClassifier.new(
12
+ # criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
13
+ # estimator.fit(training_samples, traininig_labels)
14
+ # results = estimator.predict(testing_samples)
15
+ #
16
+ # *Reference*
17
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
18
+ class ExtraTreeClassifier < DecisionTreeClassifier
19
+ # Return the class labels.
20
+ # @return [Numo::Int32] (size: n_classes)
21
+ attr_reader :classes
22
+
23
+ # Return the importance for each feature.
24
+ # @return [Numo::DFloat] (size: n_features)
25
+ attr_reader :feature_importances
26
+
27
+ # Return the learned tree.
28
+ # @return [Node]
29
+ attr_reader :tree
30
+
31
+ # Return the random generator for random selection of feature index.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Return the labels assigned each leaf.
36
+ # @return [Numo::Int32] (size: n_leafs)
37
+ attr_reader :leaf_labels
38
+
39
+ # Create a new classifier with extra randomized tree algorithm.
40
+ #
41
+ # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
42
+ # @param max_depth [Integer] The maximum depth of the tree.
43
+ # If nil is given, extra tree grows without concern for depth.
44
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
45
+ # If nil is given, number of leaves is not limited.
46
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
47
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
48
+ # If nil is given, split process considers all features.
49
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
50
+ # It is used to randomly determine the order of features when deciding spliting point.
51
+ def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
52
+ random_seed: nil)
53
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
54
+ max_features: max_features, random_seed: random_seed)
55
+ check_params_integer(min_samples_leaf: min_samples_leaf)
56
+ check_params_string(criterion: criterion)
57
+ check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
58
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
59
+ super
60
+ end
61
+
62
+ # Fit the model with given training data.
63
+ #
64
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
65
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
66
+ # @return [ExtraTreeClassifier] The learned classifier itself.
67
+ def fit(x, y)
68
+ check_sample_array(x)
69
+ check_label_array(y)
70
+ check_sample_label_size(x, y)
71
+ super
72
+ end
73
+
74
+ # Predict class labels for samples.
75
+ #
76
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
77
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
78
+ def predict(x)
79
+ check_sample_array(x)
80
+ super
81
+ end
82
+
83
+ # Predict probability for samples.
84
+ #
85
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
86
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
87
+ def predict_proba(x)
88
+ check_sample_array(x)
89
+ super
90
+ end
91
+
92
+ # Dump marshal data.
93
+ # @return [Hash] The marshal data about ExtraTreeClassifier
94
+ def marshal_dump
95
+ super
96
+ end
97
+
98
+ # Load marshal data.
99
+ # @return [nil]
100
+ def marshal_load(obj)
101
+ super
102
+ end
103
+
104
+ private
105
+
106
+ def best_split(features, y, whole_impurity)
107
+ threshold = @rng.rand(features.min..features.max)
108
+ l_ids = features.le(threshold).where
109
+ r_ids = features.gt(threshold).where
110
+ l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
111
+ r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
112
+ gain = whole_impurity -
113
+ l_impurity * l_ids.size.fdiv(y.shape[0]) -
114
+ r_impurity * r_ids.size.fdiv(y.shape[0])
115
+ [l_impurity, r_impurity, threshold, gain]
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/decision_tree_regressor'
4
+
5
+ module Rumale
6
+ module Tree
7
+ # ExtraTreeRegressor is a class that implements extra randomized tree for regression.
8
+ #
9
+ # @example
10
+ # estimator =
11
+ # Rumale::Tree::ExtraTreeRegressor.new(
12
+ # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
13
+ # estimator.fit(training_samples, traininig_values)
14
+ # results = estimator.predict(testing_samples)
15
+ #
16
+ # *Reference*
17
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
18
+ class ExtraTreeRegressor < DecisionTreeRegressor
19
+ # Return the importance for each feature.
20
+ # @return [Numo::DFloat] (size: n_features)
21
+ attr_reader :feature_importances
22
+
23
+ # Return the learned tree.
24
+ # @return [Node]
25
+ attr_reader :tree
26
+
27
+ # Return the random generator for random selection of feature index.
28
+ # @return [Random]
29
+ attr_reader :rng
30
+
31
+ # Return the values assigned each leaf.
32
+ # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
33
+ attr_reader :leaf_values
34
+
35
+ # Create a new regressor with extra randomized tree algorithm.
36
+ #
37
+ # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
38
+ # @param max_depth [Integer] The maximum depth of the tree.
39
+ # If nil is given, extra tree grows without concern for depth.
40
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
41
+ # If nil is given, number of leaves is not limited.
42
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
43
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
44
+ # If nil is given, split process considers all features.
45
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
46
+ # It is used to randomly determine the order of features when deciding spliting point.
47
+ def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
48
+ random_seed: nil)
49
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
50
+ max_features: max_features, random_seed: random_seed)
51
+ check_params_integer(min_samples_leaf: min_samples_leaf)
52
+ check_params_string(criterion: criterion)
53
+ check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
54
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
55
+ super
56
+ end
57
+
58
+ # Fit the model with given training data.
59
+ #
60
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
61
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
62
+ # @return [ExtraTreeRegressor] The learned regressor itself.
63
+ def fit(x, y)
64
+ check_sample_array(x)
65
+ check_tvalue_array(y)
66
+ check_sample_tvalue_size(x, y)
67
+ super
68
+ end
69
+
70
+ # Predict values for samples.
71
+ #
72
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
73
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
74
+ def predict(x)
75
+ check_sample_array(x)
76
+ super
77
+ end
78
+
79
+ # Dump marshal data.
80
+ # @return [Hash] The marshal data about ExtraTreeRegressor
81
+ def marshal_dump
82
+ super
83
+ end
84
+
85
+ # Load marshal data.
86
+ # @return [nil]
87
+ def marshal_load(obj)
88
+ super
89
+ end
90
+
91
+ private
92
+
93
+ def best_split(features, y, whole_impurity)
94
+ threshold = @rng.rand(features.min..features.max)
95
+ l_ids = features.le(threshold).where
96
+ r_ids = features.gt(threshold).where
97
+ l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
98
+ r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
99
+ gain = whole_impurity -
100
+ l_impurity * l_ids.size.fdiv(y.shape[0]) -
101
+ r_impurity * r_ids.size.fdiv(y.shape[0])
102
+ [l_impurity, r_impurity, threshold, gain]
103
+ end
104
+ end
105
+ end
106
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.9.0'
6
+ VERSION = '0.9.1'
7
7
  end
data/lib/rumale.rb CHANGED
@@ -45,10 +45,14 @@ require 'rumale/tree/node'
45
45
  require 'rumale/tree/base_decision_tree'
46
46
  require 'rumale/tree/decision_tree_classifier'
47
47
  require 'rumale/tree/decision_tree_regressor'
48
+ require 'rumale/tree/extra_tree_classifier'
49
+ require 'rumale/tree/extra_tree_regressor'
48
50
  require 'rumale/ensemble/ada_boost_classifier'
49
51
  require 'rumale/ensemble/ada_boost_regressor'
50
52
  require 'rumale/ensemble/random_forest_classifier'
51
53
  require 'rumale/ensemble/random_forest_regressor'
54
+ require 'rumale/ensemble/extra_trees_classifier'
55
+ require 'rumale/ensemble/extra_trees_regressor'
52
56
  require 'rumale/clustering/k_means'
53
57
  require 'rumale/clustering/dbscan'
54
58
  require 'rumale/decomposition/pca'
data/rumale.gemspec CHANGED
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
17
17
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
18
18
  Rumale currently supports Linear / Kernel Support Vector Machine,
19
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
20
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
20
+ Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
21
21
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
22
22
  MSG
23
23
  spec.homepage = 'https://github.com/yoshoku/rumale'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-22 00:00:00.000000000 Z
11
+ date: 2019-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -99,7 +99,7 @@ description: |
99
99
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
100
100
  Rumale currently supports Linear / Kernel Support Vector Machine,
101
101
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
102
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
102
+ Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
103
103
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
104
104
  email:
105
105
  - yoshoku@outlook.com
@@ -140,6 +140,8 @@ files:
140
140
  - lib/rumale/decomposition/pca.rb
141
141
  - lib/rumale/ensemble/ada_boost_classifier.rb
142
142
  - lib/rumale/ensemble/ada_boost_regressor.rb
143
+ - lib/rumale/ensemble/extra_trees_classifier.rb
144
+ - lib/rumale/ensemble/extra_trees_regressor.rb
143
145
  - lib/rumale/ensemble/random_forest_classifier.rb
144
146
  - lib/rumale/ensemble/random_forest_regressor.rb
145
147
  - lib/rumale/evaluation_measure/accuracy.rb
@@ -199,6 +201,8 @@ files:
199
201
  - lib/rumale/tree/base_decision_tree.rb
200
202
  - lib/rumale/tree/decision_tree_classifier.rb
201
203
  - lib/rumale/tree/decision_tree_regressor.rb
204
+ - lib/rumale/tree/extra_tree_classifier.rb
205
+ - lib/rumale/tree/extra_tree_regressor.rb
202
206
  - lib/rumale/tree/node.rb
203
207
  - lib/rumale/utils.rb
204
208
  - lib/rumale/validation.rb